Skip to content

Commit 35ebe4c

Browse files
committed
[Clang][OpenMP] Add partial support for Static Device Libraries
An archive containing device code object files can be passed to clang command line for linking. For each given offload target it creates a device specific archives which is either passed to llvm-link if the target is amdgpu, or to clang-nvlink-wrapper if the target is nvptx. -L/-l flags are used to specify these fat archives on the command line. E.g. clang++ -fopenmp -fopenmp-targets=nvptx64 main.cpp -L. -lmylib It currently doesn't support linking an archive directly, like: clang++ -fopenmp -fopenmp-targets=nvptx64 main.cpp libmylib.a Linking with x86 offload also does not work. Reviewed By: ye-luo Differential Revision: https://reviews.llvm.org/D105191
1 parent dcb0e68 commit 35ebe4c

File tree

9 files changed

+549
-5
lines changed

9 files changed

+549
-5
lines changed

clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp

+4
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,10 @@ const char *AMDGCN::OpenMPLinker::constructLLVMLinkCommand(
114114
}
115115
}
116116

117+
AddStaticDeviceLibsLinking(C, *this, JA, Inputs, Args, CmdArgs, "amdgcn",
118+
SubArchName,
119+
/* bitcode SDL?*/ true,
120+
/* PostClang Link? */ false);
117121
// Add an intermediate output file.
118122
CmdArgs.push_back("-o");
119123
const char *OutputFileName =

clang/lib/Driver/ToolChains/Clang.cpp

+34-3
Original file line numberDiff line numberDiff line change
@@ -7745,12 +7745,28 @@ void OffloadBundler::ConstructJob(Compilation &C, const JobAction &JA,
77457745
Triples += Action::GetOffloadKindName(CurKind);
77467746
Triples += '-';
77477747
Triples += CurTC->getTriple().normalize();
7748-
if ((CurKind == Action::OFK_HIP || CurKind == Action::OFK_OpenMP ||
7749-
CurKind == Action::OFK_Cuda) &&
7748+
if ((CurKind == Action::OFK_HIP || CurKind == Action::OFK_Cuda) &&
77507749
CurDep->getOffloadingArch()) {
77517750
Triples += '-';
77527751
Triples += CurDep->getOffloadingArch();
77537752
}
7753+
7754+
// TODO: Replace parsing of -march flag. Can be done by storing GPUArch
7755+
// with each toolchain.
7756+
StringRef GPUArchName;
7757+
if (CurKind == Action::OFK_OpenMP) {
7758+
// Extract GPUArch from -march argument in TC argument list.
7759+
for (unsigned ArgIndex = 0; ArgIndex < TCArgs.size(); ArgIndex++) {
7760+
auto ArchStr = StringRef(TCArgs.getArgString(ArgIndex));
7761+
auto Arch = ArchStr.startswith_insensitive("-march=");
7762+
if (Arch) {
7763+
GPUArchName = ArchStr.substr(7);
7764+
Triples += "-";
7765+
break;
7766+
}
7767+
}
7768+
Triples += GPUArchName.str();
7769+
}
77547770
}
77557771
CmdArgs.push_back(TCArgs.MakeArgString(Triples));
77567772

@@ -7824,12 +7840,27 @@ void OffloadBundler::ConstructJobMultipleOutputs(
78247840
Triples += '-';
78257841
Triples += Dep.DependentToolChain->getTriple().normalize();
78267842
if ((Dep.DependentOffloadKind == Action::OFK_HIP ||
7827-
Dep.DependentOffloadKind == Action::OFK_OpenMP ||
78287843
Dep.DependentOffloadKind == Action::OFK_Cuda) &&
78297844
!Dep.DependentBoundArch.empty()) {
78307845
Triples += '-';
78317846
Triples += Dep.DependentBoundArch;
78327847
}
7848+
// TODO: Replace parsing of -march flag. Can be done by storing GPUArch
7849+
// with each toolchain.
7850+
StringRef GPUArchName;
7851+
if (Dep.DependentOffloadKind == Action::OFK_OpenMP) {
7852+
// Extract GPUArch from -march argument in TC argument list.
7853+
for (unsigned ArgIndex = 0; ArgIndex < TCArgs.size(); ArgIndex++) {
7854+
StringRef ArchStr = StringRef(TCArgs.getArgString(ArgIndex));
7855+
auto Arch = ArchStr.startswith_insensitive("-march=");
7856+
if (Arch) {
7857+
GPUArchName = ArchStr.substr(7);
7858+
Triples += "-";
7859+
break;
7860+
}
7861+
}
7862+
Triples += GPUArchName.str();
7863+
}
78337864
}
78347865

78357866
CmdArgs.push_back(TCArgs.MakeArgString(Triples));

clang/lib/Driver/ToolChains/CommonArgs.cpp

+287
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
#include "clang/Driver/Util.h"
3535
#include "clang/Driver/XRayArgs.h"
3636
#include "llvm/ADT/STLExtras.h"
37+
#include "llvm/ADT/SmallSet.h"
3738
#include "llvm/ADT/SmallString.h"
3839
#include "llvm/ADT/StringExtras.h"
3940
#include "llvm/ADT/StringSwitch.h"
@@ -1587,6 +1588,292 @@ void tools::addX86AlignBranchArgs(const Driver &D, const ArgList &Args,
15871588
}
15881589
}
15891590

1591+
/// SDLSearch: Search for Static Device Library
1592+
/// The search for SDL bitcode files is consistent with how static host
1593+
/// libraries are discovered. That is, the -l option triggers a search for
1594+
/// files in a set of directories called the LINKPATH. The host library search
1595+
/// procedure looks for a specific filename in the LINKPATH. The filename for
1596+
/// a host library is lib<libname>.a or lib<libname>.so. For SDLs, there is an
1597+
/// ordered-set of filenames that are searched. We call this ordered-set of
1598+
/// filenames as SEARCH-ORDER. Since an SDL can either be device-type specific,
1599+
/// architecture specific, or generic across all architectures, a naming
1600+
/// convention and search order is used where the file name embeds the
1601+
/// architecture name <arch-name> (nvptx or amdgcn) and the GPU device type
1602+
/// <device-name> such as sm_30 and gfx906. <device-name> is absent in case of
1603+
/// device-independent SDLs. To reduce congestion in host library directories,
1604+
/// the search first looks for files in the “libdevice” subdirectory. SDLs that
1605+
/// are bc files begin with the prefix “lib”.
1606+
///
1607+
/// Machine-code SDLs can also be managed as an archive (*.a file). The
1608+
/// convention has been to use the prefix “lib”. To avoid confusion with host
1609+
/// archive libraries, we use prefix "libbc-" for the bitcode SDL archives.
1610+
///
1611+
bool tools::SDLSearch(const Driver &D, const llvm::opt::ArgList &DriverArgs,
1612+
llvm::opt::ArgStringList &CC1Args,
1613+
SmallVector<std::string, 8> LibraryPaths, std::string Lib,
1614+
StringRef Arch, StringRef Target, bool isBitCodeSDL,
1615+
bool postClangLink) {
1616+
SmallVector<std::string, 12> SDLs;
1617+
1618+
std::string LibDeviceLoc = "/libdevice";
1619+
std::string LibBcPrefix = "/libbc-";
1620+
std::string LibPrefix = "/lib";
1621+
1622+
if (isBitCodeSDL) {
1623+
// SEARCH-ORDER for Bitcode SDLs:
1624+
// libdevice/libbc-<libname>-<arch-name>-<device-type>.a
1625+
// libbc-<libname>-<arch-name>-<device-type>.a
1626+
// libdevice/libbc-<libname>-<arch-name>.a
1627+
// libbc-<libname>-<arch-name>.a
1628+
// libdevice/libbc-<libname>.a
1629+
// libbc-<libname>.a
1630+
// libdevice/lib<libname>-<arch-name>-<device-type>.bc
1631+
// lib<libname>-<arch-name>-<device-type>.bc
1632+
// libdevice/lib<libname>-<arch-name>.bc
1633+
// lib<libname>-<arch-name>.bc
1634+
// libdevice/lib<libname>.bc
1635+
// lib<libname>.bc
1636+
1637+
for (StringRef Base : {LibBcPrefix, LibPrefix}) {
1638+
const auto *Ext = Base.contains(LibBcPrefix) ? ".a" : ".bc";
1639+
1640+
for (auto Suffix : {Twine(Lib + "-" + Arch + "-" + Target).str(),
1641+
Twine(Lib + "-" + Arch).str(), Twine(Lib).str()}) {
1642+
SDLs.push_back(Twine(LibDeviceLoc + Base + Suffix + Ext).str());
1643+
SDLs.push_back(Twine(Base + Suffix + Ext).str());
1644+
}
1645+
}
1646+
} else {
1647+
// SEARCH-ORDER for Machine-code SDLs:
1648+
// libdevice/lib<libname>-<arch-name>-<device-type>.a
1649+
// lib<libname>-<arch-name>-<device-type>.a
1650+
// libdevice/lib<libname>-<arch-name>.a
1651+
// lib<libname>-<arch-name>.a
1652+
1653+
const auto *Ext = ".a";
1654+
1655+
for (auto Suffix : {Twine(Lib + "-" + Arch + "-" + Target).str(),
1656+
Twine(Lib + "-" + Arch).str()}) {
1657+
SDLs.push_back(Twine(LibDeviceLoc + LibPrefix + Suffix + Ext).str());
1658+
SDLs.push_back(Twine(LibPrefix + Suffix + Ext).str());
1659+
}
1660+
}
1661+
1662+
// The CUDA toolchain does not use a global device llvm-link before the LLVM
1663+
// backend generates ptx. So currently, the use of bitcode SDL for nvptx is
1664+
// only possible with post-clang-cc1 linking. Clang cc1 has a feature that
1665+
// will link libraries after clang compilation while the LLVM IR is still in
1666+
// memory. This utilizes a clang cc1 option called “-mlink-builtin-bitcode”.
1667+
// This is a clang -cc1 option that is generated by the clang driver. The
1668+
// option value must a full path to an existing file.
1669+
bool FoundSDL = false;
1670+
for (auto LPath : LibraryPaths) {
1671+
for (auto SDL : SDLs) {
1672+
auto FullName = Twine(LPath + SDL).str();
1673+
if (llvm::sys::fs::exists(FullName)) {
1674+
if (postClangLink)
1675+
CC1Args.push_back("-mlink-builtin-bitcode");
1676+
CC1Args.push_back(DriverArgs.MakeArgString(FullName));
1677+
FoundSDL = true;
1678+
break;
1679+
}
1680+
}
1681+
if (FoundSDL)
1682+
break;
1683+
}
1684+
return FoundSDL;
1685+
}
1686+
1687+
/// Search if a user provided archive file lib<libname>.a exists in any of
1688+
/// the library paths. If so, add a new command to clang-offload-bundler to
1689+
/// unbundle this archive and create a temporary device specific archive. Name
1690+
/// of this SDL is passed to the llvm-link (for amdgcn) or to the
1691+
/// clang-nvlink-wrapper (for nvptx) commands by the driver.
1692+
bool tools::GetSDLFromOffloadArchive(
1693+
Compilation &C, const Driver &D, const Tool &T, const JobAction &JA,
1694+
const InputInfoList &Inputs, const llvm::opt::ArgList &DriverArgs,
1695+
llvm::opt::ArgStringList &CC1Args, SmallVector<std::string, 8> LibraryPaths,
1696+
StringRef Lib, StringRef Arch, StringRef Target, bool isBitCodeSDL,
1697+
bool postClangLink) {
1698+
1699+
// We don't support bitcode archive bundles for nvptx
1700+
if (isBitCodeSDL && Arch.contains("nvptx"))
1701+
return false;
1702+
1703+
bool FoundAOB = false;
1704+
SmallVector<std::string, 2> AOBFileNames;
1705+
std::string ArchiveOfBundles;
1706+
for (auto LPath : LibraryPaths) {
1707+
ArchiveOfBundles.clear();
1708+
1709+
AOBFileNames.push_back(Twine(LPath + "/libdevice/lib" + Lib + ".a").str());
1710+
AOBFileNames.push_back(Twine(LPath + "/lib" + Lib + ".a").str());
1711+
1712+
for (auto AOB : AOBFileNames) {
1713+
if (llvm::sys::fs::exists(AOB)) {
1714+
ArchiveOfBundles = AOB;
1715+
FoundAOB = true;
1716+
break;
1717+
}
1718+
}
1719+
1720+
if (!FoundAOB)
1721+
continue;
1722+
1723+
StringRef Prefix = isBitCodeSDL ? "libbc-" : "lib";
1724+
std::string OutputLib = D.GetTemporaryPath(
1725+
Twine(Prefix + Lib + "-" + Arch + "-" + Target).str(), "a");
1726+
1727+
C.addTempFile(C.getArgs().MakeArgString(OutputLib.c_str()));
1728+
1729+
ArgStringList CmdArgs;
1730+
SmallString<128> DeviceTriple;
1731+
DeviceTriple += Action::GetOffloadKindName(JA.getOffloadingDeviceKind());
1732+
DeviceTriple += '-';
1733+
std::string NormalizedTriple = T.getToolChain().getTriple().normalize();
1734+
DeviceTriple += NormalizedTriple;
1735+
if (!Target.empty()) {
1736+
DeviceTriple += '-';
1737+
DeviceTriple += Target;
1738+
}
1739+
1740+
std::string UnbundleArg("-unbundle");
1741+
std::string TypeArg("-type=a");
1742+
std::string InputArg("-inputs=" + ArchiveOfBundles);
1743+
std::string OffloadArg("-targets=" + std::string(DeviceTriple));
1744+
std::string OutputArg("-outputs=" + OutputLib);
1745+
1746+
const char *UBProgram = DriverArgs.MakeArgString(
1747+
T.getToolChain().GetProgramPath("clang-offload-bundler"));
1748+
1749+
ArgStringList UBArgs;
1750+
UBArgs.push_back(C.getArgs().MakeArgString(UnbundleArg.c_str()));
1751+
UBArgs.push_back(C.getArgs().MakeArgString(TypeArg.c_str()));
1752+
UBArgs.push_back(C.getArgs().MakeArgString(InputArg.c_str()));
1753+
UBArgs.push_back(C.getArgs().MakeArgString(OffloadArg.c_str()));
1754+
UBArgs.push_back(C.getArgs().MakeArgString(OutputArg.c_str()));
1755+
1756+
// Add this flag to not exit from clang-offload-bundler if no compatible
1757+
// code object is found in heterogenous archive library.
1758+
std::string AdditionalArgs("-allow-missing-bundles");
1759+
UBArgs.push_back(C.getArgs().MakeArgString(AdditionalArgs.c_str()));
1760+
1761+
C.addCommand(std::make_unique<Command>(
1762+
JA, T, ResponseFileSupport::AtFileCurCP(), UBProgram, UBArgs, Inputs,
1763+
InputInfo(&JA, C.getArgs().MakeArgString(OutputLib.c_str()))));
1764+
if (postClangLink)
1765+
CC1Args.push_back("-mlink-builtin-bitcode");
1766+
1767+
CC1Args.push_back(DriverArgs.MakeArgString(OutputLib));
1768+
break;
1769+
}
1770+
1771+
return FoundAOB;
1772+
}
1773+
1774+
// Wrapper function used by driver for adding SDLs during link phase.
1775+
void tools::AddStaticDeviceLibsLinking(Compilation &C, const Tool &T,
1776+
const JobAction &JA,
1777+
const InputInfoList &Inputs,
1778+
const llvm::opt::ArgList &DriverArgs,
1779+
llvm::opt::ArgStringList &CC1Args,
1780+
StringRef Arch, StringRef Target,
1781+
bool isBitCodeSDL, bool postClangLink) {
1782+
AddStaticDeviceLibs(&C, &T, &JA, &Inputs, C.getDriver(), DriverArgs, CC1Args,
1783+
Arch, Target, isBitCodeSDL, postClangLink);
1784+
}
1785+
1786+
// Wrapper function used for post clang linking of bitcode SDLS for nvptx by
1787+
// the CUDA toolchain.
1788+
void tools::AddStaticDeviceLibsPostLinking(const Driver &D,
1789+
const llvm::opt::ArgList &DriverArgs,
1790+
llvm::opt::ArgStringList &CC1Args,
1791+
StringRef Arch, StringRef Target,
1792+
bool isBitCodeSDL, bool postClangLink) {
1793+
AddStaticDeviceLibs(nullptr, nullptr, nullptr, nullptr, D, DriverArgs,
1794+
CC1Args, Arch, Target, isBitCodeSDL, postClangLink);
1795+
}
1796+
1797+
// User defined Static Device Libraries(SDLs) can be passed to clang for
1798+
// offloading GPU compilers. Like static host libraries, the use of a SDL is
1799+
// specified with the -l command line option. The primary difference between
1800+
// host and SDLs is the filenames for SDLs (refer SEARCH-ORDER for Bitcode SDLs
1801+
// and SEARCH-ORDER for Machine-code SDLs for the naming convention).
1802+
// SDLs are of following types:
1803+
//
1804+
// * Bitcode SDLs: They can either be a *.bc file or an archive of *.bc files.
1805+
// For NVPTX, these libraries are post-clang linked following each
1806+
// compilation. For AMDGPU, these libraries are linked one time
1807+
// during the application link phase.
1808+
//
1809+
// * Machine-code SDLs: They are archive files. For NVPTX, the archive members
1810+
// contain cubin for Nvidia GPUs and are linked one time during the
1811+
// link phase by the CUDA SDK linker called nvlink. For AMDGPU, the
1812+
// process for machine code SDLs is still in development. But they
1813+
// will be linked by the LLVM tool lld.
1814+
//
1815+
// * Bundled objects that contain both host and device codes: Bundled objects
1816+
// may also contain library code compiled from source. For NVPTX, the
1817+
// bundle contains cubin. For AMDGPU, the bundle contains bitcode.
1818+
//
1819+
// For Bitcode and Machine-code SDLs, current compiler toolchains hardcode the
1820+
// inclusion of specific SDLs such as math libraries and the OpenMP device
1821+
// library libomptarget.
1822+
void tools::AddStaticDeviceLibs(Compilation *C, const Tool *T,
1823+
const JobAction *JA,
1824+
const InputInfoList *Inputs, const Driver &D,
1825+
const llvm::opt::ArgList &DriverArgs,
1826+
llvm::opt::ArgStringList &CC1Args,
1827+
StringRef Arch, StringRef Target,
1828+
bool isBitCodeSDL, bool postClangLink) {
1829+
1830+
SmallVector<std::string, 8> LibraryPaths;
1831+
// Add search directories from LIBRARY_PATH env variable
1832+
llvm::Optional<std::string> LibPath =
1833+
llvm::sys::Process::GetEnv("LIBRARY_PATH");
1834+
if (LibPath) {
1835+
SmallVector<StringRef, 8> Frags;
1836+
const char EnvPathSeparatorStr[] = {llvm::sys::EnvPathSeparator, '\0'};
1837+
llvm::SplitString(*LibPath, Frags, EnvPathSeparatorStr);
1838+
for (StringRef Path : Frags)
1839+
LibraryPaths.emplace_back(Path.trim());
1840+
}
1841+
1842+
// Add directories from user-specified -L options
1843+
for (std::string Search_Dir : DriverArgs.getAllArgValues(options::OPT_L))
1844+
LibraryPaths.emplace_back(Search_Dir);
1845+
1846+
// Add path to lib-debug folders
1847+
SmallString<256> DefaultLibPath = llvm::sys::path::parent_path(D.Dir);
1848+
llvm::sys::path::append(DefaultLibPath, Twine("lib") + CLANG_LIBDIR_SUFFIX);
1849+
LibraryPaths.emplace_back(DefaultLibPath.c_str());
1850+
1851+
// Build list of Static Device Libraries SDLs specified by -l option
1852+
llvm::SmallSet<std::string, 16> SDLNames;
1853+
static const StringRef HostOnlyArchives[] = {
1854+
"omp", "cudart", "m", "gcc", "gcc_s", "pthread", "hip_hcc"};
1855+
for (auto SDLName : DriverArgs.getAllArgValues(options::OPT_l)) {
1856+
if (!HostOnlyArchives->contains(SDLName)) {
1857+
SDLNames.insert(SDLName);
1858+
}
1859+
}
1860+
1861+
// The search stops as soon as an SDL file is found. The driver then provides
1862+
// the full filename of the SDL to the llvm-link or clang-nvlink-wrapper
1863+
// command. If no SDL is found after searching each LINKPATH with
1864+
// SEARCH-ORDER, it is possible that an archive file lib<libname>.a exists
1865+
// and may contain bundled object files.
1866+
for (auto SDLName : SDLNames) {
1867+
// This is the only call to SDLSearch
1868+
if (!SDLSearch(D, DriverArgs, CC1Args, LibraryPaths, SDLName, Arch, Target,
1869+
isBitCodeSDL, postClangLink)) {
1870+
GetSDLFromOffloadArchive(*C, D, *T, *JA, *Inputs, DriverArgs, CC1Args,
1871+
LibraryPaths, SDLName, Arch, Target,
1872+
isBitCodeSDL, postClangLink);
1873+
}
1874+
}
1875+
}
1876+
15901877
static llvm::opt::Arg *
15911878
getAMDGPUCodeObjectArgument(const Driver &D, const llvm::opt::ArgList &Args) {
15921879
// The last of -mcode-object-v3, -mno-code-object-v3 and

0 commit comments

Comments
 (0)