Skip to content

Commit 73fb605

Browse files
[SYCL][CUDA] Use piclProgramCreateWithBinary with CUDA backend (#1791)
This commit restructures the CUDA backend so that the program manager calls "piclProgramCreateWithBinary" for both OpenCL and CUDA backends instead of branching unnecessarily. Unit tests are also updated accordingly. Signed-off-by: Przemek Malon <przemek.malon@codeplay.com>
1 parent d4a5b71 commit 73fb605

File tree

4 files changed

+103
-85
lines changed

4 files changed

+103
-85
lines changed

sycl/plugins/cuda/pi_cuda.cpp

+55-52
Original file line numberDiff line numberDiff line change
@@ -389,19 +389,20 @@ pi_result enqueueEventWait(pi_queue queue, pi_event event) {
389389
}
390390

391391
_pi_program::_pi_program(pi_context ctxt)
392-
: module_{nullptr}, source_{}, sourceLength_{0}
393-
, refCount_{1}, context_{ctxt}
394-
{
392+
: module_{nullptr}, binary_{},
393+
binarySizeInBytes_{0}, refCount_{1}, context_{ctxt} {
395394
cuda_piContextRetain(context_);
396395
}
397396

398397
_pi_program::~_pi_program() {
399398
cuda_piContextRelease(context_);
400399
}
401400

402-
pi_result _pi_program::create_from_source(const char *source, size_t length) {
403-
source_ = source;
404-
sourceLength_ = length;
401+
pi_result _pi_program::set_binary(const char *source, size_t length) {
402+
assert((binary_ == nullptr && binarySizeInBytes_ == 0) &&
403+
"Re-setting program binary data which has already been set");
404+
binary_ = source;
405+
binarySizeInBytes_ = length;
405406
return PI_SUCCESS;
406407
}
407408

@@ -427,9 +428,9 @@ pi_result _pi_program::build_program(const char *build_options) {
427428
options[3] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
428429
optionVals[3] = (void *)(long)MAX_LOG_SIZE;
429430

430-
auto result = PI_CHECK_ERROR(cuModuleLoadDataEx(
431-
&module_, static_cast<const void *>(source_), numberOfOptions, options,
432-
optionVals));
431+
auto result = PI_CHECK_ERROR(
432+
cuModuleLoadDataEx(&module_, static_cast<const void *>(binary_),
433+
numberOfOptions, options, optionVals));
433434

434435
const auto success = (result == PI_SUCCESS);
435436

@@ -446,8 +447,8 @@ pi_result _pi_program::build_program(const char *build_options) {
446447
/// has_kernel method, so an alternative would be to move the has_kernel
447448
/// query to PI and use cuModuleGetFunction to check for a kernel.
448449
std::string getKernelNames(pi_program program) {
449-
std::string source(program->source_,
450-
program->source_ + program->sourceLength_);
450+
std::string source(program->binary_,
451+
program->binary_ + program->binarySizeInBytes_);
451452
std::regex entries_pattern(".entry\\s+([^\\([:s:]]*)");
452453
std::string names("");
453454
std::smatch match;
@@ -2172,41 +2173,15 @@ pi_result cuda_piMemRetain(pi_mem mem) {
21722173
return PI_SUCCESS;
21732174
}
21742175

2175-
/// Constructs a PI program from a list of PTX or CUBIN binaries.
2176-
/// Note: No calls to CUDA driver API in this function, only store binaries
2177-
/// for later.
2178-
///
2179-
/// \TODO Implement more than one input image
2180-
/// \TODO SYCL RT should use cuda_piclprogramCreateWithBinary instead
2176+
/// Not used as CUDA backend only creates programs from binary.
2177+
/// See \ref cuda_piclProgramCreateWithBinary.
21812178
///
21822179
pi_result cuda_piclProgramCreateWithSource(pi_context context, pi_uint32 count,
21832180
const char **strings,
21842181
const size_t *lengths,
21852182
pi_program *program) {
2186-
2187-
assert(context != nullptr);
2188-
assert(strings != nullptr);
2189-
assert(program != nullptr);
2190-
2191-
pi_result retErr = PI_SUCCESS;
2192-
2193-
if (count == 0) {
2194-
retErr = PI_INVALID_PROGRAM;
2195-
return retErr;
2196-
}
2197-
2198-
assert(count == 1);
2199-
2200-
std::unique_ptr<_pi_program> retProgram{new _pi_program{context}};
2201-
2202-
auto has_length = (lengths != nullptr);
2203-
size_t length = has_length ? lengths[0] : strlen(strings[0]) + 1;
2204-
2205-
retProgram->create_from_source(strings[0], length);
2206-
2207-
*program = retProgram.release();
2208-
2209-
return retErr;
2183+
cl::sycl::detail::pi::die("cuda_piclProgramCreateWithSource not implemented");
2184+
return {};
22102185
}
22112186

22122187
/// Loads the images from a PI program into a CUmodule that can be
@@ -2244,13 +2219,41 @@ pi_result cuda_piProgramCreate(pi_context context, const void *il,
22442219
return {};
22452220
}
22462221

2247-
/// \TODO Not implemented. See \ref cuda_piclProgramCreateWithSource
2222+
/// Loads images from a list of PTX or CUBIN binaries.
2223+
/// Note: No calls to CUDA driver API in this function, only store binaries
2224+
/// for later.
2225+
///
2226+
/// Note: Only supports one device
2227+
///
22482228
pi_result cuda_piclProgramCreateWithBinary(
22492229
pi_context context, pi_uint32 num_devices, const pi_device *device_list,
22502230
const size_t *lengths, const unsigned char **binaries,
2251-
pi_int32 *binary_status, pi_program *errcode_ret) {
2252-
cl::sycl::detail::pi::die("cuda_piclProgramCreateWithBinary not implemented");
2253-
return {};
2231+
pi_int32 *binary_status, pi_program *program) {
2232+
assert(context != nullptr);
2233+
assert(binaries != nullptr);
2234+
assert(program != nullptr);
2235+
assert(device_list != nullptr);
2236+
assert(num_devices == 1 && "CUDA contexts are for a single device");
2237+
assert((context->get_device()->get() == device_list[0]->get()) &&
2238+
"Mismatch between devices context and passed context when creating "
2239+
"program from binary");
2240+
2241+
pi_result retError = PI_SUCCESS;
2242+
2243+
std::unique_ptr<_pi_program> retProgram{new _pi_program{context}};
2244+
2245+
const bool has_length = (lengths != nullptr);
2246+
size_t length = has_length
2247+
? lengths[0]
2248+
: strlen(reinterpret_cast<const char *>(binaries[0])) + 1;
2249+
2250+
assert(length != 0);
2251+
2252+
retProgram->set_binary(reinterpret_cast<const char *>(binaries[0]), length);
2253+
2254+
*program = retProgram.release();
2255+
2256+
return retError;
22542257
}
22552258

22562259
pi_result cuda_piProgramGetInfo(pi_program program, pi_program_info param_name,
@@ -2272,13 +2275,13 @@ pi_result cuda_piProgramGetInfo(pi_program program, pi_program_info param_name,
22722275
&program->context_->deviceId_);
22732276
case PI_PROGRAM_INFO_SOURCE:
22742277
return getInfo(param_value_size, param_value, param_value_size_ret,
2275-
program->source_);
2278+
program->binary_);
22762279
case PI_PROGRAM_INFO_BINARY_SIZES:
22772280
return getInfoArray(1, param_value_size, param_value, param_value_size_ret,
2278-
&program->sourceLength_);
2281+
&program->binarySizeInBytes_);
22792282
case PI_PROGRAM_INFO_BINARIES:
22802283
return getInfoArray(1, param_value_size, param_value, param_value_size_ret,
2281-
&program->source_);
2284+
&program->binary_);
22822285
case PI_PROGRAM_INFO_KERNEL_NAMES: {
22832286
return getInfo(param_value_size, param_value, param_value_size_ret,
22842287
getKernelNames(program).c_str());
@@ -2320,15 +2323,15 @@ pi_result cuda_piProgramLink(pi_context context, pi_uint32 num_devices,
23202323
for (size_t i = 0; i < num_input_programs; ++i) {
23212324
pi_program program = input_programs[i];
23222325
retError = PI_CHECK_ERROR(cuLinkAddData(
2323-
state, CU_JIT_INPUT_PTX, const_cast<char *>(program->source_),
2324-
program->sourceLength_, nullptr, 0, nullptr, nullptr));
2326+
state, CU_JIT_INPUT_PTX, const_cast<char *>(program->binary_),
2327+
program->binarySizeInBytes_, nullptr, 0, nullptr, nullptr));
23252328
}
23262329
void *cubin = nullptr;
23272330
size_t cubinSize = 0;
23282331
retError = PI_CHECK_ERROR(cuLinkComplete(state, &cubin, &cubinSize));
23292332

2330-
retError = retProgram->create_from_source(
2331-
static_cast<const char *>(cubin), cubinSize);
2333+
retError =
2334+
retProgram->set_binary(static_cast<const char *>(cubin), cubinSize);
23322335

23332336
if (retError != PI_SUCCESS) {
23342337
return retError;

sycl/plugins/cuda/pi_cuda.hpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -404,8 +404,8 @@ class _pi_event {
404404
struct _pi_program {
405405
using native_type = CUmodule;
406406
native_type module_;
407-
const char *source_;
408-
size_t sourceLength_;
407+
const char *binary_;
408+
size_t binarySizeInBytes_;
409409
std::atomic_uint32_t refCount_;
410410
_pi_context *context_;
411411

@@ -418,7 +418,7 @@ struct _pi_program {
418418
_pi_program(pi_context ctxt);
419419
~_pi_program();
420420

421-
pi_result create_from_source(const char *source, size_t length);
421+
pi_result set_binary(const char *binary, size_t binarySizeInBytes);
422422

423423
pi_result build_program(const char* build_options);
424424

sycl/source/detail/program_manager/program_manager.cpp

+8-16
Original file line numberDiff line numberDiff line change
@@ -85,22 +85,14 @@ static RT::PiProgram createBinaryProgram(const ContextImplPtr Context,
8585
#endif
8686

8787
RT::PiProgram Program;
88-
89-
// TODO: Implement `piProgramCreateWithBinary` to not require extra logic for
90-
// the CUDA backend.
91-
const auto Backend = Context->getPlugin().getBackend();
92-
if (Backend == backend::cuda) {
93-
// TODO: Reemplace CreateWithSource with CreateWithBinary in CUDA backend
94-
const char *SignedData = reinterpret_cast<const char *>(Data);
95-
Plugin.call<PiApiKind::piclProgramCreateWithSource>(
96-
Context->getHandleRef(), 1 /*one binary*/, &SignedData, &DataLen,
97-
&Program);
98-
} else {
99-
RT::PiDevice Device = getFirstDevice(Context);
100-
pi_int32 BinaryStatus = CL_SUCCESS;
101-
Plugin.call<PiApiKind::piclProgramCreateWithBinary>(
102-
Context->getHandleRef(), 1 /*one binary*/, &Device, &DataLen, &Data,
103-
&BinaryStatus, &Program);
88+
RT::PiDevice Device = getFirstDevice(Context);
89+
pi_int32 BinaryStatus = CL_SUCCESS;
90+
Plugin.call<PiApiKind::piclProgramCreateWithBinary>(
91+
Context->getHandleRef(), 1 /*one binary*/, &Device, &DataLen, &Data,
92+
&BinaryStatus, &Program);
93+
94+
if (BinaryStatus != CL_SUCCESS) {
95+
throw runtime_error("Creating program with binary failed.", BinaryStatus);
10496
}
10597

10698
return Program;

sycl/unittests/pi/cuda/test_kernels.cpp

+37-14
Original file line numberDiff line numberDiff line change
@@ -128,9 +128,11 @@ const char *threeParamsTwoLocal = "\n\
128128
TEST_F(CudaKernelsTest, PICreateProgramAndKernel) {
129129

130130
pi_program prog;
131+
pi_int32 binary_status = PI_SUCCESS;
131132
ASSERT_EQ(
132-
(plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithSource>(
133-
context_, 1, (const char **)&ptxSource, nullptr, &prog)),
133+
(plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithBinary>(
134+
context_, 1, &device_, nullptr, (const unsigned char **)&ptxSource,
135+
&binary_status, &prog)),
134136
PI_SUCCESS);
135137

136138
ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piProgramBuild>(
@@ -147,10 +149,16 @@ TEST_F(CudaKernelsTest, PICreateProgramAndKernel) {
147149
TEST_F(CudaKernelsTest, PIKernelArgumentSimple) {
148150

149151
pi_program prog;
152+
/// NOTE: `binary_status` currently unsused in the CUDA backend but in case we
153+
/// use it at some point in the future, pass it anyway and check the result.
154+
/// Same goes for all the other tests in this file.
155+
pi_int32 binary_status = PI_SUCCESS;
150156
ASSERT_EQ(
151-
(plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithSource>(
152-
context_, 1, (const char **)&ptxSource, nullptr, &prog)),
157+
(plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithBinary>(
158+
context_, 1, &device_, nullptr, (const unsigned char **)&ptxSource,
159+
&binary_status, &prog)),
153160
PI_SUCCESS);
161+
ASSERT_EQ(binary_status, PI_SUCCESS);
154162

155163
ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piProgramBuild>(
156164
prog, 1, &device_, "", nullptr, nullptr)),
@@ -174,10 +182,13 @@ TEST_F(CudaKernelsTest, PIKernelArgumentSimple) {
174182
TEST_F(CudaKernelsTest, PIKernelArgumentSetTwice) {
175183

176184
pi_program prog;
185+
pi_int32 binary_status = PI_SUCCESS;
177186
ASSERT_EQ(
178-
(plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithSource>(
179-
context_, 1, (const char **)&ptxSource, nullptr, &prog)),
187+
(plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithBinary>(
188+
context_, 1, &device_, nullptr, (const unsigned char **)&ptxSource,
189+
&binary_status, &prog)),
180190
PI_SUCCESS);
191+
ASSERT_EQ(binary_status, PI_SUCCESS);
181192

182193
ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piProgramBuild>(
183194
prog, 1, &device_, "", nullptr, nullptr)),
@@ -210,10 +221,13 @@ TEST_F(CudaKernelsTest, PIKernelArgumentSetTwice) {
210221
TEST_F(CudaKernelsTest, PIKernelSetMemObj) {
211222

212223
pi_program prog;
224+
pi_int32 binary_status = PI_SUCCESS;
213225
ASSERT_EQ(
214-
(plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithSource>(
215-
context_, 1, (const char **)&ptxSource, nullptr, &prog)),
226+
(plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithBinary>(
227+
context_, 1, &device_, nullptr, (const unsigned char **)&ptxSource,
228+
&binary_status, &prog)),
216229
PI_SUCCESS);
230+
ASSERT_EQ(binary_status, PI_SUCCESS);
217231

218232
ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piProgramBuild>(
219233
prog, 1, &device_, "", nullptr, nullptr)),
@@ -242,10 +256,13 @@ TEST_F(CudaKernelsTest, PIKernelSetMemObj) {
242256
TEST_F(CudaKernelsTest, PIkerneldispatch) {
243257

244258
pi_program prog;
259+
pi_int32 binary_status = PI_SUCCESS;
245260
ASSERT_EQ(
246-
(plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithSource>(
247-
context_, 1, (const char **)&ptxSource, nullptr, &prog)),
261+
(plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithBinary>(
262+
context_, 1, &device_, nullptr, (const unsigned char **)&ptxSource,
263+
&binary_status, &prog)),
248264
PI_SUCCESS);
265+
ASSERT_EQ(binary_status, PI_SUCCESS);
249266

250267
ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piProgramBuild>(
251268
prog, 1, &device_, "", nullptr, nullptr)),
@@ -282,10 +299,13 @@ TEST_F(CudaKernelsTest, PIkerneldispatch) {
282299
TEST_F(CudaKernelsTest, PIkerneldispatchTwo) {
283300

284301
pi_program prog;
302+
pi_int32 binary_status = PI_SUCCESS;
285303
ASSERT_EQ(
286-
(plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithSource>(
287-
context_, 1, (const char **)&twoParams, nullptr, &prog)),
304+
(plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithBinary>(
305+
context_, 1, &device_, nullptr, (const unsigned char **)&twoParams,
306+
&binary_status, &prog)),
288307
PI_SUCCESS);
308+
ASSERT_EQ(binary_status, PI_SUCCESS);
289309

290310
ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piProgramBuild>(
291311
prog, 1, &device_, "", nullptr, nullptr)),
@@ -333,10 +353,13 @@ TEST_F(CudaKernelsTest, PIkerneldispatchTwo) {
333353
TEST_F(CudaKernelsTest, PIKernelArgumentSetTwiceOneLocal) {
334354

335355
pi_program prog;
356+
pi_int32 binary_status = PI_SUCCESS;
336357
ASSERT_EQ(
337-
(plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithSource>(
338-
context_, 1, (const char **)&threeParamsTwoLocal, nullptr, &prog)),
358+
(plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithBinary>(
359+
context_, 1, &device_, nullptr,
360+
(const unsigned char **)&threeParamsTwoLocal, &binary_status, &prog)),
339361
PI_SUCCESS);
362+
ASSERT_EQ(binary_status, PI_SUCCESS);
340363

341364
ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piProgramBuild>(
342365
prog, 1, &device_, "", nullptr, nullptr)),

0 commit comments

Comments
 (0)