[SYCL][CUDA] Use piclProgramCreateWithBinary with CUDA backend (#1791)

przemektmalon · web-flow · commit 73fb60522b2b · 2020-06-11T12:15:09.000+03:00
This commit restructures the CUDA backend so that the program manager
calls "piclProgramCreateWithBinary" for both OpenCL and CUDA backends
instead of branching unnecessarily. Unit tests are also updated
accordingly.

Signed-off-by: Przemek Malon &lt;przemek.malon@codeplay.com&gt;
diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
@@ -389,19 +389,20 @@ pi_result enqueueEventWait(pi_queue queue, pi_event event) {
 }
 
 _pi_program::_pi_program(pi_context ctxt)
-    : module_{nullptr}, source_{}, sourceLength_{0}
-    , refCount_{1}, context_{ctxt} 
-{
+    : module_{nullptr}, binary_{},
+      binarySizeInBytes_{0}, refCount_{1}, context_{ctxt} {
   cuda_piContextRetain(context_);
 }
 
 _pi_program::~_pi_program() {
   cuda_piContextRelease(context_);
 }
 
-pi_result _pi_program::create_from_source(const char *source, size_t length) {
-  source_ = source;
-  sourceLength_ = length;
+pi_result _pi_program::set_binary(const char *source, size_t length) {
+  assert((binary_ == nullptr && binarySizeInBytes_ == 0) &&
+         "Re-setting program binary data which has already been set");
+  binary_ = source;
+  binarySizeInBytes_ = length;
   return PI_SUCCESS;
 }
 
@@ -427,9 +428,9 @@ pi_result _pi_program::build_program(const char *build_options) {
   options[3] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
   optionVals[3] = (void *)(long)MAX_LOG_SIZE;
 
-  auto result = PI_CHECK_ERROR(cuModuleLoadDataEx(
-      &module_, static_cast<const void *>(source_), numberOfOptions, options,
-      optionVals));
+  auto result = PI_CHECK_ERROR(
+      cuModuleLoadDataEx(&module_, static_cast<const void *>(binary_),
+                         numberOfOptions, options, optionVals));
 
   const auto success = (result == PI_SUCCESS);
 
@@ -446,8 +447,8 @@ pi_result _pi_program::build_program(const char *build_options) {
 ///       has_kernel method, so an alternative would be to move the has_kernel
 ///       query to PI and use cuModuleGetFunction to check for a kernel.
 std::string getKernelNames(pi_program program) {
-  std::string source(program->source_,
-                     program->source_ + program->sourceLength_);
+  std::string source(program->binary_,
+                     program->binary_ + program->binarySizeInBytes_);
   std::regex entries_pattern(".entry\\s+([^\\([:s:]]*)");
   std::string names("");
   std::smatch match;
@@ -2172,41 +2173,15 @@ pi_result cuda_piMemRetain(pi_mem mem) {
   return PI_SUCCESS;
 }
 
-/// Constructs a PI program from a list of PTX or CUBIN binaries.
-/// Note: No calls to CUDA driver API in this function, only store binaries
-/// for later.
-///
-/// \TODO Implement more than one input image
-/// \TODO SYCL RT should use cuda_piclprogramCreateWithBinary instead
+/// Not used as CUDA backend only creates programs from binary.
+/// See \ref cuda_piclProgramCreateWithBinary.
 ///
 pi_result cuda_piclProgramCreateWithSource(pi_context context, pi_uint32 count,
                                            const char **strings,
                                            const size_t *lengths,
                                            pi_program *program) {
-
-  assert(context != nullptr);
-  assert(strings != nullptr);
-  assert(program != nullptr);
-
-  pi_result retErr = PI_SUCCESS;
-
-  if (count == 0) {
-    retErr = PI_INVALID_PROGRAM;
-    return retErr;
-  }
-
-  assert(count == 1);
-
-  std::unique_ptr<_pi_program> retProgram{new _pi_program{context}};
-
-  auto has_length = (lengths != nullptr);
-  size_t length = has_length ? lengths[0] : strlen(strings[0]) + 1;
-
-  retProgram->create_from_source(strings[0], length);
-
-  *program = retProgram.release();
-
-  return retErr;
+  cl::sycl::detail::pi::die("cuda_piclProgramCreateWithSource not implemented");
+  return {};
 }
 
 /// Loads the images from a PI program into a CUmodule that can be
@@ -2244,13 +2219,41 @@ pi_result cuda_piProgramCreate(pi_context context, const void *il,
   return {};
 }
 
-/// \TODO Not implemented. See \ref cuda_piclProgramCreateWithSource
+/// Loads images from a list of PTX or CUBIN binaries.
+/// Note: No calls to CUDA driver API in this function, only store binaries
+/// for later.
+///
+/// Note: Only supports one device
+///
 pi_result cuda_piclProgramCreateWithBinary(
     pi_context context, pi_uint32 num_devices, const pi_device *device_list,
     const size_t *lengths, const unsigned char **binaries,
-    pi_int32 *binary_status, pi_program *errcode_ret) {
-  cl::sycl::detail::pi::die("cuda_piclProgramCreateWithBinary not implemented");
-  return {};
+    pi_int32 *binary_status, pi_program *program) {
+  assert(context != nullptr);
+  assert(binaries != nullptr);
+  assert(program != nullptr);
+  assert(device_list != nullptr);
+  assert(num_devices == 1 && "CUDA contexts are for a single device");
+  assert((context->get_device()->get() == device_list[0]->get()) &&
+         "Mismatch between devices context and passed context when creating "
+         "program from binary");
+
+  pi_result retError = PI_SUCCESS;
+
+  std::unique_ptr<_pi_program> retProgram{new _pi_program{context}};
+
+  const bool has_length = (lengths != nullptr);
+  size_t length = has_length
+                      ? lengths[0]
+                      : strlen(reinterpret_cast<const char *>(binaries[0])) + 1;
+
+  assert(length != 0);
+
+  retProgram->set_binary(reinterpret_cast<const char *>(binaries[0]), length);
+
+  *program = retProgram.release();
+
+  return retError;
 }
 
 pi_result cuda_piProgramGetInfo(pi_program program, pi_program_info param_name,
@@ -2272,13 +2275,13 @@ pi_result cuda_piProgramGetInfo(pi_program program, pi_program_info param_name,
                         &program->context_->deviceId_);
   case PI_PROGRAM_INFO_SOURCE:
     return getInfo(param_value_size, param_value, param_value_size_ret,
-                   program->source_);
+                   program->binary_);
   case PI_PROGRAM_INFO_BINARY_SIZES:
     return getInfoArray(1, param_value_size, param_value, param_value_size_ret,
-                        &program->sourceLength_);
+                        &program->binarySizeInBytes_);
   case PI_PROGRAM_INFO_BINARIES:
     return getInfoArray(1, param_value_size, param_value, param_value_size_ret,
-                        &program->source_);
+                        &program->binary_);
   case PI_PROGRAM_INFO_KERNEL_NAMES: {
     return getInfo(param_value_size, param_value, param_value_size_ret,
                    getKernelNames(program).c_str());
@@ -2320,15 +2323,15 @@ pi_result cuda_piProgramLink(pi_context context, pi_uint32 num_devices,
       for (size_t i = 0; i < num_input_programs; ++i) {
         pi_program program = input_programs[i];
         retError = PI_CHECK_ERROR(cuLinkAddData(
-            state, CU_JIT_INPUT_PTX, const_cast<char *>(program->source_),
-            program->sourceLength_, nullptr, 0, nullptr, nullptr));
+            state, CU_JIT_INPUT_PTX, const_cast<char *>(program->binary_),
+            program->binarySizeInBytes_, nullptr, 0, nullptr, nullptr));
       }
       void *cubin = nullptr;
       size_t cubinSize = 0;
       retError = PI_CHECK_ERROR(cuLinkComplete(state, &cubin, &cubinSize));
 
-      retError = retProgram->create_from_source(
-          static_cast<const char *>(cubin), cubinSize);
+      retError =
+          retProgram->set_binary(static_cast<const char *>(cubin), cubinSize);
 
       if (retError != PI_SUCCESS) {
         return retError;
diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp
@@ -404,8 +404,8 @@ class _pi_event {
 struct _pi_program {
   using native_type = CUmodule;
   native_type module_;
-  const char *source_;
-  size_t sourceLength_;
+  const char *binary_;
+  size_t binarySizeInBytes_;
   std::atomic_uint32_t refCount_;
   _pi_context *context_;
 
@@ -418,7 +418,7 @@ struct _pi_program {
   _pi_program(pi_context ctxt);
   ~_pi_program();
 
-  pi_result create_from_source(const char *source, size_t length);
+  pi_result set_binary(const char *binary, size_t binarySizeInBytes);
 
   pi_result build_program(const char* build_options);
 
diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp
@@ -85,22 +85,14 @@ static RT::PiProgram createBinaryProgram(const ContextImplPtr Context,
 #endif
 
   RT::PiProgram Program;
-
-  // TODO: Implement `piProgramCreateWithBinary` to not require extra logic for
-  //       the CUDA backend.
-  const auto Backend = Context->getPlugin().getBackend();
-  if (Backend == backend::cuda) {
-    // TODO: Reemplace CreateWithSource with CreateWithBinary in CUDA backend
-    const char *SignedData = reinterpret_cast<const char *>(Data);
-    Plugin.call<PiApiKind::piclProgramCreateWithSource>(
-        Context->getHandleRef(), 1 /*one binary*/, &SignedData, &DataLen,
-        &Program);
-  } else {
-    RT::PiDevice Device = getFirstDevice(Context);
-    pi_int32 BinaryStatus = CL_SUCCESS;
-    Plugin.call<PiApiKind::piclProgramCreateWithBinary>(
-        Context->getHandleRef(), 1 /*one binary*/, &Device, &DataLen, &Data,
-        &BinaryStatus, &Program);
+  RT::PiDevice Device = getFirstDevice(Context);
+  pi_int32 BinaryStatus = CL_SUCCESS;
+  Plugin.call<PiApiKind::piclProgramCreateWithBinary>(
+      Context->getHandleRef(), 1 /*one binary*/, &Device, &DataLen, &Data,
+      &BinaryStatus, &Program);
+
+  if (BinaryStatus != CL_SUCCESS) {
+    throw runtime_error("Creating program with binary failed.", BinaryStatus);
   }
 
   return Program;
diff --git a/sycl/unittests/pi/cuda/test_kernels.cpp b/sycl/unittests/pi/cuda/test_kernels.cpp
@@ -128,9 +128,11 @@ const char *threeParamsTwoLocal = "\n\
 TEST_F(CudaKernelsTest, PICreateProgramAndKernel) {
 
   pi_program prog;
+  pi_int32 binary_status = PI_SUCCESS;
   ASSERT_EQ(
-      (plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithSource>(
-          context_, 1, (const char **)&ptxSource, nullptr, &prog)),
+      (plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithBinary>(
+          context_, 1, &device_, nullptr, (const unsigned char **)&ptxSource,
+          &binary_status, &prog)),
       PI_SUCCESS);
 
   ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piProgramBuild>(
@@ -147,10 +149,16 @@ TEST_F(CudaKernelsTest, PICreateProgramAndKernel) {
 TEST_F(CudaKernelsTest, PIKernelArgumentSimple) {
 
   pi_program prog;
+  /// NOTE: `binary_status` currently unsused in the CUDA backend but in case we
+  /// use it at some point in the future, pass it anyway and check the result.
+  /// Same goes for all the other tests in this file.
+  pi_int32 binary_status = PI_SUCCESS;
   ASSERT_EQ(
-      (plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithSource>(
-          context_, 1, (const char **)&ptxSource, nullptr, &prog)),
+      (plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithBinary>(
+          context_, 1, &device_, nullptr, (const unsigned char **)&ptxSource,
+          &binary_status, &prog)),
       PI_SUCCESS);
+  ASSERT_EQ(binary_status, PI_SUCCESS);
 
   ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piProgramBuild>(
                 prog, 1, &device_, "", nullptr, nullptr)),
@@ -174,10 +182,13 @@ TEST_F(CudaKernelsTest, PIKernelArgumentSimple) {
 TEST_F(CudaKernelsTest, PIKernelArgumentSetTwice) {
 
   pi_program prog;
+  pi_int32 binary_status = PI_SUCCESS;
   ASSERT_EQ(
-      (plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithSource>(
-          context_, 1, (const char **)&ptxSource, nullptr, &prog)),
+      (plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithBinary>(
+          context_, 1, &device_, nullptr, (const unsigned char **)&ptxSource,
+          &binary_status, &prog)),
       PI_SUCCESS);
+  ASSERT_EQ(binary_status, PI_SUCCESS);
 
   ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piProgramBuild>(
                 prog, 1, &device_, "", nullptr, nullptr)),
@@ -210,10 +221,13 @@ TEST_F(CudaKernelsTest, PIKernelArgumentSetTwice) {
 TEST_F(CudaKernelsTest, PIKernelSetMemObj) {
 
   pi_program prog;
+  pi_int32 binary_status = PI_SUCCESS;
   ASSERT_EQ(
-      (plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithSource>(
-          context_, 1, (const char **)&ptxSource, nullptr, &prog)),
+      (plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithBinary>(
+          context_, 1, &device_, nullptr, (const unsigned char **)&ptxSource,
+          &binary_status, &prog)),
       PI_SUCCESS);
+  ASSERT_EQ(binary_status, PI_SUCCESS);
 
   ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piProgramBuild>(
                 prog, 1, &device_, "", nullptr, nullptr)),
@@ -242,10 +256,13 @@ TEST_F(CudaKernelsTest, PIKernelSetMemObj) {
 TEST_F(CudaKernelsTest, PIkerneldispatch) {
 
   pi_program prog;
+  pi_int32 binary_status = PI_SUCCESS;
   ASSERT_EQ(
-      (plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithSource>(
-          context_, 1, (const char **)&ptxSource, nullptr, &prog)),
+      (plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithBinary>(
+          context_, 1, &device_, nullptr, (const unsigned char **)&ptxSource,
+          &binary_status, &prog)),
       PI_SUCCESS);
+  ASSERT_EQ(binary_status, PI_SUCCESS);
 
   ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piProgramBuild>(
                 prog, 1, &device_, "", nullptr, nullptr)),
@@ -282,10 +299,13 @@ TEST_F(CudaKernelsTest, PIkerneldispatch) {
 TEST_F(CudaKernelsTest, PIkerneldispatchTwo) {
 
   pi_program prog;
+  pi_int32 binary_status = PI_SUCCESS;
   ASSERT_EQ(
-      (plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithSource>(
-          context_, 1, (const char **)&twoParams, nullptr, &prog)),
+      (plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithBinary>(
+          context_, 1, &device_, nullptr, (const unsigned char **)&twoParams,
+          &binary_status, &prog)),
       PI_SUCCESS);
+  ASSERT_EQ(binary_status, PI_SUCCESS);
 
   ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piProgramBuild>(
                 prog, 1, &device_, "", nullptr, nullptr)),
@@ -333,10 +353,13 @@ TEST_F(CudaKernelsTest, PIkerneldispatchTwo) {
 TEST_F(CudaKernelsTest, PIKernelArgumentSetTwiceOneLocal) {
 
   pi_program prog;
+  pi_int32 binary_status = PI_SUCCESS;
   ASSERT_EQ(
-      (plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithSource>(
-          context_, 1, (const char **)&threeParamsTwoLocal, nullptr, &prog)),
+      (plugin.call_nocheck<detail::PiApiKind::piclProgramCreateWithBinary>(
+          context_, 1, &device_, nullptr,
+          (const unsigned char **)&threeParamsTwoLocal, &binary_status, &prog)),
       PI_SUCCESS);
+  ASSERT_EQ(binary_status, PI_SUCCESS);
 
   ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piProgramBuild>(
                 prog, 1, &device_, "", nullptr, nullptr)),