Skip to content

Commit 3a5ee6e

Browse files
Alexandr-KonovalovEwanCbader
authored
[SYCL] Allocate sycl::handler on the stack when possible (#17319)
There are two set of changes: 1. On hot path handler_impl is allocated on stack. 2. handler keeps reference to queue shared_ptr, not shared_ptr. --------- Signed-off-by: Alexandr Konovalov <alexandr.konovalov@intel.com> Co-authored-by: Ewan Crawford <ewan.cr@gmail.com> Co-authored-by: Alexey Bader <alexey.bader@intel.com>
1 parent 57173a0 commit 3a5ee6e

File tree

9 files changed

+138
-33
lines changed

9 files changed

+138
-33
lines changed

sycl/include/sycl/handler.hpp

+33
Original file line numberDiff line numberDiff line change
@@ -426,8 +426,23 @@ class __SYCL_EXPORT handler {
426426
/// \param Queue is a SYCL queue.
427427
/// \param CallerNeedsEvent indicates if the event resulting from this handler
428428
/// is needed by the caller.
429+
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
430+
handler(const std::shared_ptr<detail::queue_impl> &Queue,
431+
bool CallerNeedsEvent);
432+
#else
429433
handler(std::shared_ptr<detail::queue_impl> Queue, bool CallerNeedsEvent);
434+
#endif
430435

436+
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
437+
/// Constructs SYCL handler from the pre-constructed handler_impl and the
438+
/// associated queue. Inside of Graph implementation, the Queue value is not
439+
/// used, for those cases it can be initialized with an empty shared_ptr.
440+
///
441+
/// \param HandlerImpl is a pre-constructed handler_impl.
442+
/// \param Queue is a SYCL queue.
443+
handler(detail::handler_impl *HandlerImpl,
444+
const std::shared_ptr<detail::queue_impl> &Queue);
445+
#else
431446
/// Constructs SYCL handler from the associated queue and the submission's
432447
/// primary and secondary queue.
433448
///
@@ -449,14 +464,17 @@ class __SYCL_EXPORT handler {
449464
__SYCL_DLL_LOCAL handler(std::shared_ptr<detail::queue_impl> Queue,
450465
detail::queue_impl *SecondaryQueue,
451466
bool CallerNeedsEvent);
467+
#endif
452468

469+
#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
453470
/// Constructs SYCL handler from Graph.
454471
///
455472
/// The handler will add the command-group as a node to the graph rather than
456473
/// enqueueing it straight away.
457474
///
458475
/// \param Graph is a SYCL command_graph
459476
handler(std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> Graph);
477+
#endif
460478

461479
void *storeRawArg(const void *Ptr, size_t Size);
462480

@@ -3272,8 +3290,18 @@ class __SYCL_EXPORT handler {
32723290
uint64_t SignalValue);
32733291

32743292
private:
3293+
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
3294+
// In some cases we need to construct handler_impl in heap. Sole propose
3295+
// of MImplOwner is to destroy handler_impl in destructor of handler.
3296+
// Can't use unique_ptr because declaration of handler_impl is not available
3297+
// in this header.
3298+
std::shared_ptr<detail::handler_impl> MImplOwner;
3299+
detail::handler_impl *impl;
3300+
const std::shared_ptr<detail::queue_impl> &MQueue;
3301+
#else
32753302
std::shared_ptr<detail::handler_impl> impl;
32763303
std::shared_ptr<detail::queue_impl> MQueue;
3304+
#endif
32773305
std::vector<detail::LocalAccessorImplPtr> MLocalAccStorage;
32783306
std::vector<std::shared_ptr<detail::stream_impl>> MStreamStorage;
32793307
detail::ABINeutralKernelNameStrT MKernelName;
@@ -3735,6 +3763,11 @@ class __SYCL_EXPORT handler {
37353763

37363764
friend class detail::HandlerAccess;
37373765

3766+
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
3767+
__SYCL_DLL_LOCAL detail::handler_impl *get_impl() { return impl; }
3768+
#else
3769+
__SYCL_DLL_LOCAL detail::handler_impl *get_impl() { return impl.get(); }
3770+
#endif
37383771
// Friend free-functions for asynchronous allocation and freeing.
37393772
__SYCL_EXPORT friend void
37403773
ext::oneapi::experimental::async_free(sycl::handler &h, void *ptr);

sycl/include/sycl/reduction.hpp

+37-17
Original file line numberDiff line numberDiff line change
@@ -140,12 +140,19 @@ template <typename... Ts> ReduTupleT<Ts...> makeReduTupleT(Ts... Elements) {
140140
return sycl::detail::make_tuple(Elements...);
141141
}
142142

143+
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
144+
__SYCL_EXPORT size_t reduGetMaxWGSize(const std::shared_ptr<queue_impl> &Queue,
145+
size_t LocalMemBytesPerWorkItem);
146+
__SYCL_EXPORT size_t reduGetPreferredWGSize(
147+
const std::shared_ptr<queue_impl> &Queue, size_t LocalMemBytesPerWorkItem);
148+
#else
143149
__SYCL_EXPORT size_t reduGetMaxWGSize(std::shared_ptr<queue_impl> Queue,
144150
size_t LocalMemBytesPerWorkItem);
145-
__SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
146-
size_t &NWorkGroups);
147151
__SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue,
148152
size_t LocalMemBytesPerWorkItem);
153+
#endif
154+
__SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
155+
size_t &NWorkGroups);
149156

150157
template <typename T, class BinaryOperation, bool IsOptional>
151158
class ReducerElement;
@@ -1071,7 +1078,12 @@ class reduction_impl_algo {
10711078
std::shared_ptr<int> Counter(malloc_device<int>(1, q), Deleter);
10721079
CGH.addReduction(Counter);
10731080

1074-
addCounterInit(CGH, CGH.MQueue, Counter);
1081+
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
1082+
std::shared_ptr<detail::queue_impl> Queue(CGH.MQueue);
1083+
#else
1084+
std::shared_ptr<detail::queue_impl> &Queue = CGH.MQueue;
1085+
#endif
1086+
addCounterInit(CGH, Queue, Counter);
10751087

10761088
return Counter.get();
10771089
}
@@ -1229,7 +1241,8 @@ template <>
12291241
struct NDRangeReduction<reduction::strategy::local_atomic_and_atomic_cross_wg> {
12301242
template <typename KernelName, int Dims, typename PropertiesT,
12311243
typename KernelType, typename Reduction>
1232-
static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1244+
static void run(handler &CGH,
1245+
const std::shared_ptr<detail::queue_impl> &Queue,
12331246
nd_range<Dims> NDRange, PropertiesT &Properties,
12341247
Reduction &Redu, KernelType &KernelFunc) {
12351248
static_assert(Reduction::has_identity,
@@ -1280,7 +1293,8 @@ struct NDRangeReduction<
12801293
reduction::strategy::group_reduce_and_last_wg_detection> {
12811294
template <typename KernelName, int Dims, typename PropertiesT,
12821295
typename KernelType, typename Reduction>
1283-
static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1296+
static void run(handler &CGH,
1297+
const std::shared_ptr<detail::queue_impl> &Queue,
12841298
nd_range<Dims> NDRange, PropertiesT &Properties,
12851299
Reduction &Redu, KernelType &KernelFunc) {
12861300
static_assert(Reduction::has_identity,
@@ -1479,7 +1493,8 @@ void doTreeReductionOnTuple(size_t WorkSize, size_t LID,
14791493
template <> struct NDRangeReduction<reduction::strategy::range_basic> {
14801494
template <typename KernelName, int Dims, typename PropertiesT,
14811495
typename KernelType, typename Reduction>
1482-
static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1496+
static void run(handler &CGH,
1497+
const std::shared_ptr<detail::queue_impl> &Queue,
14831498
nd_range<Dims> NDRange, PropertiesT &Properties,
14841499
Reduction &Redu, KernelType &KernelFunc) {
14851500
using reducer_type = typename Reduction::reducer_type;
@@ -1590,7 +1605,8 @@ template <>
15901605
struct NDRangeReduction<reduction::strategy::group_reduce_and_atomic_cross_wg> {
15911606
template <typename KernelName, int Dims, typename PropertiesT,
15921607
typename KernelType, typename Reduction>
1593-
static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1608+
static void run(handler &CGH,
1609+
const std::shared_ptr<detail::queue_impl> &Queue,
15941610
nd_range<Dims> NDRange, PropertiesT &Properties,
15951611
Reduction &Redu, KernelType &KernelFunc) {
15961612
static_assert(Reduction::has_identity,
@@ -1626,7 +1642,8 @@ struct NDRangeReduction<
16261642
reduction::strategy::local_mem_tree_and_atomic_cross_wg> {
16271643
template <typename KernelName, int Dims, typename PropertiesT,
16281644
typename KernelType, typename Reduction>
1629-
static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1645+
static void run(handler &CGH,
1646+
const std::shared_ptr<detail::queue_impl> &Queue,
16301647
nd_range<Dims> NDRange, PropertiesT &Properties,
16311648
Reduction &Redu, KernelType &KernelFunc) {
16321649
using reducer_type = typename Reduction::reducer_type;
@@ -1687,7 +1704,8 @@ struct NDRangeReduction<
16871704
reduction::strategy::group_reduce_and_multiple_kernels> {
16881705
template <typename KernelName, int Dims, typename PropertiesT,
16891706
typename KernelType, typename Reduction>
1690-
static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1707+
static void run(handler &CGH,
1708+
const std::shared_ptr<detail::queue_impl> &Queue,
16911709
nd_range<Dims> NDRange, PropertiesT &Properties,
16921710
Reduction &Redu, KernelType &KernelFunc) {
16931711
static_assert(Reduction::has_identity,
@@ -1825,7 +1843,8 @@ struct NDRangeReduction<
18251843
template <> struct NDRangeReduction<reduction::strategy::basic> {
18261844
template <typename KernelName, int Dims, typename PropertiesT,
18271845
typename KernelType, typename Reduction>
1828-
static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
1846+
static void run(handler &CGH,
1847+
const std::shared_ptr<detail::queue_impl> &Queue,
18291848
nd_range<Dims> NDRange, PropertiesT &Properties,
18301849
Reduction &Redu, KernelType &KernelFunc) {
18311850
using element_type = typename Reduction::reducer_element_type;
@@ -2600,9 +2619,9 @@ tuple_select_elements(TupleT Tuple, std::index_sequence<Is...>) {
26002619
template <> struct NDRangeReduction<reduction::strategy::multi> {
26012620
template <typename KernelName, int Dims, typename PropertiesT,
26022621
typename... RestT>
2603-
static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
2604-
nd_range<Dims> NDRange, PropertiesT &Properties,
2605-
RestT... Rest) {
2622+
static void
2623+
run(handler &CGH, const std::shared_ptr<detail::queue_impl> &Queue,
2624+
nd_range<Dims> NDRange, PropertiesT &Properties, RestT... Rest) {
26062625
std::tuple<RestT...> ArgsTuple(Rest...);
26072626
constexpr size_t NumArgs = sizeof...(RestT);
26082627
auto KernelFunc = std::get<NumArgs - 1>(ArgsTuple);
@@ -2644,7 +2663,8 @@ template <> struct NDRangeReduction<reduction::strategy::auto_select> {
26442663

26452664
template <typename KernelName, int Dims, typename PropertiesT,
26462665
typename KernelType, typename Reduction>
2647-
static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
2666+
static void run(handler &CGH,
2667+
const std::shared_ptr<detail::queue_impl> &Queue,
26482668
nd_range<Dims> NDRange, PropertiesT &Properties,
26492669
Reduction &Redu, KernelType &KernelFunc) {
26502670
auto Delegate = [&](auto Impl) {
@@ -2691,9 +2711,9 @@ template <> struct NDRangeReduction<reduction::strategy::auto_select> {
26912711
}
26922712
template <typename KernelName, int Dims, typename PropertiesT,
26932713
typename... RestT>
2694-
static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
2695-
nd_range<Dims> NDRange, PropertiesT &Properties,
2696-
RestT... Rest) {
2714+
static void
2715+
run(handler &CGH, const std::shared_ptr<detail::queue_impl> &Queue,
2716+
nd_range<Dims> NDRange, PropertiesT &Properties, RestT... Rest) {
26972717
return Impl<Strat::multi>::run<KernelName>(CGH, Queue, NDRange, Properties,
26982718
Rest...);
26992719
}

sycl/source/detail/graph_impl.cpp

+10
Original file line numberDiff line numberDiff line change
@@ -502,7 +502,12 @@ graph_impl::add(std::function<void(handler &)> CGF,
502502
const std::vector<sycl::detail::ArgDesc> &Args,
503503
std::vector<std::shared_ptr<node_impl>> &Deps) {
504504
(void)Args;
505+
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
506+
detail::handler_impl HandlerImpl{shared_from_this()};
507+
sycl::handler Handler{&HandlerImpl, std::shared_ptr<detail::queue_impl>{}};
508+
#else
505509
sycl::handler Handler{shared_from_this()};
510+
#endif
506511

507512
#if XPTI_ENABLE_INSTRUMENTATION
508513
// Save code location if one was set in TLS.
@@ -2184,7 +2189,12 @@ void dynamic_command_group_impl::finalizeCGFList(
21842189
const auto &CGF = CGFList[CGFIndex];
21852190
// Handler defined inside the loop so it doesn't appear to the runtime
21862191
// as a single command-group with multiple commands inside.
2192+
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
2193+
detail::handler_impl HandlerImpl{MGraph};
2194+
sycl::handler Handler{&HandlerImpl, std::shared_ptr<detail::queue_impl>{}};
2195+
#else
21872196
sycl::handler Handler{MGraph};
2197+
#endif
21882198
CGF(Handler);
21892199

21902200
if (Handler.getType() != sycl::detail::CGType::Kernel &&

sycl/source/detail/queue_impl.cpp

+13
Original file line numberDiff line numberDiff line change
@@ -316,8 +316,15 @@ event queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF,
316316
const detail::code_location &Loc,
317317
bool IsTopCodeLoc,
318318
const SubmissionInfo &SubmitInfo) {
319+
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
320+
detail::handler_impl HandlerImplVal(SecondaryQueue, CallerNeedsEvent);
321+
detail::handler_impl *HandlerImpl = &HandlerImplVal;
322+
handler Handler(HandlerImpl, Self);
323+
#else
319324
handler Handler(Self, SecondaryQueue, CallerNeedsEvent);
320325
auto &HandlerImpl = detail::getSyclObjImpl(Handler);
326+
#endif
327+
321328
#ifdef XPTI_ENABLE_INSTRUMENTATION
322329
if (xptiTraceEnabled()) {
323330
Handler.saveCodeLoc(Loc, IsTopCodeLoc);
@@ -371,8 +378,14 @@ event queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF,
371378
const detail::code_location &Loc,
372379
bool IsTopCodeLoc,
373380
const SubmissionInfo &SubmitInfo) {
381+
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
382+
detail::handler_impl HandlerImplVal(PrimaryQueue.get(), CallerNeedsEvent);
383+
detail::handler_impl *HandlerImpl = &HandlerImplVal;
384+
handler Handler(HandlerImpl, Self);
385+
#else
374386
handler Handler(Self, CallerNeedsEvent);
375387
auto &HandlerImpl = detail::getSyclObjImpl(Handler);
388+
#endif
376389

377390
#if XPTI_ENABLE_INSTRUMENTATION
378391
if (xptiTraceEnabled()) {

sycl/source/detail/reduction.cpp

+11
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,15 @@ __SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups(
7272
return NumThreads;
7373
}
7474

75+
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
76+
__SYCL_EXPORT size_t
77+
reduGetMaxWGSize(const std::shared_ptr<sycl::detail::queue_impl> &Queue,
78+
size_t LocalMemBytesPerWorkItem) {
79+
#else
7580
__SYCL_EXPORT size_t
7681
reduGetMaxWGSize(std::shared_ptr<sycl::detail::queue_impl> Queue,
7782
size_t LocalMemBytesPerWorkItem) {
83+
#endif
7884
device Dev = Queue->get_device();
7985
size_t MaxWGSize = Dev.get_info<sycl::info::device::max_work_group_size>();
8086

@@ -113,8 +119,13 @@ reduGetMaxWGSize(std::shared_ptr<sycl::detail::queue_impl> Queue,
113119
return WGSize;
114120
}
115121

122+
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
123+
__SYCL_EXPORT size_t reduGetPreferredWGSize(
124+
const std::shared_ptr<queue_impl> &Queue, size_t LocalMemBytesPerWorkItem) {
125+
#else
116126
__SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue,
117127
size_t LocalMemBytesPerWorkItem) {
128+
#endif
118129
// TODO: Graphs extension explicit API uses a handler with a null queue to
119130
// process CGFs, in future we should have access to the device so we can
120131
// correctly calculate this.

sycl/source/detail/scheduler/commands.hpp

+1
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,7 @@ class Command {
241241
static std::vector<ur_event_handle_t>
242242
getUrEvents(const std::vector<EventImplPtr> &EventImpls,
243243
const QueueImplPtr &CommandQueue, bool IsHostTaskCommand);
244+
244245
/// Collect UR events from EventImpls and filter out some of them in case of
245246
/// in order queue. Does blocking enqueue if event is expected to produce ur
246247
/// event but has empty native handle.

0 commit comments

Comments
 (0)