Skip to content
This repository was archived by the owner on Aug 30, 2024. It is now read-only.

Commit 1ba27e4

Browse files
sunjiweiswiftDDEle
authored andcommitted
opt PVC arch
1 parent 5456fc0 commit 1ba27e4

File tree

15 files changed

+186
-207
lines changed

15 files changed

+186
-207
lines changed

examples/05_batch_gemm/batch_gemm.hpp

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -276,20 +276,20 @@ class batch_gemm_t {
276276
args.matB_base.base, args.matB_ld);
277277
}
278278
}
279-
if (epilogue_t::msg_type_c != msg_type::unaligned_2d) {
280-
if (epilogue_t::msg_type_c == msg_type::block_2d) {
281-
implementable &=
282-
kernel::block_2d<gpu_arch::XeHpc, dtype_c>::check_tensor(
283-
(uint64_t)(args.matC_base.base),
284-
args.matrix_n,
285-
args.matrix_m * args.batch_size,
286-
args.matC_ld);
287-
} else {
288-
implementable &=
289-
kernel::general_1d<gpu_arch::XeHpc, dtype_c>::check_alignment(
290-
args.matC_base.base, args.matC_ld);
291-
}
292-
}
279+
// if (epilogue_t::msg_type_c != msg_type::unaligned_2d) {
280+
// if (epilogue_t::msg_type_c == msg_type::block_2d) {
281+
// implementable &=
282+
// kernel::block_2d<gpu_arch::XeHpc, dtype_c>::check_tensor(
283+
// (uint64_t)(args.matC_base.base),
284+
// args.matrix_n,
285+
// args.matrix_m * args.batch_size,
286+
// args.matC_ld);
287+
// } else {
288+
// implementable &=
289+
// kernel::general_1d<gpu_arch::XeHpc, dtype_c>::check_alignment(
290+
// args.matC_base.base, args.matC_ld);
291+
// }
292+
// }
293293

294294
return implementable;
295295
}

examples/07_multi_layer_perceptron/multi_layer_perceptron.hpp

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -451,20 +451,20 @@ class multi_layer_perceptron_t {
451451
args.matV_base.base, args.matV_ld);
452452
}
453453
}
454-
if (epilogue_layer2_t::msg_type_c != msg_type::unaligned_2d) {
455-
if (epilogue_layer2_t::msg_type_c == msg_type::block_2d) {
456-
implementable &=
457-
kernel::block_2d<gpu_arch::XeHpc, dtype_c>::check_tensor(
458-
(uint64_t)(args.matC_base.base),
459-
args.matrix_n_layer2,
460-
args.matrix_m_layer2,
461-
args.matC_ld);
462-
} else {
463-
implementable &=
464-
kernel::general_1d<gpu_arch::XeHpc, dtype_c>::check_alignment(
465-
args.matC_base.base, args.matC_ld);
466-
}
467-
}
454+
// if (epilogue_layer2_t::msg_type_c != msg_type::unaligned_2d) {
455+
// if (epilogue_layer2_t::msg_type_c == msg_type::block_2d) {
456+
// implementable &=
457+
// kernel::block_2d<gpu_arch::XeHpc, dtype_c>::check_tensor(
458+
// (uint64_t)(args.matC_base.base),
459+
// args.matrix_n_layer2,
460+
// args.matrix_m_layer2,
461+
// args.matC_ld);
462+
// } else {
463+
// implementable &=
464+
// kernel::general_1d<gpu_arch::XeHpc, dtype_c>::check_alignment(
465+
// args.matC_base.base, args.matC_ld);
466+
// }
467+
// }
468468

469469
return implementable;
470470
}

include/common/core/arch_config.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,8 +100,8 @@ struct load_store_attr_t<msg_type::block_1d, arch_tag> {
100100

101101
template <>
102102
struct load_store_attr_t<msg_type::block_1d, gpu_arch::XeHpc> {
103-
static constexpr uint32_t max_load_vec_len = 64;
104-
static constexpr uint32_t max_store_vec_len = 64;
103+
static constexpr uint32_t max_load_vec_len = 512;
104+
static constexpr uint32_t max_store_vec_len = 512;
105105
static constexpr uint32_t max_prefetch_vec_len = 64;
106106
};
107107

include/common/core/memory.hpp

Lines changed: 57 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -256,7 +256,7 @@ constexpr __ESIMD_NS::atomic_op get_atomic_op(gpu::xetla::atomic_op ao) {
256256
///
257257
template <
258258
typename Ty,
259-
uint8_t NElts = 1,
259+
int NElts = 1,
260260
data_size DS = data_size::default_size,
261261
cache_hint L1H = cache_hint::cached,
262262
cache_hint L2H = cache_hint::cached,
@@ -293,7 +293,7 @@ __XETLA_API void xetla_prefetch_global(
293293
///
294294
template <
295295
typename Ty,
296-
uint8_t NElts = 1,
296+
int NElts = 1,
297297
data_size DS = data_size::default_size,
298298
cache_hint L1H = cache_hint::cached,
299299
cache_hint L2H = cache_hint::cached>
@@ -385,7 +385,7 @@ __XETLA_API xetla_vector<T, N> xetla_load_global(
385385
///
386386
template <
387387
typename Ty,
388-
uint8_t NElts = 1,
388+
int NElts = 1,
389389
data_size DS = data_size::default_size,
390390
cache_hint L1H = cache_hint::none,
391391
cache_hint L2H = cache_hint::none,
@@ -431,7 +431,7 @@ __XETLA_API xetla_vector<Ty, N * NElts> xetla_load_global(
431431
///
432432
template <
433433
typename Ty,
434-
uint8_t NElts = 1,
434+
int NElts = 1,
435435
data_size DS = data_size::default_size,
436436
cache_hint L1H = cache_hint::none,
437437
cache_hint L2H = cache_hint::none,
@@ -653,7 +653,7 @@ __XETLA_API void xetla_local_init() {
653653
///
654654
template <
655655
typename Ty,
656-
uint8_t NElts = 1,
656+
int NElts = 1,
657657
data_size DS = data_size::default_size,
658658
int N>
659659
__XETLA_API xetla_vector<Ty, N * NElts> xetla_load_local(
@@ -670,35 +670,31 @@ __XETLA_API xetla_vector<Ty, N * NElts> xetla_load_local(
670670
xetla_cvt<uint64_t, uint32_t>(offsets), pred);
671671
}
672672

673-
/// @brief SLM block load. (transposed gather with 1 channel).
674-
/// Collects elements located at slm and returns them as a single \ref
675-
/// xetla_vector object.
676-
///
677-
/// Supported platforms: DG2, PVC
678-
///
679-
/// VISA instruction: lsc_load.slm
680-
///
681-
/// @tparam Ty is element type.
682-
/// @tparam NElts is the number of elements to load per address (i.e.
683-
/// vector_size per SIMD channel).
684-
/// @tparam DS is the data size.
685-
/// @param offset [in] is the zero-based offset for SLM buffer in bytes.
686-
/// @return is a xetla_vector of type T and size NElts.
687-
///
688-
template <
689-
typename Ty,
690-
uint8_t NElts = 1,
691-
data_size DS = data_size::default_size>
673+
/// Loads a contiguous block of SLM memory referenced by the given byte-offset
674+
/// \p offset, then returns the loaded data as a simd object.
675+
/// The generated code depends on the combination {T, N, Flags}.
676+
/// Providing flags specifying the alignment of 16-bytes or more produces more
677+
/// efficient code. If the alignment is smaller than 16-bytes, then less
678+
/// efficient gather is generated. If the loaded vector is too long
679+
/// for 1 flat-load GPU instruction, then a series of flat-loads and/or gathers
680+
/// may be generated.
681+
/// @tparam T Element type.
682+
/// @tparam N Number of elements to load.
683+
/// @tparam Flags The alignment specifier type tag.
684+
/// @param byte_offset The byte-offset to load from.
685+
/// @param Flags Specifies the alignment.
686+
/// @return A vector of loaded elements.
687+
///
688+
template <typename Ty, int NElts = 1, data_size DS = data_size::default_size>
692689
__XETLA_API xetla_vector<Ty, NElts> xetla_load_local(uint32_t offset) {
693690
using T = native_type_t<Ty>;
694-
DEBUG_INVOKE(
695-
dbg_level::core,
696-
core::general_1d<gpu_arch::XeHpc, Ty>::template check_restriction<NElts>(
697-
(uint64_t)offset));
691+
// DEBUG_INVOKE(
692+
// dbg_level::core,
693+
// core::general_1d<gpu_arch::XeHpc, Ty>::template
694+
// check_restriction<NElts>(
695+
// (uint64_t)offset));
698696

699-
return __ESIMD_ENS::
700-
lsc_slm_block_load<T, NElts, gpu::xetla::detail::get_data_size(DS)>(
701-
offset);
697+
return __ESIMD_NS::slm_block_load<T, NElts>(offset);
702698
}
703699

704700
/// @brief SLM scattered store.
@@ -719,7 +715,7 @@ __XETLA_API xetla_vector<Ty, NElts> xetla_load_local(uint32_t offset) {
719715
///
720716
template <
721717
typename Ty,
722-
uint8_t NElts = 1,
718+
int NElts = 1,
723719
data_size DS = data_size::default_size,
724720
int N>
725721
__XETLA_API void xetla_store_local(
@@ -737,36 +733,38 @@ __XETLA_API void xetla_store_local(
737733
offsets, vals, pred);
738734
}
739735

740-
/// @brief SLM block store (transposed SLM scatter with 1 channel).
741-
/// Scatters elements located to slm.
742-
///
743-
/// Supported platforms: DG2, PVC
744-
///
745-
/// VISA instruction: lsc_store.slm
746-
///
747-
/// @tparam Ty is element type.
748-
/// @tparam NElts is the number of elements to store per address (i.e.
749-
/// vector_size per SIMD channel).
750-
/// @tparam DS is the data size.
751-
/// @param offset [in] is the zero-based offset for SLM buffer in bytes.
752-
/// @param vals [in] is values to store.
753-
///
754-
template <
755-
typename Ty,
756-
uint8_t NElts = 1,
757-
data_size DS = data_size::default_size>
736+
/// Stores elements of the vector \p vals to a contiguous block of SLM memory
737+
/// at the given byte-offset \p offset.
738+
/// The generated code depends on the combination {T, N, Flags}.
739+
/// Providing flags specifying the alignment of 16-bytes or more produces more
740+
/// efficient code. If the alignment is smaller than 16-bytes, then less
741+
/// efficient scatter is generated. If the stored vector is too long
742+
/// for 1 flat-store GPU instruction, then a series of flat-store and/or
743+
/// scatters may be generated.
744+
/// @tparam T Element type.
745+
/// @tparam N Number of elements to store.
746+
/// @tparam Flags The alignment specifier type tag.
747+
/// @param offset The byte-offset to store at.
748+
/// @param vals The vector to store.
749+
/// @param Flags Specifies the alignment.
750+
///
751+
template <typename Ty, int NElts = 1, data_size DS = data_size::default_size>
758752
__XETLA_API void xetla_store_local(
759753
uint32_t offset,
760754
xetla_vector<Ty, NElts> vals) {
761-
using T = native_type_t<Ty>;
762-
DEBUG_INVOKE(
763-
dbg_level::core,
764-
core::general_1d<gpu_arch::XeHpc, Ty>::template check_restriction<NElts>(
765-
offset));
766-
767-
__ESIMD_ENS::
768-
lsc_slm_block_store<T, NElts, gpu::xetla::detail::get_data_size(DS)>(
769-
offset, vals);
755+
// using T = native_type_t<Ty>;
756+
// DEBUG_INVOKE(
757+
// dbg_level::core,
758+
// core::general_1d<gpu_arch::XeHpc, Ty>::template
759+
// check_restriction<NElts>(
760+
// offset));
761+
762+
// __ESIMD_ENS::
763+
// lsc_slm_block_store<T, NElts, gpu::xetla::detail::get_data_size(DS)>(
764+
// offset, vals);
765+
// __ESIMD_NS::properties props{};
766+
767+
__ESIMD_NS::slm_block_store<Ty, NElts>(offset, vals);
770768
}
771769

772770
/// @brief SLM scattered atomic (0 src).

include/group/cooperative_reduction.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ class cooperative_reduce_t<
9595
static constexpr uint32_t block_size_x =
9696
gpu::xetla::subgroup::detail::gcd<tile_size_x, src_block_size_x>::value;
9797
static constexpr uint32_t block_size_y =
98-
(tile_size_y > src_block_size_y) ? src_block_size_y : tile_size_y;
98+
std::min(src_block_size_y, tile_size_y);
9999

100100
using local_st_tile_desc_t = subgroup::tile_desc_t<
101101
sg_tile_n,

include/group/epilogue/impl/default_xe.hpp

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -70,9 +70,9 @@ class epilogue_t<
7070
}
7171

7272
public:
73-
static constexpr msg_type msg_type_c =
74-
(mem_space_c == mem_space::global ? msg_type::block_2d
75-
: msg_type::scatter);
73+
// static constexpr msg_type msg_type_c =
74+
// (mem_space_c == mem_space::global ? msg_type::block_2d
75+
// : msg_type::scatter);
7676

7777
/// @brief Default epilogue.
7878
/// 1) Convert dtype_acc to dtype_c 2) Overwrite to memory.
@@ -94,6 +94,11 @@ class epilogue_t<
9494
[[maybe_unused]] uint32_t nbarrier_base = 0) {
9595
using mat_tile_desc = typename matAcc_t::tile_desc;
9696
using matC_t = subgroup::tile_t<dtype_c, mat_tile_desc>;
97+
98+
static constexpr msg_type msg_type_c =
99+
subgroup::msg_type_v<mat_tile_desc, mem_desc_c_t>;
100+
using matC_payload_t = subgroup::
101+
mem_payload_t<mem_desc_c_t, mat_tile_desc, msg_type_c, arch_tag>;
97102
using matC_payload_t = subgroup::
98103
mem_payload_t<mem_desc_c_t, mat_tile_desc, msg_type_c, arch_tag>;
99104
update_sg_tile_tdesc(g, mem_desc_c);
@@ -143,9 +148,7 @@ class epilogue_t<
143148
using dtype_c = typename mem_desc_c_t::dtype;
144149
static constexpr mem_layout mem_layout_c = mem_desc_c_t::layout;
145150
static constexpr mem_space mem_space_c = mem_desc_c_t::space;
146-
static constexpr msg_type msg_type_c =
147-
(mem_space_c == mem_space::global ? msg_type::block_2d
148-
: msg_type::scatter);
151+
149152
/// @brief Updates tile base descriptor based on the tid.
150153
__XETLA_API static void update_sg_tile_tdesc(
151154
work_group_t& g,
@@ -165,8 +168,6 @@ class epilogue_t<
165168
}
166169

167170
public:
168-
static constexpr bool is_2d_block_c = (msg_type_c == msg_type::block_2d);
169-
170171
/// @brief Default epilogue.
171172
/// 1) Convert dtype_acc to dtype_c 2) Overwrite to memory.
172173
/// @tparam matAcc_t Is the type of the input tile.
@@ -190,11 +191,13 @@ class epilogue_t<
190191
[[maybe_unused]] uint32_t nbarrier_base = 0) {
191192
using mat_tile_desc = typename matAcc_t::tile_desc;
192193
using matC_t = subgroup::tile_t<dtype_c, mat_tile_desc>;
193-
using matC_payload_t = subgroup::mem_payload_t<
194-
mem_desc_t<dtype_c, mem_layout_c, mem_space_c>,
195-
mat_tile_desc,
196-
msg_type_c,
197-
arch_tag>;
194+
195+
// static constexpr msg_type msg_type_c = msg_type::block_2d;
196+
static constexpr msg_type msg_type_c =
197+
subgroup::msg_type_v<mat_tile_desc, mem_desc_c_t>;
198+
199+
using matC_payload_t = subgroup::
200+
mem_payload_t<mem_desc_c_t, mat_tile_desc, msg_type_c, arch_tag>;
198201

199202
update_sg_tile_tdesc(g, mem_desc_c);
200203

include/group/epilogue/impl/tile_op_xe.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ class epilogue_t<
127127
uint32_t nbarrier_base = 0) {
128128
using mat_tile_desc = typename matAcc_t::tile_desc;
129129
using matC_t = subgroup::tile_t<dtype_c, mat_tile_desc>;
130+
// static constexpr msg_type msg_type_c = msg_type::block_2d;
130131
static constexpr msg_type msg_type_c =
131132
subgroup::msg_type_v<mat_tile_desc, mem_desc_c_t>;
132133
using matC_payload_t = subgroup::

include/kernel/gemm/impl/kslicing_xe.hpp

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -387,18 +387,18 @@ class gemm_universal_t<
387387
args.matB_base.base, args.matB_ld);
388388
}
389389
}
390-
if (epilogue_t::msg_type_c != msg_type::unaligned_2d) {
391-
if (epilogue_t::msg_type_c == msg_type::block_2d) {
392-
implementable &= kernel::block_2d<arch_tag, dtype_c>::check_tensor(
393-
(uint64_t)(args.matC_base.base),
394-
args.matrix_n,
395-
args.matrix_m,
396-
args.matC_ld);
397-
} else {
398-
implementable &= kernel::general_1d<arch_tag, dtype_c>::check_alignment(
399-
args.matC_base.base, args.matC_ld);
400-
}
401-
}
390+
// if (epilogue_t::msg_type_c != msg_type::unaligned_2d) {
391+
// if (epilogue_t::msg_type_c == msg_type::block_2d) {
392+
// implementable &= kernel::block_2d<arch_tag, dtype_c>::check_tensor(
393+
// (uint64_t)(args.matC_base.base),
394+
// args.matrix_n,
395+
// args.matrix_m,
396+
// args.matC_ld);
397+
// } else {
398+
// implementable &= kernel::general_1d<arch_tag, dtype_c>::check_alignment(
399+
// args.matC_base.base, args.matC_ld);
400+
// }
401+
// }
402402

403403
return implementable;
404404
}

0 commit comments

Comments
 (0)