save

sunjiweiswift · sunjiweiswift · commit 20c2056b72fb · 2024-08-29T02:41:27.000Z
diff --git a/include/subgroup/tile/impl/load_xe.hpp b/include/subgroup/tile/impl/load_xe.hpp
@@ -119,8 +119,9 @@ tile_load(tile_t& tile, payload_t& payload) {
   static constexpr uint32_t max_load_width_in_elem =
       load_store_attr::max_load_width_in_bytes / sizeof(dtype);
 
-  //   static constexpr uint32_t max_trans_load_height_in_elem =
-  //       load_store_attr::max_trans_load_height_in_elem;
+  static constexpr uint32_t max_trans_load_height_in_elem =
+      load_store_attr::max_trans_load_height_in_elem;
+
   static constexpr uint32_t max_load_height_in_elem =
       load_store_attr::max_load_height_in_elem;
 
@@ -130,6 +131,22 @@ tile_load(tile_t& tile, payload_t& payload) {
   static constexpr uint32_t elems_per_reg =
       register_bytes_t<arch_tag>::reg_in_bytes / sizeof(dtype);
 
+  static constexpr uint32_t max_ld_blk_width_in_elem =
+      trans ? max_trans_load_width_in_elem : max_load_width_in_elem;
+
+  static constexpr uint32_t max_ld_blk_height_in_elem =
+      trans ? max_trans_load_height_in_elem : max_load_height_in_elem;
+
+  static constexpr uint32_t ld_blk_width =
+      std::min(
+          mem_transpose ? block_size_y : block_size_x,
+          max_ld_blk_width_in_elem) /
+      scale_factor;
+  static constexpr uint32_t ld_blk_height = std::min(
+      mem_transpose ? block_size_x : block_size_y, max_ld_blk_height_in_elem);
+
+      
+
   static constexpr uint32_t ld_blk_size_y_limit =
       mem_transpose ? max_trans_load_width_in_elem : max_load_height_in_elem;
   static constexpr uint32_t ld_blk_size_y = reg_transpose
@@ -211,12 +228,11 @@ tile_load(tile_t& tile, payload_t& payload) {
             scale_factor;
         uint32_t address_offset_y =
             mem_transpose ? offset_x : (offset_y + ii * ld_blk_size_y);
+
         reg_tmp.xetla_format<native_type_t<load_dtype>>() = xetla_load_global<
             native_type_t<load_dtype>,
-            (trans ? ld_blk_size_y : block_size_x) / scale_factor,
-            (trans ? block_size_x : ld_blk_size_y),
-            // block_size_x / scale_factor,
-            // ld_blk_size_y,
+            ld_blk_width,
+            ld_blk_height,
             arr_len,
             trans,
             mem_transform,
@@ -261,11 +277,6 @@ tile_load(tile_t& tile, payload_t& payload) {
             (mem_transpose ? remained_blk_size_y : block_size_x) / scale_factor;
         constexpr uint8_t block_height =
             mem_transpose ? block_size_x : remained_blk_size_y;
-        // constexpr uint32_t block_widthx_widthy_arrlen =
-        //     (block_width - 1) | ((block_height - 1) << 8);
-        // gpu::xetla::detail::xetla_set_block_widthx_widthy_arrlen(
-        //     tdesc.xetla_format<uint32_t>(), block_widthx_widthy_arrlen);
-
         reg_blk.xetla_select<load_elems, 1>(remained_start)
             .xetla_format<native_type_t<load_dtype>>() = xetla_load_global<
             native_type_t<load_dtype>,
@@ -283,15 +294,6 @@ tile_load(tile_t& tile, payload_t& payload) {
             payload.surface_pitch,
             payload.offset_x + offset_x / scale_factor,
             payload.offset_y + offset_y + remained_start_y);
-
-        // xetla_tload_global<
-        // load_dtype,
-        // (load_elems / scale_factor),
-        // L1,
-        // L2,
-        // trans,
-        // mem_transform,
-        // arch_tag>(tdesc);
       }
     }
   }
@@ -304,15 +306,7 @@ tile_load(tile_t& tile, payload_t& payload) {
         (!reg_transpose && (remained_size_y > ld_blk_size_y_limit))
         ? ld_blk_size_y_limit
         : remained_size_y;
-    // auto payload_row = payload_2d.xetla_select<num_block_x, 1, 16, 1>(
-    //     num_block_y * num_block_x, 0);
-    // detail::reset_tile_desc_core<
-    //     num_block_x,
-    //     block_size_x,
-    //     remained_ld_blk_size_y,
-    //     scale_factor,
-    //     arr_len,
-    //     mem_transpose>(payload_row);
+
 #pragma unroll
     for (uint32_t j = 0; j < num_block_x; j += arr_len) {
       int32_t offset_x = j * block_size_x;