pytorch
diff --git a/‎benchmarks/benchmark_rowwise_scaled_linear_sparse_cutlass.py
+62 b/‎benchmarks/benchmark_rowwise_scaled_linear_sparse_cutlass.py
+62
diff --git a/‎docs/source/api_ref_dtypes.rst
+1 b/‎docs/source/api_ref_dtypes.rst
+1
diff --git a/‎setup.py
+12 b/‎setup.py
+12
diff --git a/‎test/test_rowwise_scaled_linear_cutlass.py
+1-1 b/‎test/test_rowwise_scaled_linear_cutlass.py
+1-1
diff --git a/‎test/test_rowwise_scaled_linear_sparse_cutlass.py
+125 b/‎test/test_rowwise_scaled_linear_sparse_cutlass.py
+125
diff --git a/‎torchao/_models/llama/generate.py
+42-20 b/‎torchao/_models/llama/generate.py
+42-20
diff --git a/‎torchao/csrc/cuda/rowwise_scaled_linear_cutlass/rowwise_scaled_linear_cutlass.cuh
+2 b/‎torchao/csrc/cuda/rowwise_scaled_linear_cutlass/rowwise_scaled_linear_cutlass.cuh
+2
diff --git a/‎torchao/csrc/cuda/rowwise_scaled_linear_cutlass/rowwise_scaled_linear_cutlass_s4s4.cu
+9-8 b/‎torchao/csrc/cuda/rowwise_scaled_linear_cutlass/rowwise_scaled_linear_cutlass_s4s4.cu
+9-8
diff --git a/‎torchao/csrc/cuda/rowwise_scaled_linear_cutlass/rowwise_scaled_linear_cutlass_s8s4.cu
+6-5 b/‎torchao/csrc/cuda/rowwise_scaled_linear_cutlass/rowwise_scaled_linear_cutlass_s8s4.cu
+6-5
@@ -0,0 +1,62 @@
+import pandas as pd
+import torch
+from tqdm import tqdm
+from triton.testing import do_bench
+
+from torchao.ops import (
+    rowwise_scaled_linear_sparse_cutlass_f8f8,
+    to_sparse_semi_structured_cutlass_sm9x_f8,
+)
+
+
+def benchmark_microseconds(f, *args):
+    return do_bench(lambda: f(*args), return_mode="median") * 1e3
+
+
+def get_problem(m: int, n: int, k: int):
+    dev = torch.device("cuda")
+
+    A = torch.randn((m, k), dtype=torch.half, device=dev).to(torch.float8_e5m2)
+    A_scale = torch.randn((m,), dtype=torch.half, device=dev)
+    B = torch.randn((n, k), dtype=torch.half, device=dev).to(torch.float8_e4m3fn)
+    B_sp, B_meta = to_sparse_semi_structured_cutlass_sm9x_f8(B)
+    B_scale = torch.randn((n,), dtype=torch.half, device=dev)
+    C = None
+
+    return A, A_scale, B_sp, B_meta, B_scale, C
+
+
+def benchmark(m: int, k: int, n: int):
+    dev = torch.device("cuda")
+    A_ref = torch.randn((m, k), dtype=torch.half, device=dev)
+    B_ref = torch.randn((n, k), dtype=torch.half, device=dev)
+    fp16_time = benchmark_microseconds(torch.nn.functional.linear, A_ref, B_ref)
+
+    A, A_scale, B_sp, B_meta, B_scale, C = get_problem(m, n, k)
+    rowwise_scaled_linear_sparse_cutlass_f8f8_time = benchmark_microseconds(
+        rowwise_scaled_linear_sparse_cutlass_f8f8, A, A_scale, B_sp, B_meta, B_scale, C
+    )
+
+    return {
+        "m": m,
+        "k": k,
+        "n": n,
+        "fp16_latency (ms)": fp16_time,
+        "rowwise_scaled_linear_sparse_cutlass_f8f8 latency (ms)": rowwise_scaled_linear_sparse_cutlass_f8f8_time,
+        "f8f8 speedup (d/s)": fp16_time
+        / rowwise_scaled_linear_sparse_cutlass_f8f8_time,
+    }
+
+
+if __name__ == "__main__":
+    k_vals = (8192, 8192, 8192, 28672)
+    n_vals = (8192, 10240, 57344, 8192)
+
+    results = []
+    for m in tqdm([1 << i for i in range(10)]):
+        for n, k in zip(n_vals, k_vals):
+            results.append(benchmark(m, k, n))
+
+    df = pd.DataFrame(results)
+    df.to_csv("rowwise_scaled_linear_sparse_cutlass_time_results.csv", index=False)
+    print(df.to_markdown(index=False))
@@ -28,6 +28,7 @@ Layouts and Tensor Subclasses
     MarlinQQQLayout
     Int4CPULayout
     CutlassInt4PackedLayout
+    CutlassSemiSparseLayout
 
 Quantization techniques
 -----------------------
 
@@ -265,6 +265,18 @@ def get_extensions():
                 "-I" + cutlass_include_dir,
                 "-I" + cutlass_tools_include_dir,
                 "-I" + cutlass_extensions_include_dir,
+                "-DNDEBUG" if not debug_mode else "",
+                "-DCUTE_USE_PACKED_TUPLE=1"
+                "-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED",
+                "-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1",
+                "-DCUTLASS_DEBUG_TRACE_LEVEL=0",
+                "--use_fast_math",
+                "--ftemplate-backtrace-limit=0",
+                # "--keep",
+                # "--ptxas-options=--verbose,--register-usage-level=5,--warn-on-local-memory-usage",
+                # "--resource-usage",
+                # "-lineinfo",
+                # "-DCUTLASS_ENABLE_GDC_FOR_SM90",  # https://github.com/NVIDIA/cutlass/blob/main/media/docs/dependent_kernel_launch.md
             ]
         )
     else:
 
@@ -57,7 +57,7 @@ def run_test_for_op(op, xq_bits, wq_bits, dtype, batch_size, size_mnk, use_bias)
     )
     assert torch.all(wq_zeros == 0)
     if wq_bits == 4:
-        wq = (wq_s8[:, 1::2] << 4) | (wq_s8[:, 0::2] & 0xF)
+        wq = (wq_s8[..., 1::2] << 4) | (wq_s8[..., 0::2] & 0xF)
     else:
         wq = wq_s8
 
 
@@ -0,0 +1,125 @@
+import itertools
+
+import pytest
+import torch
+from torch.testing._internal.common_cuda import SM90OrLater
+
+from torchao.dtypes import (
+    Float8Layout,
+    to_affine_quantized_floatx,
+)
+from torchao.ops import (
+    rowwise_scaled_linear_sparse_cutlass_f8f8,
+    to_sparse_semi_structured_cutlass_sm9x_f8,
+)
+from torchao.quantization.utils import _get_per_token_block_size
+from torchao.sparsity.utils import create_semi_structured_tensor
+
+X_W_DTYPES = [(torch.float16, torch.float16), (torch.bfloat16, torch.bfloat16)]
+XQ_WQ_DTYPES = [
+    (torch.float8_e5m2, torch.float8_e4m3fn),
+    (torch.float8_e4m3fn, torch.float8_e4m3fn),
+]
+BATCH_SIZE = [1, 4]
+SIZE_MNK = [
+    (2, 128, 256),
+    (3, 128, 256),
+    (13, 128, 256),
+    (27, 128, 128),
+    (33, 128, 64),
+    (65, 128, 32),
+]
+USE_BIAS = [False, True]
+BIAS_DTYPE = [torch.float16]
+TEST_PARAMS = list(
+    itertools.product(
+        X_W_DTYPES,
+        XQ_WQ_DTYPES,
+        BATCH_SIZE,
+        SIZE_MNK,
+        USE_BIAS,
+        BIAS_DTYPE,
+    )
+)
+
+
+def run_test_for_op(
+    op,
+    x_dtype,
+    w_dtype,
+    xq_dtype,
+    wq_dtype,
+    batch_size,
+    size_mnk,
+    use_bias,
+    bias_dtype,
+):
+    size_m, size_n, size_k = size_mnk
+
+    x = torch.randn((batch_size, size_m, size_k), dtype=x_dtype, device="cuda")
+    w = create_semi_structured_tensor(size_n, size_k, dtype=w_dtype)
+    bias = torch.rand((size_n,), dtype=bias_dtype, device="cuda") if use_bias else None
+
+    x_aqt = to_affine_quantized_floatx(
+        input_float=x,
+        target_dtype=xq_dtype,
+        block_size=_get_per_token_block_size(x),
+        _layout=Float8Layout(mm_config=None),
+    )
+    xq, xq_scales, zero_points = x_aqt.tensor_impl.get_plain()
+    assert zero_points is None
+
+    w_aqt = to_affine_quantized_floatx(
+        input_float=w,
+        target_dtype=wq_dtype,
+        block_size=_get_per_token_block_size(w),
+        _layout=Float8Layout(mm_config=None),
+    )
+    wq, wq_scales, zero_points = w_aqt.tensor_impl.get_plain()
+    assert zero_points is None
+    wq_sp, wq_sp_meta = to_sparse_semi_structured_cutlass_sm9x_f8(wq)
+    wq_sp_scales = wq_scales
+
+    xq_2d = xq.view(-1, xq.shape[-1])
+    size_m_2d = xq_2d.shape[0]
+    output_ref = (
+        (xq_2d.float() @ wq.float().T)
+        * xq_scales.view(size_m_2d, 1)
+        * wq_scales.view(1, size_n)
+    )
+    if bias is not None:
+        output_ref += bias
+    output_ref = output_ref.to(x.dtype).reshape(x.shape[:-1] + (size_n,))
+
+    fn_inputs = (xq, xq_scales, wq_sp, wq_sp_meta, wq_sp_scales, bias)
+    try:
+        output = op(*fn_inputs)
+    except NotImplementedError:
+        pytest.xfail("operator not implemented")
+
+    torch.testing.assert_close(output, output_ref, rtol=1e-2, atol=5e-3)
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.skipif(not SM90OrLater, reason="FP8 is only supported on H100+ devices")
+@pytest.mark.parametrize(
+    "x_w_dtypes, xq_wq_dtypes, batch_size, size_mnk, use_bias, bias_dtype",
+    TEST_PARAMS,
+)
+def test_rowwise_scaled_liner_sparse_cutlass_f8f8(
+    x_w_dtypes,
+    xq_wq_dtypes,
+    batch_size,
+    size_mnk,
+    use_bias,
+    bias_dtype,
+):
+    run_test_for_op(
+        rowwise_scaled_linear_sparse_cutlass_f8f8,
+        *x_w_dtypes,
+        *xq_wq_dtypes,
+        batch_size,
+        size_mnk,
+        use_bias,
+        bias_dtype,
+    )
@@ -334,11 +334,13 @@ def ffn_or_attn_only(mod, fqn):
 
     if quantization:
         from torchao.quantization import (
+            Float8DynamicActivationFloat8SemiSparseWeightConfig,
             autoquant,
             float8_dynamic_activation_float8_weight,
             float8_weight_only,
             fpx_weight_only,
             gemlite_uintx_weight_only,
+            int4_dynamic_activation_int4_weight,
             int4_weight_only,
             int8_dynamic_activation_int4_weight,
             int8_dynamic_activation_int8_weight,
@@ -434,18 +436,30 @@ def ffn_or_attn_only(mod, fqn):
                 ]
             ), f"int4wo group_size needs to be one of [32,64,128,256] but got {group_size}"
             quantize_(model, int4_weight_only(group_size=group_size, use_hqq=use_hqq))
-        elif "int8adq-int4w-symm" in quantization:
+        elif "int4dq-" in quantization:
             from torchao.dtypes import CutlassInt4PackedLayout
 
-            quantize_(
-                model,
-                int8_dynamic_activation_int4_weight(
-                    group_size=None,
-                    mapping_type=MappingType.SYMMETRIC,
-                    act_mapping_type=MappingType.SYMMETRIC,
-                    layout=CutlassInt4PackedLayout(),
-                ),
-            )
+            nbits = int(quantization.removeprefix("int4dq-"))
+            assert nbits == 4 or nbits == 8
+            if nbits == 4:
+                quantize_(
+                    model,
+                    int4_dynamic_activation_int4_weight(
+                        mapping_type=MappingType.SYMMETRIC,
+                        act_mapping_type=MappingType.SYMMETRIC,
+                        layout=CutlassInt4PackedLayout(),
+                    ),
+                )
+            elif nbits == 8:
+                quantize_(
+                    model,
+                    int8_dynamic_activation_int4_weight(
+                        group_size=None,
+                        mapping_type=MappingType.SYMMETRIC,
+                        act_mapping_type=MappingType.SYMMETRIC,
+                        layout=CutlassInt4PackedLayout(),
+                    ),
+                )
         if "marlin" in quantization:
             if "qqq" in quantization:
                 from torchao.dtypes import MarlinQQQLayout
@@ -564,16 +578,24 @@ def ffn_or_attn_only(mod, fqn):
         elif "float8wo" in quantization:
             quantize_(model, float8_weight_only())
         elif "float8dq" in quantization:
-            granularity = str(quantization.split("-")[-1])
-            if granularity == "tensor":
-                granularity = PerTensor()
-            elif granularity == "row":
-                granularity = PerRow()
+            if sparsity and "semi" in sparsity:
+                quantize_(
+                    model,
+                    Float8DynamicActivationFloat8SemiSparseWeightConfig(),
+                    filter_fn=ffn_only
+                )
             else:
-                granularity = PerTensor()
-            quantize_(
-                model, float8_dynamic_activation_float8_weight(granularity=granularity)
-            )
+                granularity = str(quantization.split("-")[-1])
+                if granularity == "tensor":
+                    granularity = PerTensor()
+                elif granularity == "row":
+                    granularity = PerRow()
+                else:
+                    granularity = PerTensor()
+                quantize_(
+                    model,
+                    float8_dynamic_activation_float8_weight(granularity=granularity),
+                )
         elif "autoquant_v2" in quantization:
             from torchao._models._eval import InputRecorder
             from torchao._models.llama.model import prepare_inputs_for_model
@@ -1130,7 +1152,7 @@ def callback(x):
         help=(
             "Which quantization techniques to apply: int8dq, int8wo, fp6, int4wo-<groupsize>, int4wo-<groupsize>-hqq, autoquant, "
             + "autoquant-int4, autoquant-gemlite-int4, autoquant-float8, autoquant-sparse, autoquant-all, uintx-<nbits>-<groupsize>, uintx-<nbits>-<groupsize>-hqq, sparse-marlin, spinquant, "
-            + "embed-int8wo, marlin_qqq, gemlite-<pack_bitwidth>-<nbits>-<groupsize>, int8adq-int4w-symm"
+            + "embed-int8wo, marlin_qqq, gemlite-<pack_bitwidth>-<nbits>-<groupsize>, float8dq, int4dq-<nbits>"
         ),
     )
     parser.add_argument(
 
@@ -11,6 +11,8 @@
 #endif
 
 #if defined(BUILD_ROWWISE_SCALED_LINEAR_CUTLASS)
+#include <cuda_runtime.h>
+#include <cutlass/cutlass.h>
 #include <cutlass/gemm/device/gemm_universal.h>
 #include <cutlass/gemm/device/gemm_universal_adapter.h>
 #include <cutlass/gemm/kernel/default_gemm_universal_with_visitor.h>
 
@@ -13,15 +13,16 @@ rowwise_scaled_linear_cutlass_s4s4(
               __func__, " : The input datatypes combination ", xq.dtype(),
               " for xq and ", wq.dtype(), " for wq is not supported");
 
+#if defined(BUILD_ROWWISE_SCALED_LINEAR_CUTLASS)
   // Dispatch to appropriate kernel template.
-  #if defined(BUILD_ROWWISE_SCALED_LINEAR_CUTLASS)
-  // We get ElementA/ElementB types from the header
-  return rowwise_scaled_linear_cutlass<cutlass::int4b_t, cutlass::int4b_t>(
-      xq, x_scale, wq, w_scale, bias);
-  #else
-    TORCH_CHECK(false, "CUTLASS kernels not built - rowwise_scaled_linear_cutlass_s4s4 not available");
-    return at::Tensor{};
-  #endif
+  using ElementA = cutlass::int4b_t;
+  using ElementB = cutlass::int4b_t;
+  return rowwise_scaled_linear_cutlass<ElementA, ElementB>(
+    xq, x_scale, wq, w_scale, bias);
+#else
+  TORCH_CHECK_NOT_IMPLEMENTED(false, OPERATOR_NAME);
+  return at::Tensor{};
+#endif
 }
 
 TORCH_LIBRARY_IMPL(torchao, CUDA, m) {
 
@@ -1,4 +1,5 @@
 #include <torch/library.h>
+
 #include "rowwise_scaled_linear_cutlass.cuh"
 
 namespace torchao {
@@ -13,13 +14,13 @@ rowwise_scaled_linear_cutlass_s8s4(
               " for xq and ", wq.dtype(), " for wq is not supported");
 
 #if defined(BUILD_ROWWISE_SCALED_LINEAR_CUTLASS)
-  // Define ElementA as int8_t since it's a standard type
+  // Dispatch to appropriate kernel template.
   using ElementA = int8_t;
-  // ElementB comes from cutlass header
-  return rowwise_scaled_linear_cutlass<ElementA, cutlass::int4b_t>(
-      xq, x_scale, wq, w_scale, bias);
+  using ElementB = cutlass::int4b_t;
+  return rowwise_scaled_linear_cutlass<ElementA, ElementB>(
+    xq, x_scale, wq, w_scale, bias);
 #else
-  TORCH_CHECK(false, "CUTLASS kernels not built - rowwise_scaled_linear_cutlass_s8s4 not available");
+  TORCH_CHECK_NOT_IMPLEMENTED(false, OPERATOR_NAME);
   return at::Tensor{};
 #endif
 }
Original file line number	Diff line number	Diff line change
`@@ -57,7 +57,7 @@ def run_test_for_op(op, xq_bits, wq_bits, dtype, batch_size, size_mnk, use_bias)`
`57`	`57`	`)`
`58`	`58`	`assert torch.all(wq_zeros == 0)`
`59`	`59`	`if wq_bits == 4:`
`60`		`- wq = (wq_s8[:, 1::2] << 4) \| (wq_s8[:, 0::2] & 0xF)`
	`60`	`+ wq = (wq_s8[..., 1::2] << 4) \| (wq_s8[..., 0::2] & 0xF)`
`61`	`61`	`else:`
`62`	`62`	`wq = wq_s8`
`63`	`63`