feat: add checks

Diogo-V · Diogo-V · commit 26cfc08bec7a · 2024-08-20T18:45:01.000Z
diff --git a/torchao/dtypes/affine_quantized_tensor.py b/torchao/dtypes/affine_quantized_tensor.py
@@ -1053,11 +1053,14 @@ def _linear_fp_act_int4_weight_sparse_marlin_impl(input_tensor, weight_tensor, b
     original_shape = weight_tensor.layout_tensor.original_shape
     num_bits = weight_tensor.layout_tensor.num_bits
 
+    size_m = input_tensor.shape[0]
+    size_n = original_shape[0]
+    size_k = input_tensor.shape[1]
     workspace_24 = marlin_24_workspace(original_shape[1])
 
     out = torchao.ops.marlin_24_gemm(
-        input_tensor, sparse_w_int4, meta, scale, workspace_24, 
-        num_bits, input_tensor.shape[0], original_shape[1], input_tensor.shape[1],
+        input_tensor, sparse_w_int4, meta, scale, 
+        workspace_24, num_bits, size_m, size_n, size_k
     )
     torch.cuda.synchronize()
 
diff --git a/torchao/ops.py b/torchao/ops.py
@@ -204,5 +204,60 @@ def _(
     size_n: int,
     size_k: int,
 ) -> Tensor:
-    # NOTE: Checks in kernel
+    TILE_SIZE = 16
+    MIN_THREAD_N = 128
+    MAX_PARALLELISM = 64
+
+    # Verify num_bits
+    torch._check(bits == 4 or bits == 8, lambda: f"num_bits must be 4 or 8. Got = {bits}")
+    pack_factor = 32 // bits
+
+    # Verify M
+    torch._check(size_m == x.size(0), lambda: f"Shape mismatch: x.size(0) = {x.size(0)}, size_m = {size_m}")
+
+    # Verify K
+    torch._check(size_k == x.size(1), lambda: f"Shape mismatch: x.size(1) = {x.size(1)}, size_k = {size_k}")
+    torch._check(size_k % TILE_SIZE == 0, lambda: f"size_k = {size_k} is not divisible by tile_size = {TILE_SIZE}")
+    torch._check((size_k // TILE_SIZE // 2) == weight_marlin.size(0), lambda: f"Shape mismatch: weight_marlin.size(0) = {weight_marlin.size(0)}, size_k = {size_k}, tile_size = {TILE_SIZE}")
+
+    # Verify N
+    torch._check(s.size(1) == size_n, lambda: f"s.size(1) = {s.size(1)}, size_n = {size_n}")
+    torch._check(weight_marlin.size(1) % TILE_SIZE == 0, lambda: f"weight_marlin.size(1) = {weight_marlin.size(1)} is not divisible by tile_size = {TILE_SIZE}")
+
+    actual_size_n = (weight_marlin.size(1) // TILE_SIZE) * pack_factor
+    torch._check(size_n == actual_size_n, lambda: f"size_n = {size_n}, actual_size_n = {actual_size_n}")
+
+    # Verify meta
+    torch._check(meta.size(0) == size_k // 8 // 2 // 2, lambda: f"meta.size(0) = {meta.size(0)} is not size_k / 8 / 2 / 2 = {size_k // 8 // 2 // 2}")
+    torch._check(meta.size(1) == size_n * 2, lambda: f"meta.size(1) = {meta.size(1)} is not size_n * 2 = {size_n * 2}")
+
+    # Verify A device and strides
+    torch._check(x.is_cuda, lambda: "x is not on GPU")
+    torch._check(x.is_contiguous(), lambda: "x is not contiguous")
+
+    # Verify B device and strides
+    torch._check(weight_marlin.is_cuda, lambda: "weight_marlin is not on GPU")
+    torch._check(weight_marlin.is_contiguous(), lambda: "weight_marlin is not contiguous")
+
+    # Verify meta device and strides
+    torch._check(meta.is_cuda, lambda: "meta is not on GPU")
+    torch._check(meta.is_contiguous(), lambda: "meta is not contiguous")
+
+    # Verify scales device and strides
+    torch._check(s.is_cuda, lambda: "s is not on GPU")
+    torch._check(s.is_contiguous(), lambda: "s is not contiguous")
+
+    # Verify groupsize
+    groupsize = -1
+    if s.size(0) > 1:
+        torch._check(size_k % s.size(0) == 0, lambda: f"size_k = {size_k} is not divisible by s.size(0) = {s.size(0)}")
+        groupsize = size_k // s.size(0)
+        groupsize //= 2  # Because of 24
+    torch._check(groupsize == -1 or groupsize == 64, lambda: f"Unexpected groupsize = {groupsize}")
+
+    # Verify workspace size
+    torch._check(size_n % MIN_THREAD_N == 0, lambda: f"size_n = {size_n} is not divisible by min_thread_n = {MIN_THREAD_N}")
+    min_workspace_size = (size_n // MIN_THREAD_N) * MAX_PARALLELISM
+    torch._check(workspace.numel() >= min_workspace_size, lambda: f"workspace.numel = {workspace.numel()} is below min_workspace_size = {min_workspace_size}")
+
     return torch.empty((x.size(0), s.size(1)), dtype=x.dtype, device=x.device)