Q4 cache: Add groupwise Hadamard transform

turboderp · turboderp · commit 324404ebe4e3 · 2024-04-12T20:06:25.000+02:00
diff --git a/doc/qcache_eval.md b/doc/qcache_eval.md
@@ -15,20 +15,23 @@ The tl;dr:
 Token-level perplexity tests for various full-precision and quantized models using FP16, FP8 and Q4 cache
 modes. Dataset is The Pile, 10 rows of 512 tokens per test. 
 
-Model	| Precision	| FP16 cache	| FP8 cache	| Q4 cache
---------|-----------|---------------|-----------|---------
-Mistral 7B Instruct	| 3.0 bpw	| 13.33	| 13.43	| 13.41
---	| 3.5 bpw	| 13.07	| 13.14	| 13.12
---	| 4.0 bpw	| 12.90	| 12.90	| 12.90
---	| 5.0 bpw	| 12.73	| 12.73	| 12.75
---	| 6.0 bpw	| 12.73	| 12.75	| 12.74
---	| FP16	| 12.69	| 12.71	| 12.72
-Mixtral 8x7B	| 3.5 bpw	| 10.27	| 10.41	| 10.39
---	| 4.0 bpw	| 10.09	| 10.26	| 10.23
---	| 5.0 bpw	| 10.02	| 10.16	| 10.15
-Llama2 7B	| 4.0 bpw	| 11.43	| 11.92	| 11.74
---	| 5.0 bpw	| 11.13	| 11.40	| 11.31
---	| FP16	| 10.91	| 11.24	| 11.16
+Results are updated for the new method which uses Hadamard rotations on the keys/values. Old results for version
+0.0.18 and prior kept for reference.
+
+Model	| Precision	 | FP16 cache	 | FP8 cache	| Q4 cache (old) | Q4 cache
+--------|---------|-------------|-----------|-------|----------
+Mistral 7B Instruct	| 3.0 bpw | **13.33**	  | 13.43	| 13.41 | **13.37**
+--	| 3.5 bpw	 | **13.07**	  | 13.14	| 13.12 | **13.09**
+--	| 4.0 bpw	 | **12.90**	  | 12.90	| 12.90 | **12.90**
+--	| 5.0 bpw	 | **12.73**	  | 12.73	| 12.75 | **12.75** 
+--	| 6.0 bpw	 | **12.73**	  | 12.75	| 12.74 | **12.74**
+--	| FP16	   | **12.69**	  | 12.71	| 12.72 | **12.69**
+Mixtral 8x7B	| 3.5 bpw	 | **10.27**	  | 10.41	| 10.39 | **10.32** 
+--	| 4.0 bpw	 | **10.09**	  | 10.26	| 10.23 | **10.19**
+--	| 5.0 bpw	 | **10.02**	  | 10.16	| 10.15 | **10.04**
+Llama2 7B	| 4.0 bpw	 | **11.43**	  | 11.92	| 11.74 | **11.60** 
+--	| 5.0 bpw	 | **11.13**	  | 11.40	| 11.31 | **11.19**
+--	| FP16	   | **10.91**	  | 11.24	| 11.16 | **10.05**
 
 
 ### HumanEval
@@ -37,6 +40,8 @@ The following are HumanEval tests on various full-precision and quantized models
 respectively. Number of samples per task is limited to 10 (still giving 39360 completions in total produced
 over about 24 hours.)
 
+The following tests were done prior to the improvements in 0.0.18-dev. 
+
 #### pass@1 
 
 Model |	Precision	| FP16 cache  |	Q4 cache	| diff
diff --git a/exllamav2/exllamav2_ext/cuda/cache.cu b/exllamav2/exllamav2_ext/cuda/cache.cu
@@ -7,6 +7,7 @@
 #define THREADS 32
 #define BLOCKSIZE_Q 256
 #define THREADS_Q (BLOCKSIZE_Q / 2)
+#define HADAMARD_Q4
 
 // The upper 8 bits of FP16 are equivalent to FP8 E5M2.
 //
@@ -164,6 +165,22 @@ __global__ void fp16_to_q4_kv_kernel
     half2 w2 = in2[t];
     half2 o = w2;
 
+    // Perform hadamard transform on two interleaved 32-element groups. Don't scale output by 1/sqrt(32) here, instead
+    // scale by 1/32 when dequantizing
+
+    #ifdef HADAMARD_Q4
+
+        for (int i = 1; i < 32; i <<= 1)
+        {
+            half2 pw2 = __shfl_xor_sync(0xffffffff, w2, i, 32);
+            uint32_t* w2i = reinterpret_cast<uint32_t*>(&w2);
+            int32_t sfm = -static_cast<int32_t>(t & i) >> 31;
+            *w2i ^= (sfm & 0x80008000);
+            w2 = __hadd2(w2, pw2);
+        }
+
+    #endif
+
     // Max abs value for lane_id 0..15, 16..31
 
     half2 absmax2 = __habs2(w2);
@@ -176,7 +193,7 @@ __global__ void fp16_to_q4_kv_kernel
 
     // Normalize
 
-    half2 c_8 = __half2half2(__int2half_rn(8));
+    half2 c_8 = __half2half2(__float2half_rn(8));
     half c_i = __float2half_rn(1.0f / 8.0f);
 
     w2 = __h2div(w2, absmax2);
@@ -255,6 +272,23 @@ __global__ void q4_to_fp16_kv_kernel
     half2 w2 = __halves2half2(w0, w1);
     w2 = __hmul2(w2, scale2);
 
+    // Perform hadamard transform on two interleaved 32-element groups. Skipped scaling when quantizing, so result
+    // is scaled by 1/32 here
+
+    #ifdef HADAMARD_Q4
+
+        for (int i = 1; i < 32; i <<= 1)
+        {
+            half2 pw2 = __shfl_xor_sync(0xffffffff, w2, i, 32);
+            uint32_t* w2i = reinterpret_cast<uint32_t*>(&w2);
+            int32_t sfm = -static_cast<int32_t>(t & i) >> 31;
+            *w2i ^= (sfm & 0x80008000);
+            w2 = __hadd2(w2, pw2);
+        }
+        w2 = __hmul2(w2, __float2half2_rn(1.0f/32.0f));
+
+    #endif
+
     // Store
 
     half2* out2 = (half2*) (out + block_offset);