Skip to content

Commit 5808fcf

Browse files
committed
Use full range for q4_2 quantization
1 parent d09f97e commit 5808fcf

File tree

1 file changed

+11
-10
lines changed

1 file changed

+11
-10
lines changed

ggml.c

+11-10
Original file line numberDiff line numberDiff line change
@@ -1200,13 +1200,17 @@ static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * r
12001200

12011201
for (int i = 0; i < nb; i++) {
12021202
float amax = 0.0f; // absolute max
1203+
float max = 0.0f;
12031204

12041205
for (int l = 0; l < QK4_2; l++) {
12051206
const float v = x[i*QK4_2 + l];
1206-
amax = MAX(amax, fabsf(v));
1207+
if (amax < fabsf(v)) {
1208+
amax = fabsf(v);
1209+
max = v;
1210+
}
12071211
}
12081212

1209-
const float d = amax / ((1 << 3) - 1);
1213+
const float d = max / -8;
12101214

12111215
const float id = d ? 1.0f/d : 0.0f;
12121216

@@ -1216,8 +1220,8 @@ static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * r
12161220
const float v0 = x[i*QK4_2 + l + 0]*id;
12171221
const float v1 = x[i*QK4_2 + l + 1]*id;
12181222

1219-
const uint8_t vi0 = (uint8_t)(v0 + 8.5f);
1220-
const uint8_t vi1 = (uint8_t)(v1 + 8.5f);
1223+
const uint8_t vi0 = MIN(15, (uint8_t)(v0 + 8.5f));
1224+
const uint8_t vi1 = MIN(15, (uint8_t)(v1 + 8.5f));
12211225

12221226
assert(vi0 < 16);
12231227
assert(vi1 < 16);
@@ -1311,9 +1315,7 @@ static void quantize_row_q4_2(const float * restrict x, void * restrict vy, int
13111315

13121316
block_q4_2 * restrict y = vy;
13131317

1314-
//quantize_row_q4_2_reference(x, y, k);
1315-
// This produces the exact same format, just better match to the input floats ("better" as measured by RMSE)
1316-
quantize_row_q4_2_rmse(x, y, k);
1318+
quantize_row_q4_2_reference(x, y, k);
13171319
}
13181320

13191321
static void quantize_row_q4_3_reference(const float * restrict x, block_q4_3 * restrict y, int k) {
@@ -1864,7 +1866,7 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
18641866
[GGML_TYPE_Q4_2] = {
18651867
.dequantize_row_q = dequantize_row_q4_2,
18661868
.quantize_row_q = quantize_row_q4_2,
1867-
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_2_rmse, //quantize_row_q4_2_reference,
1869+
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_2_reference,
18681870
.quantize_row_q_dot = quantize_row_q8_0,
18691871
.vec_dot_q = ggml_vec_dot_q4_2_q8_0,
18701872
},
@@ -12196,8 +12198,7 @@ size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t *
1219612198
for (int j = 0; j < n; j += k) {
1219712199
block_q4_2 * restrict y = (block_q4_2 *)dst + j/QK4_2;
1219812200

12199-
//quantize_row_q4_2_reference(src + j, y, k);
12200-
quantize_row_q4_2_rmse(src + j, y, k);
12201+
quantize_row_q4_2_reference(src + j, y, k);
1220112202

1220212203
for (int i = 0; i < nb; i++) {
1220312204
for (int l = 0; l < QK4_2; l += 2) {

0 commit comments

Comments
 (0)