@@ -1200,13 +1200,17 @@ static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * r
1200
1200
1201
1201
for (int i = 0 ; i < nb ; i ++ ) {
1202
1202
float amax = 0.0f ; // absolute max
1203
+ float max = 0.0f ;
1203
1204
1204
1205
for (int l = 0 ; l < QK4_2 ; l ++ ) {
1205
1206
const float v = x [i * QK4_2 + l ];
1206
- amax = MAX (amax , fabsf (v ));
1207
+ if (amax < fabsf (v )) {
1208
+ amax = fabsf (v );
1209
+ max = v ;
1210
+ }
1207
1211
}
1208
1212
1209
- const float d = amax / (( 1 << 3 ) - 1 ) ;
1213
+ const float d = max / -8 ;
1210
1214
1211
1215
const float id = d ? 1.0f /d : 0.0f ;
1212
1216
@@ -1216,8 +1220,8 @@ static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * r
1216
1220
const float v0 = x [i * QK4_2 + l + 0 ]* id ;
1217
1221
const float v1 = x [i * QK4_2 + l + 1 ]* id ;
1218
1222
1219
- const uint8_t vi0 = ( uint8_t )(v0 + 8.5f );
1220
- const uint8_t vi1 = ( uint8_t )(v1 + 8.5f );
1223
+ const uint8_t vi0 = MIN ( 15 , ( uint8_t )(v0 + 8.5f ) );
1224
+ const uint8_t vi1 = MIN ( 15 , ( uint8_t )(v1 + 8.5f ) );
1221
1225
1222
1226
assert (vi0 < 16 );
1223
1227
assert (vi1 < 16 );
@@ -1311,9 +1315,7 @@ static void quantize_row_q4_2(const float * restrict x, void * restrict vy, int
1311
1315
1312
1316
block_q4_2 * restrict y = vy ;
1313
1317
1314
- //quantize_row_q4_2_reference(x, y, k);
1315
- // This produces the exact same format, just better match to the input floats ("better" as measured by RMSE)
1316
- quantize_row_q4_2_rmse (x , y , k );
1318
+ quantize_row_q4_2_reference (x , y , k );
1317
1319
}
1318
1320
1319
1321
static void quantize_row_q4_3_reference (const float * restrict x , block_q4_3 * restrict y , int k ) {
@@ -1864,7 +1866,7 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
1864
1866
[GGML_TYPE_Q4_2 ] = {
1865
1867
.dequantize_row_q = dequantize_row_q4_2 ,
1866
1868
.quantize_row_q = quantize_row_q4_2 ,
1867
- .quantize_row_q_reference = (quantize_row_q_t ) quantize_row_q4_2_rmse , // quantize_row_q4_2_reference,
1869
+ .quantize_row_q_reference = (quantize_row_q_t ) quantize_row_q4_2_reference ,
1868
1870
.quantize_row_q_dot = quantize_row_q8_0 ,
1869
1871
.vec_dot_q = ggml_vec_dot_q4_2_q8_0 ,
1870
1872
},
@@ -12196,8 +12198,7 @@ size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t *
12196
12198
for (int j = 0 ; j < n ; j += k ) {
12197
12199
block_q4_2 * restrict y = (block_q4_2 * )dst + j /QK4_2 ;
12198
12200
12199
- //quantize_row_q4_2_reference(src + j, y, k);
12200
- quantize_row_q4_2_rmse (src + j , y , k );
12201
+ quantize_row_q4_2_reference (src + j , y , k );
12201
12202
12202
12203
for (int i = 0 ; i < nb ; i ++ ) {
12203
12204
for (int l = 0 ; l < QK4_2 ; l += 2 ) {
0 commit comments