@@ -740,19 +740,19 @@ inline static float vaddvq_f32(float32x4_t v) {
740
740
return vgetq_lane_f32 (v , 0 ) + vgetq_lane_f32 (v , 1 ) + vgetq_lane_f32 (v , 2 ) + vgetq_lane_f32 (v , 3 );
741
741
}
742
742
743
- float vminvq_f32 (float32x4_t v ) {
743
+ inline static float vminvq_f32 (float32x4_t v ) {
744
744
return
745
745
MIN (MIN (vgetq_lane_f32 (v , 0 ), vgetq_lane_f32 (v , 1 )),
746
746
MIN (vgetq_lane_f32 (v , 2 ), vgetq_lane_f32 (v , 3 )));
747
747
}
748
748
749
- float vmaxvq_f32 (float32x4_t v ) {
749
+ inline static float vmaxvq_f32 (float32x4_t v ) {
750
750
return
751
751
MAX (MAX (vgetq_lane_f32 (v , 0 ), vgetq_lane_f32 (v , 1 )),
752
752
MAX (vgetq_lane_f32 (v , 2 ), vgetq_lane_f32 (v , 3 )));
753
753
}
754
754
755
- int32x4_t vcvtnq_s32_f32 (float32x4_t v ) {
755
+ inline static int32x4_t vcvtnq_s32_f32 (float32x4_t v ) {
756
756
int32x4_t res ;
757
757
758
758
res [0 ] = roundf (vgetq_lane_f32 (v , 0 ));
@@ -766,7 +766,6 @@ int32x4_t vcvtnq_s32_f32(float32x4_t v) {
766
766
#endif
767
767
#endif
768
768
769
-
770
769
#define QK4_0 32
771
770
typedef struct {
772
771
ggml_fp16_t d ; // delta
@@ -1056,6 +1055,39 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
1056
1055
y [i ].qs [4 * j + 3 ] = vgetq_lane_s32 (vi , 3 );
1057
1056
}
1058
1057
}
1058
+ #elif defined(__wasm_simd128__ )
1059
+ for (int i = 0 ; i < nb ; i ++ ) {
1060
+ v128_t srcv [8 ];
1061
+ v128_t asrcv [8 ];
1062
+ v128_t amaxv [8 ];
1063
+
1064
+ for (int j = 0 ; j < 8 ; j ++ ) srcv [j ] = wasm_v128_load (x + i * 32 + 4 * j );
1065
+ for (int j = 0 ; j < 8 ; j ++ ) asrcv [j ] = wasm_f32x4_abs (srcv [j ]);
1066
+
1067
+ for (int j = 0 ; j < 4 ; j ++ ) amaxv [2 * j ] = wasm_f32x4_max (asrcv [2 * j ], asrcv [2 * j + 1 ]);
1068
+ for (int j = 0 ; j < 2 ; j ++ ) amaxv [4 * j ] = wasm_f32x4_max (amaxv [4 * j ], amaxv [4 * j + 2 ]);
1069
+ for (int j = 0 ; j < 1 ; j ++ ) amaxv [8 * j ] = wasm_f32x4_max (amaxv [8 * j ], amaxv [8 * j + 4 ]);
1070
+
1071
+ const float amax = MAX (MAX (wasm_f32x4_extract_lane (amaxv [0 ], 0 ),
1072
+ wasm_f32x4_extract_lane (amaxv [0 ], 1 )),
1073
+ MAX (wasm_f32x4_extract_lane (amaxv [0 ], 2 ),
1074
+ wasm_f32x4_extract_lane (amaxv [0 ], 3 )));
1075
+
1076
+ const float d = amax / ((1 << 7 ) - 1 );
1077
+ const float id = d ? 1.0f /d : 0.0f ;
1078
+
1079
+ y [i ].d = GGML_FP32_TO_FP16 (d );
1080
+
1081
+ for (int j = 0 ; j < 8 ; j ++ ) {
1082
+ const v128_t v = wasm_f32x4_mul (srcv [j ], wasm_f32x4_splat (id ));
1083
+ const v128_t vi = wasm_i32x4_trunc_sat_f32x4 (v );
1084
+
1085
+ y [i ].qs [4 * j + 0 ] = wasm_i32x4_extract_lane (vi , 0 );
1086
+ y [i ].qs [4 * j + 1 ] = wasm_i32x4_extract_lane (vi , 1 );
1087
+ y [i ].qs [4 * j + 2 ] = wasm_i32x4_extract_lane (vi , 2 );
1088
+ y [i ].qs [4 * j + 3 ] = wasm_i32x4_extract_lane (vi , 3 );
1089
+ }
1090
+ }
1059
1091
#elif defined(__AVX2__ ) || defined(__AVX__ )
1060
1092
for (int i = 0 ; i < nb ; i ++ ) {
1061
1093
// Load elements into 4 AVX vectors
@@ -1224,6 +1256,48 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
1224
1256
1225
1257
y [i ].s = d * vaddvq_s32 (accv );
1226
1258
}
1259
+ #elif defined(__wasm_simd128__ )
1260
+ for (int i = 0 ; i < nb ; i ++ ) {
1261
+ v128_t srcv [8 ];
1262
+ v128_t asrcv [8 ];
1263
+ v128_t amaxv [8 ];
1264
+
1265
+ for (int j = 0 ; j < 8 ; j ++ ) srcv [j ] = wasm_v128_load (x + i * 32 + 4 * j );
1266
+ for (int j = 0 ; j < 8 ; j ++ ) asrcv [j ] = wasm_f32x4_abs (srcv [j ]);
1267
+
1268
+ for (int j = 0 ; j < 4 ; j ++ ) amaxv [2 * j ] = wasm_f32x4_max (asrcv [2 * j ], asrcv [2 * j + 1 ]);
1269
+ for (int j = 0 ; j < 2 ; j ++ ) amaxv [4 * j ] = wasm_f32x4_max (amaxv [4 * j ], amaxv [4 * j + 2 ]);
1270
+ for (int j = 0 ; j < 1 ; j ++ ) amaxv [8 * j ] = wasm_f32x4_max (amaxv [8 * j ], amaxv [8 * j + 4 ]);
1271
+
1272
+ const float amax = MAX (MAX (wasm_f32x4_extract_lane (amaxv [0 ], 0 ),
1273
+ wasm_f32x4_extract_lane (amaxv [0 ], 1 )),
1274
+ MAX (wasm_f32x4_extract_lane (amaxv [0 ], 2 ),
1275
+ wasm_f32x4_extract_lane (amaxv [0 ], 3 )));
1276
+
1277
+ const float d = amax / ((1 << 7 ) - 1 );
1278
+ const float id = d ? 1.0f /d : 0.0f ;
1279
+
1280
+ y [i ].d = d ;
1281
+
1282
+ v128_t accv = wasm_i32x4_splat (0 );
1283
+
1284
+ for (int j = 0 ; j < 8 ; j ++ ) {
1285
+ const v128_t v = wasm_f32x4_mul (srcv [j ], wasm_f32x4_splat (id ));
1286
+ const v128_t vi = wasm_i32x4_trunc_sat_f32x4 (v );
1287
+
1288
+ y [i ].qs [4 * j + 0 ] = wasm_i32x4_extract_lane (vi , 0 );
1289
+ y [i ].qs [4 * j + 1 ] = wasm_i32x4_extract_lane (vi , 1 );
1290
+ y [i ].qs [4 * j + 2 ] = wasm_i32x4_extract_lane (vi , 2 );
1291
+ y [i ].qs [4 * j + 3 ] = wasm_i32x4_extract_lane (vi , 3 );
1292
+
1293
+ accv = wasm_i32x4_add (accv , vi );
1294
+ }
1295
+
1296
+ y [i ].s = d * (wasm_i32x4_extract_lane (accv , 0 ) +
1297
+ wasm_i32x4_extract_lane (accv , 1 ) +
1298
+ wasm_i32x4_extract_lane (accv , 2 ) +
1299
+ wasm_i32x4_extract_lane (accv , 3 ));
1300
+ }
1227
1301
#elif defined(__AVX2__ ) || defined(__AVX__ )
1228
1302
for (int i = 0 ; i < nb ; i ++ ) {
1229
1303
// Load elements into 4 AVX vectors
@@ -2598,7 +2672,6 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2598
2672
const block_q8_0 * restrict y0 = & y [i ];
2599
2673
2600
2674
const v128_t m4b = wasm_i8x16_splat (0x0F );
2601
- const v128_t s16b = wasm_i8x16_splat (0x10 );
2602
2675
2603
2676
// extract the 5th bit
2604
2677
memcpy (& qh , x0 -> qh , sizeof (qh ));
@@ -2636,15 +2709,14 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2636
2709
const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h );
2637
2710
const v128_t v1hh = wasm_i16x8_extend_high_i8x16 (v1h );
2638
2711
2639
- const float x0d = GGML_FP16_TO_FP32 (x0 -> d );
2640
-
2641
2712
// dot product
2642
2713
sumv = wasm_f32x4_add (sumv , wasm_f32x4_mul (wasm_f32x4_convert_i32x4 (
2643
2714
wasm_i32x4_add (
2644
2715
wasm_i32x4_add (wasm_i32x4_dot_i16x8 (v0lfl , v1ll ),
2645
2716
wasm_i32x4_dot_i16x8 (v0lfh , v1lh )),
2646
2717
wasm_i32x4_add (wasm_i32x4_dot_i16x8 (v0hfl , v1hl ),
2647
- wasm_i32x4_dot_i16x8 (v0hfh , v1hh )))), wasm_f32x4_splat (x0d * y0 -> d )));
2718
+ wasm_i32x4_dot_i16x8 (v0hfh , v1hh )))),
2719
+ wasm_f32x4_splat (GGML_FP16_TO_FP32 (x0 -> d ) * GGML_FP16_TO_FP32 (y0 -> d ))));
2648
2720
}
2649
2721
2650
2722
* s = wasm_f32x4_extract_lane (sumv , 0 ) + wasm_f32x4_extract_lane (sumv , 1 ) +
@@ -2868,8 +2940,6 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
2868
2940
const v128_t v0l = wasm_v128_and (v0 , m4b );
2869
2941
const v128_t v0h = wasm_u8x16_shr (v0 , 4 );
2870
2942
2871
- static bool x = true;
2872
-
2873
2943
// add high bit
2874
2944
const v128_t v0lf = wasm_v128_or (v0l , qhl );
2875
2945
const v128_t v0hf = wasm_v128_or (v0h , qhh );
@@ -2892,11 +2962,11 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
2892
2962
// dot product
2893
2963
sumv = wasm_f32x4_add (sumv ,
2894
2964
wasm_f32x4_mul (wasm_f32x4_convert_i32x4 (wasm_i32x4_add (
2895
- wasm_i32x4_add (wasm_i32x4_dot_i16x8 (v0lfl , v1ll ),
2896
- wasm_i32x4_dot_i16x8 (v0lfh , v1lh )),
2897
- wasm_i32x4_add (wasm_i32x4_dot_i16x8 (v0hfl , v1hl ),
2898
- wasm_i32x4_dot_i16x8 (v0hfh , v1hh )))),
2899
- wasm_f32x4_splat (GGML_FP16_TO_FP32 (x0 -> d ) * y0 -> d ));
2965
+ wasm_i32x4_add (wasm_i32x4_dot_i16x8 (v0lfl , v1ll ),
2966
+ wasm_i32x4_dot_i16x8 (v0lfh , v1lh )),
2967
+ wasm_i32x4_add (wasm_i32x4_dot_i16x8 (v0hfl , v1hl ),
2968
+ wasm_i32x4_dot_i16x8 (v0hfh , v1hh )))),
2969
+ wasm_f32x4_splat (GGML_FP16_TO_FP32 (x0 -> d ) * y0 -> d ))) ;
2900
2970
}
2901
2971
2902
2972
* s = wasm_f32x4_extract_lane (sumv , 0 ) + wasm_f32x4_extract_lane (sumv , 1 ) +
0 commit comments