Skip to content

Commit 429b978

Browse files
committed
ggml : update WASM SIMD
1 parent e410cfc commit 429b978

File tree

2 files changed

+86
-16
lines changed

2 files changed

+86
-16
lines changed

bindings/javascript/whisper.js

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ggml.c

Lines changed: 85 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -740,19 +740,19 @@ inline static float vaddvq_f32(float32x4_t v) {
740740
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
741741
}
742742

743-
float vminvq_f32(float32x4_t v) {
743+
inline static float vminvq_f32(float32x4_t v) {
744744
return
745745
MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
746746
MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
747747
}
748748

749-
float vmaxvq_f32(float32x4_t v) {
749+
inline static float vmaxvq_f32(float32x4_t v) {
750750
return
751751
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
752752
MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
753753
}
754754

755-
int32x4_t vcvtnq_s32_f32(float32x4_t v) {
755+
inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
756756
int32x4_t res;
757757

758758
res[0] = roundf(vgetq_lane_f32(v, 0));
@@ -766,7 +766,6 @@ int32x4_t vcvtnq_s32_f32(float32x4_t v) {
766766
#endif
767767
#endif
768768

769-
770769
#define QK4_0 32
771770
typedef struct {
772771
ggml_fp16_t d; // delta
@@ -1056,6 +1055,39 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
10561055
y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
10571056
}
10581057
}
1058+
#elif defined(__wasm_simd128__)
1059+
for (int i = 0; i < nb; i++) {
1060+
v128_t srcv [8];
1061+
v128_t asrcv[8];
1062+
v128_t amaxv[8];
1063+
1064+
for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j);
1065+
for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
1066+
1067+
for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
1068+
for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
1069+
for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
1070+
1071+
const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
1072+
wasm_f32x4_extract_lane(amaxv[0], 1)),
1073+
MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
1074+
wasm_f32x4_extract_lane(amaxv[0], 3)));
1075+
1076+
const float d = amax / ((1 << 7) - 1);
1077+
const float id = d ? 1.0f/d : 0.0f;
1078+
1079+
y[i].d = GGML_FP32_TO_FP16(d);
1080+
1081+
for (int j = 0; j < 8; j++) {
1082+
const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
1083+
const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
1084+
1085+
y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
1086+
y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
1087+
y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
1088+
y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
1089+
}
1090+
}
10591091
#elif defined(__AVX2__) || defined(__AVX__)
10601092
for (int i = 0; i < nb; i++) {
10611093
// Load elements into 4 AVX vectors
@@ -1224,6 +1256,48 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
12241256

12251257
y[i].s = d * vaddvq_s32(accv);
12261258
}
1259+
#elif defined(__wasm_simd128__)
1260+
for (int i = 0; i < nb; i++) {
1261+
v128_t srcv [8];
1262+
v128_t asrcv[8];
1263+
v128_t amaxv[8];
1264+
1265+
for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j);
1266+
for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
1267+
1268+
for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
1269+
for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
1270+
for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
1271+
1272+
const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
1273+
wasm_f32x4_extract_lane(amaxv[0], 1)),
1274+
MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
1275+
wasm_f32x4_extract_lane(amaxv[0], 3)));
1276+
1277+
const float d = amax / ((1 << 7) - 1);
1278+
const float id = d ? 1.0f/d : 0.0f;
1279+
1280+
y[i].d = d;
1281+
1282+
v128_t accv = wasm_i32x4_splat(0);
1283+
1284+
for (int j = 0; j < 8; j++) {
1285+
const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
1286+
const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
1287+
1288+
y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
1289+
y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
1290+
y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
1291+
y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
1292+
1293+
accv = wasm_i32x4_add(accv, vi);
1294+
}
1295+
1296+
y[i].s = d * (wasm_i32x4_extract_lane(accv, 0) +
1297+
wasm_i32x4_extract_lane(accv, 1) +
1298+
wasm_i32x4_extract_lane(accv, 2) +
1299+
wasm_i32x4_extract_lane(accv, 3));
1300+
}
12271301
#elif defined(__AVX2__) || defined(__AVX__)
12281302
for (int i = 0; i < nb; i++) {
12291303
// Load elements into 4 AVX vectors
@@ -2598,7 +2672,6 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
25982672
const block_q8_0 * restrict y0 = &y[i];
25992673

26002674
const v128_t m4b = wasm_i8x16_splat(0x0F);
2601-
const v128_t s16b = wasm_i8x16_splat(0x10);
26022675

26032676
// extract the 5th bit
26042677
memcpy(&qh, x0->qh, sizeof(qh));
@@ -2636,15 +2709,14 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
26362709
const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
26372710
const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
26382711

2639-
const float x0d = GGML_FP16_TO_FP32(x0->d);
2640-
26412712
// dot product
26422713
sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
26432714
wasm_i32x4_add(
26442715
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
26452716
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
26462717
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
2647-
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
2718+
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
2719+
wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d))));
26482720
}
26492721

26502722
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
@@ -2868,8 +2940,6 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
28682940
const v128_t v0l = wasm_v128_and (v0, m4b);
28692941
const v128_t v0h = wasm_u8x16_shr(v0, 4);
28702942

2871-
static bool x = true;
2872-
28732943
// add high bit
28742944
const v128_t v0lf = wasm_v128_or(v0l, qhl);
28752945
const v128_t v0hf = wasm_v128_or(v0h, qhh);
@@ -2892,11 +2962,11 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
28922962
// dot product
28932963
sumv = wasm_f32x4_add(sumv,
28942964
wasm_f32x4_mul(wasm_f32x4_convert_i32x4(wasm_i32x4_add(
2895-
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
2896-
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
2897-
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
2898-
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
2899-
wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * y0->d));
2965+
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
2966+
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
2967+
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
2968+
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
2969+
wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * y0->d)));
29002970
}
29012971

29022972
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +

0 commit comments

Comments
 (0)