75
75
76
76
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
77
77
78
+ #if defined(GGML_USE_HIPBLAS)
79
+ #define __CUDA_ARCH__ 1300
80
+
81
+ typedef int8_t int8x4_t __attribute__ ((ext_vector_type(4 )));
82
+ static __device__ __forceinline__ int __vsubss4 (const int a, const int b) {
83
+ const int8x4_t va = reinterpret_cast <const int8x4_t &>(a);
84
+ const int8x4_t vb = reinterpret_cast <const int8x4_t &>(b);
85
+ const int8x4_t c = __builtin_elementwise_sub_sat (va, vb);
86
+ return reinterpret_cast <const int &>(c);
87
+ }
88
+
89
+ static __device__ __forceinline__ int __dp4a (const int a, const int b, int c) {
90
+ #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
91
+ c = __builtin_amdgcn_sdot4 (a, b, c, false );
92
+ #else
93
+ const int8x4_t va = reinterpret_cast <const int8x4_t &>(a);
94
+ const int8x4_t vb = reinterpret_cast <const int8x4_t &>(b);
95
+ c += va[0 ] * vb[0 ] + va[1 ] * vb[1 ] + va[2 ] * vb[2 ] + va[3 ] * vb[3 ];
96
+ #endif
97
+ return c;
98
+ }
99
+ #endif
100
+
78
101
#if defined(_MSC_VER)
79
102
#pragma warning(disable: 4244 4267) // possible loss of data
80
103
#endif
@@ -1396,8 +1419,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
1396
1419
return ;
1397
1420
}
1398
1421
1399
- y[ib].ds .x = d;
1400
- y[ib].ds .y = sum;
1422
+ reinterpret_cast <half&>( y[ib].ds .x ) = d;
1423
+ reinterpret_cast <half&>( y[ib].ds .y ) = sum;
1401
1424
}
1402
1425
1403
1426
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
@@ -1609,8 +1632,8 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
1609
1632
#else
1610
1633
const float2 dm8f = __half22float2 (dm8);
1611
1634
const float2 ds8f = __half22float2 (ds8);
1612
- const float d8d8 = dm8. x * ds8. x ;
1613
- const float m8s8 = dm8. y * ds8. y ;
1635
+ const float d8d8 = __low2float ( dm8) * __low2float ( ds8) ;
1636
+ const float m8s8 = __high2float ( dm8) * __high2float ( ds8) ;
1614
1637
#endif // GGML_CUDA_F16
1615
1638
1616
1639
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
@@ -2380,7 +2403,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
2380
2403
u[i] = get_int_from_int8_aligned (bq8_1->qs , iqs + i);
2381
2404
}
2382
2405
2383
- return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d , bq8_1->ds . x );
2406
+ return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d , __low2half ( bq8_1->ds ) );
2384
2407
}
2385
2408
2386
2409
static __device__ __forceinline__ void allocate_tiles_q8_0 (int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
@@ -2478,7 +2501,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
2478
2501
#pragma unroll
2479
2502
for (int i = 0 ; i < QR2_K; ++ i) {
2480
2503
u[i] = get_int_from_int8_aligned (bq8_1[bq8_offset + i].qs , iqs % QI8_1);
2481
- d8[i] = bq8_1[bq8_offset + i].ds . x ;
2504
+ d8[i] = __low2half ( bq8_1[bq8_offset + i].ds ) ;
2482
2505
}
2483
2506
2484
2507
return vec_dot_q2_K_q8_1_impl_mmvq (v, u, scales, bq2_K->dm , d8);
@@ -2605,7 +2628,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
2605
2628
#pragma unroll
2606
2629
for (int i = 0 ; i < QR3_K; ++i) {
2607
2630
u[i] = get_int_from_int8_aligned (bq8_1[bq8_offset + i].qs , iqs % QI8_1);
2608
- d8[i] = bq8_1[bq8_offset + i].ds . x ;
2631
+ d8[i] = __low2half ( bq8_1[bq8_offset + i].ds ) ;
2609
2632
}
2610
2633
2611
2634
return vec_dot_q3_K_q8_1_impl_mmvq (vl, vh, u, bq3_K->scales , scale_offset, d, d8);
@@ -2782,7 +2805,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
2782
2805
2783
2806
for (int i = 0 ; i < QR4_K; ++i) {
2784
2807
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
2785
- d8[i] = bq8i->ds . x ;
2808
+ d8[i] = __low2half ( bq8i->ds ) ;
2786
2809
2787
2810
const int * q8 = (const int *)bq8i->qs + ((iqs/2 )%4 );
2788
2811
u[2 *i+0 ] = q8[0 ];
@@ -2809,8 +2832,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
2809
2832
const float dall = bq4_K->d [0 ];
2810
2833
const float dmin = bq4_K->d [1 ];
2811
2834
2812
- const float d8_1 = bq8_1[0 ].ds . x ;
2813
- const float d8_2 = bq8_1[1 ].ds . x ;
2835
+ const float d8_1 = __low2float ( bq8_1[0 ].ds ) ;
2836
+ const float d8_2 = __low2float ( bq8_1[1 ].ds ) ;
2814
2837
2815
2838
const int ui1 = *((const int *)bq8_1[0 ].qs + (iqs/2 ));
2816
2839
const int ui2 = *((const int *)bq8_1[0 ].qs + (iqs/2 ) + 4 );
@@ -2977,7 +3000,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
2977
3000
#pragma unroll
2978
3001
for (int i = 0 ; i < QR5_K; ++i) {
2979
3002
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
2980
- d8[i] = bq8i->ds . x ;
3003
+ d8[i] = __low2float ( bq8i->ds ) ;
2981
3004
2982
3005
const int * q8 = (const int *)bq8i->qs + ((iqs/2 )%4 );
2983
3006
u[2 *i+0 ] = q8[0 ];
@@ -2995,8 +3018,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
2995
3018
2996
3019
const float d = bq5_K->d ;
2997
3020
2998
- const float d8_1 = bq8_1[0 ].ds . x ;
2999
- const float d8_2 = bq8_1[1 ].ds . x ;
3021
+ const float d8_1 = __low2half ( bq8_1[0 ].ds ) ;
3022
+ const float d8_2 = __low2half ( bq8_1[1 ].ds ) ;
3000
3023
3001
3024
const int ui1 = *((const int *)bq8_1[0 ].qs + (iqs/2 ));
3002
3025
const int ui2 = *((const int *)bq8_1[0 ].qs + (iqs/2 ) + 4 );
@@ -3157,7 +3180,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
3157
3180
#pragma unroll
3158
3181
for (int i = 0 ; i < QR6_K; ++i) {
3159
3182
u[i] = get_int_from_int8_aligned (bq8_1[bq8_offset + 2 *i].qs , iqs % QI8_1);
3160
- d8[i] = bq8_1[bq8_offset + 2 *i].ds . x ;
3183
+ d8[i] = __low2half ( bq8_1[bq8_offset + 2 *i].ds ) ;
3161
3184
}
3162
3185
3163
3186
return vec_dot_q6_K_q8_1_impl_mmvq (vl, vh, u, scales, bq6_K->d , d8);
@@ -3336,7 +3359,7 @@ static __global__ void mul_mat_q(
3336
3359
*dsi_dst = *dsi_src;
3337
3360
} else {
3338
3361
float * dfi_dst = (float *) dsi_dst;
3339
- *dfi_dst = (*dsi_src). x ;
3362
+ *dfi_dst = __low2half (*dsi_src);
3340
3363
}
3341
3364
}
3342
3365
0 commit comments