@@ -119,6 +119,56 @@ pub unsafe fn _mm256_shuffle_pd(a: f64x4, b: f64x4, imm8: i32) -> f64x4 {
119
119
}
120
120
}
121
121
122
+ /// Shuffle single-precision (32-bit) floating-point elements in `a` within
123
+ /// 128-bit lanes using the control in `imm8`.
124
+ #[ inline( always) ]
125
+ #[ target_feature = "+avx" ]
126
+ #[ cfg_attr( test, assert_instr( vshufps, imm8 = 0x0 ) ) ]
127
+ pub unsafe fn _mm256_shuffle_ps ( a : f32x8 , b : f32x8 , imm8 : i32 ) -> f32x8 {
128
+ let imm8 = ( imm8 & 0xFF ) as u8 ;
129
+ macro_rules! shuffle4 {
130
+ ( $a: expr, $b: expr, $c: expr, $d: expr, $e: expr, $f: expr, $g: expr, $h: expr) => {
131
+ simd_shuffle8( a, b, [ $a, $b, $c, $d, $e, $f, $g, $h] ) ;
132
+ }
133
+ }
134
+ macro_rules! shuffle3 {
135
+ ( $a: expr, $b: expr, $c: expr, $e: expr, $f: expr, $g: expr) => {
136
+ match ( imm8 >> 6 ) & 0x3 {
137
+ 0 => shuffle4!( $a, $b, $c, 8 , $e, $f, $g, 12 ) ,
138
+ 1 => shuffle4!( $a, $b, $c, 9 , $e, $f, $g, 13 ) ,
139
+ 2 => shuffle4!( $a, $b, $c, 10 , $e, $f, $g, 14 ) ,
140
+ _ => shuffle4!( $a, $b, $c, 11 , $e, $f, $g, 15 ) ,
141
+ }
142
+ }
143
+ }
144
+ macro_rules! shuffle2 {
145
+ ( $a: expr, $b: expr, $e: expr, $f: expr) => {
146
+ match ( imm8 >> 4 ) & 0x3 {
147
+ 0 => shuffle3!( $a, $b, 8 , $e, $f, 12 ) ,
148
+ 1 => shuffle3!( $a, $b, 9 , $e, $f, 13 ) ,
149
+ 2 => shuffle3!( $a, $b, 10 , $e, $f, 14 ) ,
150
+ _ => shuffle3!( $a, $b, 11 , $e, $f, 15 ) ,
151
+ }
152
+ }
153
+ }
154
+ macro_rules! shuffle1 {
155
+ ( $a: expr, $e: expr) => {
156
+ match ( imm8 >> 2 ) & 0x3 {
157
+ 0 => shuffle2!( $a, 0 , $e, 4 ) ,
158
+ 1 => shuffle2!( $a, 1 , $e, 5 ) ,
159
+ 2 => shuffle2!( $a, 2 , $e, 6 ) ,
160
+ _ => shuffle2!( $a, 3 , $e, 7 ) ,
161
+ }
162
+ }
163
+ }
164
+ match ( imm8 >> 0 ) & 0x3 {
165
+ 0 => shuffle1 ! ( 0 , 4 ) ,
166
+ 1 => shuffle1 ! ( 1 , 5 ) ,
167
+ 2 => shuffle1 ! ( 2 , 6 ) ,
168
+ _ => shuffle1 ! ( 3 , 7 ) ,
169
+ }
170
+ }
171
+
122
172
/// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
123
173
/// elements in `a`
124
174
/// and then AND with `b`.
@@ -393,6 +443,56 @@ pub unsafe fn _mm256_blend_pd(a: f64x4, b: f64x4, imm8: i32) -> f64x4 {
393
443
}
394
444
}
395
445
446
+ /// Blend packed single-precision (32-bit) floating-point elements from
447
+ /// `a` and `b` using control mask `imm8`.
448
+ #[ inline( always) ]
449
+ #[ target_feature = "+avx" ]
450
+ #[ cfg_attr( test, assert_instr( vblendps, imm8 = 9 ) ) ]
451
+ pub unsafe fn _mm256_blend_ps ( a : f32x8 , b : f32x8 , imm8 : i32 ) -> f32x8 {
452
+ let imm8 = ( imm8 & 0xFF ) as u8 ;
453
+ macro_rules! blend4 {
454
+ ( $a: expr, $b: expr, $c: expr, $d: expr, $e: expr, $f: expr, $g: expr, $h: expr) => {
455
+ simd_shuffle8( a, b, [ $a, $b, $c, $d, $e, $f, $g, $h] ) ;
456
+ }
457
+ }
458
+ macro_rules! blend3 {
459
+ ( $a: expr, $b: expr, $c: expr, $d: expr, $e: expr, $f: expr) => {
460
+ match ( imm8 >> 6 ) & 0b11 {
461
+ 0b00 => blend4!( $a, $b, $c, $d, $e, $f, 6 , 7 ) ,
462
+ 0b01 => blend4!( $a, $b, $c, $d, $e, $f, 14 , 7 ) ,
463
+ 0b10 => blend4!( $a, $b, $c, $d, $e, $f, 6 , 15 ) ,
464
+ _ => blend4!( $a, $b, $c, $d, $e, $f, 14 , 15 ) ,
465
+ }
466
+ }
467
+ }
468
+ macro_rules! blend2 {
469
+ ( $a: expr, $b: expr, $c: expr, $d: expr) => {
470
+ match ( imm8 >> 4 ) & 0b11 {
471
+ 0b00 => blend3!( $a, $b, $c, $d, 4 , 5 ) ,
472
+ 0b01 => blend3!( $a, $b, $c, $d, 12 , 5 ) ,
473
+ 0b10 => blend3!( $a, $b, $c, $d, 4 , 13 ) ,
474
+ _ => blend3!( $a, $b, $c, $d, 12 , 13 ) ,
475
+ }
476
+ }
477
+ }
478
+ macro_rules! blend1 {
479
+ ( $a: expr, $b: expr) => {
480
+ match ( imm8 >> 2 ) & 0b11 {
481
+ 0b00 => blend2!( $a, $b, 2 , 3 ) ,
482
+ 0b01 => blend2!( $a, $b, 10 , 3 ) ,
483
+ 0b10 => blend2!( $a, $b, 2 , 11 ) ,
484
+ _ => blend2!( $a, $b, 10 , 11 ) ,
485
+ }
486
+ }
487
+ }
488
+ match imm8 & 0b11 {
489
+ 0b00 => blend1 ! ( 0 , 1 ) ,
490
+ 0b01 => blend1 ! ( 8 , 1 ) ,
491
+ 0b10 => blend1 ! ( 0 , 9 ) ,
492
+ _ => blend1 ! ( 8 , 9 ) ,
493
+ }
494
+ }
495
+
396
496
/// Blend packed double-precision (64-bit) floating-point elements from
397
497
/// `a` and `b` using `c` as a mask.
398
498
#[ inline( always) ]
@@ -1437,6 +1537,18 @@ pub unsafe fn _mm256_testc_si256(a: i64x4, b: i64x4) -> i32 {
1437
1537
ptestc256 ( a, b)
1438
1538
}
1439
1539
1540
+ /// Compute the bitwise AND of 256 bits (representing integer data) in `a` and
1541
+ /// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
1542
+ /// Compute the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
1543
+ /// the result is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and
1544
+ /// `CF` values are zero, otherwise return 0.
1545
+ #[ inline( always) ]
1546
+ #[ target_feature = "+avx" ]
1547
+ #[ cfg_attr( test, assert_instr( vptest) ) ]
1548
+ pub unsafe fn _mm256_testnzc_si256 ( a : i64x4 , b : i64x4 ) -> i32 {
1549
+ ptestnzc256 ( a, b)
1550
+ }
1551
+
1440
1552
/// Compute the bitwise AND of 256 bits (representing double-precision (64-bit)
1441
1553
/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
1442
1554
/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
@@ -2272,6 +2384,8 @@ extern "C" {
2272
2384
fn ptestz256 ( a : i64x4 , b : i64x4 ) -> i32 ;
2273
2385
#[ link_name = "llvm.x86.avx.ptestc.256" ]
2274
2386
fn ptestc256 ( a : i64x4 , b : i64x4 ) -> i32 ;
2387
+ #[ link_name = "llvm.x86.avx.ptestnzc.256" ]
2388
+ fn ptestnzc256 ( a : i64x4 , b : i64x4 ) -> i32 ;
2275
2389
#[ link_name = "llvm.x86.avx.vtestz.pd.256" ]
2276
2390
fn vtestzpd256 ( a : f64x4 , b : f64x4 ) -> i32 ;
2277
2391
#[ link_name = "llvm.x86.avx.vtestc.pd.256" ]
@@ -2375,6 +2489,15 @@ mod tests {
2375
2489
assert_eq ! ( r, e) ;
2376
2490
}
2377
2491
2492
+ #[ simd_test = "avx" ]
2493
+ unsafe fn _mm256_shuffle_ps ( ) {
2494
+ let a = f32x8:: new ( 1. , 4. , 5. , 8. , 9. , 12. , 13. , 16. ) ;
2495
+ let b = f32x8:: new ( 2. , 3. , 6. , 7. , 10. , 11. , 14. , 15. ) ;
2496
+ let r = avx:: _mm256_shuffle_ps ( a, b, 0x0F ) ;
2497
+ let e = f32x8:: new ( 8. , 8. , 2. , 2. , 16. , 16. , 10. , 10. ) ;
2498
+ assert_eq ! ( r, e) ;
2499
+ }
2500
+
2378
2501
#[ simd_test = "avx" ]
2379
2502
unsafe fn _mm256_andnot_pd ( ) {
2380
2503
let a = f64x4:: splat ( 0. ) ;
@@ -2421,7 +2544,7 @@ mod tests {
2421
2544
#[ simd_test = "avx" ]
2422
2545
unsafe fn _mm256_min_ps ( ) {
2423
2546
let a = f32x8:: new ( 1. , 4. , 5. , 8. , 9. , 12. , 13. , 16. ) ;
2424
- let b = f32x8:: new ( 2. , 3. , 6. , 7. , 10.0 , 11. , 14. , 15. ) ;
2547
+ let b = f32x8:: new ( 2. , 3. , 6. , 7. , 10. , 11. , 14. , 15. ) ;
2425
2548
let r = avx:: _mm256_min_ps ( a, b) ;
2426
2549
let e = f32x8:: new ( 1. , 3. , 5. , 7. , 9. , 11. , 13. , 15. ) ;
2427
2550
assert_eq ! ( r, e) ;
@@ -2439,9 +2562,9 @@ mod tests {
2439
2562
#[ simd_test = "avx" ]
2440
2563
unsafe fn _mm256_mul_ps ( ) {
2441
2564
let a = f32x8:: new ( 1. , 2. , 3. , 4. , 5. , 6. , 7. , 8. ) ;
2442
- let b = f32x8:: new ( 9. , 10.0 , 11. , 12. , 13. , 14. , 15. , 16. ) ;
2565
+ let b = f32x8:: new ( 9. , 10. , 11. , 12. , 13. , 14. , 15. , 16. ) ;
2443
2566
let r = avx:: _mm256_mul_ps ( a, b) ;
2444
- let e = f32x8:: new ( 9. , 20.0 , 33. , 48. , 65. , 84. , 105. , 128. ) ;
2567
+ let e = f32x8:: new ( 9. , 20. , 33. , 48. , 65. , 84. , 105. , 128. ) ;
2445
2568
assert_eq ! ( r, e) ;
2446
2569
}
2447
2570
@@ -2560,7 +2683,7 @@ mod tests {
2560
2683
#[ simd_test = "avx" ]
2561
2684
unsafe fn _mm256_div_ps ( ) {
2562
2685
let a = f32x8:: new ( 4. , 9. , 16. , 25. , 4. , 9. , 16. , 25. ) ;
2563
- let b = f32x8:: new ( 4. , 3. , 2. , 5. , 8. , 9. , 64. , 50.0 ) ;
2686
+ let b = f32x8:: new ( 4. , 3. , 2. , 5. , 8. , 9. , 64. , 50. ) ;
2564
2687
let r = avx:: _mm256_div_ps ( a, b) ;
2565
2688
let e = f32x8:: new ( 1. , 3. , 8. , 5. , 0.5 , 1. , 0.25 , 0.5 ) ;
2566
2689
assert_eq ! ( r, e) ;
@@ -2587,6 +2710,18 @@ mod tests {
2587
2710
assert_eq ! ( r, f64x4:: new( 4. , 3. , 2. , 5. ) ) ;
2588
2711
}
2589
2712
2713
+ #[ simd_test = "avx" ]
2714
+ unsafe fn _mm256_blend_ps ( ) {
2715
+ let a = f32x8:: new ( 1. , 4. , 5. , 8. , 9. , 12. , 13. , 16. ) ;
2716
+ let b = f32x8:: new ( 2. , 3. , 6. , 7. , 10. , 11. , 14. , 15. ) ;
2717
+ let r = avx:: _mm256_blend_ps ( a, b, 0x0 ) ;
2718
+ assert_eq ! ( r, f32x8:: new( 1. , 4. , 5. , 8. , 9. , 12. , 13. , 16. ) ) ;
2719
+ let r = avx:: _mm256_blend_ps ( a, b, 0x3 ) ;
2720
+ assert_eq ! ( r, f32x8:: new( 2. , 3. , 5. , 8. , 9. , 12. , 13. , 16. ) ) ;
2721
+ let r = avx:: _mm256_blend_ps ( a, b, 0xF ) ;
2722
+ assert_eq ! ( r, f32x8:: new( 2. , 3. , 6. , 7. , 9. , 12. , 13. , 16. ) ) ;
2723
+ }
2724
+
2590
2725
#[ simd_test = "avx" ]
2591
2726
unsafe fn _mm256_blendv_pd ( ) {
2592
2727
let a = f64x4:: new ( 4. , 9. , 16. , 25. ) ;
@@ -2600,23 +2735,23 @@ mod tests {
2600
2735
#[ simd_test = "avx" ]
2601
2736
unsafe fn _mm256_blendv_ps ( ) {
2602
2737
let a = f32x8:: new ( 4. , 9. , 16. , 25. , 4. , 9. , 16. , 25. ) ;
2603
- let b = f32x8:: new ( 4. , 3. , 2. , 5. , 8. , 9. , 64. , 50.0 ) ;
2738
+ let b = f32x8:: new ( 4. , 3. , 2. , 5. , 8. , 9. , 64. , 50. ) ;
2604
2739
#[ cfg_attr( rustfmt, rustfmt_skip) ]
2605
2740
let c = f32x8:: new (
2606
2741
0. , 0. , 0. , 0. , !0 as f32 , !0 as f32 , !0 as f32 , !0 as f32 ,
2607
2742
) ;
2608
2743
let r = avx:: _mm256_blendv_ps ( a, b, c) ;
2609
- let e = f32x8:: new ( 4. , 9. , 16. , 25. , 8. , 9. , 64. , 50.0 ) ;
2744
+ let e = f32x8:: new ( 4. , 9. , 16. , 25. , 8. , 9. , 64. , 50. ) ;
2610
2745
assert_eq ! ( r, e) ;
2611
2746
}
2612
2747
2613
2748
#[ simd_test = "avx" ]
2614
2749
unsafe fn _mm256_dp_ps ( ) {
2615
2750
let a = f32x8:: new ( 4. , 9. , 16. , 25. , 4. , 9. , 16. , 25. ) ;
2616
- let b = f32x8:: new ( 4. , 3. , 2. , 5. , 8. , 9. , 64. , 50.0 ) ;
2751
+ let b = f32x8:: new ( 4. , 3. , 2. , 5. , 8. , 9. , 64. , 50. ) ;
2617
2752
let r = avx:: _mm256_dp_ps ( a, b, 0xFF ) ;
2618
2753
let e =
2619
- f32x8:: new ( 200.0 , 200.0 , 200.0 , 200.0 , 2387. , 2387. , 2387. , 2387. ) ;
2754
+ f32x8:: new ( 200. , 200. , 200. , 200. , 2387. , 2387. , 2387. , 2387. ) ;
2620
2755
assert_eq ! ( r, e) ;
2621
2756
}
2622
2757
@@ -2638,7 +2773,7 @@ mod tests {
2638
2773
#[ simd_test = "avx" ]
2639
2774
unsafe fn _mm256_hadd_ps ( ) {
2640
2775
let a = f32x8:: new ( 4. , 9. , 16. , 25. , 4. , 9. , 16. , 25. ) ;
2641
- let b = f32x8:: new ( 4. , 3. , 2. , 5. , 8. , 9. , 64. , 50.0 ) ;
2776
+ let b = f32x8:: new ( 4. , 3. , 2. , 5. , 8. , 9. , 64. , 50. ) ;
2642
2777
let r = avx:: _mm256_hadd_ps ( a, b) ;
2643
2778
let e = f32x8:: new ( 13. , 41. , 7. , 7. , 13. , 41. , 17. , 114. ) ;
2644
2779
assert_eq ! ( r, e) ;
@@ -2668,7 +2803,7 @@ mod tests {
2668
2803
#[ simd_test = "avx" ]
2669
2804
unsafe fn _mm256_hsub_ps ( ) {
2670
2805
let a = f32x8:: new ( 4. , 9. , 16. , 25. , 4. , 9. , 16. , 25. ) ;
2671
- let b = f32x8:: new ( 4. , 3. , 2. , 5. , 8. , 9. , 64. , 50.0 ) ;
2806
+ let b = f32x8:: new ( 4. , 3. , 2. , 5. , 8. , 9. , 64. , 50. ) ;
2672
2807
let r = avx:: _mm256_hsub_ps ( a, b) ;
2673
2808
let e = f32x8:: new ( -5. , -9. , 1. , -3. , -5. , -9. , -1. , 14. ) ;
2674
2809
assert_eq ! ( r, e) ;
@@ -2821,7 +2956,7 @@ mod tests {
2821
2956
2822
2957
#[ simd_test = "avx" ]
2823
2958
unsafe fn _mm256_extractf128_ps ( ) {
2824
- let a = f32x8:: new ( 4. , 3. , 2. , 5. , 8. , 9. , 64. , 50.0 ) ;
2959
+ let a = f32x8:: new ( 4. , 3. , 2. , 5. , 8. , 9. , 64. , 50. ) ;
2825
2960
let r = avx:: _mm256_extractf128_ps ( a, 0 ) ;
2826
2961
let e = f32x4:: new ( 4. , 3. , 2. , 5. ) ;
2827
2962
assert_eq ! ( r, e) ;
@@ -2890,10 +3025,10 @@ mod tests {
2890
3025
2891
3026
#[ simd_test = "avx" ]
2892
3027
unsafe fn _mm256_permutevar_ps ( ) {
2893
- let a = f32x8:: new ( 4. , 3. , 2. , 5. , 8. , 9. , 64. , 50.0 ) ;
3028
+ let a = f32x8:: new ( 4. , 3. , 2. , 5. , 8. , 9. , 64. , 50. ) ;
2894
3029
let b = i32x8:: new ( 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 ) ;
2895
3030
let r = avx:: _mm256_permutevar_ps ( a, b) ;
2896
- let e = f32x8:: new ( 3. , 2. , 5. , 4. , 9. , 64. , 50.0 , 8. ) ;
3031
+ let e = f32x8:: new ( 3. , 2. , 5. , 4. , 9. , 64. , 50. , 8. ) ;
2897
3032
assert_eq ! ( r, e) ;
2898
3033
}
2899
3034
@@ -2908,9 +3043,9 @@ mod tests {
2908
3043
2909
3044
#[ simd_test = "avx" ]
2910
3045
unsafe fn _mm256_permute_ps ( ) {
2911
- let a = f32x8:: new ( 4. , 3. , 2. , 5. , 8. , 9. , 64. , 50.0 ) ;
3046
+ let a = f32x8:: new ( 4. , 3. , 2. , 5. , 8. , 9. , 64. , 50. ) ;
2912
3047
let r = avx:: _mm256_permute_ps ( a, 0x1b ) ;
2913
- let e = f32x8:: new ( 5. , 2. , 3. , 4. , 50.0 , 64. , 9. , 8. ) ;
3048
+ let e = f32x8:: new ( 5. , 2. , 3. , 4. , 50. , 64. , 9. , 8. ) ;
2914
3049
assert_eq ! ( r, e) ;
2915
3050
}
2916
3051
@@ -3022,10 +3157,10 @@ mod tests {
3022
3157
3023
3158
#[ simd_test = "avx" ]
3024
3159
unsafe fn _mm256_insertf128_ps ( ) {
3025
- let a = f32x8:: new ( 4. , 3. , 2. , 5. , 8. , 9. , 64. , 50.0 ) ;
3160
+ let a = f32x8:: new ( 4. , 3. , 2. , 5. , 8. , 9. , 64. , 50. ) ;
3026
3161
let b = f32x4:: new ( 4. , 9. , 16. , 25. ) ;
3027
3162
let r = avx:: _mm256_insertf128_ps ( a, b, 0 ) ;
3028
- let e = f32x8:: new ( 4. , 9. , 16. , 25. , 8. , 9. , 64. , 50.0 ) ;
3163
+ let e = f32x8:: new ( 4. , 9. , 16. , 25. , 8. , 9. , 64. , 50. ) ;
3029
3164
assert_eq ! ( r, e) ;
3030
3165
}
3031
3166
@@ -3112,10 +3247,10 @@ mod tests {
3112
3247
3113
3248
#[ simd_test = "avx" ]
3114
3249
unsafe fn _mm256_loadu_ps ( ) {
3115
- let a = & [ 4. , 3. , 2. , 5. , 8. , 9. , 64. , 50.0 ] ;
3250
+ let a = & [ 4. , 3. , 2. , 5. , 8. , 9. , 64. , 50. ] ;
3116
3251
let p = a. as_ptr ( ) ;
3117
3252
let r = avx:: _mm256_loadu_ps ( black_box ( p) ) ;
3118
- let e = f32x8:: new ( 4. , 3. , 2. , 5. , 8. , 9. , 64. , 50.0 ) ;
3253
+ let e = f32x8:: new ( 4. , 3. , 2. , 5. , 8. , 9. , 64. , 50. ) ;
3119
3254
assert_eq ! ( r, e) ;
3120
3255
}
3121
3256
@@ -3357,6 +3492,18 @@ mod tests {
3357
3492
assert_eq ! ( r, 1 ) ;
3358
3493
}
3359
3494
3495
+ #[ simd_test = "avx" ]
3496
+ unsafe fn _mm256_testnzc_si256 ( ) {
3497
+ let a = i64x4:: new ( 1 , 2 , 3 , 4 ) ;
3498
+ let b = i64x4:: new ( 5 , 6 , 7 , 8 ) ;
3499
+ let r = avx:: _mm256_testnzc_si256 ( a, b) ;
3500
+ assert_eq ! ( r, 1 ) ;
3501
+ let a = i64x4:: new ( 0 , 0 , 0 , 0 ) ;
3502
+ let b = i64x4:: new ( 0 , 0 , 0 , 0 ) ;
3503
+ let r = avx:: _mm256_testnzc_si256 ( a, b) ;
3504
+ assert_eq ! ( r, 0 ) ;
3505
+ }
3506
+
3360
3507
#[ simd_test = "avx" ]
3361
3508
unsafe fn _mm256_testz_pd ( ) {
3362
3509
let a = f64x4:: new ( 1. , 2. , 3. , 4. ) ;
0 commit comments