Skip to content

Commit 46d64f0

Browse files
gwennalexcrichton
authored andcommitted
* avx: _mm256_testnzc_si256 * avx: _mm256_shuffle_ps 8 levels of macro expansion takes too long to compile. * avx: remove useless 0 in tests * avx: _mm256_shuffle_ps Macro expansion can be reduced to four levels * avx: _mm256_blend_ps Copy/paste from avx2::_mm256_blend_epi32
1 parent 5a4a1f4 commit 46d64f0

File tree

1 file changed

+166
-19
lines changed

1 file changed

+166
-19
lines changed

src/x86/avx.rs

+166-19
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,56 @@ pub unsafe fn _mm256_shuffle_pd(a: f64x4, b: f64x4, imm8: i32) -> f64x4 {
119119
}
120120
}
121121

122+
/// Shuffle single-precision (32-bit) floating-point elements in `a` within
123+
/// 128-bit lanes using the control in `imm8`.
124+
#[inline(always)]
125+
#[target_feature = "+avx"]
126+
#[cfg_attr(test, assert_instr(vshufps, imm8 = 0x0))]
127+
pub unsafe fn _mm256_shuffle_ps(a: f32x8, b: f32x8, imm8: i32) -> f32x8 {
128+
let imm8 = (imm8 & 0xFF) as u8;
129+
macro_rules! shuffle4 {
130+
($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr, $g: expr, $h: expr) => {
131+
simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h]);
132+
}
133+
}
134+
macro_rules! shuffle3 {
135+
($a:expr, $b: expr, $c: expr, $e:expr, $f:expr, $g:expr) => {
136+
match (imm8 >> 6) & 0x3 {
137+
0 => shuffle4!($a, $b, $c, 8, $e, $f, $g, 12),
138+
1 => shuffle4!($a, $b, $c, 9, $e, $f, $g, 13),
139+
2 => shuffle4!($a, $b, $c, 10, $e, $f, $g, 14),
140+
_ => shuffle4!($a, $b, $c, 11, $e, $f, $g, 15),
141+
}
142+
}
143+
}
144+
macro_rules! shuffle2 {
145+
($a:expr, $b:expr, $e:expr, $f:expr) => {
146+
match (imm8 >> 4) & 0x3 {
147+
0 => shuffle3!($a, $b, 8, $e, $f, 12),
148+
1 => shuffle3!($a, $b, 9, $e, $f, 13),
149+
2 => shuffle3!($a, $b, 10, $e, $f, 14),
150+
_ => shuffle3!($a, $b, 11, $e, $f, 15),
151+
}
152+
}
153+
}
154+
macro_rules! shuffle1 {
155+
($a:expr, $e:expr) => {
156+
match (imm8 >> 2) & 0x3 {
157+
0 => shuffle2!($a, 0, $e, 4),
158+
1 => shuffle2!($a, 1, $e, 5),
159+
2 => shuffle2!($a, 2, $e, 6),
160+
_ => shuffle2!($a, 3, $e, 7),
161+
}
162+
}
163+
}
164+
match (imm8 >> 0) & 0x3 {
165+
0 => shuffle1!(0, 4),
166+
1 => shuffle1!(1, 5),
167+
2 => shuffle1!(2, 6),
168+
_ => shuffle1!(3, 7),
169+
}
170+
}
171+
122172
/// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
123173
/// elements in `a`
124174
/// and then AND with `b`.
@@ -393,6 +443,56 @@ pub unsafe fn _mm256_blend_pd(a: f64x4, b: f64x4, imm8: i32) -> f64x4 {
393443
}
394444
}
395445

446+
/// Blend packed single-precision (32-bit) floating-point elements from
447+
/// `a` and `b` using control mask `imm8`.
448+
#[inline(always)]
449+
#[target_feature = "+avx"]
450+
#[cfg_attr(test, assert_instr(vblendps, imm8 = 9))]
451+
pub unsafe fn _mm256_blend_ps(a: f32x8, b: f32x8, imm8: i32) -> f32x8 {
452+
let imm8 = (imm8 & 0xFF) as u8;
453+
macro_rules! blend4 {
454+
($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr, $g:expr, $h:expr) => {
455+
simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h]);
456+
}
457+
}
458+
macro_rules! blend3 {
459+
($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr) => {
460+
match (imm8 >> 6) & 0b11 {
461+
0b00 => blend4!($a, $b, $c, $d, $e, $f, 6, 7),
462+
0b01 => blend4!($a, $b, $c, $d, $e, $f, 14, 7),
463+
0b10 => blend4!($a, $b, $c, $d, $e, $f, 6, 15),
464+
_ => blend4!($a, $b, $c, $d, $e, $f, 14, 15),
465+
}
466+
}
467+
}
468+
macro_rules! blend2 {
469+
($a:expr, $b:expr, $c:expr, $d:expr) => {
470+
match (imm8 >> 4) & 0b11 {
471+
0b00 => blend3!($a, $b, $c, $d, 4, 5),
472+
0b01 => blend3!($a, $b, $c, $d, 12, 5),
473+
0b10 => blend3!($a, $b, $c, $d, 4, 13),
474+
_ => blend3!($a, $b, $c, $d, 12, 13),
475+
}
476+
}
477+
}
478+
macro_rules! blend1 {
479+
($a:expr, $b:expr) => {
480+
match (imm8 >> 2) & 0b11 {
481+
0b00 => blend2!($a, $b, 2, 3),
482+
0b01 => blend2!($a, $b, 10, 3),
483+
0b10 => blend2!($a, $b, 2, 11),
484+
_ => blend2!($a, $b, 10, 11),
485+
}
486+
}
487+
}
488+
match imm8 & 0b11 {
489+
0b00 => blend1!(0, 1),
490+
0b01 => blend1!(8, 1),
491+
0b10 => blend1!(0, 9),
492+
_ => blend1!(8, 9),
493+
}
494+
}
495+
396496
/// Blend packed double-precision (64-bit) floating-point elements from
397497
/// `a` and `b` using `c` as a mask.
398498
#[inline(always)]
@@ -1437,6 +1537,18 @@ pub unsafe fn _mm256_testc_si256(a: i64x4, b: i64x4) -> i32 {
14371537
ptestc256(a, b)
14381538
}
14391539

1540+
/// Compute the bitwise AND of 256 bits (representing integer data) in `a` and
1541+
/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
1542+
/// Compute the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
1543+
/// the result is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and
1544+
/// `CF` values are zero, otherwise return 0.
1545+
#[inline(always)]
1546+
#[target_feature = "+avx"]
1547+
#[cfg_attr(test, assert_instr(vptest))]
1548+
pub unsafe fn _mm256_testnzc_si256(a: i64x4, b: i64x4) -> i32 {
1549+
ptestnzc256(a, b)
1550+
}
1551+
14401552
/// Compute the bitwise AND of 256 bits (representing double-precision (64-bit)
14411553
/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
14421554
/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
@@ -2272,6 +2384,8 @@ extern "C" {
22722384
fn ptestz256(a: i64x4, b: i64x4) -> i32;
22732385
#[link_name = "llvm.x86.avx.ptestc.256"]
22742386
fn ptestc256(a: i64x4, b: i64x4) -> i32;
2387+
#[link_name = "llvm.x86.avx.ptestnzc.256"]
2388+
fn ptestnzc256(a: i64x4, b: i64x4) -> i32;
22752389
#[link_name = "llvm.x86.avx.vtestz.pd.256"]
22762390
fn vtestzpd256(a: f64x4, b: f64x4) -> i32;
22772391
#[link_name = "llvm.x86.avx.vtestc.pd.256"]
@@ -2375,6 +2489,15 @@ mod tests {
23752489
assert_eq!(r, e);
23762490
}
23772491

2492+
#[simd_test = "avx"]
2493+
unsafe fn _mm256_shuffle_ps() {
2494+
let a = f32x8::new(1., 4., 5., 8., 9., 12., 13., 16.);
2495+
let b = f32x8::new(2., 3., 6., 7., 10., 11., 14., 15.);
2496+
let r = avx::_mm256_shuffle_ps(a, b, 0x0F);
2497+
let e = f32x8::new(8., 8., 2., 2., 16., 16., 10., 10.);
2498+
assert_eq!(r, e);
2499+
}
2500+
23782501
#[simd_test = "avx"]
23792502
unsafe fn _mm256_andnot_pd() {
23802503
let a = f64x4::splat(0.);
@@ -2421,7 +2544,7 @@ mod tests {
24212544
#[simd_test = "avx"]
24222545
unsafe fn _mm256_min_ps() {
24232546
let a = f32x8::new(1., 4., 5., 8., 9., 12., 13., 16.);
2424-
let b = f32x8::new(2., 3., 6., 7., 10.0, 11., 14., 15.);
2547+
let b = f32x8::new(2., 3., 6., 7., 10., 11., 14., 15.);
24252548
let r = avx::_mm256_min_ps(a, b);
24262549
let e = f32x8::new(1., 3., 5., 7., 9., 11., 13., 15.);
24272550
assert_eq!(r, e);
@@ -2439,9 +2562,9 @@ mod tests {
24392562
#[simd_test = "avx"]
24402563
unsafe fn _mm256_mul_ps() {
24412564
let a = f32x8::new(1., 2., 3., 4., 5., 6., 7., 8.);
2442-
let b = f32x8::new(9., 10.0, 11., 12., 13., 14., 15., 16.);
2565+
let b = f32x8::new(9., 10., 11., 12., 13., 14., 15., 16.);
24432566
let r = avx::_mm256_mul_ps(a, b);
2444-
let e = f32x8::new(9., 20.0, 33., 48., 65., 84., 105., 128.);
2567+
let e = f32x8::new(9., 20., 33., 48., 65., 84., 105., 128.);
24452568
assert_eq!(r, e);
24462569
}
24472570

@@ -2560,7 +2683,7 @@ mod tests {
25602683
#[simd_test = "avx"]
25612684
unsafe fn _mm256_div_ps() {
25622685
let a = f32x8::new(4., 9., 16., 25., 4., 9., 16., 25.);
2563-
let b = f32x8::new(4., 3., 2., 5., 8., 9., 64., 50.0);
2686+
let b = f32x8::new(4., 3., 2., 5., 8., 9., 64., 50.);
25642687
let r = avx::_mm256_div_ps(a, b);
25652688
let e = f32x8::new(1., 3., 8., 5., 0.5, 1., 0.25, 0.5);
25662689
assert_eq!(r, e);
@@ -2587,6 +2710,18 @@ mod tests {
25872710
assert_eq!(r, f64x4::new(4., 3., 2., 5.));
25882711
}
25892712

2713+
#[simd_test = "avx"]
2714+
unsafe fn _mm256_blend_ps() {
2715+
let a = f32x8::new(1., 4., 5., 8., 9., 12., 13., 16.);
2716+
let b = f32x8::new(2., 3., 6., 7., 10., 11., 14., 15.);
2717+
let r = avx::_mm256_blend_ps(a, b, 0x0);
2718+
assert_eq!(r, f32x8::new(1., 4., 5., 8., 9., 12., 13., 16.));
2719+
let r = avx::_mm256_blend_ps(a, b, 0x3);
2720+
assert_eq!(r, f32x8::new(2., 3., 5., 8., 9., 12., 13., 16.));
2721+
let r = avx::_mm256_blend_ps(a, b, 0xF);
2722+
assert_eq!(r, f32x8::new(2., 3., 6., 7., 9., 12., 13., 16.));
2723+
}
2724+
25902725
#[simd_test = "avx"]
25912726
unsafe fn _mm256_blendv_pd() {
25922727
let a = f64x4::new(4., 9., 16., 25.);
@@ -2600,23 +2735,23 @@ mod tests {
26002735
#[simd_test = "avx"]
26012736
unsafe fn _mm256_blendv_ps() {
26022737
let a = f32x8::new(4., 9., 16., 25., 4., 9., 16., 25.);
2603-
let b = f32x8::new(4., 3., 2., 5., 8., 9., 64., 50.0);
2738+
let b = f32x8::new(4., 3., 2., 5., 8., 9., 64., 50.);
26042739
#[cfg_attr(rustfmt, rustfmt_skip)]
26052740
let c = f32x8::new(
26062741
0., 0., 0., 0., !0 as f32, !0 as f32, !0 as f32, !0 as f32,
26072742
);
26082743
let r = avx::_mm256_blendv_ps(a, b, c);
2609-
let e = f32x8::new(4., 9., 16., 25., 8., 9., 64., 50.0);
2744+
let e = f32x8::new(4., 9., 16., 25., 8., 9., 64., 50.);
26102745
assert_eq!(r, e);
26112746
}
26122747

26132748
#[simd_test = "avx"]
26142749
unsafe fn _mm256_dp_ps() {
26152750
let a = f32x8::new(4., 9., 16., 25., 4., 9., 16., 25.);
2616-
let b = f32x8::new(4., 3., 2., 5., 8., 9., 64., 50.0);
2751+
let b = f32x8::new(4., 3., 2., 5., 8., 9., 64., 50.);
26172752
let r = avx::_mm256_dp_ps(a, b, 0xFF);
26182753
let e =
2619-
f32x8::new(200.0, 200.0, 200.0, 200.0, 2387., 2387., 2387., 2387.);
2754+
f32x8::new(200., 200., 200., 200., 2387., 2387., 2387., 2387.);
26202755
assert_eq!(r, e);
26212756
}
26222757

@@ -2638,7 +2773,7 @@ mod tests {
26382773
#[simd_test = "avx"]
26392774
unsafe fn _mm256_hadd_ps() {
26402775
let a = f32x8::new(4., 9., 16., 25., 4., 9., 16., 25.);
2641-
let b = f32x8::new(4., 3., 2., 5., 8., 9., 64., 50.0);
2776+
let b = f32x8::new(4., 3., 2., 5., 8., 9., 64., 50.);
26422777
let r = avx::_mm256_hadd_ps(a, b);
26432778
let e = f32x8::new(13., 41., 7., 7., 13., 41., 17., 114.);
26442779
assert_eq!(r, e);
@@ -2668,7 +2803,7 @@ mod tests {
26682803
#[simd_test = "avx"]
26692804
unsafe fn _mm256_hsub_ps() {
26702805
let a = f32x8::new(4., 9., 16., 25., 4., 9., 16., 25.);
2671-
let b = f32x8::new(4., 3., 2., 5., 8., 9., 64., 50.0);
2806+
let b = f32x8::new(4., 3., 2., 5., 8., 9., 64., 50.);
26722807
let r = avx::_mm256_hsub_ps(a, b);
26732808
let e = f32x8::new(-5., -9., 1., -3., -5., -9., -1., 14.);
26742809
assert_eq!(r, e);
@@ -2821,7 +2956,7 @@ mod tests {
28212956

28222957
#[simd_test = "avx"]
28232958
unsafe fn _mm256_extractf128_ps() {
2824-
let a = f32x8::new(4., 3., 2., 5., 8., 9., 64., 50.0);
2959+
let a = f32x8::new(4., 3., 2., 5., 8., 9., 64., 50.);
28252960
let r = avx::_mm256_extractf128_ps(a, 0);
28262961
let e = f32x4::new(4., 3., 2., 5.);
28272962
assert_eq!(r, e);
@@ -2890,10 +3025,10 @@ mod tests {
28903025

28913026
#[simd_test = "avx"]
28923027
unsafe fn _mm256_permutevar_ps() {
2893-
let a = f32x8::new(4., 3., 2., 5., 8., 9., 64., 50.0);
3028+
let a = f32x8::new(4., 3., 2., 5., 8., 9., 64., 50.);
28943029
let b = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8);
28953030
let r = avx::_mm256_permutevar_ps(a, b);
2896-
let e = f32x8::new(3., 2., 5., 4., 9., 64., 50.0, 8.);
3031+
let e = f32x8::new(3., 2., 5., 4., 9., 64., 50., 8.);
28973032
assert_eq!(r, e);
28983033
}
28993034

@@ -2908,9 +3043,9 @@ mod tests {
29083043

29093044
#[simd_test = "avx"]
29103045
unsafe fn _mm256_permute_ps() {
2911-
let a = f32x8::new(4., 3., 2., 5., 8., 9., 64., 50.0);
3046+
let a = f32x8::new(4., 3., 2., 5., 8., 9., 64., 50.);
29123047
let r = avx::_mm256_permute_ps(a, 0x1b);
2913-
let e = f32x8::new(5., 2., 3., 4., 50.0, 64., 9., 8.);
3048+
let e = f32x8::new(5., 2., 3., 4., 50., 64., 9., 8.);
29143049
assert_eq!(r, e);
29153050
}
29163051

@@ -3022,10 +3157,10 @@ mod tests {
30223157

30233158
#[simd_test = "avx"]
30243159
unsafe fn _mm256_insertf128_ps() {
3025-
let a = f32x8::new(4., 3., 2., 5., 8., 9., 64., 50.0);
3160+
let a = f32x8::new(4., 3., 2., 5., 8., 9., 64., 50.);
30263161
let b = f32x4::new(4., 9., 16., 25.);
30273162
let r = avx::_mm256_insertf128_ps(a, b, 0);
3028-
let e = f32x8::new(4., 9., 16., 25., 8., 9., 64., 50.0);
3163+
let e = f32x8::new(4., 9., 16., 25., 8., 9., 64., 50.);
30293164
assert_eq!(r, e);
30303165
}
30313166

@@ -3112,10 +3247,10 @@ mod tests {
31123247

31133248
#[simd_test = "avx"]
31143249
unsafe fn _mm256_loadu_ps() {
3115-
let a = &[4., 3., 2., 5., 8., 9., 64., 50.0];
3250+
let a = &[4., 3., 2., 5., 8., 9., 64., 50.];
31163251
let p = a.as_ptr();
31173252
let r = avx::_mm256_loadu_ps(black_box(p));
3118-
let e = f32x8::new(4., 3., 2., 5., 8., 9., 64., 50.0);
3253+
let e = f32x8::new(4., 3., 2., 5., 8., 9., 64., 50.);
31193254
assert_eq!(r, e);
31203255
}
31213256

@@ -3357,6 +3492,18 @@ mod tests {
33573492
assert_eq!(r, 1);
33583493
}
33593494

3495+
#[simd_test = "avx"]
3496+
unsafe fn _mm256_testnzc_si256() {
3497+
let a = i64x4::new(1, 2, 3, 4);
3498+
let b = i64x4::new(5, 6, 7, 8);
3499+
let r = avx::_mm256_testnzc_si256(a, b);
3500+
assert_eq!(r, 1);
3501+
let a = i64x4::new(0, 0, 0, 0);
3502+
let b = i64x4::new(0, 0, 0, 0);
3503+
let r = avx::_mm256_testnzc_si256(a, b);
3504+
assert_eq!(r, 0);
3505+
}
3506+
33603507
#[simd_test = "avx"]
33613508
unsafe fn _mm256_testz_pd() {
33623509
let a = f64x4::new(1., 2., 3., 4.);

0 commit comments

Comments
 (0)