Skip to content

Commit dcb61bb

Browse files
dlrobertsonalexcrichton
authored andcommitted
[x86] Implement avx2 broadcast intrinsics (rust-lang#97)
Implement - _mm_broadcastb_epi8 - _mm256_broadcastb_epi8 - _mm_broadcastd_epi32 - _mm256_broadcastd_epi32 - _mm_bradcastq_epi64 - _mm256_broadcastq_epi64 - _mm_broadcastsd_pd - _mm256_broadcastsd_pd - _mm256_broadcastsi128_si256 - _mm_broadcastss_ps - _mm256_broadcastss_ps - _mm_broadcastw_epi16 - _mm256_broadcast2_epi16
1 parent afba6fc commit dcb61bb

File tree

1 file changed

+219
-15
lines changed

1 file changed

+219
-15
lines changed

src/x86/avx2.rs

+219-15
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
use simd_llvm::{simd_shuffle8, simd_shuffle32};
1+
use simd_llvm::{simd_shuffle2, simd_shuffle4, simd_shuffle8};
2+
use simd_llvm::{simd_shuffle16, simd_shuffle32};
23
use v256::*;
34
use v128::*;
45
use x86::__m256i;
@@ -245,20 +246,130 @@ pub unsafe fn _mm256_blendv_epi8(a:i8x32,b:i8x32,mask:__m256i) -> i8x32 {
245246
pblendvb(a,b,mask)
246247
}
247248

248-
// TODO _mm_broadcastb_epi8
249-
// TODO _mm256_broadcastb_epi8
250-
// TODO _mm_broadcastd_epi32
251-
// TODO _mm256_broadcastd_epi32
252-
// TODO _mm_broadcastq_epi64
253-
// TODO _mm256_broadcastq_epi64
254-
// TODO _mm_broadcastsd_pd
255-
// TODO _mm256_broadcastsd_pd
256-
// TODO _mm_broadcastsi128_si256
257-
// TODO _mm256_broadcastsi128_si256
258-
// TODO _mm_broadcastss_ps
259-
// TODO _mm256_broadcastss_ps
260-
// TODO _mm_broadcastw_epi16
261-
// TODO _mm256_broadcastw_epi16
249+
/// Broadcast the low packed 8-bit integer from `a` to all elements of
250+
/// the 128-bit returned value.
251+
#[inline(always)]
252+
#[target_feature = "+avx2"]
253+
#[cfg_attr(test, assert_instr(vpbroadcastb))]
254+
pub unsafe fn _mm_broadcastb_epi8(a: i8x16) -> i8x16 {
255+
simd_shuffle16(a, i8x16::splat(0i8), [0u32; 16])
256+
}
257+
258+
/// Broadcast the low packed 8-bit integer from `a` to all elements of
259+
/// the 256-bit returned value.
260+
#[inline(always)]
261+
#[target_feature = "+avx2"]
262+
#[cfg_attr(test, assert_instr(vpbroadcastb))]
263+
pub unsafe fn _mm256_broadcastb_epi8(a: i8x16) -> i8x32 {
264+
simd_shuffle32(a, i8x16::splat(0i8), [0u32; 32])
265+
}
266+
267+
// NB: simd_shuffle4 with integer data types for `a` and `b` is
268+
// often compiled to vbroadcastss.
269+
/// Broadcast the low packed 32-bit integer from `a` to all elements of
270+
/// the 128-bit returned value.
271+
#[inline(always)]
272+
#[target_feature = "+avx2"]
273+
#[cfg_attr(test, assert_instr(vbroadcastss))]
274+
pub unsafe fn _mm_broadcastd_epi32(a: i32x4) -> i32x4 {
275+
simd_shuffle4(a, i32x4::splat(0i32), [0u32; 4])
276+
}
277+
278+
// NB: simd_shuffle4 with integer data types for `a` and `b` is
279+
// often compiled to vbroadcastss.
280+
/// Broadcast the low packed 32-bit integer from `a` to all elements of
281+
/// the 256-bit returned value.
282+
#[inline(always)]
283+
#[target_feature = "+avx2"]
284+
#[cfg_attr(test, assert_instr(vbroadcastss))]
285+
pub unsafe fn _mm256_broadcastd_epi32(a: i32x4) -> i32x8 {
286+
simd_shuffle8(a, i32x4::splat(0i32), [0u32; 8])
287+
}
288+
289+
/// Broadcast the low packed 64-bit integer from `a` to all elements of
290+
/// the 128-bit returned value.
291+
#[inline(always)]
292+
#[target_feature = "+avx2"]
293+
#[cfg_attr(test, assert_instr(vpbroadcastq))]
294+
pub unsafe fn _mm_broadcastq_epi64(a: i64x2) -> i64x2 {
295+
simd_shuffle2(a, i64x2::splat(0i64), [0u32; 2])
296+
}
297+
298+
// NB: simd_shuffle4 with integer data types for `a` and `b` is
299+
// often compiled to vbroadcastsd.
300+
/// Broadcast the low packed 64-bit integer from `a` to all elements of
301+
/// the 256-bit returned value.
302+
#[inline(always)]
303+
#[target_feature = "+avx2"]
304+
#[cfg_attr(test, assert_instr(vbroadcastsd))]
305+
pub unsafe fn _mm256_broadcastq_epi64(a: i64x2) -> i64x4 {
306+
simd_shuffle4(a, i64x2::splat(0i64), [0u32; 4])
307+
}
308+
309+
/// Broadcast the low double-precision (64-bit) floating-point element
310+
/// from `a` to all elements of the 128-bit returned value.
311+
#[inline(always)]
312+
#[target_feature = "+avx2"]
313+
#[cfg_attr(test, assert_instr(vmovddup))]
314+
pub unsafe fn _mm_broadcastsd_pd(a: f64x2) -> f64x2 {
315+
simd_shuffle2(a, f64x2::splat(0f64), [0u32; 2])
316+
}
317+
318+
/// Broadcast the low double-precision (64-bit) floating-point element
319+
/// from `a` to all elements of the 256-bit returned value.
320+
#[inline(always)]
321+
#[target_feature = "+avx2"]
322+
#[cfg_attr(test, assert_instr(vbroadcastsd))]
323+
pub unsafe fn _mm256_broadcastsd_pd(a: f64x2) -> f64x4 {
324+
simd_shuffle4(a, f64x2::splat(0f64), [0u32; 4])
325+
}
326+
327+
// NB: broadcastsi128_si256 is often compiled to vinsertf128 or
328+
// vbroadcastf128.
329+
/// Broadcast 128 bits of integer data from a to all 128-bit lanes in
330+
/// the 256-bit returned value.
331+
#[inline(always)]
332+
#[target_feature = "+avx2"]
333+
pub unsafe fn _mm256_broadcastsi128_si256(a: i64x2) -> i64x4 {
334+
simd_shuffle4(a, i64x2::splat(0i64), [0, 1, 0, 1])
335+
}
336+
337+
/// Broadcast the low single-precision (32-bit) floating-point element
338+
/// from `a` to all elements of the 128-bit returned value.
339+
#[inline(always)]
340+
#[target_feature = "+avx2"]
341+
#[cfg_attr(test, assert_instr(vbroadcastss))]
342+
pub unsafe fn _mm_broadcastss_ps(a: f32x4) -> f32x4 {
343+
simd_shuffle4(a, f32x4::splat(0f32), [0u32; 4])
344+
}
345+
346+
/// Broadcast the low single-precision (32-bit) floating-point element
347+
/// from `a` to all elements of the 256-bit returned value.
348+
#[inline(always)]
349+
#[target_feature = "+avx2"]
350+
#[cfg_attr(test, assert_instr(vbroadcastss))]
351+
pub unsafe fn _mm256_broadcastss_ps(a: f32x4) -> f32x8 {
352+
simd_shuffle8(a, f32x4::splat(0f32), [0u32; 8])
353+
}
354+
355+
/// Broadcast the low packed 16-bit integer from a to all elements of
356+
/// the 128-bit returned value
357+
#[inline(always)]
358+
#[target_feature = "+avx2"]
359+
#[cfg_attr(test, assert_instr(vpbroadcastw))]
360+
pub unsafe fn _mm_broadcastw_epi16(a: i16x8) -> i16x8 {
361+
simd_shuffle8(a, i16x8::splat(0i16), [0u32; 8])
362+
}
363+
364+
/// Broadcast the low packed 16-bit integer from a to all elements of
365+
/// the 256-bit returned value
366+
#[inline(always)]
367+
#[target_feature = "+avx2"]
368+
#[cfg_attr(test, assert_instr(vpbroadcastw))]
369+
pub unsafe fn _mm256_broadcastw_epi16(a: i16x8) -> i16x16 {
370+
simd_shuffle16(a, i16x8::splat(0i16), [0u32; 16])
371+
}
372+
262373
// TODO _mm256_bslli_epi128
263374
// TODO _mm256_bsrli_epi128
264375

@@ -1517,6 +1628,99 @@ mod tests {
15171628
assert_eq!(r,e);
15181629
}
15191630

1631+
#[simd_test = "avx2"]
1632+
unsafe fn _mm_broadcastb_epi8() {
1633+
let a = i8x16::splat(0x00).replace(0, 0x2a);
1634+
let res = avx2::_mm_broadcastb_epi8(a);
1635+
assert_eq!(res, i8x16::splat(0x2a));
1636+
}
1637+
1638+
#[simd_test = "avx2"]
1639+
unsafe fn _mm256_broadcastb_epi8() {
1640+
let a = i8x16::splat(0x00).replace(0, 0x2a);
1641+
let res = avx2::_mm256_broadcastb_epi8(a);
1642+
assert_eq!(res, i8x32::splat(0x2a));
1643+
}
1644+
1645+
#[simd_test = "avx2"]
1646+
unsafe fn _mm_broadcastd_epi32() {
1647+
let a = i32x4::splat(0x00).replace(0, 0x2a).replace(1, 0x8000000);
1648+
let res = avx2::_mm_broadcastd_epi32(a);
1649+
assert_eq!(res, i32x4::splat(0x2a));
1650+
}
1651+
1652+
#[simd_test = "avx2"]
1653+
unsafe fn _mm256_broadcastd_epi32() {
1654+
let a = i32x4::splat(0x00).replace(0, 0x2a).replace(1, 0x8000000);
1655+
let res = avx2::_mm256_broadcastd_epi32(a);
1656+
assert_eq!(res, i32x8::splat(0x2a));
1657+
}
1658+
1659+
#[simd_test = "avx2"]
1660+
unsafe fn _mm_broadcastq_epi64() {
1661+
let a = i64x2::splat(0x00).replace(0, 0x1ffffffff);
1662+
let res = avx2::_mm_broadcastq_epi64(a);
1663+
assert_eq!(res, i64x2::splat(0x1ffffffff));
1664+
}
1665+
1666+
#[simd_test = "avx2"]
1667+
unsafe fn _mm256_broadcastq_epi64() {
1668+
let a = i64x2::splat(0x00).replace(0, 0x1ffffffff);
1669+
let res = avx2::_mm256_broadcastq_epi64(a);
1670+
assert_eq!(res, i64x4::splat(0x1ffffffff));
1671+
}
1672+
1673+
#[simd_test = "avx2"]
1674+
unsafe fn _mm_broadcastsd_pd() {
1675+
let a = f64x2::splat(3.14f64).replace(0, 6.28f64);
1676+
let res = avx2::_mm_broadcastsd_pd(a);
1677+
assert_eq!(res, f64x2::splat(6.28f64));
1678+
}
1679+
1680+
#[simd_test = "avx2"]
1681+
unsafe fn _mm256_broadcastsd_pd() {
1682+
let a = f64x2::splat(3.14f64).replace(0, 6.28f64);
1683+
let res = avx2::_mm256_broadcastsd_pd(a);
1684+
assert_eq!(res, f64x4::splat(6.28f64));
1685+
}
1686+
1687+
#[simd_test = "avx2"]
1688+
unsafe fn _mm256_broadcastsi128_si256() {
1689+
let a = i64x2::new(0x0987654321012334, 0x5678909876543210);
1690+
let res = avx2::_mm256_broadcastsi128_si256(a);
1691+
let retval = i64x4::new(0x0987654321012334, 0x5678909876543210,
1692+
0x0987654321012334, 0x5678909876543210);
1693+
assert_eq!(res, retval);
1694+
}
1695+
1696+
#[simd_test = "avx2"]
1697+
unsafe fn _mm_broadcastss_ps() {
1698+
let a = f32x4::splat(3.14f32).replace(0, 6.28f32);
1699+
let res = avx2::_mm_broadcastss_ps(a);
1700+
assert_eq!(res, f32x4::splat(6.28f32));
1701+
}
1702+
1703+
#[simd_test = "avx2"]
1704+
unsafe fn _mm256_broadcastss_ps() {
1705+
let a = f32x4::splat(3.14f32).replace(0, 6.28f32);
1706+
let res = avx2::_mm256_broadcastss_ps(a);
1707+
assert_eq!(res, f32x8::splat(6.28f32));
1708+
}
1709+
1710+
#[simd_test = "avx2"]
1711+
unsafe fn _mm_broadcastw_epi16() {
1712+
let a = i16x8::splat(0x2a).replace(0, 0x22b);
1713+
let res = avx2::_mm_broadcastw_epi16(a);
1714+
assert_eq!(res, i16x8::splat(0x22b));
1715+
}
1716+
1717+
#[simd_test = "avx2"]
1718+
unsafe fn _mm256_broadcastw_epi16() {
1719+
let a = i16x8::splat(0x2a).replace(0, 0x22b);
1720+
let res = avx2::_mm256_broadcastw_epi16(a);
1721+
assert_eq!(res, i16x16::splat(0x22b));
1722+
}
1723+
15201724
#[simd_test = "avx2"]
15211725
unsafe fn _mm256_cmpeq_epi8() {
15221726
let a = i8x32::new(

0 commit comments

Comments
 (0)