Factor out simd

adamant-pwn · adamant-pwn · commit 88a38e3a74af · 2025-04-28T10:54:47.000+02:00
diff --git a/cp-algo/math/cvector.hpp b/cp-algo/math/cvector.hpp
@@ -1,42 +1,22 @@
 #ifndef CP_ALGO_MATH_CVECTOR_HPP
 #define CP_ALGO_MATH_CVECTOR_HPP
+#include "../util/simd.hpp"
 #include "../util/complex.hpp"
 #include "../util/checkpoint.hpp"
 #include "../util/big_alloc.hpp"
-#include <experimental/simd>
 #include <ranges>
 
 namespace stdx = std::experimental;
 namespace cp_algo::math::fft {
-    using ftype = double;
     static constexpr size_t flen = 4;
-    static constexpr size_t bytes = flen * sizeof(ftype);
+    using ftype = double;
+    using vftype = simd<ftype, flen>;
     using point = complex<ftype>;
-    using vftype [[gnu::vector_size(bytes)]] = ftype;
     using vpoint = complex<vftype>;
     static constexpr vftype vz = {};
     vpoint vi(vpoint const& r) {
         return {-imag(r), real(r)};
     }
-    vftype abs(vftype a) {
-        return a < 0 ? -a : a;
-    }
-    using i64x4 [[gnu::vector_size(bytes)]] = int64_t;
-    using u64x4 [[gnu::vector_size(bytes)]] = uint64_t;
-    auto lround(vftype a) {
-        return __builtin_convertvector(a < 0 ? a - 0.5 : a + 0.5, i64x4);
-    }
-    auto round(vftype a) {
-        return __builtin_convertvector(lround(a), vftype);
-    }
-    u64x4 montgomery_reduce(u64x4 x, u64x4 mod, u64x4 imod) {
-        auto x_ninv = _mm256_mul_epu32(__m256i(x), __m256i(imod));
-        auto x_res = _mm256_add_epi64(__m256i(x), _mm256_mul_epu32(x_ninv, __m256i(mod)));
-        return u64x4(_mm256_bsrli_epi128(x_res, 4));
-    }
-    u64x4 montgomery_mul(u64x4 x, u64x4 y, u64x4 mod, u64x4 imod) {
-        return montgomery_reduce(u64x4(_mm256_mul_epu32(__m256i(x), __m256i(y))), mod, imod);
-    }
 
     struct cvector {
         std::vector<vpoint, big_alloc<vpoint>> r;
@@ -99,8 +79,7 @@ namespace cp_algo::math::fft {
         }
         template<int step>
         static void exec_on_eval(size_t n, size_t k, auto &&callback) {
-            point factor = root(4 * step * n);
-            callback(factor * eval_point(step * k));
+            callback(root(4 * step * n) * eval_point(step * k));
         }
 
         void dot(cvector const& t) {
diff --git a/cp-algo/util/simd.hpp b/cp-algo/util/simd.hpp
@@ -0,0 +1,61 @@
+#ifndef CP_ALGO_UTIL_SIMD_HPP
+#define CP_ALGO_UTIL_SIMD_HPP
+#include <experimental/simd>
+#include <cstdint>
+#include <cstddef>
+namespace cp_algo {
+    template<typename T, size_t len>
+    using simd [[gnu::vector_size(len * sizeof(T))]] = T;
+    using i64x4 = simd<int64_t, 4>;
+    using u64x4 = simd<uint64_t, 4>;
+    using u32x8 = simd<uint32_t, 8>;
+    using u32x4 = simd<uint32_t, 4>;
+
+    template<typename Simd>
+    Simd abs(Simd a) {
+#ifdef __AVX2__
+    return _mm256_and_pd(a, Simd{} + 1/0.);
+#else
+    return a < 0 ? -a : a;
+#endif
+    }
+
+    template<typename Simd>
+    i64x4 lround(Simd a) {
+#ifdef __AVX2__
+        return __builtin_convertvector(_mm256_round_pd(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC), i64x4);
+#else
+        return __builtin_convertvector(a < 0 ? a - 0.5 : a + 0.5, i64x4);
+#endif
+    }
+
+    template<typename Simd>
+    Simd round(Simd a) {
+#ifdef __AVX2__
+        return _mm256_round_pd(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+#else
+        return __builtin_convertvector(lround(a), Simd);
+#endif
+    }
+
+    u64x4 montgomery_reduce(u64x4 x, u64x4 mod, u64x4 imod) {
+#ifdef __AVX2__
+        auto x_ninv = _mm256_mul_epu32(__m256i(x), __m256i(imod));
+        auto x_res = _mm256_add_epi64(__m256i(x), _mm256_mul_epu32(x_ninv, __m256i(mod)));
+        return u64x4(_mm256_bsrli_epi128(x_res, 4));
+#else
+        auto x_ninv = x * imod;
+        auto x_res = x + ((x_ninv << 32) >> 32) * mod;
+        return u64x4(x_res >> 32);
+#endif
+    }
+
+    u64x4 montgomery_mul(u64x4 x, u64x4 y, u64x4 mod, u64x4 imod) {
+#ifdef __AVX2__
+        return montgomery_reduce(u64x4(_mm256_mul_epu32(__m256i(x), __m256i(y))), mod, imod);
+#else
+        return montgomery_reduce(x * y, mod, imod);
+#endif
+    }
+}
+#endif // CP_ALGO_UTIL_SIMD_HPP