numerics
Loading...
Searching...
No Matches
matrix.cpp
Go to the documentation of this file.
1/// @file core/backends/opt/matrix.cpp
2/// @brief SIMD backend -- hand-written AVX2/NEON matmul and matvec
3///
4/// Compile-time dispatch:
5/// NUMERICS_HAS_AVX2 -> AVX-256 + FMA (x86-64, 4 doubles/register)
6/// NUMERICS_HAS_NEON -> ARM NEON (AArch64, 2 doubles/register)
7/// neither -> falls back to cache-blocked scalar
8///
9/// Both backends use the same register-tile structure:
10/// Outer cache tile: ii -> jj -> kk (B tile stays in L2)
11/// Inner reg tile: 4 rows x 4 cols (AVX: 4 YMM regs; NEON: 8 Q-regs)
12/// Hot k loop: one vector FMA per row, zero loop overhead for j
13
14#include "core/matrix.hpp"
15#include "../seq/impl.hpp"
16#include <algorithm>
17#include <cassert>
18
19#ifdef NUMERICS_HAS_AVX2
20# include <immintrin.h>
21#endif
22
23#ifdef NUMERICS_HAS_NEON
24# include <arm_neon.h>
25#endif
26
28
29static_assert(sizeof(real) == 8, "SIMD kernels require real == double");
30
31// AVX-256 backend
32#ifdef NUMERICS_HAS_AVX2
33
34static inline void avx_tile_4x4(const Matrix& A, const Matrix& B, Matrix& C,
36{
37 const idx N = B.cols();
38 real* Crow = C.data() + ir * N;
39
40 __m256d c0 = _mm256_loadu_pd(Crow + 0 * N + jr);
41 __m256d c1 = _mm256_loadu_pd(Crow + 1 * N + jr);
42 __m256d c2 = _mm256_loadu_pd(Crow + 2 * N + jr);
43 __m256d c3 = _mm256_loadu_pd(Crow + 3 * N + jr);
44
45 for (idx k = kk; k < k_lim; ++k) {
46 __m256d b = _mm256_loadu_pd(B.data() + k * N + jr);
47 c0 = _mm256_fmadd_pd(_mm256_set1_pd(A(ir+0, k)), b, c0);
51 }
52
53 _mm256_storeu_pd(Crow + 0 * N + jr, c0);
54 _mm256_storeu_pd(Crow + 1 * N + jr, c1);
55 _mm256_storeu_pd(Crow + 2 * N + jr, c2);
56 _mm256_storeu_pd(Crow + 3 * N + jr, c3);
57}
58
59static void matmul_avx(const Matrix& A, const Matrix& B, Matrix& C, idx block_size)
60{
61 const idx M = A.rows(), K = A.cols(), N = B.cols();
62 std::fill_n(C.data(), M * N, real(0));
63
64 for (idx ii = 0; ii < M; ii += block_size) {
65 const idx i_lim = std::min(ii + block_size, M);
66 for (idx jj = 0; jj < N; jj += block_size) {
67 const idx j_lim = std::min(jj + block_size, N);
68 for (idx kk = 0; kk < K; kk += block_size) {
69 const idx k_lim = std::min(kk + block_size, K);
70 idx ir = ii;
71 for (; ir + 4 <= i_lim; ir += 4) {
72 idx jr = jj;
73 for (; jr + 4 <= j_lim; jr += 4)
74 avx_tile_4x4(A, B, C, ir, jr, kk, k_lim);
75 for (; jr < j_lim; ++jr) {
76 real c0 = C(ir+0, jr), c1 = C(ir+1, jr);
77 real c2 = C(ir+2, jr), c3 = C(ir+3, jr);
78 for (idx k = kk; k < k_lim; ++k) {
79 real b = B(k, jr);
80 c0 += A(ir+0, k) * b; c1 += A(ir+1, k) * b;
81 c2 += A(ir+2, k) * b; c3 += A(ir+3, k) * b;
82 }
83 C(ir+0, jr) = c0; C(ir+1, jr) = c1;
84 C(ir+2, jr) = c2; C(ir+3, jr) = c3;
85 }
86 }
87 for (; ir < i_lim; ++ir) {
88 for (idx k = kk; k < k_lim; ++k) {
89 const real a_ik = A(ir, k);
90 for (idx j = jj; j < j_lim; ++j)
91 C(ir, j) += a_ik * B(k, j);
92 }
93 }
94 }
95 }
96 }
97}
98
99static void matvec_avx(const Matrix& A, const Vector& x, Vector& y)
100{
101 const idx M = A.rows(), N = A.cols();
102 for (idx i = 0; i < M; ++i) {
104 idx j = 0;
105 for (; j + 4 <= N; j += 4) {
106 __m256d a = _mm256_loadu_pd(A.data() + i * N + j);
109 }
112 __m128d sum = _mm_add_pd(lo, hi);
115 for (; j < N; ++j)
116 result += A(i, j) * x[j];
117 y[i] = result;
118 }
119}
120
121#endif // NUMERICS_HAS_AVX2
122
123// ARM NEON backend
124#ifdef NUMERICS_HAS_NEON
125
126static inline void neon_tile_4x4(const Matrix& A, const Matrix& B, Matrix& C,
127 idx ir, idx jr, idx kk, idx k_lim)
128{
129 const idx N = B.cols();
130 real* Crow = C.data() + ir * N;
131
136
137 for (idx k = kk; k < k_lim; ++k) {
138 const real* Brow = B.data() + k * N + jr;
146 }
147
148 vst1q_f64(Crow + 0*N+jr, c0lo); vst1q_f64(Crow + 0*N+jr+2, c0hi);
149 vst1q_f64(Crow + 1*N+jr, c1lo); vst1q_f64(Crow + 1*N+jr+2, c1hi);
150 vst1q_f64(Crow + 2*N+jr, c2lo); vst1q_f64(Crow + 2*N+jr+2, c2hi);
151 vst1q_f64(Crow + 3*N+jr, c3lo); vst1q_f64(Crow + 3*N+jr+2, c3hi);
152}
153
154static void matmul_neon(const Matrix& A, const Matrix& B, Matrix& C, idx block_size)
155{
156 const idx M = A.rows(), K = A.cols(), N = B.cols();
157 std::fill_n(C.data(), M * N, real(0));
158
159 for (idx ii = 0; ii < M; ii += block_size) {
160 const idx i_lim = std::min(ii + block_size, M);
161 for (idx jj = 0; jj < N; jj += block_size) {
162 const idx j_lim = std::min(jj + block_size, N);
163 for (idx kk = 0; kk < K; kk += block_size) {
164 const idx k_lim = std::min(kk + block_size, K);
165 idx ir = ii;
166 for (; ir + 4 <= i_lim; ir += 4) {
167 idx jr = jj;
168 for (; jr + 4 <= j_lim; jr += 4)
169 neon_tile_4x4(A, B, C, ir, jr, kk, k_lim);
170 for (; jr < j_lim; ++jr) {
171 real c0 = C(ir+0,jr), c1 = C(ir+1,jr);
172 real c2 = C(ir+2,jr), c3 = C(ir+3,jr);
173 for (idx k = kk; k < k_lim; ++k) {
174 real b = B(k,jr);
175 c0 += A(ir+0,k)*b; c1 += A(ir+1,k)*b;
176 c2 += A(ir+2,k)*b; c3 += A(ir+3,k)*b;
177 }
178 C(ir+0,jr)=c0; C(ir+1,jr)=c1;
179 C(ir+2,jr)=c2; C(ir+3,jr)=c3;
180 }
181 }
182 for (; ir < i_lim; ++ir) {
183 for (idx k = kk; k < k_lim; ++k) {
184 const real a_ik = A(ir,k);
185 for (idx j = jj; j < j_lim; ++j)
186 C(ir,j) += a_ik * B(k,j);
187 }
188 }
189 }
190 }
191 }
192}
193
194static void matvec_neon(const Matrix& A, const Vector& x, Vector& y)
195{
196 const idx M = A.rows(), N = A.cols();
197 for (idx i = 0; i < M; ++i) {
199 idx j = 0;
200 for (; j + 2 <= N; j += 2) {
201 float64x2_t a = vld1q_f64(A.data() + i * N + j);
202 float64x2_t xv = vld1q_f64(x.data() + j);
203 acc = vfmaq_f64(acc, a, xv);
204 }
206 for (; j < N; ++j)
207 result += A(i,j) * x[j];
208 y[i] = result;
209 }
210}
211
212#endif // NUMERICS_HAS_NEON
213
214// Implementations -- compile-time dispatch to best available SIMD backend
215
216void matmul(const Matrix& A, const Matrix& B, Matrix& C, idx block_size) {
217#if defined(NUMERICS_HAS_AVX2)
219#elif defined(NUMERICS_HAS_NEON)
221#else
223#endif
224}
225
226void matvec(const Matrix& A, const Vector& x, Vector& y) {
227#if defined(NUMERICS_HAS_AVX2)
228 matvec_avx(A, x, y);
229#elif defined(NUMERICS_HAS_NEON)
230 matvec_neon(A, x, y);
231#else
233#endif
234}
235
236} // namespace num::backends::simd
Dense row-major matrix with optional GPU storage.
Definition matrix.hpp:12
Matrix operations.
void matvec(const Matrix &A, const Vector &x, Vector &y)
Definition matrix.cpp:24
void matmul_blocked(const Matrix &A, const Matrix &B, Matrix &C, idx block_size)
Definition matrix.cpp:73
void matvec(const Matrix &A, const Vector &x, Vector &y)
Definition matrix.cpp:226
void matmul(const Matrix &A, const Matrix &B, Matrix &C, idx block_size)
Definition matrix.cpp:216
double real
Definition types.hpp:10
constexpr T ipow(T x) noexcept
Compute x^N at compile time via repeated squaring.
std::size_t idx
Definition types.hpp:11