numerics
Loading...
Searching...
No Matches
matrix.cpp
Go to the documentation of this file.
1/// @file core/backends/seq/matrix.cpp
2/// @brief Sequential and blocked C++ matrix operations
3///
4/// seq: textbook reference implementations (correct, readable, always available).
5/// blocked: cache-blocked loops that compiler can auto-vectorize.
6/// matmul_blocked and matmul_register_blocked live here -- they are pure
7/// optimization algorithms, not class implementation.
8
9#include "core/matrix.hpp"
10#include <algorithm>
11
12namespace num::backends::seq {
13
14void matmul(const Matrix& A, const Matrix& B, Matrix& C) {
15 const idx M = A.rows(), K = A.cols(), N = B.cols();
16 for (idx i = 0; i < M; ++i)
17 for (idx j = 0; j < N; ++j) {
18 C(i, j) = 0;
19 for (idx k = 0; k < K; ++k)
20 C(i, j) += A(i, k) * B(k, j);
21 }
22}
23
24void matvec(const Matrix& A, const Vector& x, Vector& y) {
25 for (idx i = 0; i < A.rows(); ++i) {
26 y[i] = 0;
27 for (idx j = 0; j < A.cols(); ++j)
28 y[i] += A(i, j) * x[j];
29 }
30}
31
32void matadd(real alpha, const Matrix& A, real beta, const Matrix& B, Matrix& C) {
33 for (idx i = 0; i < A.size(); ++i)
34 C.data()[i] = alpha * A.data()[i] + beta * B.data()[i];
35}
36
37// Cache-blocked matrix multiply
38//
39// WHY THE NAIVE LOOP IS SLOW
40// All three matrices are stored row-major. In the naive i-j-k order:
41//
42// for i: <- row of A and C
43// for j: <- column of B, element of C
44// for k: <- inner dimension
45// C(i,j) += A(i,k)*B(k,j)
46//
47// A(i,k) is read left-to-right along row i as k increases -- cache-friendly.
48// B(k,j) for fixed j and varying k is a stride-N access (column of B) --
49// every step jumps N doubles = N*8 bytes. For N=512 that is 4 KB per step,
50// thrashing the cache and stalling on every B load.
51//
52// HOW BLOCKING FIXES IT
53// Divide A, B, C into BLOCK*BLOCK tiles. The working set for one tile
54// triple is 3 * BLOCK^2 * 8 bytes:
55//
56// BLOCK = 32 -> 24 KB (fits in a 32 KB L1 data cache)
57// BLOCK = 64 -> 98 KB (fits in a 256 KB L2 cache)
58// BLOCK = 128 -> 393 KB (fits in a 512 KB L2 or L3 at 4 MB)
59//
60// Within each tile the hot j loop accesses B(k,j) left-to-right along a
61// single row of B -- now fully sequential. The hardware prefetcher can
62// recognise the pattern and hide latency.
63//
64// OUTER LOOP ORDER: ii -> jj -> kk
65// For fixed (ii, jj), the C tile stays in L2 for the entire kk loop.
66//
67// INNER (MICRO-KERNEL) LOOP ORDER: i -> k -> j
68// A(i,k) is hoisted into a register, turning the inner kernel into:
69// for j in [jj, jj+B):
70// C[i][j] += scalar * B[k][j] <- AXPY on contiguous memory
71// Trivially auto-vectorisable.
72
73void matmul_blocked(const Matrix& A, const Matrix& B, Matrix& C, idx block_size) {
74 const idx M = A.rows(), K = A.cols(), N = B.cols();
75 std::fill_n(C.data(), M * N, real(0));
76
77 for (idx ii = 0; ii < M; ii += block_size) {
78 const idx i_end = std::min(ii + block_size, M);
79 for (idx jj = 0; jj < N; jj += block_size) {
80 const idx j_end = std::min(jj + block_size, N);
81 for (idx kk = 0; kk < K; kk += block_size) {
82 const idx k_end = std::min(kk + block_size, K);
83 for (idx i = ii; i < i_end; ++i) {
84 for (idx k = kk; k < k_end; ++k) {
85 const real a_ik = A(i, k);
86 for (idx j = jj; j < j_end; ++j)
87 C(i, j) += a_ik * B(k, j);
88 }
89 }
90 }
91 }
92 }
93}
94
95// Register-blocked matrix multiply
96//
97// Extends cache blocking with a REG*REG register tile inside each cache tile.
98// The accumulator for each small C sub-block is kept in local doubles
99// (promoted to registers by the compiler) for the full k sweep, then written
100// back to C once -- eliminating the per-iteration load+store of C.
101//
102// Register blocking only pays off when combined with explicit SIMD, where each
103// j-step processes 4 doubles simultaneously. This file is the conceptual
104// bridge: the loop structure here is exactly what matmul_avx implements, with
105// scalar loads replaced by vector intrinsics.
106
109 const idx M = A.rows(), K = A.cols(), N = B.cols();
110 std::fill_n(C.data(), M * N, real(0));
111
112 for (idx ii = 0; ii < M; ii += block_size) {
113 const idx i_lim = std::min(ii + block_size, M);
114 for (idx jj = 0; jj < N; jj += block_size) {
115 const idx j_lim = std::min(jj + block_size, N);
116 for (idx kk = 0; kk < K; kk += block_size) {
117 const idx k_lim = std::min(kk + block_size, K);
118 for (idx ir = ii; ir < i_lim; ir += reg_size) {
119 const idx ri = std::min(ir + reg_size, i_lim);
120 for (idx jr = jj; jr < j_lim; jr += reg_size) {
121 const idx rj = std::min(jr + reg_size, j_lim);
122 real c[4][4] = {};
123 for (idx i = ir; i < ri; ++i)
124 for (idx j = jr; j < rj; ++j)
125 c[i - ir][j - jr] = C(i, j);
126 for (idx k = kk; k < k_lim; ++k) {
127 for (idx i = ir; i < ri; ++i) {
128 const real a_ik = A(i, k);
129 for (idx j = jr; j < rj; ++j)
130 c[i - ir][j - jr] += a_ik * B(k, j);
131 }
132 }
133 for (idx i = ir; i < ri; ++i)
134 for (idx j = jr; j < rj; ++j)
135 C(i, j) = c[i - ir][j - jr];
136 }
137 }
138 }
139 }
140 }
141}
142
143} // namespace num::backends::seq
Dense row-major matrix with optional GPU storage.
Definition matrix.hpp:12
Matrix operations.
void matmul_register_blocked(const Matrix &A, const Matrix &B, Matrix &C, idx block_size, idx reg_size)
Definition matrix.cpp:107
void matmul(const Matrix &A, const Matrix &B, Matrix &C)
Definition matrix.cpp:14
void matvec(const Matrix &A, const Vector &x, Vector &y)
Definition matrix.cpp:24
void matmul_blocked(const Matrix &A, const Matrix &B, Matrix &C, idx block_size)
Definition matrix.cpp:73
void matadd(real alpha, const Matrix &A, real beta, const Matrix &B, Matrix &C)
Definition matrix.cpp:32
double real
Definition types.hpp:10
real beta(real a, real b)
B(a, b) – beta function.
Definition math.hpp:242
constexpr T ipow(T x) noexcept
Compute x^N at compile time via repeated squaring.
std::size_t idx
Definition types.hpp:11