numerics 0.1.0
Loading...
Searching...
No Matches
matrix.cpp
Go to the documentation of this file.
1/// @file core/backends/seq/matrix.cpp
2/// @brief Sequential and blocked C++ matrix operations
3///
4/// seq: textbook reference implementations (correct, readable, always
5/// available). blocked: cache-blocked loops that compiler can auto-vectorize.
6/// matmul_blocked and matmul_register_blocked live here -- they are pure
7/// optimization algorithms, not class implementation.
8
9#include "core/matrix.hpp"
10#include <algorithm>
11
12namespace num::backends::seq {
13
14void matmul(const Matrix& A, const Matrix& B, Matrix& C) {
15 const idx M = A.rows(), K = A.cols(), N = B.cols();
16 for (idx i = 0; i < M; ++i)
17 for (idx j = 0; j < N; ++j) {
18 C(i, j) = 0;
19 for (idx k = 0; k < K; ++k)
20 C(i, j) += A(i, k) * B(k, j);
21 }
22}
23
24void matvec(const Matrix& A, const Vector& x, Vector& y) {
25 for (idx i = 0; i < A.rows(); ++i) {
26 y[i] = 0;
27 for (idx j = 0; j < A.cols(); ++j)
28 y[i] += A(i, j) * x[j];
29 }
30}
31
32void matadd(real alpha,
33 const Matrix& A,
34 real beta,
35 const Matrix& B,
36 Matrix& C) {
37 for (idx i = 0; i < A.size(); ++i)
38 C.data()[i] = alpha * A.data()[i] + beta * B.data()[i];
39}
40
41// Cache-blocked matrix multiply
42//
43// WHY THE NAIVE LOOP IS SLOW
44// All three matrices are stored row-major. In the naive i-j-k order:
45//
46// for i: <- row of A and C
47// for j: <- column of B, element of C
48// for k: <- inner dimension
49// C(i,j) += A(i,k)*B(k,j)
50//
51// A(i,k) is read left-to-right along row i as k increases -- cache-friendly.
52// B(k,j) for fixed j and varying k is a stride-N access (column of B) --
53// every step jumps N doubles = N*8 bytes. For N=512 that is 4 KB per step,
54// thrashing the cache and stalling on every B load.
55//
56// HOW BLOCKING FIXES IT
57// Divide A, B, C into BLOCK*BLOCK tiles. The working set for one tile
58// triple is 3 * BLOCK^2 * 8 bytes:
59//
60// BLOCK = 32 -> 24 KB (fits in a 32 KB L1 data cache)
61// BLOCK = 64 -> 98 KB (fits in a 256 KB L2 cache)
62// BLOCK = 128 -> 393 KB (fits in a 512 KB L2 or L3 at 4 MB)
63//
64// Within each tile the hot j loop accesses B(k,j) left-to-right along a
65// single row of B -- now fully sequential. The hardware prefetcher can
66// recognise the pattern and hide latency.
67//
68// OUTER LOOP ORDER: ii -> jj -> kk
69// For fixed (ii, jj), the C tile stays in L2 for the entire kk loop.
70//
71// INNER (MICRO-KERNEL) LOOP ORDER: i -> k -> j
72// A(i,k) is hoisted into a register, turning the inner kernel into:
73// for j in [jj, jj+B):
74// C[i][j] += scalar * B[k][j] <- AXPY on contiguous memory
75// Trivially auto-vectorisable.
76
77void matmul_blocked(const Matrix& A,
78 const Matrix& B,
79 Matrix& C,
80 idx block_size) {
81 const idx M = A.rows(), K = A.cols(), N = B.cols();
82 std::fill_n(C.data(), M * N, real(0));
83
84 for (idx ii = 0; ii < M; ii += block_size) {
85 const idx i_end = std::min(ii + block_size, M);
86 for (idx jj = 0; jj < N; jj += block_size) {
87 const idx j_end = std::min(jj + block_size, N);
88 for (idx kk = 0; kk < K; kk += block_size) {
89 const idx k_end = std::min(kk + block_size, K);
90 for (idx i = ii; i < i_end; ++i) {
91 for (idx k = kk; k < k_end; ++k) {
92 const real a_ik = A(i, k);
93 for (idx j = jj; j < j_end; ++j)
94 C(i, j) += a_ik * B(k, j);
95 }
96 }
97 }
98 }
99 }
100}
101
102// Register-blocked matrix multiply
103//
104// Extends cache blocking with a REG*REG register tile inside each cache tile.
105// The accumulator for each small C sub-block is kept in local doubles
106// (promoted to registers by the compiler) for the full k sweep, then written
107// back to C once -- eliminating the per-iteration load+store of C.
108//
109// Register blocking only pays off when combined with explicit SIMD, where each
110// j-step processes 4 doubles simultaneously. This file is the conceptual
111// bridge: the loop structure here is exactly what matmul_avx implements, with
112// scalar loads replaced by vector intrinsics.
113
115 const Matrix& B,
116 Matrix& C,
117 idx block_size,
118 idx reg_size) {
119 const idx M = A.rows(), K = A.cols(), N = B.cols();
120 std::fill_n(C.data(), M * N, real(0));
121
122 for (idx ii = 0; ii < M; ii += block_size) {
123 const idx i_lim = std::min(ii + block_size, M);
124 for (idx jj = 0; jj < N; jj += block_size) {
125 const idx j_lim = std::min(jj + block_size, N);
126 for (idx kk = 0; kk < K; kk += block_size) {
127 const idx k_lim = std::min(kk + block_size, K);
128 for (idx ir = ii; ir < i_lim; ir += reg_size) {
129 const idx ri = std::min(ir + reg_size, i_lim);
130 for (idx jr = jj; jr < j_lim; jr += reg_size) {
131 const idx rj = std::min(jr + reg_size, j_lim);
132 real c[4][4] = {};
133 for (idx i = ir; i < ri; ++i)
134 for (idx j = jr; j < rj; ++j)
135 c[i - ir][j - jr] = C(i, j);
136 for (idx k = kk; k < k_lim; ++k) {
137 for (idx i = ir; i < ri; ++i) {
138 const real a_ik = A(i, k);
139 for (idx j = jr; j < rj; ++j)
140 c[i - ir][j - jr] += a_ik * B(k, j);
141 }
142 }
143 for (idx i = ir; i < ri; ++i)
144 for (idx j = jr; j < rj; ++j)
145 C(i, j) = c[i - ir][j - jr];
146 }
147 }
148 }
149 }
150 }
151}
152
153} // namespace num::backends::seq
Dense row-major matrix with optional GPU storage.
Definition matrix.hpp:12
constexpr idx size() const noexcept
Definition matrix.hpp:26
real * data()
Definition matrix.hpp:28
constexpr idx rows() const noexcept
Definition matrix.hpp:24
constexpr idx cols() const noexcept
Definition matrix.hpp:25
Matrix operations.
void matmul_register_blocked(const Matrix &A, const Matrix &B, Matrix &C, idx block_size, idx reg_size)
Definition matrix.cpp:114
void matmul(const Matrix &A, const Matrix &B, Matrix &C)
Definition matrix.cpp:14
void matvec(const Matrix &A, const Vector &x, Vector &y)
Definition matrix.cpp:24
void matmul_blocked(const Matrix &A, const Matrix &B, Matrix &C, idx block_size)
Definition matrix.cpp:77
void matadd(real alpha, const Matrix &A, real beta, const Matrix &B, Matrix &C)
Definition matrix.cpp:32
double real
Definition types.hpp:10
real beta(real a, real b)
B(a, b) – beta function.
Definition math.hpp:248
std::size_t idx
Definition types.hpp:11