12 for (
idx i = 0; i < M; ++i)
13 for (
idx j = 0; j < N; ++j) {
15 for (
idx k = 0; k < K; ++k)
16 C(i, j) += A(i, k) * B(k, j);
30 std::fill_n(C.
data(), M * N,
real(0));
32 for (
idx ii = 0; ii < M; ii += block_size) {
33 const idx i_end = std::min(ii + block_size, M);
34 for (
idx jj = 0; jj < N; jj += block_size) {
35 const idx j_end = std::min(jj + block_size, N);
36 for (
idx kk = 0; kk < K; kk += block_size) {
37 const idx k_end = std::min(kk + block_size, K);
38 for (
idx i = ii; i < i_end; ++i) {
39 for (
idx k = kk; k < k_end; ++k) {
40 const real a_ik = A(i, k);
41 for (
idx j = jj; j < j_end; ++j)
42 C(i, j) += a_ik * B(k, j);
56 std::fill_n(C.
data(), M * N,
real(0));
58 for (
idx ii = 0; ii < M; ii += block_size) {
59 const idx i_lim = std::min(ii + block_size, M);
60 for (
idx jj = 0; jj < N; jj += block_size) {
61 const idx j_lim = std::min(jj + block_size, N);
62 for (
idx kk = 0; kk < K; kk += block_size) {
63 const idx k_lim = std::min(kk + block_size, K);
64 for (
idx ir = ii; ir < i_lim; ir += reg_size) {
65 const idx ri = std::min(ir + reg_size, i_lim);
66 for (
idx jr = jj; jr < j_lim; jr += reg_size) {
67 const idx rj = std::min(jr + reg_size, j_lim);
69 for (
idx i = ir; i < ri; ++i)
70 for (
idx j = jr; j < rj; ++j)
71 c[i - ir][j - jr] = C(i, j);
72 for (
idx k = kk; k < k_lim; ++k) {
73 for (
idx i = ir; i < ri; ++i) {
74 const real a_ik = A(i, k);
75 for (
idx j = jr; j < rj; ++j)
76 c[i - ir][j - jr] += a_ik * B(k, j);
79 for (
idx i = ir; i < ri; ++i)
80 for (
idx j = jr; j < rj; ++j)
81 C(i, j) = c[i - ir][j - jr];
constexpr idx rows() const noexcept
constexpr idx size() const noexcept
constexpr idx cols() const noexcept
Dense row-major matrix templated over scalar type T.
void matmul_register_blocked(const Matrix &A, const Matrix &B, Matrix &C, idx block_size, idx reg_size)
void matmul(const Matrix &A, const Matrix &B, Matrix &C)
void matvec(const Matrix &A, const Vector &x, Vector &y)
void matmul_blocked(const Matrix &A, const Matrix &B, Matrix &C, idx block_size)
void matadd(real alpha, const Matrix &A, real beta, const Matrix &B, Matrix &C)
NUM_K_AINLINE void matvec(T *NUM_K_RESTRICT y, const T *NUM_K_RESTRICT A, const T *NUM_K_RESTRICT x, idx m, idx n) noexcept
y[i] = sum_j A[i*n + j] * x[j] (m x n row-major matrix)
NUM_K_AINLINE void axpbyz(T *NUM_K_RESTRICT z, const T *NUM_K_RESTRICT x, const T *NUM_K_RESTRICT y, T a, T b, idx n) noexcept
z[i] = a*x[i] + b*y[i].
real beta(real a, real b)
B(a, b) – beta function.
Tier-1 kernel: raw-pointer, inline, zero-overhead inner loops.