15#include "backends/seq/impl.hpp"
17#include "backends/omp/impl.hpp"
26 , data_(new
real[rows * cols]()) {}
31 , data_(new
real[rows * cols]) {
32 std::fill_n(data_.get(),
size(), val);
43 , data_(new
real[o.size()]) {
44 std::copy_n(o.data_.get(),
size(), data_.get());
50 , data_(std::move(o.data_))
51 , d_data_(o.d_data_) {
52 o.rows_ = o.cols_ = 0;
61 std::copy_n(o.data_.get(),
size(), data_.get());
72 data_ = std::move(o.data_);
74 o.rows_ = o.cols_ = 0;
Dense row-major matrix with optional GPU storage.
constexpr idx size() const noexcept
Matrix & operator=(const Matrix &)
Private declarations for the BLAS backend. Only included by src/core/vector.cpp and src/core/matrix....
Private declarations for the GPU (CUDA) backend. Only included by src/core/vector....
Private declarations for the SIMD backend. Only included by src/core/vector.cpp and src/core/matrix....
void matmul(const Matrix &A, const Matrix &B, Matrix &C)
void matadd(real alpha, const Matrix &A, real beta, const Matrix &B, Matrix &C)
void matvec(const Matrix &A, const Vector &x, Vector &y)
void matmul(const Matrix &A, const Matrix &B, Matrix &C)
void matvec(const Matrix &A, const Vector &x, Vector &y)
void matvec(const Matrix &A, const Vector &x, Vector &y)
void matadd(real alpha, const Matrix &A, real beta, const Matrix &B, Matrix &C)
void matmul(const Matrix &A, const Matrix &B, Matrix &C)
void matmul_register_blocked(const Matrix &A, const Matrix &B, Matrix &C, idx block_size, idx reg_size)
void matmul(const Matrix &A, const Matrix &B, Matrix &C)
void matvec(const Matrix &A, const Vector &x, Vector &y)
void matmul_blocked(const Matrix &A, const Matrix &B, Matrix &C, idx block_size)
void matadd(real alpha, const Matrix &A, real beta, const Matrix &B, Matrix &C)
void matvec(const Matrix &A, const Vector &x, Vector &y)
void matmul(const Matrix &A, const Matrix &B, Matrix &C, idx block_size)
void to_device(real *dst, const real *src, idx n)
Copy host to device.
void free(real *ptr)
Free device memory.
real * alloc(idx n)
Allocate device memory.
void to_host(real *dst, const real *src, idx n)
Copy device to host.
void matmul_simd(const Matrix &A, const Matrix &B, Matrix &C, idx block_size=64)
C = A * B (SIMD-accelerated)
Backend
Selects which backend handles a linalg operation.
@ gpu
CUDA – custom kernels or cuBLAS.
@ omp
OpenMP parallel blocked loops.
@ blocked
Cache-blocked; compiler auto-vectorizes inner loops.
@ simd
Hand-written SIMD intrinsics (AVX2 or NEON)
@ blas
cblas – OpenBLAS, MKL, Apple Accelerate (Level-1/2/3)
@ lapack
LAPACKE – industry-standard factorizations, SVD, eigen.
@ seq
Naive textbook loops – always available.
void matvec_simd(const Matrix &A, const Vector &x, Vector &y)
y = A * x (SIMD-accelerated)
real beta(real a, real b)
B(a, b) – beta function.
void matvec(const Matrix &A, const Vector &x, Vector &y, Backend b=default_backend)
y = A * x
void matmul_blocked(const Matrix &A, const Matrix &B, Matrix &C, idx block_size=64)
C = A * B (cache-blocked)
void matmul(const Matrix &A, const Matrix &B, Matrix &C, Backend b=default_backend)
C = A * B.
void matadd(real alpha, const Matrix &A, real beta, const Matrix &B, Matrix &C, Backend b=default_backend)
C = alpha*A + beta*B.
void matmul_register_blocked(const Matrix &A, const Matrix &B, Matrix &C, idx block_size=64, idx reg_size=4)
C = A * B (register-blocked)