14#include "backends/seq/impl.hpp"
16#include "backends/omp/impl.hpp"
23 : rows_(rows), cols_(cols), data_(
new real[rows * cols]()) {}
26 : rows_(rows), cols_(cols), data_(
new real[rows * cols]) {
27 std::fill_n(data_.get(),
size(),
val);
35 : rows_(
o.rows_), cols_(
o.cols_), data_(
new real[
o.size()]) {
36 std::copy_n(
o.data_.get(),
size(), data_.get());
40 : rows_(
o.rows_), cols_(
o.cols_), data_(std::move(
o.data_)), d_data_(
o.d_data_) {
41 o.rows_ =
o.cols_ = 0;
50 std::copy_n(
o.data_.get(),
size(), data_.get());
60 data_ = std::move(
o.data_);
62 o.rows_ =
o.cols_ = 0;
Dense row-major matrix with optional GPU storage.
constexpr idx size() const noexcept
Matrix & operator=(const Matrix &)
void matmul(const Matrix &A, const Matrix &B, Matrix &C)
void matadd(real alpha, const Matrix &A, real beta, const Matrix &B, Matrix &C)
void matvec(const Matrix &A, const Vector &x, Vector &y)
void matmul(const Matrix &A, const Matrix &B, Matrix &C)
void matvec(const Matrix &A, const Vector &x, Vector &y)
void matvec(const Matrix &A, const Vector &x, Vector &y)
void matadd(real alpha, const Matrix &A, real beta, const Matrix &B, Matrix &C)
void matmul(const Matrix &A, const Matrix &B, Matrix &C)
void matmul_register_blocked(const Matrix &A, const Matrix &B, Matrix &C, idx block_size, idx reg_size)
void matmul(const Matrix &A, const Matrix &B, Matrix &C)
void matvec(const Matrix &A, const Vector &x, Vector &y)
void matmul_blocked(const Matrix &A, const Matrix &B, Matrix &C, idx block_size)
void matadd(real alpha, const Matrix &A, real beta, const Matrix &B, Matrix &C)
void matvec(const Matrix &A, const Vector &x, Vector &y)
void matmul(const Matrix &A, const Matrix &B, Matrix &C, idx block_size)
void to_device(real *dst, const real *src, idx n)
Copy host to device.
void free(real *ptr)
Free device memory.
real * alloc(idx n)
Allocate device memory.
void to_host(real *dst, const real *src, idx n)
Copy device to host.
void matmul_simd(const Matrix &A, const Matrix &B, Matrix &C, idx block_size=64)
C = A * B (SIMD-accelerated)
Backend
Selects which backend handles a linalg operation.
@ gpu
CUDA – custom kernels or cuBLAS.
@ omp
OpenMP parallel blocked loops.
@ blocked
Cache-blocked; compiler auto-vectorizes inner loops.
@ simd
Hand-written SIMD intrinsics (AVX2 or NEON)
@ blas
cblas/LAPACKE – OpenBLAS, MKL, Apple Accelerate
@ seq
Naive textbook loops – always available.
void matvec_simd(const Matrix &A, const Vector &x, Vector &y)
y = A * x (SIMD-accelerated)
real beta(real a, real b)
B(a, b) – beta function.
constexpr T ipow(T x) noexcept
Compute x^N at compile time via repeated squaring.
void matvec(const Matrix &A, const Vector &x, Vector &y, Backend b=default_backend)
y = A * x
void matmul_blocked(const Matrix &A, const Matrix &B, Matrix &C, idx block_size=64)
C = A * B (cache-blocked)
void matmul(const Matrix &A, const Matrix &B, Matrix &C, Backend b=default_backend)
C = A * B.
void matadd(real alpha, const Matrix &A, real beta, const Matrix &B, Matrix &C, Backend b=default_backend)
C = alpha*A + beta*B.
void matmul_register_blocked(const Matrix &A, const Matrix &B, Matrix &C, idx block_size=64, idx reg_size=4)
C = A * B (register-blocked)
Private declarations for the BLAS backend. Only included by src/core/vector.cpp and src/core/matrix....
Private declarations for the GPU (CUDA) backend. Only included by src/core/vector....
Private declarations for the SIMD backend. Only included by src/core/vector.cpp and src/core/matrix....