15#include "../seq/impl.hpp"
19#ifdef NUMERICS_HAS_AVX2
20# include <immintrin.h>
23#ifdef NUMERICS_HAS_NEON
29static_assert(
sizeof(
real) == 8,
"SIMD kernels require real == double");
32#ifdef NUMERICS_HAS_AVX2
37 const idx N =
B.cols();
61 const idx M =
A.rows(),
K =
A.cols(), N =
B.cols();
62 std::fill_n(
C.data(),
M * N,
real(0));
101 const idx M =
A.rows(), N =
A.cols();
105 for (;
j + 4 <= N;
j += 4) {
124#ifdef NUMERICS_HAS_NEON
129 const idx N =
B.cols();
156 const idx M =
A.rows(),
K =
A.cols(), N =
B.cols();
157 std::fill_n(
C.data(),
M * N,
real(0));
196 const idx M =
A.rows(), N =
A.cols();
200 for (;
j + 2 <= N;
j += 2) {
217#if defined(NUMERICS_HAS_AVX2)
219#elif defined(NUMERICS_HAS_NEON)
227#if defined(NUMERICS_HAS_AVX2)
229#elif defined(NUMERICS_HAS_NEON)
Dense row-major matrix with optional GPU storage.
void matvec(const Matrix &A, const Vector &x, Vector &y)
void matmul_blocked(const Matrix &A, const Matrix &B, Matrix &C, idx block_size)
void matvec(const Matrix &A, const Vector &x, Vector &y)
void matmul(const Matrix &A, const Matrix &B, Matrix &C, idx block_size)
constexpr T ipow(T x) noexcept
Compute x^N at compile time via repeated squaring.