14#ifdef NUMERICS_HAS_CUDA
21 : n_(n), kl_(kl), ku_(ku), ldab_(2 * kl + ku + 1) {
22 if (n == 0)
throw std::invalid_argument(
"BandedMatrix: n must be positive");
23 data_ = std::make_unique<real[]>(ldab_ * n_);
28 std::fill_n(data_.get(), ldab_ * n_,
val);
32#ifdef NUMERICS_HAS_CUDA
39 data_ = std::make_unique<real[]>(ldab_ * n_);
40 std::memcpy(data_.get(),
other.data_.get(), ldab_ * n_ *
sizeof(
real));
46 data_(std::move(
other.data_)), d_data_(
other.d_data_) {
48 other.d_data_ =
nullptr;
54 data_ = std::make_unique<real[]>(ldab_ * n_);
55 std::memcpy(data_.get(),
other.data_.get(), ldab_ * n_ *
sizeof(
real));
56#ifdef NUMERICS_HAS_CUDA
57 if (d_data_) {
cuda::free(d_data_); d_data_ =
nullptr; }
65#ifdef NUMERICS_HAS_CUDA
69 data_ = std::move(
other.data_);
70 d_data_ =
other.d_data_;
77 return data_[(kl_ + ku_ +
i -
j) +
j * ldab_];
81 return data_[(kl_ + ku_ +
i -
j) +
j * ldab_];
85 return data_[
band_row + col * ldab_];
89 return data_[
band_row + col * ldab_];
93 return (
j <=
i + ku_) && (
i <=
j + kl_);
97#ifdef NUMERICS_HAS_CUDA
104#ifdef NUMERICS_HAS_CUDA
105 if (d_data_)
cuda::to_host(data_.get(), d_data_, ldab_ * n_);
112 const idx n =
A.size(), kl =
A.kl(), ku =
A.ku(), ldab =
A.ldab();
118 for (
idx j = 0;
j < n; ++
j) {
119 const idx kv = ku + kl;
176 const idx n =
A.size(), kl =
A.kl(), ku =
A.ku(), ldab =
A.ldab();
179 if (
b.size() != n)
throw std::invalid_argument(
"banded_lu_solve: dimension mismatch");
181 const idx kv = ku + kl;
183 for (
idx i = 0;
i < n; ++
i)
186 for (
idx j = 0;
j < n; ++
j) {
188 const idx last = std::min(
j + kl, n - 1);
193 for (
idx i =
j + 1;
i <= last; ++
i)
198 for (
idx j = n;
j > 0; --
j) {
199 const idx col =
j - 1;
200 x[col] /=
ab[
kv + col * ldab];
202 const idx first = (col > ku) ? col - ku : 0;
207 for (
idx i = first;
i < col; ++
i)
208 x[
i] -=
ab[
kv +
i - col + col * ldab] *
xc;
215 const idx n =
A.size(), kl =
A.kl(), ku =
A.ku(), ldab =
A.ldab();
217 const idx kv = ku + kl;
220 #pragma omp parallel for if(nrhs > 16)
222 for (
idx rhs = 0; rhs <
nrhs; ++rhs) {
223 real* x =
B + rhs * n;
224 for (
idx i = 0;
i < n; ++
i)
226 for (
idx j = 0;
j < n; ++
j) {
228 const idx last = std::min(
j + kl, n - 1);
230 for (
idx i =
j + 1;
i <= last; ++
i)
234 for (
idx j = n;
j > 0; --
j) {
235 const idx col =
j - 1;
236 x[col] /=
ab[
kv + col * ldab];
238 const idx first = (col > ku) ? col - ku : 0;
240 for (
idx i = first;
i < col; ++
i)
241 x[
i] -=
ab[
kv +
i - col + col * ldab] *
xc;
248 const idx n =
A.size();
249 if (
b.size() != n || x.
size() != n)
250 throw std::invalid_argument(
"banded_solve: dimension mismatch");
253 auto ipiv = std::make_unique<idx[]>(n);
270 const idx n =
A.size(), kl =
A.kl(), ku =
A.ku();
272 throw std::invalid_argument(
"banded_gemv: dimension mismatch");
275 const idx ldab =
A.ldab();
279 const idx kv = ku + kl;
281 if (
beta == 0.0) std::memset(
yp, 0, n *
sizeof(
real));
285 #pragma omp parallel for schedule(static) if(n > 1000)
287 for (
idx j = 0;
j < n; ++
j) {
291 const idx i_end = std::min(
j + kl, n - 1);
306 const idx n =
A.size(), kl =
A.kl(), ku =
A.ku(), ldab =
A.ldab();
308 const idx kv = ku + kl;
310 for (
idx j = 0;
j < n; ++
j) {
313 const idx i_end = std::min(
j + kl, n - 1);
322 const idx n =
A.size();
323 if (n == 0 ||
anorm == 0.0)
return 0.0;
High-performance banded matrix solver for HPC applications.
Banded matrix with efficient storage for direct solvers.
BandedMatrix & operator=(const BandedMatrix &)
real & operator()(idx i, idx j)
Access element at (row, col) in original matrix coordinates.
bool in_band(idx i, idx j) const
Check if (i,j) is within the band.
real & band(idx band_row, idx col)
Direct access to band storage.
BandedMatrix(idx n, idx kl, idx ku)
Construct a banded matrix.
constexpr idx size() const noexcept
void to_device(real *dst, const real *src, idx n)
Copy host to device.
void free(real *ptr)
Free device memory.
real * alloc(idx n)
Allocate device memory.
void to_host(real *dst, const real *src, idx n)
Copy device to host.
BandedSolverResult banded_solve(const BandedMatrix &A, const Vector &b, Vector &x)
Solve banded system Ax = b (convenience function)
Backend
Selects which backend handles a linalg operation.
@ gpu
CUDA – custom kernels or cuBLAS.
void banded_lu_solve_multi(const BandedMatrix &A, const idx *ipiv, real *B, idx nrhs)
Solve multiple right-hand sides using LU factorization.
real banded_norm1(const BandedMatrix &A)
Compute 1-norm of banded matrix.
real beta(real a, real b)
B(a, b) – beta function.
constexpr T ipow(T x) noexcept
Compute x^N at compile time via repeated squaring.
BandedSolverResult banded_lu(BandedMatrix &A, idx *ipiv)
LU factorization of banded matrix with partial pivoting.
real banded_rcond(const BandedMatrix &A, const idx *ipiv, real anorm)
Estimate reciprocal condition number of banded matrix.
void banded_gemv(real alpha, const BandedMatrix &A, const Vector &x, real beta, Vector &y, Backend backend=default_backend)
Banded matrix-vector product y = alpha*A*x + beta*y.
void banded_lu_solve(const BandedMatrix &A, const idx *ipiv, Vector &b)
Solve banded system using precomputed LU factorization.
void banded_matvec(const BandedMatrix &A, const Vector &x, Vector &y, Backend backend=default_backend)
Banded matrix-vector product y = A*x.
Result from banded solver.