14#ifdef NUMERICS_HAS_CUDA
24 , ldab_(2 * kl + ku + 1) {
26 throw std::invalid_argument(
"BandedMatrix: n must be positive");
27 data_ = std::make_unique<real[]>(ldab_ * n_);
32 std::fill_n(data_.get(), ldab_ * n_, val);
36#ifdef NUMERICS_HAS_CUDA
46 , ldab_(other.ldab_) {
47 data_ = std::make_unique<real[]>(ldab_ * n_);
48 std::memcpy(data_.get(), other.data_.get(), ldab_ * n_ *
sizeof(
real));
56 , data_(std::move(other.data_))
57 , d_data_(other.d_data_) {
59 other.d_data_ =
nullptr;
68 data_ = std::make_unique<real[]>(ldab_ * n_);
69 std::memcpy(data_.get(), other.data_.get(), ldab_ * n_ *
sizeof(
real));
70#ifdef NUMERICS_HAS_CUDA
82#ifdef NUMERICS_HAS_CUDA
90 data_ = std::move(other.data_);
91 d_data_ = other.d_data_;
93 other.d_data_ =
nullptr;
99 return data_[(kl_ + ku_ + i - j) + j * ldab_];
103 return data_[(kl_ + ku_ + i - j) + j * ldab_];
107 return data_[band_row + col * ldab_];
111 return data_[band_row + col * ldab_];
115 return (j <= i + ku_) && (i <= j + kl_);
119#ifdef NUMERICS_HAS_CUDA
127#ifdef NUMERICS_HAS_CUDA
140 for (
idx i = 0; i < n; ++i)
143 for (
idx j = 0; j < n; ++j) {
144 const idx kv = ku + kl;
145 const idx last_row = std::min(j + kl, n - 1);
147 real max_val = std::abs(ab[kv + j * ldab]);
149 for (
idx i = j + 1; i <= last_row; ++i) {
150 real val = std::abs(ab[kv + i - j + j * ldab]);
157 if (max_val == 0.0) {
158 result.success =
false;
159 result.pivot_row = j;
165 const idx col_start = (j > ku) ? j - ku : 0;
166 const idx col_end = std::min(j + kl, n - 1);
167 for (
idx col = col_start; col <= col_end; ++col) {
168 idx band_j = kl + ku + j - col;
169 idx band_pivot = kl + ku + pivot - col;
170 if (band_j < ldab && band_pivot < ldab)
171 std::swap(ab[band_j + col * ldab],
172 ab[band_pivot + col * ldab]);
176 real pivot_val = ab[kv + j * ldab];
177 const idx num_rows = std::min(kl, n - j - 1);
179 if (num_rows > 0 && pivot_val != 0.0) {
180 real inv_pivot = 1.0 / pivot_val;
184 for (
idx i = 1; i <= num_rows; ++i)
185 ab[kv + i + j * ldab] *= inv_pivot;
187 const idx col_end = std::min(j + ku, n - 1);
188 for (
idx k = j + 1; k <= col_end; ++k) {
189 real akj = ab[kv + j - k + k * ldab];
194 for (
idx i = 1; i <= num_rows; ++i) {
195 idx band_row = kv + j + i - k;
197 ab[band_row + k * ldab] -= ab[kv + i + j * ldab]
214 throw std::invalid_argument(
"banded_lu_solve: dimension mismatch");
216 const idx kv = ku + kl;
218 for (
idx i = 0; i < n; ++i)
220 std::swap(x[i], x[ipiv[i]]);
222 for (
idx j = 0; j < n; ++j) {
224 const idx last = std::min(j + kl, n - 1);
229 for (
idx i = j + 1; i <= last; ++i)
230 x[i] -= ab[kv + i - j + j * ldab] * xj;
234 for (
idx j = n; j > 0; --j) {
235 const idx col = j - 1;
236 x[col] /= ab[kv + col * ldab];
238 const idx first = (col > ku) ? col - ku : 0;
243 for (
idx i = first; i < col; ++i)
244 x[i] -= ab[kv + i - col + col * ldab] * xc;
255 const idx kv = ku + kl;
258 #pragma omp parallel for if (nrhs > 16)
260 for (
idx rhs = 0; rhs < nrhs; ++rhs) {
261 real* x = B + rhs * n;
262 for (
idx i = 0; i < n; ++i)
264 std::swap(x[i], x[ipiv[i]]);
265 for (
idx j = 0; j < n; ++j) {
267 const idx last = std::min(j + kl, n - 1);
269 for (
idx i = j + 1; i <= last; ++i)
270 x[i] -= ab[kv + i - j + j * ldab] * xj;
273 for (
idx j = n; j > 0; --j) {
274 const idx col = j - 1;
275 x[col] /= ab[kv + col * ldab];
277 const idx first = (col > ku) ? col - ku : 0;
279 for (
idx i = first; i < col; ++i)
280 x[i] -= ab[kv + i - col + col * ldab] * xc;
291 throw std::invalid_argument(
"banded_solve: dimension mismatch");
294 auto ipiv = std::make_unique<idx[]>(n);
299 for (
idx i = 0; i < n; ++i)
322 throw std::invalid_argument(
"banded_gemv: dimension mismatch");
329 const idx kv = ku + kl;
332 std::memset(yp, 0, n *
sizeof(
real));
333 else if (
beta != 1.0)
334 for (
idx i = 0; i < n; ++i)
338 #pragma omp parallel for schedule(static) if (n > 1000)
340 for (
idx j = 0; j < n; ++j) {
342 real temp = alpha * xp[j];
343 const idx i_start = (j > ku) ? j - ku : 0;
344 const idx i_end = std::min(j + kl, n - 1);
345 for (
idx i = i_start; i <= i_end; ++i) {
349 yp[i] += ab[kv + i - j + j * ldab] * temp;
361 const idx kv = ku + kl;
363 for (
idx j = 0; j < n; ++j) {
365 const idx i_start = (j > ku) ? j - ku : 0;
366 const idx i_end = std::min(j + kl, n - 1);
367 for (
idx i = i_start; i <= i_end; ++i)
368 col_sum += std::abs(ab[kv + i - j + j * ldab]);
369 max_sum = std::max(max_sum, col_sum);
376 if (n == 0 || anorm == 0.0)
381 real ainv_norm = 0.0;
382 for (
idx i = 0; i < n; ++i)
383 ainv_norm += std::abs(y[i]);
384 return 1.0 / (anorm * ainv_norm);
High-performance banded matrix solver for HPC applications.
Banded matrix with efficient storage for direct solvers.
idx kl() const
Number of lower diagonals.
idx ldab() const
Leading dimension of band storage (2*kl + ku + 1)
idx size() const
Matrix dimension.
BandedMatrix & operator=(const BandedMatrix &)
real & operator()(idx i, idx j)
Access element at (row, col) in original matrix coordinates.
bool in_band(idx i, idx j) const
Check if (i,j) is within the band.
real & band(idx band_row, idx col)
Direct access to band storage.
idx ku() const
Number of upper diagonals.
real * data()
Raw pointer to band storage (column-major)
BandedMatrix(idx n, idx kl, idx ku)
Construct a banded matrix.
constexpr idx size() const noexcept
void to_device(real *dst, const real *src, idx n)
Copy host to device.
void free(real *ptr)
Free device memory.
real * alloc(idx n)
Allocate device memory.
void to_host(real *dst, const real *src, idx n)
Copy device to host.
BandedSolverResult banded_solve(const BandedMatrix &A, const Vector &b, Vector &x)
Solve banded system Ax = b (convenience function)
Backend
Selects which backend handles a linalg operation.
@ gpu
CUDA – custom kernels or cuBLAS.
void banded_lu_solve_multi(const BandedMatrix &A, const idx *ipiv, real *B, idx nrhs)
Solve multiple right-hand sides using LU factorization.
real banded_norm1(const BandedMatrix &A)
Compute 1-norm of banded matrix.
real beta(real a, real b)
B(a, b) – beta function.
BandedSolverResult banded_lu(BandedMatrix &A, idx *ipiv)
LU factorization of banded matrix with partial pivoting.
real banded_rcond(const BandedMatrix &A, const idx *ipiv, real anorm)
Estimate reciprocal condition number of banded matrix.
void banded_gemv(real alpha, const BandedMatrix &A, const Vector &x, real beta, Vector &y, Backend backend=default_backend)
Banded matrix-vector product y = alpha*A*x + beta*y.
void banded_lu_solve(const BandedMatrix &A, const idx *ipiv, Vector &b)
Solve banded system using precomputed LU factorization.
void banded_matvec(const BandedMatrix &A, const Vector &x, Vector &y, Backend backend=default_backend)
Banded matrix-vector product y = A*x.
Result from banded solver.
bool success
True if solve succeeded.