14#ifdef NUMERICS_HAS_CUDA
24 ldab_(2 * kl + ku + 1) {
26 throw std::invalid_argument(
"BandedMatrix: n must be positive");
27 data_ = std::make_unique<real[]>(ldab_ * n_);
32 std::fill_n(data_.get(), ldab_ * n_, val);
36#ifdef NUMERICS_HAS_CUDA
47 data_ = std::make_unique<real[]>(ldab_ * n_);
48 std::memcpy(data_.get(), other.data_.get(), ldab_ * n_ *
sizeof(
real));
56 data_(std::move(other.data_)),
57 d_data_(other.d_data_) {
59 other.d_data_ =
nullptr;
68 data_ = std::make_unique<real[]>(ldab_ * n_);
69 std::memcpy(data_.get(), other.data_.get(), ldab_ * n_ *
sizeof(
real));
70#ifdef NUMERICS_HAS_CUDA
82#ifdef NUMERICS_HAS_CUDA
90 data_ = std::move(other.data_);
91 d_data_ = other.d_data_;
93 other.d_data_ =
nullptr;
99 return data_[(kl_ + ku_ + i - j) + j * ldab_];
103 return data_[(kl_ + ku_ + i - j) + j * ldab_];
107 return data_[band_row + col * ldab_];
111 return data_[band_row + col * ldab_];
115 return (j <= i + ku_) && (i <= j + kl_);
119#ifdef NUMERICS_HAS_CUDA
127#ifdef NUMERICS_HAS_CUDA
140 for (
idx i = 0; i < n; ++i)
143 for (
idx j = 0; j < n; ++j) {
144 const idx kv = ku + kl;
145 const idx last_row = std::min(j + kl, n - 1);
147 real max_val = std::abs(ab[kv + j * ldab]);
149 for (
idx i = j + 1; i <= last_row; ++i) {
150 real val = std::abs(ab[kv + i - j + j * ldab]);
157 if (max_val == 0.0) {
158 result.success =
false;
159 result.pivot_row = j;
165 const idx col_start = (j > ku) ? j - ku : 0;
166 const idx col_end = std::min(j + kl, n - 1);
167 for (
idx col = col_start; col <= col_end; ++col) {
168 idx band_j = kl + ku + j - col;
169 idx band_pivot = kl + ku + pivot - col;
170 if (band_j < ldab && band_pivot < ldab)
171 std::swap(ab[band_j + col * ldab], ab[band_pivot + col * ldab]);
175 real pivot_val = ab[kv + j * ldab];
176 const idx num_rows = std::min(kl, n - j - 1);
178 if (num_rows > 0 && pivot_val != 0.0) {
179 real inv_pivot = 1.0 / pivot_val;
183 for (
idx i = 1; i <= num_rows; ++i)
184 ab[kv + i + j * ldab] *= inv_pivot;
186 const idx col_end = std::min(j + ku, n - 1);
187 for (
idx k = j + 1; k <= col_end; ++k) {
188 real akj = ab[kv + j - k + k * ldab];
193 for (
idx i = 1; i <= num_rows; ++i) {
194 idx band_row = kv + j + i - k;
196 ab[band_row + k * ldab] -= ab[kv + i + j * ldab] * akj;
212 throw std::invalid_argument(
"banded_lu_solve: dimension mismatch");
214 const idx kv = ku + kl;
216 for (
idx i = 0; i < n; ++i)
218 std::swap(x[i], x[ipiv[i]]);
220 for (
idx j = 0; j < n; ++j) {
222 const idx last = std::min(j + kl, n - 1);
227 for (
idx i = j + 1; i <= last; ++i)
228 x[i] -= ab[kv + i - j + j * ldab] * xj;
232 for (
idx j = n; j > 0; --j) {
233 const idx col = j - 1;
234 x[col] /= ab[kv + col * ldab];
236 const idx first = (col > ku) ? col - ku : 0;
241 for (
idx i = first; i < col; ++i)
242 x[i] -= ab[kv + i - col + col * ldab] * xc;
250 const idx kv = ku + kl;
253 #pragma omp parallel for if (nrhs > 16)
255 for (
idx rhs = 0; rhs < nrhs; ++rhs) {
256 real* x = B + rhs * n;
257 for (
idx i = 0; i < n; ++i)
259 std::swap(x[i], x[ipiv[i]]);
260 for (
idx j = 0; j < n; ++j) {
262 const idx last = std::min(j + kl, n - 1);
264 for (
idx i = j + 1; i <= last; ++i)
265 x[i] -= ab[kv + i - j + j * ldab] * xj;
268 for (
idx j = n; j > 0; --j) {
269 const idx col = j - 1;
270 x[col] /= ab[kv + col * ldab];
272 const idx first = (col > ku) ? col - ku : 0;
274 for (
idx i = first; i < col; ++i)
275 x[i] -= ab[kv + i - col + col * ldab] * xc;
284 throw std::invalid_argument(
"banded_solve: dimension mismatch");
287 auto ipiv = std::make_unique<idx[]>(n);
292 for (
idx i = 0; i < n; ++i)
312 throw std::invalid_argument(
"banded_gemv: dimension mismatch");
319 const idx kv = ku + kl;
322 std::memset(yp, 0, n *
sizeof(
real));
323 else if (
beta != 1.0)
324 for (
idx i = 0; i < n; ++i)
328 #pragma omp parallel for schedule(static) if (n > 1000)
330 for (
idx j = 0; j < n; ++j) {
332 real temp = alpha * xp[j];
333 const idx i_start = (j > ku) ? j - ku : 0;
334 const idx i_end = std::min(j + kl, n - 1);
335 for (
idx i = i_start; i <= i_end; ++i) {
339 yp[i] += ab[kv + i - j + j * ldab] * temp;
351 const idx kv = ku + kl;
353 for (
idx j = 0; j < n; ++j) {
355 const idx i_start = (j > ku) ? j - ku : 0;
356 const idx i_end = std::min(j + kl, n - 1);
357 for (
idx i = i_start; i <= i_end; ++i)
358 col_sum += std::abs(ab[kv + i - j + j * ldab]);
359 max_sum = std::max(max_sum, col_sum);
366 if (n == 0 || anorm == 0.0)
371 real ainv_norm = 0.0;
372 for (
idx i = 0; i < n; ++i)
373 ainv_norm += std::abs(y[i]);
374 return 1.0 / (anorm * ainv_norm);
Banded matrix storage and solvers.
LAPACK-style band storage.
BandedMatrix & operator=(const BandedMatrix &)
real & operator()(idx i, idx j)
bool in_band(idx i, idx j) const
real & band(idx band_row, idx col)
BandedMatrix(idx n, idx kl, idx ku)
constexpr idx size() const noexcept
void to_device(real *dst, const real *src, idx n)
Copy host to device.
void free(real *ptr)
Free device memory.
real * alloc(idx n)
Allocate device memory.
void to_host(real *dst, const real *src, idx n)
Copy device to host.
BandedSolverResult banded_solve(const BandedMatrix &A, const Vector &b, Vector &x)
Factor and solve .
void banded_lu_solve_multi(const BandedMatrix &A, const idx *ipiv, real *B, idx nrhs)
Solve using a precomputed banded LU factorization.
real banded_norm1(const BandedMatrix &A)
Compute .
real beta(real a, real b)
B(a, b) – beta function.
BandedSolverResult banded_lu(BandedMatrix &A, idx *ipiv)
In-place banded factorization.
real banded_rcond(const BandedMatrix &A, const idx *ipiv, real anorm)
Estimate .
void banded_gemv(real alpha, const BandedMatrix &A, const Vector &x, real beta, Vector &y, Backend backend=default_backend)
Compute .
void banded_lu_solve(const BandedMatrix &A, const idx *ipiv, Vector &b)
Solve using a precomputed banded LU factorization.
void banded_matvec(const BandedMatrix &A, const Vector &x, Vector &y, Backend backend=default_backend)
Compute .