numerics
Loading...
Searching...
No Matches
banded.cpp
Go to the documentation of this file.
1/// @file banded.cpp
2/// @brief High-performance banded matrix solver implementation
3
5#include <algorithm>
6#include <cmath>
7#include <cstring>
8#include <stdexcept>
9
10#ifdef _OPENMP
11#include <omp.h>
12#endif
13
14#ifdef NUMERICS_HAS_CUDA
16#endif
17
18namespace num {
19
21 : n_(n), kl_(kl), ku_(ku), ldab_(2 * kl + ku + 1) {
22 if (n == 0) throw std::invalid_argument("BandedMatrix: n must be positive");
23 data_ = std::make_unique<real[]>(ldab_ * n_);
24}
25
27 : BandedMatrix(n, kl, ku) {
28 std::fill_n(data_.get(), ldab_ * n_, val);
29}
30
32#ifdef NUMERICS_HAS_CUDA
33 if (d_data_) cuda::free(d_data_);
34#endif
35}
36
38 : n_(other.n_), kl_(other.kl_), ku_(other.ku_), ldab_(other.ldab_) {
39 data_ = std::make_unique<real[]>(ldab_ * n_);
40 std::memcpy(data_.get(), other.data_.get(), ldab_ * n_ * sizeof(real));
41}
42
43
45 : n_(other.n_), kl_(other.kl_), ku_(other.ku_), ldab_(other.ldab_),
46 data_(std::move(other.data_)), d_data_(other.d_data_) {
47 other.n_ = 0;
48 other.d_data_ = nullptr;
49}
50
52 if (this != &other) {
53 n_ = other.n_; kl_ = other.kl_; ku_ = other.ku_; ldab_ = other.ldab_;
54 data_ = std::make_unique<real[]>(ldab_ * n_);
55 std::memcpy(data_.get(), other.data_.get(), ldab_ * n_ * sizeof(real));
56#ifdef NUMERICS_HAS_CUDA
57 if (d_data_) { cuda::free(d_data_); d_data_ = nullptr; }
58#endif
59 }
60 return *this;
61}
62
64 if (this != &other) {
65#ifdef NUMERICS_HAS_CUDA
66 if (d_data_) cuda::free(d_data_);
67#endif
68 n_ = other.n_; kl_ = other.kl_; ku_ = other.ku_; ldab_ = other.ldab_;
69 data_ = std::move(other.data_);
70 d_data_ = other.d_data_;
71 other.n_ = 0; other.d_data_ = nullptr;
72 }
73 return *this;
74}
75
77 return data_[(kl_ + ku_ + i - j) + j * ldab_];
78}
79
81 return data_[(kl_ + ku_ + i - j) + j * ldab_];
82}
83
85 return data_[band_row + col * ldab_];
86}
87
89 return data_[band_row + col * ldab_];
90}
91
93 return (j <= i + ku_) && (i <= j + kl_);
94}
95
97#ifdef NUMERICS_HAS_CUDA
98 if (!d_data_) d_data_ = cuda::alloc(ldab_ * n_);
99 cuda::to_device(d_data_, data_.get(), ldab_ * n_);
100#endif
101}
102
104#ifdef NUMERICS_HAS_CUDA
105 if (d_data_) cuda::to_host(data_.get(), d_data_, ldab_ * n_);
106#endif
107}
108
109// LU Factorization with Partial Pivoting
110
112 const idx n = A.size(), kl = A.kl(), ku = A.ku(), ldab = A.ldab();
113 real* ab = A.data();
114 BandedSolverResult result{true, 0, 0.0};
115
116 for (idx i = 0; i < n; ++i) ipiv[i] = i;
117
118 for (idx j = 0; j < n; ++j) {
119 const idx kv = ku + kl;
120 const idx last_row = std::min(j + kl, n - 1);
121
122 real max_val = std::abs(ab[kv + j * ldab]);
123 idx pivot = j;
124 for (idx i = j + 1; i <= last_row; ++i) {
125 real val = std::abs(ab[kv + i - j + j * ldab]);
126 if (val > max_val) { max_val = val; pivot = i; }
127 }
128
129 if (max_val == 0.0) { result.success = false; result.pivot_row = j; return result; }
130
131 ipiv[j] = pivot;
132 if (pivot != j) {
133 const idx col_start = (j > ku) ? j - ku : 0;
134 const idx col_end = std::min(j + kl, n - 1);
135 for (idx col = col_start; col <= col_end; ++col) {
136 idx band_j = kl + ku + j - col;
137 idx band_pivot = kl + ku + pivot - col;
138 if (band_j < ldab && band_pivot < ldab)
139 std::swap(ab[band_j + col * ldab], ab[band_pivot + col * ldab]);
140 }
141 }
142
143 real pivot_val = ab[kv + j * ldab];
144 const idx num_rows = std::min(kl, n - j - 1);
145
146 if (num_rows > 0 && pivot_val != 0.0) {
147 real inv_pivot = 1.0 / pivot_val;
148 #ifdef _OPENMP
149 #pragma omp simd
150 #endif
151 for (idx i = 1; i <= num_rows; ++i)
152 ab[kv + i + j * ldab] *= inv_pivot;
153
154 const idx col_end = std::min(j + ku, n - 1);
155 for (idx k = j + 1; k <= col_end; ++k) {
156 real akj = ab[kv + j - k + k * ldab];
157 if (akj != 0.0) {
158 #ifdef _OPENMP
159 #pragma omp simd
160 #endif
161 for (idx i = 1; i <= num_rows; ++i) {
162 idx band_row = kv + j + i - k;
163 if (band_row < ldab)
164 ab[band_row + k * ldab] -= ab[kv + i + j * ldab] * akj;
165 }
166 }
167 }
168 }
169 }
170 return result;
171}
172
173// Solve Using LU Factorization
174
175void banded_lu_solve(const BandedMatrix& A, const idx* ipiv, Vector& b) {
176 const idx n = A.size(), kl = A.kl(), ku = A.ku(), ldab = A.ldab();
177 const real* ab = A.data();
178 real* x = b.data();
179 if (b.size() != n) throw std::invalid_argument("banded_lu_solve: dimension mismatch");
180
181 const idx kv = ku + kl;
182
183 for (idx i = 0; i < n; ++i)
184 if (ipiv[i] != i) std::swap(x[i], x[ipiv[i]]);
185
186 for (idx j = 0; j < n; ++j) {
187 if (x[j] != 0.0) {
188 const idx last = std::min(j + kl, n - 1);
189 real xj = x[j];
190 #ifdef _OPENMP
191 #pragma omp simd
192 #endif
193 for (idx i = j + 1; i <= last; ++i)
194 x[i] -= ab[kv + i - j + j * ldab] * xj;
195 }
196 }
197
198 for (idx j = n; j > 0; --j) {
199 const idx col = j - 1;
200 x[col] /= ab[kv + col * ldab];
201 if (x[col] != 0.0) {
202 const idx first = (col > ku) ? col - ku : 0;
203 real xc = x[col];
204 #ifdef _OPENMP
205 #pragma omp simd
206 #endif
207 for (idx i = first; i < col; ++i)
208 x[i] -= ab[kv + i - col + col * ldab] * xc;
209 }
210 }
211}
212
214 real* B, idx nrhs) {
215 const idx n = A.size(), kl = A.kl(), ku = A.ku(), ldab = A.ldab();
216 const real* ab = A.data();
217 const idx kv = ku + kl;
218
219 #ifdef _OPENMP
220 #pragma omp parallel for if(nrhs > 16)
221 #endif
222 for (idx rhs = 0; rhs < nrhs; ++rhs) {
223 real* x = B + rhs * n;
224 for (idx i = 0; i < n; ++i)
225 if (ipiv[i] != i) std::swap(x[i], x[ipiv[i]]);
226 for (idx j = 0; j < n; ++j) {
227 if (x[j] != 0.0) {
228 const idx last = std::min(j + kl, n - 1);
229 real xj = x[j];
230 for (idx i = j + 1; i <= last; ++i)
231 x[i] -= ab[kv + i - j + j * ldab] * xj;
232 }
233 }
234 for (idx j = n; j > 0; --j) {
235 const idx col = j - 1;
236 x[col] /= ab[kv + col * ldab];
237 if (x[col] != 0.0) {
238 const idx first = (col > ku) ? col - ku : 0;
239 real xc = x[col];
240 for (idx i = first; i < col; ++i)
241 x[i] -= ab[kv + i - col + col * ldab] * xc;
242 }
243 }
244 }
245}
246
248 const idx n = A.size();
249 if (b.size() != n || x.size() != n)
250 throw std::invalid_argument("banded_solve: dimension mismatch");
251
253 auto ipiv = std::make_unique<idx[]>(n);
255 if (!result.success) return result;
256
257 for (idx i = 0; i < n; ++i) x[i] = b[i];
258 banded_lu_solve(A_work, ipiv.get(), x);
259 return result;
260}
261
262// Matrix-Vector Products
263
264void banded_matvec(const BandedMatrix& A, const Vector& x, Vector& y, Backend backend) {
265 banded_gemv(1.0, A, x, 0.0, y, backend);
266}
267
268void banded_gemv(real alpha, const BandedMatrix& A, const Vector& x,
269 real beta, Vector& y, Backend backend) {
270 const idx n = A.size(), kl = A.kl(), ku = A.ku();
271 if (x.size() != n || y.size() != n)
272 throw std::invalid_argument("banded_gemv: dimension mismatch");
273
274 if (backend != Backend::gpu) {
275 const idx ldab = A.ldab();
276 const real* ab = A.data();
277 const real* xp = x.data();
278 real* yp = y.data();
279 const idx kv = ku + kl;
280
281 if (beta == 0.0) std::memset(yp, 0, n * sizeof(real));
282 else if (beta != 1.0) for (idx i = 0; i < n; ++i) yp[i] *= beta;
283
284 #ifdef _OPENMP
285 #pragma omp parallel for schedule(static) if(n > 1000)
286 #endif
287 for (idx j = 0; j < n; ++j) {
288 if (xp[j] != 0.0) {
289 real temp = alpha * xp[j];
290 const idx i_start = (j > ku) ? j - ku : 0;
291 const idx i_end = std::min(j + kl, n - 1);
292 for (idx i = i_start; i <= i_end; ++i) {
293 #ifdef _OPENMP
294 #pragma omp atomic
295 #endif
296 yp[i] += ab[kv + i - j + j * ldab] * temp;
297 }
298 }
299 }
300 }
301}
302
303// Condition Number Estimation
304
306 const idx n = A.size(), kl = A.kl(), ku = A.ku(), ldab = A.ldab();
307 const real* ab = A.data();
308 const idx kv = ku + kl;
309 real max_sum = 0.0;
310 for (idx j = 0; j < n; ++j) {
311 real col_sum = 0.0;
312 const idx i_start = (j > ku) ? j - ku : 0;
313 const idx i_end = std::min(j + kl, n - 1);
314 for (idx i = i_start; i <= i_end; ++i)
315 col_sum += std::abs(ab[kv + i - j + j * ldab]);
316 max_sum = std::max(max_sum, col_sum);
317 }
318 return max_sum;
319}
320
322 const idx n = A.size();
323 if (n == 0 || anorm == 0.0) return 0.0;
324 Vector y(n, 1.0 / static_cast<real>(n));
327 real ainv_norm = 0.0;
328 for (idx i = 0; i < n; ++i) ainv_norm += std::abs(y[i]);
329 return 1.0 / (anorm * ainv_norm);
330}
331
332} // namespace num
High-performance banded matrix solver for HPC applications.
Banded matrix with efficient storage for direct solvers.
Definition banded.hpp:32
BandedMatrix & operator=(const BandedMatrix &)
Definition banded.cpp:51
real & operator()(idx i, idx j)
Access element at (row, col) in original matrix coordinates.
Definition banded.cpp:76
bool in_band(idx i, idx j) const
Check if (i,j) is within the band.
Definition banded.cpp:92
real & band(idx band_row, idx col)
Direct access to band storage.
Definition banded.cpp:84
BandedMatrix(idx n, idx kl, idx ku)
Construct a banded matrix.
Definition banded.cpp:20
constexpr idx size() const noexcept
Definition vector.hpp:77
CUDA kernel wrappers.
void to_device(real *dst, const real *src, idx n)
Copy host to device.
void free(real *ptr)
Free device memory.
real * alloc(idx n)
Allocate device memory.
void to_host(real *dst, const real *src, idx n)
Copy device to host.
BandedSolverResult banded_solve(const BandedMatrix &A, const Vector &b, Vector &x)
Solve banded system Ax = b (convenience function)
Definition banded.cpp:247
double real
Definition types.hpp:10
Backend
Selects which backend handles a linalg operation.
Definition policy.hpp:19
@ gpu
CUDA – custom kernels or cuBLAS.
void banded_lu_solve_multi(const BandedMatrix &A, const idx *ipiv, real *B, idx nrhs)
Solve multiple right-hand sides using LU factorization.
Definition banded.cpp:213
real banded_norm1(const BandedMatrix &A)
Compute 1-norm of banded matrix.
Definition banded.cpp:305
real beta(real a, real b)
B(a, b) – beta function.
Definition math.hpp:242
constexpr T ipow(T x) noexcept
Compute x^N at compile time via repeated squaring.
std::size_t idx
Definition types.hpp:11
BandedSolverResult banded_lu(BandedMatrix &A, idx *ipiv)
LU factorization of banded matrix with partial pivoting.
Definition banded.cpp:111
real banded_rcond(const BandedMatrix &A, const idx *ipiv, real anorm)
Estimate reciprocal condition number of banded matrix.
Definition banded.cpp:321
void banded_gemv(real alpha, const BandedMatrix &A, const Vector &x, real beta, Vector &y, Backend backend=default_backend)
Banded matrix-vector product y = alpha*A*x + beta*y.
Definition banded.cpp:268
void banded_lu_solve(const BandedMatrix &A, const idx *ipiv, Vector &b)
Solve banded system using precomputed LU factorization.
Definition banded.cpp:175
void banded_matvec(const BandedMatrix &A, const Vector &x, Vector &y, Backend backend=default_backend)
Banded matrix-vector product y = A*x.
Definition banded.cpp:264
Result from banded solver.
Definition banded.hpp:105