numerics 0.1.0
Loading...
Searching...
No Matches
banded.cpp
Go to the documentation of this file.
1/// @file banded.cpp
2/// @brief High-performance banded matrix solver implementation
3
5#include <algorithm>
6#include <cmath>
7#include <cstring>
8#include <stdexcept>
9
10#ifdef _OPENMP
11 #include <omp.h>
12#endif
13
14#ifdef NUMERICS_HAS_CUDA
16#endif
17
18namespace num {
19
21 : n_(n)
22 , kl_(kl)
23 , ku_(ku)
24 , ldab_(2 * kl + ku + 1) {
25 if (n == 0)
26 throw std::invalid_argument("BandedMatrix: n must be positive");
27 data_ = std::make_unique<real[]>(ldab_ * n_);
28}
29
31 : BandedMatrix(n, kl, ku) {
32 std::fill_n(data_.get(), ldab_ * n_, val);
33}
34
36#ifdef NUMERICS_HAS_CUDA
37 if (d_data_)
38 cuda::free(d_data_);
39#endif
40}
41
43 : n_(other.n_)
44 , kl_(other.kl_)
45 , ku_(other.ku_)
46 , ldab_(other.ldab_) {
47 data_ = std::make_unique<real[]>(ldab_ * n_);
48 std::memcpy(data_.get(), other.data_.get(), ldab_ * n_ * sizeof(real));
49}
50
52 : n_(other.n_)
53 , kl_(other.kl_)
54 , ku_(other.ku_)
55 , ldab_(other.ldab_)
56 , data_(std::move(other.data_))
57 , d_data_(other.d_data_) {
58 other.n_ = 0;
59 other.d_data_ = nullptr;
60}
61
63 if (this != &other) {
64 n_ = other.n_;
65 kl_ = other.kl_;
66 ku_ = other.ku_;
67 ldab_ = other.ldab_;
68 data_ = std::make_unique<real[]>(ldab_ * n_);
69 std::memcpy(data_.get(), other.data_.get(), ldab_ * n_ * sizeof(real));
70#ifdef NUMERICS_HAS_CUDA
71 if (d_data_) {
72 cuda::free(d_data_);
73 d_data_ = nullptr;
74 }
75#endif
76 }
77 return *this;
78}
79
81 if (this != &other) {
82#ifdef NUMERICS_HAS_CUDA
83 if (d_data_)
84 cuda::free(d_data_);
85#endif
86 n_ = other.n_;
87 kl_ = other.kl_;
88 ku_ = other.ku_;
89 ldab_ = other.ldab_;
90 data_ = std::move(other.data_);
91 d_data_ = other.d_data_;
92 other.n_ = 0;
93 other.d_data_ = nullptr;
94 }
95 return *this;
96}
97
99 return data_[(kl_ + ku_ + i - j) + j * ldab_];
100}
101
103 return data_[(kl_ + ku_ + i - j) + j * ldab_];
104}
105
107 return data_[band_row + col * ldab_];
108}
109
110real BandedMatrix::band(idx band_row, idx col) const {
111 return data_[band_row + col * ldab_];
112}
113
114bool BandedMatrix::in_band(idx i, idx j) const {
115 return (j <= i + ku_) && (i <= j + kl_);
116}
117
119#ifdef NUMERICS_HAS_CUDA
120 if (!d_data_)
121 d_data_ = cuda::alloc(ldab_ * n_);
122 cuda::to_device(d_data_, data_.get(), ldab_ * n_);
123#endif
124}
125
127#ifdef NUMERICS_HAS_CUDA
128 if (d_data_)
129 cuda::to_host(data_.get(), d_data_, ldab_ * n_);
130#endif
131}
132
133// LU Factorization with Partial Pivoting
134
136 const idx n = A.size(), kl = A.kl(), ku = A.ku(), ldab = A.ldab();
137 real* ab = A.data();
138 BandedSolverResult result{true, 0, 0.0};
139
140 for (idx i = 0; i < n; ++i)
141 ipiv[i] = i;
142
143 for (idx j = 0; j < n; ++j) {
144 const idx kv = ku + kl;
145 const idx last_row = std::min(j + kl, n - 1);
146
147 real max_val = std::abs(ab[kv + j * ldab]);
148 idx pivot = j;
149 for (idx i = j + 1; i <= last_row; ++i) {
150 real val = std::abs(ab[kv + i - j + j * ldab]);
151 if (val > max_val) {
152 max_val = val;
153 pivot = i;
154 }
155 }
156
157 if (max_val == 0.0) {
158 result.success = false;
159 result.pivot_row = j;
160 return result;
161 }
162
163 ipiv[j] = pivot;
164 if (pivot != j) {
165 const idx col_start = (j > ku) ? j - ku : 0;
166 const idx col_end = std::min(j + kl, n - 1);
167 for (idx col = col_start; col <= col_end; ++col) {
168 idx band_j = kl + ku + j - col;
169 idx band_pivot = kl + ku + pivot - col;
170 if (band_j < ldab && band_pivot < ldab)
171 std::swap(ab[band_j + col * ldab],
172 ab[band_pivot + col * ldab]);
173 }
174 }
175
176 real pivot_val = ab[kv + j * ldab];
177 const idx num_rows = std::min(kl, n - j - 1);
178
179 if (num_rows > 0 && pivot_val != 0.0) {
180 real inv_pivot = 1.0 / pivot_val;
181#ifdef _OPENMP
182 #pragma omp simd
183#endif
184 for (idx i = 1; i <= num_rows; ++i)
185 ab[kv + i + j * ldab] *= inv_pivot;
186
187 const idx col_end = std::min(j + ku, n - 1);
188 for (idx k = j + 1; k <= col_end; ++k) {
189 real akj = ab[kv + j - k + k * ldab];
190 if (akj != 0.0) {
191#ifdef _OPENMP
192 #pragma omp simd
193#endif
194 for (idx i = 1; i <= num_rows; ++i) {
195 idx band_row = kv + j + i - k;
196 if (band_row < ldab)
197 ab[band_row + k * ldab] -= ab[kv + i + j * ldab]
198 * akj;
199 }
200 }
201 }
202 }
203 }
204 return result;
205}
206
207// Solve Using LU Factorization
208
209void banded_lu_solve(const BandedMatrix& A, const idx* ipiv, Vector& b) {
210 const idx n = A.size(), kl = A.kl(), ku = A.ku(), ldab = A.ldab();
211 const real* ab = A.data();
212 real* x = b.data();
213 if (b.size() != n)
214 throw std::invalid_argument("banded_lu_solve: dimension mismatch");
215
216 const idx kv = ku + kl;
217
218 for (idx i = 0; i < n; ++i)
219 if (ipiv[i] != i)
220 std::swap(x[i], x[ipiv[i]]);
221
222 for (idx j = 0; j < n; ++j) {
223 if (x[j] != 0.0) {
224 const idx last = std::min(j + kl, n - 1);
225 real xj = x[j];
226#ifdef _OPENMP
227 #pragma omp simd
228#endif
229 for (idx i = j + 1; i <= last; ++i)
230 x[i] -= ab[kv + i - j + j * ldab] * xj;
231 }
232 }
233
234 for (idx j = n; j > 0; --j) {
235 const idx col = j - 1;
236 x[col] /= ab[kv + col * ldab];
237 if (x[col] != 0.0) {
238 const idx first = (col > ku) ? col - ku : 0;
239 real xc = x[col];
240#ifdef _OPENMP
241 #pragma omp simd
242#endif
243 for (idx i = first; i < col; ++i)
244 x[i] -= ab[kv + i - col + col * ldab] * xc;
245 }
246 }
247}
248
250 const idx* ipiv,
251 real* B,
252 idx nrhs) {
253 const idx n = A.size(), kl = A.kl(), ku = A.ku(), ldab = A.ldab();
254 const real* ab = A.data();
255 const idx kv = ku + kl;
256
257#ifdef _OPENMP
258 #pragma omp parallel for if (nrhs > 16)
259#endif
260 for (idx rhs = 0; rhs < nrhs; ++rhs) {
261 real* x = B + rhs * n;
262 for (idx i = 0; i < n; ++i)
263 if (ipiv[i] != i)
264 std::swap(x[i], x[ipiv[i]]);
265 for (idx j = 0; j < n; ++j) {
266 if (x[j] != 0.0) {
267 const idx last = std::min(j + kl, n - 1);
268 real xj = x[j];
269 for (idx i = j + 1; i <= last; ++i)
270 x[i] -= ab[kv + i - j + j * ldab] * xj;
271 }
272 }
273 for (idx j = n; j > 0; --j) {
274 const idx col = j - 1;
275 x[col] /= ab[kv + col * ldab];
276 if (x[col] != 0.0) {
277 const idx first = (col > ku) ? col - ku : 0;
278 real xc = x[col];
279 for (idx i = first; i < col; ++i)
280 x[i] -= ab[kv + i - col + col * ldab] * xc;
281 }
282 }
283 }
284}
285
287 const Vector& b,
288 Vector& x) {
289 const idx n = A.size();
290 if (b.size() != n || x.size() != n)
291 throw std::invalid_argument("banded_solve: dimension mismatch");
292
293 BandedMatrix A_work = A;
294 auto ipiv = std::make_unique<idx[]>(n);
295 BandedSolverResult result = banded_lu(A_work, ipiv.get());
296 if (!result.success)
297 return result;
298
299 for (idx i = 0; i < n; ++i)
300 x[i] = b[i];
301 banded_lu_solve(A_work, ipiv.get(), x);
302 return result;
303}
304
305// Matrix-Vector Products
306
308 const Vector& x,
309 Vector& y,
310 Backend backend) {
311 banded_gemv(1.0, A, x, 0.0, y, backend);
312}
313
314void banded_gemv(real alpha,
315 const BandedMatrix& A,
316 const Vector& x,
317 real beta,
318 Vector& y,
319 Backend backend) {
320 const idx n = A.size(), kl = A.kl(), ku = A.ku();
321 if (x.size() != n || y.size() != n)
322 throw std::invalid_argument("banded_gemv: dimension mismatch");
323
324 if (backend != Backend::gpu) {
325 const idx ldab = A.ldab();
326 const real* ab = A.data();
327 const real* xp = x.data();
328 real* yp = y.data();
329 const idx kv = ku + kl;
330
331 if (beta == 0.0)
332 std::memset(yp, 0, n * sizeof(real));
333 else if (beta != 1.0)
334 for (idx i = 0; i < n; ++i)
335 yp[i] *= beta;
336
337#ifdef _OPENMP
338 #pragma omp parallel for schedule(static) if (n > 1000)
339#endif
340 for (idx j = 0; j < n; ++j) {
341 if (xp[j] != 0.0) {
342 real temp = alpha * xp[j];
343 const idx i_start = (j > ku) ? j - ku : 0;
344 const idx i_end = std::min(j + kl, n - 1);
345 for (idx i = i_start; i <= i_end; ++i) {
346#ifdef _OPENMP
347 #pragma omp atomic
348#endif
349 yp[i] += ab[kv + i - j + j * ldab] * temp;
350 }
351 }
352 }
353 }
354}
355
356// Condition Number Estimation
357
359 const idx n = A.size(), kl = A.kl(), ku = A.ku(), ldab = A.ldab();
360 const real* ab = A.data();
361 const idx kv = ku + kl;
362 real max_sum = 0.0;
363 for (idx j = 0; j < n; ++j) {
364 real col_sum = 0.0;
365 const idx i_start = (j > ku) ? j - ku : 0;
366 const idx i_end = std::min(j + kl, n - 1);
367 for (idx i = i_start; i <= i_end; ++i)
368 col_sum += std::abs(ab[kv + i - j + j * ldab]);
369 max_sum = std::max(max_sum, col_sum);
370 }
371 return max_sum;
372}
373
374real banded_rcond(const BandedMatrix& A, const idx* ipiv, real anorm) {
375 const idx n = A.size();
376 if (n == 0 || anorm == 0.0)
377 return 0.0;
378 Vector y(n, 1.0 / static_cast<real>(n));
379 BandedMatrix A_copy = A;
380 banded_lu_solve(A_copy, ipiv, y);
381 real ainv_norm = 0.0;
382 for (idx i = 0; i < n; ++i)
383 ainv_norm += std::abs(y[i]);
384 return 1.0 / (anorm * ainv_norm);
385}
386
387} // namespace num
High-performance banded matrix solver for HPC applications.
Banded matrix with efficient storage for direct solvers.
Definition banded.hpp:33
idx kl() const
Number of lower diagonals.
Definition banded.hpp:58
idx ldab() const
Leading dimension of band storage (2*kl + ku + 1)
Definition banded.hpp:67
idx size() const
Matrix dimension.
Definition banded.hpp:53
BandedMatrix & operator=(const BandedMatrix &)
Definition banded.cpp:62
real & operator()(idx i, idx j)
Access element at (row, col) in original matrix coordinates.
Definition banded.cpp:98
bool in_band(idx i, idx j) const
Check if (i,j) is within the band.
Definition banded.cpp:114
real & band(idx band_row, idx col)
Direct access to band storage.
Definition banded.cpp:106
idx ku() const
Number of upper diagonals.
Definition banded.hpp:61
real * data()
Raw pointer to band storage (column-major)
Definition banded.hpp:83
BandedMatrix(idx n, idx kl, idx ku)
Construct a banded matrix.
Definition banded.cpp:20
constexpr idx size() const noexcept
Definition vector.hpp:80
CUDA kernel wrappers.
void to_device(real *dst, const real *src, idx n)
Copy host to device.
void free(real *ptr)
Free device memory.
real * alloc(idx n)
Allocate device memory.
void to_host(real *dst, const real *src, idx n)
Copy device to host.
BandedSolverResult banded_solve(const BandedMatrix &A, const Vector &b, Vector &x)
Solve banded system Ax = b (convenience function)
Definition banded.cpp:286
double real
Definition types.hpp:10
Backend
Selects which backend handles a linalg operation.
Definition policy.hpp:19
@ gpu
CUDA – custom kernels or cuBLAS.
void banded_lu_solve_multi(const BandedMatrix &A, const idx *ipiv, real *B, idx nrhs)
Solve multiple right-hand sides using LU factorization.
Definition banded.cpp:249
real banded_norm1(const BandedMatrix &A)
Compute 1-norm of banded matrix.
Definition banded.cpp:358
real beta(real a, real b)
B(a, b) – beta function.
Definition math.hpp:248
std::size_t idx
Definition types.hpp:11
BandedSolverResult banded_lu(BandedMatrix &A, idx *ipiv)
LU factorization of banded matrix with partial pivoting.
Definition banded.cpp:135
real banded_rcond(const BandedMatrix &A, const idx *ipiv, real anorm)
Estimate reciprocal condition number of banded matrix.
Definition banded.cpp:374
void banded_gemv(real alpha, const BandedMatrix &A, const Vector &x, real beta, Vector &y, Backend backend=default_backend)
Banded matrix-vector product y = alpha*A*x + beta*y.
Definition banded.cpp:314
void banded_lu_solve(const BandedMatrix &A, const idx *ipiv, Vector &b)
Solve banded system using precomputed LU factorization.
Definition banded.cpp:209
void banded_matvec(const BandedMatrix &A, const Vector &x, Vector &y, Backend backend=default_backend)
Banded matrix-vector product y = A*x.
Definition banded.cpp:307
Result from banded solver.
Definition banded.hpp:106
bool success
True if solve succeeded.
Definition banded.hpp:107