numerics 0.1.0
Loading...
Searching...
No Matches
banded.cpp
Go to the documentation of this file.
1/// @file banded.cpp
2/// @brief High-performance banded matrix solver implementation
3
5#include <algorithm>
6#include <cmath>
7#include <cstring>
8#include <stdexcept>
9
10#ifdef _OPENMP
11 #include <omp.h>
12#endif
13
14#ifdef NUMERICS_HAS_CUDA
16#endif
17
18namespace num {
19
21 : n_(n),
22 kl_(kl),
23 ku_(ku),
24 ldab_(2 * kl + ku + 1) {
25 if (n == 0)
26 throw std::invalid_argument("BandedMatrix: n must be positive");
27 data_ = std::make_unique<real[]>(ldab_ * n_);
28}
29
31 : BandedMatrix(n, kl, ku) {
32 std::fill_n(data_.get(), ldab_ * n_, val);
33}
34
36#ifdef NUMERICS_HAS_CUDA
37 if (d_data_)
38 cuda::free(d_data_);
39#endif
40}
41
43 : n_(other.n_),
44 kl_(other.kl_),
45 ku_(other.ku_),
46 ldab_(other.ldab_) {
47 data_ = std::make_unique<real[]>(ldab_ * n_);
48 std::memcpy(data_.get(), other.data_.get(), ldab_ * n_ * sizeof(real));
49}
50
52 : n_(other.n_),
53 kl_(other.kl_),
54 ku_(other.ku_),
55 ldab_(other.ldab_),
56 data_(std::move(other.data_)),
57 d_data_(other.d_data_) {
58 other.n_ = 0;
59 other.d_data_ = nullptr;
60}
61
63 if (this != &other) {
64 n_ = other.n_;
65 kl_ = other.kl_;
66 ku_ = other.ku_;
67 ldab_ = other.ldab_;
68 data_ = std::make_unique<real[]>(ldab_ * n_);
69 std::memcpy(data_.get(), other.data_.get(), ldab_ * n_ * sizeof(real));
70#ifdef NUMERICS_HAS_CUDA
71 if (d_data_) {
72 cuda::free(d_data_);
73 d_data_ = nullptr;
74 }
75#endif
76 }
77 return *this;
78}
79
81 if (this != &other) {
82#ifdef NUMERICS_HAS_CUDA
83 if (d_data_)
84 cuda::free(d_data_);
85#endif
86 n_ = other.n_;
87 kl_ = other.kl_;
88 ku_ = other.ku_;
89 ldab_ = other.ldab_;
90 data_ = std::move(other.data_);
91 d_data_ = other.d_data_;
92 other.n_ = 0;
93 other.d_data_ = nullptr;
94 }
95 return *this;
96}
97
99 return data_[(kl_ + ku_ + i - j) + j * ldab_];
100}
101
103 return data_[(kl_ + ku_ + i - j) + j * ldab_];
104}
105
107 return data_[band_row + col * ldab_];
108}
109
110real BandedMatrix::band(idx band_row, idx col) const {
111 return data_[band_row + col * ldab_];
112}
113
114bool BandedMatrix::in_band(idx i, idx j) const {
115 return (j <= i + ku_) && (i <= j + kl_);
116}
117
119#ifdef NUMERICS_HAS_CUDA
120 if (!d_data_)
121 d_data_ = cuda::alloc(ldab_ * n_);
122 cuda::to_device(d_data_, data_.get(), ldab_ * n_);
123#endif
124}
125
127#ifdef NUMERICS_HAS_CUDA
128 if (d_data_)
129 cuda::to_host(data_.get(), d_data_, ldab_ * n_);
130#endif
131}
132
133// LU Factorization with Partial Pivoting
134
136 const idx n = A.size(), kl = A.kl(), ku = A.ku(), ldab = A.ldab();
137 real* ab = A.data();
138 BandedSolverResult result{true, 0, 0.0};
139
140 for (idx i = 0; i < n; ++i)
141 ipiv[i] = i;
142
143 for (idx j = 0; j < n; ++j) {
144 const idx kv = ku + kl;
145 const idx last_row = std::min(j + kl, n - 1);
146
147 real max_val = std::abs(ab[kv + j * ldab]);
148 idx pivot = j;
149 for (idx i = j + 1; i <= last_row; ++i) {
150 real val = std::abs(ab[kv + i - j + j * ldab]);
151 if (val > max_val) {
152 max_val = val;
153 pivot = i;
154 }
155 }
156
157 if (max_val == 0.0) {
158 result.success = false;
159 result.pivot_row = j;
160 return result;
161 }
162
163 ipiv[j] = pivot;
164 if (pivot != j) {
165 const idx col_start = (j > ku) ? j - ku : 0;
166 const idx col_end = std::min(j + kl, n - 1);
167 for (idx col = col_start; col <= col_end; ++col) {
168 idx band_j = kl + ku + j - col;
169 idx band_pivot = kl + ku + pivot - col;
170 if (band_j < ldab && band_pivot < ldab)
171 std::swap(ab[band_j + col * ldab], ab[band_pivot + col * ldab]);
172 }
173 }
174
175 real pivot_val = ab[kv + j * ldab];
176 const idx num_rows = std::min(kl, n - j - 1);
177
178 if (num_rows > 0 && pivot_val != 0.0) {
179 real inv_pivot = 1.0 / pivot_val;
180#ifdef _OPENMP
181 #pragma omp simd
182#endif
183 for (idx i = 1; i <= num_rows; ++i)
184 ab[kv + i + j * ldab] *= inv_pivot;
185
186 const idx col_end = std::min(j + ku, n - 1);
187 for (idx k = j + 1; k <= col_end; ++k) {
188 real akj = ab[kv + j - k + k * ldab];
189 if (akj != 0.0) {
190#ifdef _OPENMP
191 #pragma omp simd
192#endif
193 for (idx i = 1; i <= num_rows; ++i) {
194 idx band_row = kv + j + i - k;
195 if (band_row < ldab)
196 ab[band_row + k * ldab] -= ab[kv + i + j * ldab] * akj;
197 }
198 }
199 }
200 }
201 }
202 return result;
203}
204
205// Solve Using LU Factorization
206
207void banded_lu_solve(const BandedMatrix& A, const idx* ipiv, Vector& b) {
208 const idx n = A.size(), kl = A.kl(), ku = A.ku(), ldab = A.ldab();
209 const real* ab = A.data();
210 real* x = b.data();
211 if (b.size() != n)
212 throw std::invalid_argument("banded_lu_solve: dimension mismatch");
213
214 const idx kv = ku + kl;
215
216 for (idx i = 0; i < n; ++i)
217 if (ipiv[i] != i)
218 std::swap(x[i], x[ipiv[i]]);
219
220 for (idx j = 0; j < n; ++j) {
221 if (x[j] != 0.0) {
222 const idx last = std::min(j + kl, n - 1);
223 real xj = x[j];
224#ifdef _OPENMP
225 #pragma omp simd
226#endif
227 for (idx i = j + 1; i <= last; ++i)
228 x[i] -= ab[kv + i - j + j * ldab] * xj;
229 }
230 }
231
232 for (idx j = n; j > 0; --j) {
233 const idx col = j - 1;
234 x[col] /= ab[kv + col * ldab];
235 if (x[col] != 0.0) {
236 const idx first = (col > ku) ? col - ku : 0;
237 real xc = x[col];
238#ifdef _OPENMP
239 #pragma omp simd
240#endif
241 for (idx i = first; i < col; ++i)
242 x[i] -= ab[kv + i - col + col * ldab] * xc;
243 }
244 }
245}
246
247void banded_lu_solve_multi(const BandedMatrix& A, const idx* ipiv, real* B, idx nrhs) {
248 const idx n = A.size(), kl = A.kl(), ku = A.ku(), ldab = A.ldab();
249 const real* ab = A.data();
250 const idx kv = ku + kl;
251
252#ifdef _OPENMP
253 #pragma omp parallel for if (nrhs > 16)
254#endif
255 for (idx rhs = 0; rhs < nrhs; ++rhs) {
256 real* x = B + rhs * n;
257 for (idx i = 0; i < n; ++i)
258 if (ipiv[i] != i)
259 std::swap(x[i], x[ipiv[i]]);
260 for (idx j = 0; j < n; ++j) {
261 if (x[j] != 0.0) {
262 const idx last = std::min(j + kl, n - 1);
263 real xj = x[j];
264 for (idx i = j + 1; i <= last; ++i)
265 x[i] -= ab[kv + i - j + j * ldab] * xj;
266 }
267 }
268 for (idx j = n; j > 0; --j) {
269 const idx col = j - 1;
270 x[col] /= ab[kv + col * ldab];
271 if (x[col] != 0.0) {
272 const idx first = (col > ku) ? col - ku : 0;
273 real xc = x[col];
274 for (idx i = first; i < col; ++i)
275 x[i] -= ab[kv + i - col + col * ldab] * xc;
276 }
277 }
278 }
279}
280
282 const idx n = A.size();
283 if (b.size() != n || x.size() != n)
284 throw std::invalid_argument("banded_solve: dimension mismatch");
285
286 BandedMatrix A_work = A;
287 auto ipiv = std::make_unique<idx[]>(n);
288 BandedSolverResult result = banded_lu(A_work, ipiv.get());
289 if (!result.success)
290 return result;
291
292 for (idx i = 0; i < n; ++i)
293 x[i] = b[i];
294 banded_lu_solve(A_work, ipiv.get(), x);
295 return result;
296}
297
298// Matrix-Vector Products
299
300void banded_matvec(const BandedMatrix& A, const Vector& x, Vector& y, Backend backend) {
301 banded_gemv(1.0, A, x, 0.0, y, backend);
302}
303
304void banded_gemv(real alpha,
305 const BandedMatrix& A,
306 const Vector& x,
307 real beta,
308 Vector& y,
309 Backend backend) {
310 const idx n = A.size(), kl = A.kl(), ku = A.ku();
311 if (x.size() != n || y.size() != n)
312 throw std::invalid_argument("banded_gemv: dimension mismatch");
313
314 if (backend != Backend::gpu) {
315 const idx ldab = A.ldab();
316 const real* ab = A.data();
317 const real* xp = x.data();
318 real* yp = y.data();
319 const idx kv = ku + kl;
320
321 if (beta == 0.0)
322 std::memset(yp, 0, n * sizeof(real));
323 else if (beta != 1.0)
324 for (idx i = 0; i < n; ++i)
325 yp[i] *= beta;
326
327#ifdef _OPENMP
328 #pragma omp parallel for schedule(static) if (n > 1000)
329#endif
330 for (idx j = 0; j < n; ++j) {
331 if (xp[j] != 0.0) {
332 real temp = alpha * xp[j];
333 const idx i_start = (j > ku) ? j - ku : 0;
334 const idx i_end = std::min(j + kl, n - 1);
335 for (idx i = i_start; i <= i_end; ++i) {
336#ifdef _OPENMP
337 #pragma omp atomic
338#endif
339 yp[i] += ab[kv + i - j + j * ldab] * temp;
340 }
341 }
342 }
343 }
344}
345
346// Condition Number Estimation
347
349 const idx n = A.size(), kl = A.kl(), ku = A.ku(), ldab = A.ldab();
350 const real* ab = A.data();
351 const idx kv = ku + kl;
352 real max_sum = 0.0;
353 for (idx j = 0; j < n; ++j) {
354 real col_sum = 0.0;
355 const idx i_start = (j > ku) ? j - ku : 0;
356 const idx i_end = std::min(j + kl, n - 1);
357 for (idx i = i_start; i <= i_end; ++i)
358 col_sum += std::abs(ab[kv + i - j + j * ldab]);
359 max_sum = std::max(max_sum, col_sum);
360 }
361 return max_sum;
362}
363
364real banded_rcond(const BandedMatrix& A, const idx* ipiv, real anorm) {
365 const idx n = A.size();
366 if (n == 0 || anorm == 0.0)
367 return 0.0;
368 Vector y(n, 1.0 / static_cast<real>(n));
369 BandedMatrix A_copy = A;
370 banded_lu_solve(A_copy, ipiv, y);
371 real ainv_norm = 0.0;
372 for (idx i = 0; i < n; ++i)
373 ainv_norm += std::abs(y[i]);
374 return 1.0 / (anorm * ainv_norm);
375}
376
377} // namespace num
Banded matrix storage and solvers.
LAPACK-style band storage.
Definition banded.hpp:16
idx kl() const
Definition banded.hpp:33
idx ldab() const
Definition banded.hpp:39
idx size() const
Definition banded.hpp:29
BandedMatrix & operator=(const BandedMatrix &)
Definition banded.cpp:62
real & operator()(idx i, idx j)
Definition banded.cpp:98
bool in_band(idx i, idx j) const
Definition banded.cpp:114
real & band(idx band_row, idx col)
Definition banded.cpp:106
idx ku() const
Definition banded.hpp:35
real * data()
Definition banded.hpp:47
BandedMatrix(idx n, idx kl, idx ku)
Definition banded.cpp:20
constexpr idx size() const noexcept
Definition vector.hpp:83
CUDA kernel wrappers.
void to_device(real *dst, const real *src, idx n)
Copy host to device.
void free(real *ptr)
Free device memory.
real * alloc(idx n)
Allocate device memory.
void to_host(real *dst, const real *src, idx n)
Copy device to host.
BandedSolverResult banded_solve(const BandedMatrix &A, const Vector &b, Vector &x)
Factor and solve .
Definition banded.cpp:281
double real
Definition types.hpp:10
Backend
Definition policy.hpp:7
void banded_lu_solve_multi(const BandedMatrix &A, const idx *ipiv, real *B, idx nrhs)
Solve using a precomputed banded LU factorization.
Definition banded.cpp:247
real banded_norm1(const BandedMatrix &A)
Compute .
Definition banded.cpp:348
real beta(real a, real b)
B(a, b) – beta function.
Definition math.hpp:248
std::size_t idx
Definition types.hpp:11
BandedSolverResult banded_lu(BandedMatrix &A, idx *ipiv)
In-place banded factorization.
Definition banded.cpp:135
real banded_rcond(const BandedMatrix &A, const idx *ipiv, real anorm)
Estimate .
Definition banded.cpp:364
void banded_gemv(real alpha, const BandedMatrix &A, const Vector &x, real beta, Vector &y, Backend backend=default_backend)
Compute .
Definition banded.cpp:304
void banded_lu_solve(const BandedMatrix &A, const idx *ipiv, Vector &b)
Solve using a precomputed banded LU factorization.
Definition banded.cpp:207
void banded_matvec(const BandedMatrix &A, const Vector &x, Vector &y, Backend backend=default_backend)
Compute .
Definition banded.cpp:300