numerics 0.1.0
Loading...
Searching...
No Matches
raw.hpp
Go to the documentation of this file.
1/// @file kernel/raw.hpp
2/// @brief Tier-1 kernel: raw-pointer, inline, zero-overhead inner loops.
3///
4/// These are the lowest-level building blocks in the numerics stack.
5/// Every higher tier (type-safe wrappers, operator interface, solvers)
6/// ultimately reduces to these loops.
7///
8/// Design invariants:
9/// - All functions are NUM_K_AINLINE: the compiler always sees the loop body
10/// and can vectorize, fuse, or eliminate it during inlining.
11/// - NUM_K_RESTRICT on every pointer: tells the compiler inputs do not alias,
12/// which is necessary for auto-vectorization of most loops.
13/// - noexcept throughout: no exception machinery in hot paths.
14/// - No heap allocation: callers own all memory.
15/// - BLAS dispatch (#ifdef NUMERICS_HAS_BLAS): routes to cblas Level-1/2
16/// when available. BLAS handles its own internal threading and is safe
17/// to call from any context (no nested-OMP issue).
18/// - No OpenMP: OMP at raw level causes nested-parallelism problems when
19/// a higher-level kernel wraps these in its own parallel region.
20/// OMP parallelism lives at Tier 2.
21///
22/// Operations:
23/// --- Level 1 (vector) ---
24/// scale(x, alpha, n) x[i] *= alpha
25/// axpy(y, x, alpha, n) y[i] += alpha * x[i]
26/// axpby(y, x, a, b, n) y[i] = a*x[i] + b*y[i] [fused, no BLAS eq.]
27/// axpbyz(z, x, y, a, b, n) z[i] = a*x[i] + b*y[i] [fused, no BLAS eq.]
28/// dot(x, y, n) return sum x[i]*y[i]
29/// norm_sq(x, n) return sum x[i]^2 [no BLAS sqrt cost]
30/// norm(x, n) return sqrt(norm_sq)
31/// l1_norm(x, n) return sum |x[i]|
32/// linf_norm(x, n) return max |x[i]|
33/// sum(x, n) return sum x[i] [no BLAS eq.]
34///
35/// --- Level 2 (matrix-vector, row-major) ---
36/// matvec(y, A, x, m, n) y[i] = sum_j A[i*n+j] * x[j]
37/// ger(A, x, y, alpha, m, n) A[i*n+j] += alpha * x[i] * y[j]
38/// trsv_lower(x, L, b, n) solve Lx = b (L lower triangular)
39/// trsv_upper(x, U, b, n) solve Ux = b (U upper triangular)
40///
41/// Include this header when writing a fused custom kernel that needs to
42/// compose multiple operations into a single memory pass.
43/// For ordinary user code prefer num::kernel:: (Tier 2).
44#pragma once
45
46#include "core/types.hpp"
47#include <cmath>
48#include <cstring>
49
50#ifdef NUMERICS_HAS_BLAS
51# include <cblas.h>
52#endif
53
54// ---------------------------------------------------------------------------
55// Portability annotations
56//
57// NUM_K_AINLINE -- force-inline (compiler always sees the loop body)
58// NUM_K_RESTRICT -- no-alias pointer hint (enables auto-vectorization)
59// NUM_K_IVDEP -- assert no loop-carried dependencies to the vectorizer
60//
61// All three are no-ops on unsupported compilers; the code still compiles,
62// just without the extra hints.
63// ---------------------------------------------------------------------------
64
65#if defined(__GNUC__) || defined(__clang__)
66# define NUM_K_AINLINE [[gnu::always_inline]] inline
67# define NUM_K_RESTRICT __restrict__
68# define NUM_K_IVDEP _Pragma("GCC ivdep")
69#else
70# define NUM_K_AINLINE inline
71# define NUM_K_RESTRICT
72# define NUM_K_IVDEP
73#endif
74
76
77// ===========================================================================
78// Level 1: vector operations
79// ===========================================================================
80
81/// @brief x[i] *= alpha
83void scale(real* NUM_K_RESTRICT x, real alpha, idx n) noexcept {
84#ifdef NUMERICS_HAS_BLAS
85 cblas_dscal(static_cast<int>(n), alpha, x, 1);
86#else
88 for (idx i = 0; i < n; ++i) {
89 x[i] *= alpha;
90 }
91#endif
92}
93
94/// @brief y[i] += alpha * x[i]
97 real alpha, idx n) noexcept {
98#ifdef NUMERICS_HAS_BLAS
99 cblas_daxpy(static_cast<int>(n), alpha, x, 1, y, 1);
100#else
102 for (idx i = 0; i < n; ++i) {
103 y[i] += alpha * x[i];
104 }
105#endif
106}
107
108/// @brief y[i] = a*x[i] + b*y[i] (fused scale-and-add, one memory pass)
109///
110/// No direct BLAS equivalent. Avoids a separate scale pass then axpy pass:
111/// compared to scale(y,b); axpy(a,x,y), this reads y only once.
114 real a, real b, idx n) noexcept {
116 for (idx i = 0; i < n; ++i) {
117 y[i] = (a * x[i]) + (b * y[i]);
118 }
119}
120
121/// @brief z[i] = a*x[i] + b*y[i] (fused, three-array, one pass each)
122///
123/// No direct BLAS equivalent. z may alias x or y; otherwise fully independent.
126 const real* NUM_K_RESTRICT x,
127 const real* NUM_K_RESTRICT y,
128 real a, real b, idx n) noexcept {
130 for (idx i = 0; i < n; ++i) {
131 z[i] = (a * x[i]) + (b * y[i]);
132 }
133}
134
135/// @brief dot product: return sum x[i] * y[i]
138 idx n) noexcept {
139#ifdef NUMERICS_HAS_BLAS
140 return cblas_ddot(static_cast<int>(n), x, 1, y, 1);
141#else
142 real s = real(0);
144 for (idx i = 0; i < n; ++i) {
145 s += x[i] * y[i];
146 }
147 return s;
148#endif
149}
150
151/// @brief sum x[i]^2 (no sqrt; use for convergence checks to avoid sqrt cost)
152[[nodiscard]] NUM_K_AINLINE
153real norm_sq(const real* NUM_K_RESTRICT x, idx n) noexcept {
154 real s = real(0);
156 for (idx i = 0; i < n; ++i) {
157 s += x[i] * x[i];
158 }
159 return s;
160}
161
162/// @brief Euclidean norm: sqrt(sum x[i]^2)
163[[nodiscard]] NUM_K_AINLINE
164real norm(const real* NUM_K_RESTRICT x, idx n) noexcept {
165#ifdef NUMERICS_HAS_BLAS
166 return cblas_dnrm2(static_cast<int>(n), x, 1);
167#else
168 return std::sqrt(norm_sq(x, n));
169#endif
170}
171
172/// @brief L1 norm: sum |x[i]|
173[[nodiscard]] NUM_K_AINLINE
174real l1_norm(const real* NUM_K_RESTRICT x, idx n) noexcept {
175#ifdef NUMERICS_HAS_BLAS
176 return cblas_dasum(static_cast<int>(n), x, 1);
177#else
178 real s = real(0);
180 for (idx i = 0; i < n; ++i) {
181 s += std::abs(x[i]);
182 }
183 return s;
184#endif
185}
186
187/// @brief L-infinity norm: max |x[i]|
188[[nodiscard]] NUM_K_AINLINE
189real linf_norm(const real* NUM_K_RESTRICT x, idx n) noexcept {
190#ifdef NUMERICS_HAS_BLAS
191 // cblas_idamax returns the 0-based index of the element with max |value|
192 const int k = cblas_idamax(static_cast<int>(n), x, 1);
193 return std::abs(x[k]);
194#else
195 real mx = real(0);
196 for (idx i = 0; i < n; ++i) {
197 const real v = std::abs(x[i]);
198 if (v > mx) {
199 mx = v;
200 }
201 }
202 return mx;
203#endif
204}
205
206/// @brief Scalar sum: return sum x[i]
207///
208/// No BLAS equivalent (cblas_dasum sums absolute values; this does not).
209[[nodiscard]] NUM_K_AINLINE
210real sum(const real* NUM_K_RESTRICT x, idx n) noexcept {
211 real s = real(0);
213 for (idx i = 0; i < n; ++i) {
214 s += x[i];
215 }
216 return s;
217}
218
219// ===========================================================================
220// Level 2: matrix-vector operations (row-major storage throughout)
221// ===========================================================================
222
223/// @brief y[i] = sum_j A[i*n + j] * x[j] (m x n row-major matrix)
224///
225/// y must be pre-allocated with size m. y and x must not alias A.
228 const real* NUM_K_RESTRICT A,
229 const real* NUM_K_RESTRICT x,
230 idx m, idx n) noexcept {
231#ifdef NUMERICS_HAS_BLAS
232 cblas_dgemv(CblasRowMajor, CblasNoTrans,
233 static_cast<int>(m), static_cast<int>(n),
234 1.0, A, static_cast<int>(n),
235 x, 1,
236 0.0, y, 1);
237#else
238 for (idx i = 0; i < m; ++i) {
239 real s = real(0);
240 const real* row = A + (i * n);
242 for (idx j = 0; j < n; ++j) {
243 s += row[j] * x[j];
244 }
245 y[i] = s;
246 }
247#endif
248}
249
250/// @brief Rank-1 update: A[i*n + j] += alpha * x[i] * y[j] (m x n row-major)
251///
252/// BLAS dger equivalent. Inner loop over j is independent and vectorizable.
255 const real* NUM_K_RESTRICT x,
256 const real* NUM_K_RESTRICT y,
257 real alpha, idx m, idx n) noexcept {
258#ifdef NUMERICS_HAS_BLAS
259 cblas_dger(CblasRowMajor,
260 static_cast<int>(m), static_cast<int>(n),
261 alpha, x, 1, y, 1,
262 A, static_cast<int>(n));
263#else
264 for (idx i = 0; i < m; ++i) {
265 real* NUM_K_RESTRICT row = A + (i * n);
266 const real axi = alpha * x[i];
268 for (idx j = 0; j < n; ++j) {
269 row[j] += axi * y[j];
270 }
271 }
272#endif
273}
274
275/// @brief Forward substitution: solve Lx = b, L lower triangular (n x n, row-major).
276///
277/// x must be pre-allocated with size n. b and x must not alias each other.
278/// Uses BLAS dtrsv when available (copies b -> x first, then solves in-place).
281 const real* NUM_K_RESTRICT L,
282 const real* NUM_K_RESTRICT b,
283 idx n) noexcept {
284#ifdef NUMERICS_HAS_BLAS
285 std::memcpy(x, b, n * sizeof(real));
286 cblas_dtrsv(CblasRowMajor, CblasLower, CblasNoTrans, CblasNonUnit,
287 static_cast<int>(n), L, static_cast<int>(n), x, 1);
288#else
289 for (idx i = 0; i < n; ++i) {
290 real s = b[i];
291 const real* row = L + i * n;
292 for (idx j = 0; j < i; ++j) {
293 s -= row[j] * x[j];
294 }
295 x[i] = s / row[i];
296 }
297#endif
298}
299
300/// @brief Back substitution: solve Ux = b, U upper triangular (n x n, row-major).
301///
302/// x must be pre-allocated with size n. b and x must not alias each other.
303/// Uses BLAS dtrsv when available (copies b -> x first, then solves in-place).
306 const real* NUM_K_RESTRICT U,
307 const real* NUM_K_RESTRICT b,
308 idx n) noexcept {
309#ifdef NUMERICS_HAS_BLAS
310 std::memcpy(x, b, n * sizeof(real));
311 cblas_dtrsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit,
312 static_cast<int>(n), U, static_cast<int>(n), x, 1);
313#else
314 for (idx i = n; i-- > 0;) {
315 real s = b[i];
316 const real* row = U + i * n;
317 for (idx j = i + 1; j < n; ++j) {
318 s -= row[j] * x[j];
319 }
320 x[i] = s / row[i];
321 }
322#endif
323}
324
325} // namespace num::kernel::raw
Core type definitions.
NUM_K_AINLINE real l1_norm(const real *NUM_K_RESTRICT x, idx n) noexcept
L1 norm: sum |x[i]|.
Definition raw.hpp:174
NUM_K_AINLINE real linf_norm(const real *NUM_K_RESTRICT x, idx n) noexcept
L-infinity norm: max |x[i]|.
Definition raw.hpp:189
NUM_K_AINLINE real norm_sq(const real *NUM_K_RESTRICT x, idx n) noexcept
sum x[i]^2 (no sqrt; use for convergence checks to avoid sqrt cost)
Definition raw.hpp:153
NUM_K_AINLINE void ger(real *NUM_K_RESTRICT A, const real *NUM_K_RESTRICT x, const real *NUM_K_RESTRICT y, real alpha, idx m, idx n) noexcept
Rank-1 update: A[i*n + j] += alpha * x[i] * y[j] (m x n row-major)
Definition raw.hpp:254
NUM_K_AINLINE real norm(const real *NUM_K_RESTRICT x, idx n) noexcept
Euclidean norm: sqrt(sum x[i]^2)
Definition raw.hpp:164
NUM_K_AINLINE void axpby(real *NUM_K_RESTRICT y, const real *NUM_K_RESTRICT x, real a, real b, idx n) noexcept
y[i] = a*x[i] + b*y[i] (fused scale-and-add, one memory pass)
Definition raw.hpp:113
NUM_K_AINLINE void trsv_lower(real *NUM_K_RESTRICT x, const real *NUM_K_RESTRICT L, const real *NUM_K_RESTRICT b, idx n) noexcept
Forward substitution: solve Lx = b, L lower triangular (n x n, row-major).
Definition raw.hpp:280
NUM_K_AINLINE void axpbyz(real *NUM_K_RESTRICT z, const real *NUM_K_RESTRICT x, const real *NUM_K_RESTRICT y, real a, real b, idx n) noexcept
z[i] = a*x[i] + b*y[i] (fused, three-array, one pass each)
Definition raw.hpp:125
NUM_K_AINLINE void trsv_upper(real *NUM_K_RESTRICT x, const real *NUM_K_RESTRICT U, const real *NUM_K_RESTRICT b, idx n) noexcept
Back substitution: solve Ux = b, U upper triangular (n x n, row-major).
Definition raw.hpp:305
NUM_K_AINLINE real sum(const real *NUM_K_RESTRICT x, idx n) noexcept
Scalar sum: return sum x[i].
Definition raw.hpp:210
NUM_K_AINLINE real dot(const real *NUM_K_RESTRICT x, const real *NUM_K_RESTRICT y, idx n) noexcept
dot product: return sum x[i] * y[i]
Definition raw.hpp:137
NUM_K_AINLINE void matvec(real *NUM_K_RESTRICT y, const real *NUM_K_RESTRICT A, const real *NUM_K_RESTRICT x, idx m, idx n) noexcept
y[i] = sum_j A[i*n + j] * x[j] (m x n row-major matrix)
Definition raw.hpp:227
NUM_K_AINLINE void axpy(real *NUM_K_RESTRICT y, const real *NUM_K_RESTRICT x, real alpha, idx n) noexcept
y[i] += alpha * x[i]
Definition raw.hpp:96
NUM_K_AINLINE void scale(real *NUM_K_RESTRICT x, real alpha, idx n) noexcept
x[i] *= alpha
Definition raw.hpp:83
double real
Definition types.hpp:10
std::size_t idx
Definition types.hpp:11
#define NUM_K_RESTRICT
Definition raw.hpp:71
#define NUM_K_IVDEP
Definition raw.hpp:72
#define NUM_K_AINLINE
Definition raw.hpp:70