numerics
Loading...
Searching...
No Matches
cuda_ops.hpp
Go to the documentation of this file.
1/// @file cuda_ops.hpp
2/// @brief CUDA kernel wrappers
3#pragma once
4
5#include "core/types.hpp"
6
7namespace num::cuda {
8
9/// @brief Allocate device memory
10real* alloc(idx n);
11
12/// @brief Free device memory
13void free(real* ptr);
14
15/// @brief Copy host to device
16void to_device(real* dst, const real* src, idx n);
17
18/// @brief Copy device to host
19void to_host(real* dst, const real* src, idx n);
20
21/// @brief v = alpha * v
22void scale(real* v, idx n, real alpha);
23
24/// @brief z = x + y
25void add(const real* x, const real* y, real* z, idx n);
26
27/// @brief y = alpha*x + y
28void axpy(real alpha, const real* x, real* y, idx n);
29
30/// @brief dot product
31real dot(const real* x, const real* y, idx n);
32
33/// @brief y = A * x (row-major A)
34void matvec(const real* A, const real* x, real* y, idx rows, idx cols);
35
36/// @brief C = A * B
37void matmul(const real* A, const real* B, real* C, idx m, idx k, idx n);
38
39/// @brief Batched Thomas algorithm for tridiagonal systems
40/// @param a Lower diagonals (batch_size arrays of size n-1, packed consecutively)
41/// @param b Main diagonals (batch_size arrays of size n)
42/// @param c Upper diagonals (batch_size arrays of size n-1, packed consecutively)
43/// @param d Right-hand sides (batch_size arrays of size n)
44/// @param x Solution vectors (batch_size arrays of size n)
45/// @param n Size of each system
46/// @param batch_size Number of independent systems to solve
47void thomas_batched(const real* a, const real* b, const real* c,
48 const real* d, real* x, idx n, idx batch_size);
49
50} // namespace num::cuda
Core type definitions.
void to_device(real *dst, const real *src, idx n)
Copy host to device.
void scale(real *v, idx n, real alpha)
v = alpha * v
void thomas_batched(const real *a, const real *b, const real *c, const real *d, real *x, idx n, idx batch_size)
Batched Thomas algorithm for tridiagonal systems.
void matmul(const real *A, const real *B, real *C, idx m, idx k, idx n)
C = A * B.
void free(real *ptr)
Free device memory.
real * alloc(idx n)
Allocate device memory.
void to_host(real *dst, const real *src, idx n)
Copy device to host.
void add(const real *x, const real *y, real *z, idx n)
z = x + y
void axpy(real alpha, const real *x, real *y, idx n)
y = alpha*x + y
real dot(const real *x, const real *y, idx n)
dot product
void matvec(const real *A, const real *x, real *y, idx rows, idx cols)
y = A * x (row-major A)
double real
Definition types.hpp:10
constexpr T ipow(T x) noexcept
Compute x^N at compile time via repeated squaring.
std::size_t idx
Definition types.hpp:11