numerics/api/cuda__ops_8hpp_source.html

/// @file cuda_ops.hpp

/// @brief CUDA kernel wrappers

#pragma once


#include "core/types.hpp"


namespace num::cuda {


/// @brief Allocate device memory

real *alloc(idx n);


/// @brief Free device memory

void free(real *ptr);


/// @brief Copy host to device

void to_device(real *dst, const real *src, idx n);


/// @brief Copy device to host

void to_host(real *dst, const real *src, idx n);


/// @brief v = alpha * v

void scale(real *v, idx n, real alpha);


/// @brief z = x + y

void add(const real *x, const real *y, real *z, idx n);


/// @brief y = alpha*x + y

void axpy(real alpha, const real *x, real *y, idx n);


/// @brief dot product

real dot(const real *x, const real *y, idx n);


/// @brief y = A * x (row-major A)

void matvec(const real *A, const real *x, real *y, idx rows, idx cols);


/// @brief C = A * B

void matmul(const real *A, const real *B, real *C, idx m, idx k, idx n);


/// @brief Batched Thomas algorithm for tridiagonal systems

/// @param a  Lower diagonals (batch_size arrays of size n-1, packed

/// consecutively)

/// @param b  Main diagonals (batch_size arrays of size n)

/// @param c  Upper diagonals (batch_size arrays of size n-1, packed

/// consecutively)

/// @param d  Right-hand sides (batch_size arrays of size n)

/// @param x  Solution vectors (batch_size arrays of size n)

/// @param n  Size of each system

/// @param batch_size  Number of independent systems to solve

void thomas_batched(const real *a, const real *b, const real *c, const real *d,

                    real *x, idx n, idx batch_size);


} // namespace num::cuda


types.hpp
Core type definitions.

num::cuda
Definition cuda_ops.hpp:7

num::cuda::to_device
void to_device(real *dst, const real *src, idx n)
Copy host to device.
Definition cuda_stubs.cpp:16

num::cuda::scale
void scale(real *v, idx n, real alpha)
v = alpha * v
Definition cuda_stubs.cpp:22

num::cuda::thomas_batched
void thomas_batched(const real *a, const real *b, const real *c, const real *d, real *x, idx n, idx batch_size)
Batched Thomas algorithm for tridiagonal systems.
Definition cuda_stubs.cpp:40

num::cuda::matmul
void matmul(const real *A, const real *B, real *C, idx m, idx k, idx n)
C = A * B.
Definition cuda_stubs.cpp:37

num::cuda::free
void free(real *ptr)
Free device memory.
Definition cuda_stubs.cpp:13

num::cuda::alloc
real * alloc(idx n)
Allocate device memory.
Definition cuda_stubs.cpp:10

num::cuda::to_host
void to_host(real *dst, const real *src, idx n)
Copy device to host.
Definition cuda_stubs.cpp:19

num::cuda::add
void add(const real *x, const real *y, real *z, idx n)
z = x + y
Definition cuda_stubs.cpp:25

num::cuda::axpy
void axpy(real alpha, const real *x, real *y, idx n)
y = alpha*x + y
Definition cuda_stubs.cpp:28

num::cuda::dot
real dot(const real *x, const real *y, idx n)
dot product
Definition cuda_stubs.cpp:31

num::cuda::matvec
void matvec(const real *A, const real *x, real *y, idx rows, idx cols)
y = A * x (row-major A)
Definition cuda_stubs.cpp:34

num::real
double real
Definition types.hpp:10

num::idx
std::size_t idx
Definition types.hpp:11