#include "core/parallel/cuda_ops.hpp"
#include <stdexcept>

Functions
real *	num::cuda::alloc (idx n)
	Allocate device memory.

void	num::cuda::free (real *ptr)
	Free device memory.

void	num::cuda::to_device (real dst, const real src, idx n)
	Copy host to device.

void	num::cuda::to_host (real dst, const real src, idx n)
	Copy device to host.

void	num::cuda::scale (real *v, idx n, real alpha)
	v = alpha * v

void	num::cuda::add (const real x, const real y, real *z, idx n)
	z = x + y

void	num::cuda::axpy (real alpha, const real x, real y, idx n)
	y = alpha*x + y

real	num::cuda::dot (const real x, const real y, idx n)
	dot product

void	num::cuda::matvec (const real A, const real x, real *y, idx rows, idx cols)
	y = A * x (row-major A)

void	num::cuda::matmul (const real A, const real B, real *C, idx m, idx k, idx n)
	C = A * B.

void	num::cuda::thomas_batched (const real a, const real b, const real c, const real d, real *x, idx n, idx batch_size)
	Batched Thomas algorithm for tridiagonal systems.

Namespaces

Functions