#include "core/parallel/cuda_ops.hpp"
#include <stdexcept>
Go to the source code of this file.
|
| real * | num::cuda::alloc (idx n) |
| | Allocate device memory.
|
| |
| void | num::cuda::free (real *ptr) |
| | Free device memory.
|
| |
| void | num::cuda::to_device (real *dst, const real *src, idx n) |
| | Copy host to device.
|
| |
| void | num::cuda::to_host (real *dst, const real *src, idx n) |
| | Copy device to host.
|
| |
| void | num::cuda::scale (real *v, idx n, real alpha) |
| | v = alpha * v
|
| |
| void | num::cuda::add (const real *x, const real *y, real *z, idx n) |
| | z = x + y
|
| |
| void | num::cuda::axpy (real alpha, const real *x, real *y, idx n) |
| | y = alpha*x + y
|
| |
| real | num::cuda::dot (const real *x, const real *y, idx n) |
| | dot product
|
| |
| void | num::cuda::matvec (const real *A, const real *x, real *y, idx rows, idx cols) |
| | y = A * x (row-major A)
|
| |
| void | num::cuda::matmul (const real *A, const real *B, real *C, idx m, idx k, idx n) |
| | C = A * B.
|
| |
| void | num::cuda::thomas_batched (const real *a, const real *b, const real *c, const real *d, real *x, idx n, idx batch_size) |
| | Batched Thomas algorithm for tridiagonal systems.
|
| |