|
| constexpr int | size () const |
| | Get the size of the Vec. Provided mostly to satisfy named requirements. More...
|
| |
|
constexpr bool | empty () const |
| |
| constexpr | Vec ()=default |
| | Vec of default-constructed elements of this type and length. More...
|
| |
|
constexpr | Vec (const Vec< T, S > &)=default |
| |
|
constexpr Vec< T, S > & | operator= (const Vec< T, S > &)=default |
| |
| template<typename E , typename... Es> |
| constexpr | Vec (E head, Es... tail) |
| | Construct a Vec containing the given variadic list of elements. More...
|
| |
| template<int InSize> |
| constexpr | Vec (const T(&arr)[InSize]) |
| | Construct a Vec from a constant array of elements. More...
|
| |
| constexpr iterator | begin () |
| | Returns an iterator to the first element of the container. More...
|
| |
| constexpr const_iterator | begin () const |
| | Returns a const iterator to the first element of the container. More...
|
| |
| constexpr iterator | end () |
| | Returns an iterator to the last element of the container. More...
|
| |
| constexpr const_iterator | end () const |
| | Returns a const iterator to the last element of the container. More...
|
| |
| __host__ std::vector< T > | toVector () |
| | Convert a Vec to a std::vector. More...
|
| |
| constexpr auto | toTuple () |
| | Convert the Vec to a std::tuple of elements of type T. More...
|
| |
| constexpr T & | operator[] (int i) |
| | Access operator. More...
|
| |
| constexpr const T & | operator[] (int i) const |
| | Access operator. More...
|
| |
| constexpr BareValueType | read (int i) const |
| | Returns a copy of the value at index i. More...
|
| |
| template<int I> |
| constexpr T & | get () |
| | Read the element at index I. More...
|
| |
| template<int I> |
| constexpr const T & | get () const |
| | Read the element at index I. More...
|
| |
| template<typename Q , int OtherSize> |
| constexpr bool | operator== (const Vec< Q, OtherSize > &other) const |
| | Equality operator. More...
|
| |
|
template<typename Q , int OtherSize> |
| constexpr bool | operator!= (const Vec< Q, OtherSize > &other) const |
| |
| template<typename Op > |
| constexpr auto | map (const Op &op=Op{}) const |
| | Maps the Vec with the given callable, providing a Vec result of the same length. More...
|
| |
| constexpr Vec< T, Size > | operator- () const |
| | Unary negation operator. More...
|
| |
| template<typename Q > |
| constexpr Vec< T, Size > & | operator= (const Q &value) |
| | Unary scalar assignment. More...
|
| |
| constexpr void | replace (T from, T to) |
| | In-place replacement of all instances of from with to More...
|
| |
| template<typename Op > |
| constexpr T | reduce (Op op=Op{}) const |
| | Reduce the vector elements using a binary operator. More...
|
| |
| template<int NewSize> |
| constexpr Vec< T, NewSize > | slice (int start) const |
| | Get a copy of a contiguous slice of this Vec More...
|
| |
| template<int InSize, int ToWrite = InSize> |
| constexpr void | splice (int I, const Vec< T, InSize > &in) |
| | Overwrite a segment of this Vec with ToWrite values from another, starting at position I. More...
|
| |
| template<int I, int Len> |
| constexpr sp::Vec< T, Size+Len > | insert (const sp::Vec< T, Len > &newValues) const |
| | Insert multiple values. More...
|
| |
| template<int I> |
| constexpr auto | insert (const T &newValue) const |
| | Insert a single value. More...
|
| |
| constexpr sp::Vec< T, Size > | reverse () const |
| | Get the reverse of this Vec More...
|
| |
| template<int I, int N = 1> |
| constexpr auto | erase () const |
| | Erase one or more values. More...
|
| |
| constexpr auto | eraseLast () const |
| | Erase the last value. More...
|
| |
| constexpr auto | eraseFirst () const |
| | Erase the first value. More...
|
| |
| template<bool Foo = false, typename std::enable_if_t< Foo||Size==1 > * = nullptr> |
| | operator T () const |
| | Allow 1-vectors to convert to the corresponding scalar. More...
|
| |
template<typename T, int S>
class sp::Vec< T, S >
A vector of S elements of type T.
Like CUDA's float4 and friends, it facilitates the use of vectorised memory instructions by making an alignment promise. Vec also provides a number of convenience features:
- Arithmetic operators allowing you to write math involving
Vecs in the obvious way. This lets you very directly write vectorisable loops.
- Better metaprogramming, since the vector size and type are exposed as template parameters.
- Full
constexpr support, allowing its use in highly elaborate metaprograms.
- Functional-programming-style operators like
map() and reduce() for transforming Vecs.
- List-style manipulations like
insert() and slice().
- Conversion to/from
std::vector (CPU-only).
To facilitate interoperability, Vec transparently converts to/from CUDA vector types.
A Vec in memory is always aligned suitably for use with vector memory instructions.
Examples
The following example kernels:
- Load two 4-vectors in each thread
- Add them together
- Write the result back to memory.
- Assume the input is a multiple-of-four elements long, to make the examples shorter.
Each kernel has an example calling function showing how the data is initiated and moved around.
Example Using Vanilla CUDA
__global__ void sumKernelVanilla(float* X, float* Y, int N) {
int offset = blockIdx.x * blockDim.x + threadIdx.x;
if (offset >= N) {
return;
}
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wcast-align"
float4 xRead = ((float4*) X)[offset];
float4 yRead = ((float4*) Y)[offset];
xRead.w += yRead.w;
xRead.x += yRead.x;
xRead.y += yRead.y;
xRead.z += yRead.z;
*((float4*) X) = xRead;
#pragma clang diagnostic pop
}
float *a, *b = nullptr;
fprintf(stderr,
"GPU memory allocation failed");
}
fprintf(stderr,
"Copy from host to GPU failed");
}
sumKernelVanilla<<<1, 1, 0, stream>>>(a, b, 4);
return x;
}
Stream createStream(const std::string &name) const
static Device & getActive()
__host__ __device__ cudaError_t cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind)
__host__ __device__ cudaError_t cudaMalloc(void **devPtr, size_t size)
__host__ __device__ cudaError_t cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0))
Example Using Speclib
int offset = 4 * (blockIdx.x * blockDim.x + threadIdx.x);
if (offset >= X.
dim(0)) {
return;
}
X.vectorWrite(offset, outValue);
}
int dim(int d) const
Behaviour common to all TensorLikes ///.
Definition: TensorLike.hpp:233
Represents a Tensor- a multidimensional array that can represent a multilinear map.
Definition: Tensor.hpp:32
A vector of S elements of type T.
Definition: Vec.hpp:71
auto speclibTest() {
auto hostX = nomadicX.mutableHostTensor(stream);
auto hostY = nomadicY.mutableHostTensor(stream);
for (int i = 0; i < 4; i++) {
hostX[i] = i + 1;
hostY[i] = i + 1;
}
auto deviceX = nomadicX.mutableDeviceTensor(stream);
auto deviceY = nomadicY.mutableDeviceTensor(stream);
sumKernelSpeclib<<<1, 1, 0, stream>>>(deviceX, deviceY);
nomadicX.hostTensor(stream, true);
return nomadicX;
}
Represents a TensorLike that can be synchronised between the CPU and various GPUs.
Definition: NomadicTensor.hpp:36
On top of being shorter, this example:
- Bounds-checks (and alignment-checks) the memory accesses when compiled in debug mode, producing a descriptive error message if something goes wrong. In release mode, both programs compile to the same thing.
- Adapts seamlessly to any type by replacing
float with a template parameter.