Represents a CUDA stream. More...
#include <Stream.hpp>
Public Member Functions | |
Device & | getDevice () const |
Get a reference to the device this stream is associated with. More... | |
int | getPriority () const |
Query the stream's priority. More... | |
bool | isIdle () const |
Return true if the stream has no ongoing GPU jobs. More... | |
const Stream & | synchronize () const |
Wait for all work on this stream to finish. More... | |
Event | recordEvent (std::string &name) const |
Create an event representing the completion of all tasks currently queued on this stream. More... | |
Event | recordEvent () const |
BlockingEvent | recordBlockingEvent (std::string &name) const |
Record an event that the host can synchronise with. More... | |
BlockingEvent | recordBlockingEvent () const |
template<typename... KernelArgTypes> | |
const Stream & | launchKernel (void kernelFunction(KernelArgTypes...), dim3 gridDim, dim3 blockDim, int dynamicSMem, void **kernelArgs) const |
template<typename... KernelArgTypes> | |
const Stream & | launchKernel (void kernelFunction(KernelArgTypes...), dim3 gridDim, dim3 blockDim, int dynamicSMem, KernelArgTypes... kernelArgs) const |
Overload which takes the kernel arguments as a parameter pack rather than a single variable. More... | |
const Stream & | launchHostFunc (const std::function< void()> &fn) const |
Enqueue a host function (or closure) on the stream. More... | |
const Stream & | awaitEvent (const Event &e) const |
Enqueue a wait operation for the completion of the given event. More... | |
const Stream & | memset (__device void *dst, uint8_t value, size_t count) const |
const Stream & | memset16 (__device void *dst, uint16_t value, size_t count) const |
const Stream & | memset32 (__device void *dst, uint32_t value, size_t count) const |
const Stream & | memset (__flat void *dst, uint8_t value, size_t count) const |
const Stream & | memset16 (__flat void *dst, uint16_t value, size_t count) const |
const Stream & | memset32 (__flat void *dst, uint32_t value, size_t count) const |
const Stream & | memset (void *dst, uint8_t value, size_t count) const |
const Stream & | memset16 (void *dst, uint16_t value, size_t count) const |
const Stream & | memset32 (void *dst, uint32_t value, size_t count) const |
template<typename T > | |
const Stream & | setMemory (__device T *dst, T value, size_t elements) const |
template<typename T > | |
const Stream & | setMemory (__flat T *dst, T value, size_t elements) const |
template<typename T > | |
const Stream & | setMemory (T *dst, T value, size_t elements) const |
template<typename T > | |
const Stream & | zeroMemory (__device T *dst, size_t elements) const |
Enqueue a device memset on the stream to set elements values from dst to 0. More... | |
template<typename T > | |
const Stream & | zeroMemory (__flat T *dst, size_t elements) const |
Enqueue a host or device memset on the stream to set elements values from dst to 0. More... | |
template<typename T > | |
const Stream & | zeroMemory (T *dst, size_t elements) const |
Enqueue a host memset on the stream to set elements values from dst to 0. More... | |
const Stream & | memset2D (__device void *dst, size_t dstPitch, uint8_t value, size_t w, size_t h) const |
const Stream & | memset2D16 (__device void *dst, size_t dstPitch, uint16_t value, size_t w, size_t h) const |
const Stream & | memset2D32 (__device void *dst, size_t dstPitch, uint32_t value, size_t w, size_t h) const |
const Stream & | memset2D (__flat void *dst, size_t dstPitch, uint8_t value, size_t w, size_t h) const |
const Stream & | memset2D16 (__flat void *dst, size_t dstPitch, uint16_t value, size_t w, size_t h) const |
const Stream & | memset2D32 (__flat void *dst, size_t dstPitch, uint32_t value, size_t w, size_t h) const |
const Stream & | memset2D (void *dst, size_t dstPitch, uint8_t value, size_t w, size_t h) const |
const Stream & | memset2D16 (void *dst, size_t dstPitch, uint16_t value, size_t w, size_t h) const |
const Stream & | memset2D32 (void *dst, size_t dstPitch, uint32_t value, size_t w, size_t h) const |
template<typename T > | |
const Stream & | setMemory2D (__device T *dst, size_t dstPitch, T value, size_t w, size_t h) const |
template<typename T > | |
const Stream & | setMemory2D (__flat T *dst, size_t dstPitch, T value, size_t w, size_t h) const |
template<typename T > | |
const Stream & | setMemory2D (T *dst, size_t dstPitch, T value, size_t w, size_t h) const |
const Stream & | memcpy (void *dst, const void *src, size_t count) const |
const Stream & | memcpy (__device void *dst, const void *src, size_t count) const |
const Stream & | memcpy (void *dst, __device const void *src, size_t count) const |
const Stream & | memcpy (__device void *dst, __device const void *src, size_t count) const |
const Stream & | memcpy (__flat void *dst, __flat const void *src, size_t count) const |
const Stream & | memcpy (__flat void *dst, const void *src, size_t count) const |
const Stream & | memcpy (__flat void *dst, __device const void *src, size_t count) const |
const Stream & | memcpy (void *dst, __flat const void *src, size_t count) const |
const Stream & | memcpy (__device void *dst, __flat const void *src, size_t count) const |
template<typename T > | |
const Stream & | copyMemory (T *dst, const T *src, size_t count) const |
Enqueue a host-to-host copy. More... | |
template<typename T > | |
const Stream & | copyMemory (__device T *dst, const T *src, size_t count) const |
Enqueue a host-to-device copy Handles types larger than 1 byte automatically. More... | |
template<typename T > | |
const Stream & | copyMemory (T *dst, __device const T *src, size_t count) const |
Enqueue a device-to-host copy Handles types larger than 1 byte automatically. More... | |
template<typename T > | |
const Stream & | copyMemory (__device T *dst, __device const T *src, size_t count) const |
Enqueue a device-to-device copy. More... | |
template<typename T > | |
const Stream & | copyMemory (__flat T *dst, __flat const T *src, size_t count) const |
Flat address space copy. More... | |
template<typename T > | |
const Stream & | copyMemory (__flat T *dst, const T *src, size_t count) const |
template<typename T > | |
const Stream & | copyMemory (__flat T *dst, __device const T *src, size_t count) const |
template<typename T > | |
const Stream & | copyMemory (T *dst, __flat const T *src, size_t count) const |
template<typename T > | |
const Stream & | copyMemory (__device T *dst, __flat const T *src, size_t count) const |
Public Member Functions inherited from sp::RAIIObject< StreamAllocator, int > | |
RAIIObject (const CTorArgs &... args) | |
Allocate a new object and take ownership of it. More... | |
RAIIObject (const APIType &obj, bool own=true) | |
Wrap an existing object. More... | |
const APIType | get () const |
Get the underlying C API object (eg. cudaStream_t ) More... | |
APIType | get () |
Get the underlying C API object (eg. cudaStream_t ) More... | |
APIType | operator* () |
const APIType | operator* () const |
operator APIType () const | |
Implicitly convert to the C API type, so you can just pass this object to the C library whence it came. More... | |
operator APIType () | |
Friends | |
class | Device |
Additional Inherited Members | |
Protected Types inherited from sp::RAIIObject< StreamAllocator, int > | |
using | APIType = typename AllocType::APIType |
The C API type. Something like cudaStream_t . More... | |
using | UnderlyingType = std::remove_pointer_t< APIType > |
Represents a CUDA stream.
This object will transparently convert to a cudaStream_t
. This means you can use it in kernel launches just like a normal cuda stream.
Enqueue a wait operation for the completion of the given event.
const Stream & sp::Stream::copyMemory | ( | __device T * | dst, |
__device const T * | src, | ||
size_t | count | ||
) | const |
Enqueue a device-to-device copy.
Handles types larger than 1 byte automatically
This can do copies between devices as well as copies within the same device
const Stream & sp::Stream::copyMemory | ( | __device T * | dst, |
const T * | src, | ||
size_t | count | ||
) | const |
Enqueue a host-to-device copy Handles types larger than 1 byte automatically.
const Stream & sp::Stream::copyMemory | ( | __flat T * | dst, |
__flat const T * | src, | ||
size_t | count | ||
) | const |
Flat address space copy.
Use this overload when the source/destination locations aren't know.
const Stream & sp::Stream::copyMemory | ( | T * | dst, |
__device const T * | src, | ||
size_t | count | ||
) | const |
Enqueue a device-to-host copy Handles types larger than 1 byte automatically.
const Stream & sp::Stream::copyMemory | ( | T * | dst, |
const T * | src, | ||
size_t | count | ||
) | const |
Enqueue a host-to-host copy.
Handles types larger than 1 byte automatically
The following example includes usage of all four variants of this function:
If you need to queue up a host-host memcpy
on a stream, this will have lower overhead than using launchHostFunc()
. This version is also sometimes useful to call implicitly via overload resolution in templated code.
Device & sp::Stream::getDevice | ( | ) | const |
Get a reference to the device this stream is associated with.
int sp::Stream::getPriority | ( | ) | const |
Query the stream's priority.
bool sp::Stream::isIdle | ( | ) | const |
Return true if the stream has no ongoing GPU jobs.
const Stream & sp::Stream::launchHostFunc | ( | const std::function< void()> & | fn | ) | const |
Enqueue a host function (or closure) on the stream.
Adds a host-side function call to the stream's work queue. This is significantly cheaper than synchronising the stream and then calling the desired host function. This allows you to queue up work beyond the desired host function, so you don't have a nasty synchronous stall after your host function while you launch something else on the GPU, and the overhead of an asynchronously-queued host function is less than the cost of a full synchronisation.
The following example allocates an sp::StringBuffer
, uses a GPU kernel to populate it, copies the string back to the host, and passes the host-side buffer to a verify()
function to do some host-side processing on it.
All stream interactions shown are asynchronous: control will most likely reach the end of this example code before the GPU has done anything.
cudaLaunchHostFunc
(it isn't allowed to use most CUDA APIs).const Stream & sp::Stream::launchKernel | ( | void | kernelFunctionKernelArgTypes..., |
dim3 | gridDim, | ||
dim3 | blockDim, | ||
int | dynamicSMem, | ||
KernelArgTypes... | kernelArgs | ||
) | const |
Overload which takes the kernel arguments as a parameter pack rather than a single variable.
BlockingEvent sp::Stream::recordBlockingEvent | ( | std::string & | name | ) | const |
Record an event that the host can synchronise with.
Like recordEvent()
, but the returned event is a BlockingEvent
, allowing the host to synchronise with it using BlockingEvent::synchronize()
. This is more expensive than a regular Event
, and should only be used if you need the host-synchronisation feature.
name | An optional name for the event. This name may appear in profilers, debuggers, or other tools. |
Event sp::Stream::recordEvent | ( | std::string & | name | ) | const |
Create an event representing the completion of all tasks currently queued on this stream.
Enqueuing the returned event on another stream will make that stream wait for this stream to complete all tasks that were enqueued at the moment this function was called.
name | An optional name for the event. This name may appear in profilers, debuggers, or other tools. |
const Stream & sp::Stream::synchronize | ( | ) | const |
Wait for all work on this stream to finish.
const Stream & sp::Stream::zeroMemory | ( | __device T * | dst, |
size_t | elements | ||
) | const |
Enqueue a device memset on the stream to set elements
values from dst
to 0.
dst | Device memory pointer to begin at |
elements | The number of elements to be zeroed |
const Stream & sp::Stream::zeroMemory | ( | __flat T * | dst, |
size_t | elements | ||
) | const |
Enqueue a host or device memset on the stream to set elements
values from dst
to 0.
dst | Host or device memory pointer to begin at |
elements | The number of elements to be zeroed |