libSCALE  0.2.0
A modern C++ CUDA API
sp::Stream Class Reference

Represents a CUDA stream. More...

#include <Stream.hpp>

Inheritance diagram for sp::Stream:
[legend]

Public Member Functions

DevicegetDevice () const
 Get a reference to the device this stream is associated with. More...
 
int getPriority () const
 Query the stream's priority. More...
 
bool isIdle () const
 Return true if the stream has no ongoing GPU jobs. More...
 
const Streamsynchronize () const
 Wait for all work on this stream to finish. More...
 
Event recordEvent (std::string &name) const
 Create an event representing the completion of all tasks currently queued on this stream. More...
 
Event recordEvent () const
 
BlockingEvent recordBlockingEvent (std::string &name) const
 Record an event that the host can synchronise with. More...
 
BlockingEvent recordBlockingEvent () const
 
template<typename... KernelArgTypes>
const StreamlaunchKernel (void kernelFunction(KernelArgTypes...), dim3 gridDim, dim3 blockDim, int dynamicSMem, void **kernelArgs) const
 
template<typename... KernelArgTypes>
const StreamlaunchKernel (void kernelFunction(KernelArgTypes...), dim3 gridDim, dim3 blockDim, int dynamicSMem, KernelArgTypes... kernelArgs) const
 Overload which takes the kernel arguments as a parameter pack rather than a single variable. More...
 
const StreamlaunchHostFunc (const std::function< void()> &fn) const
 Enqueue a host function (or closure) on the stream. More...
 
const StreamawaitEvent (const Event &e) const
 Enqueue a wait operation for the completion of the given event. More...
 
const Streammemset (__device void *dst, uint8_t value, size_t count) const
 
const Streammemset16 (__device void *dst, uint16_t value, size_t count) const
 
const Streammemset32 (__device void *dst, uint32_t value, size_t count) const
 
const Streammemset (__flat void *dst, uint8_t value, size_t count) const
 
const Streammemset16 (__flat void *dst, uint16_t value, size_t count) const
 
const Streammemset32 (__flat void *dst, uint32_t value, size_t count) const
 
const Streammemset (void *dst, uint8_t value, size_t count) const
 
const Streammemset16 (void *dst, uint16_t value, size_t count) const
 
const Streammemset32 (void *dst, uint32_t value, size_t count) const
 
template<typename T >
const StreamsetMemory (__device T *dst, T value, size_t elements) const
 
template<typename T >
const StreamsetMemory (__flat T *dst, T value, size_t elements) const
 
template<typename T >
const StreamsetMemory (T *dst, T value, size_t elements) const
 
template<typename T >
const StreamzeroMemory (__device T *dst, size_t elements) const
 Enqueue a device memset on the stream to set elements values from dst to 0. More...
 
template<typename T >
const StreamzeroMemory (__flat T *dst, size_t elements) const
 Enqueue a host or device memset on the stream to set elements values from dst to 0. More...
 
template<typename T >
const StreamzeroMemory (T *dst, size_t elements) const
 Enqueue a host memset on the stream to set elements values from dst to 0. More...
 
const Streammemset2D (__device void *dst, size_t dstPitch, uint8_t value, size_t w, size_t h) const
 
const Streammemset2D16 (__device void *dst, size_t dstPitch, uint16_t value, size_t w, size_t h) const
 
const Streammemset2D32 (__device void *dst, size_t dstPitch, uint32_t value, size_t w, size_t h) const
 
const Streammemset2D (__flat void *dst, size_t dstPitch, uint8_t value, size_t w, size_t h) const
 
const Streammemset2D16 (__flat void *dst, size_t dstPitch, uint16_t value, size_t w, size_t h) const
 
const Streammemset2D32 (__flat void *dst, size_t dstPitch, uint32_t value, size_t w, size_t h) const
 
const Streammemset2D (void *dst, size_t dstPitch, uint8_t value, size_t w, size_t h) const
 
const Streammemset2D16 (void *dst, size_t dstPitch, uint16_t value, size_t w, size_t h) const
 
const Streammemset2D32 (void *dst, size_t dstPitch, uint32_t value, size_t w, size_t h) const
 
template<typename T >
const StreamsetMemory2D (__device T *dst, size_t dstPitch, T value, size_t w, size_t h) const
 
template<typename T >
const StreamsetMemory2D (__flat T *dst, size_t dstPitch, T value, size_t w, size_t h) const
 
template<typename T >
const StreamsetMemory2D (T *dst, size_t dstPitch, T value, size_t w, size_t h) const
 
const Streammemcpy (void *dst, const void *src, size_t count) const
 
const Streammemcpy (__device void *dst, const void *src, size_t count) const
 
const Streammemcpy (void *dst, __device const void *src, size_t count) const
 
const Streammemcpy (__device void *dst, __device const void *src, size_t count) const
 
const Streammemcpy (__flat void *dst, __flat const void *src, size_t count) const
 
const Streammemcpy (__flat void *dst, const void *src, size_t count) const
 
const Streammemcpy (__flat void *dst, __device const void *src, size_t count) const
 
const Streammemcpy (void *dst, __flat const void *src, size_t count) const
 
const Streammemcpy (__device void *dst, __flat const void *src, size_t count) const
 
template<typename T >
const StreamcopyMemory (T *dst, const T *src, size_t count) const
 Enqueue a host-to-host copy. More...
 
template<typename T >
const StreamcopyMemory (__device T *dst, const T *src, size_t count) const
 Enqueue a host-to-device copy Handles types larger than 1 byte automatically. More...
 
template<typename T >
const StreamcopyMemory (T *dst, __device const T *src, size_t count) const
 Enqueue a device-to-host copy Handles types larger than 1 byte automatically. More...
 
template<typename T >
const StreamcopyMemory (__device T *dst, __device const T *src, size_t count) const
 Enqueue a device-to-device copy. More...
 
template<typename T >
const StreamcopyMemory (__flat T *dst, __flat const T *src, size_t count) const
 Flat address space copy. More...
 
template<typename T >
const StreamcopyMemory (__flat T *dst, const T *src, size_t count) const
 
template<typename T >
const StreamcopyMemory (__flat T *dst, __device const T *src, size_t count) const
 
template<typename T >
const StreamcopyMemory (T *dst, __flat const T *src, size_t count) const
 
template<typename T >
const StreamcopyMemory (__device T *dst, __flat const T *src, size_t count) const
 
- Public Member Functions inherited from sp::RAIIObject< StreamAllocator, int >
 RAIIObject (const CTorArgs &... args)
 Allocate a new object and take ownership of it. More...
 
 RAIIObject (const APIType &obj, bool own=true)
 Wrap an existing object. More...
 
const APIType get () const
 Get the underlying C API object (eg. cudaStream_t) More...
 
APIType get ()
 Get the underlying C API object (eg. cudaStream_t) More...
 
APIType operator* ()
 
const APIType operator* () const
 
 operator APIType () const
 Implicitly convert to the C API type, so you can just pass this object to the C library whence it came. More...
 
 operator APIType ()
 

Friends

class Device
 

Additional Inherited Members

- Protected Types inherited from sp::RAIIObject< StreamAllocator, int >
using APIType = typename AllocType::APIType
 The C API type. Something like cudaStream_t. More...
 
using UnderlyingType = std::remove_pointer_t< APIType >
 

Detailed Description

Represents a CUDA stream.

This object will transparently convert to a cudaStream_t. This means you can use it in kernel launches just like a normal cuda stream.

Member Function Documentation

◆ awaitEvent()

const Stream & sp::Stream::awaitEvent ( const Event e) const

Enqueue a wait operation for the completion of the given event.

See also
cudaStreamWaitEvent()
recordEvent()

◆ copyMemory() [1/5]

template<typename T >
const Stream & sp::Stream::copyMemory ( __device T *  dst,
__device const T *  src,
size_t  count 
) const

Enqueue a device-to-device copy.

Handles types larger than 1 byte automatically

This can do copies between devices as well as copies within the same device

◆ copyMemory() [2/5]

template<typename T >
const Stream & sp::Stream::copyMemory ( __device T *  dst,
const T *  src,
size_t  count 
) const

Enqueue a host-to-device copy Handles types larger than 1 byte automatically.

◆ copyMemory() [3/5]

template<typename T >
const Stream & sp::Stream::copyMemory ( __flat T *  dst,
__flat const T *  src,
size_t  count 
) const

Flat address space copy.

Use this overload when the source/destination locations aren't know.

◆ copyMemory() [4/5]

template<typename T >
const Stream & sp::Stream::copyMemory ( T *  dst,
__device const T *  src,
size_t  count 
) const

Enqueue a device-to-host copy Handles types larger than 1 byte automatically.

◆ copyMemory() [5/5]

template<typename T >
const Stream & sp::Stream::copyMemory ( T *  dst,
const T *  src,
size_t  count 
) const

Enqueue a host-to-host copy.

Handles types larger than 1 byte automatically

The following example includes usage of all four variants of this function:

Example

sp::Stream stream = gpu.createStream();
std::string origin = "aaaaaaaaaa";
std::string result = "aaaaaaaaaa";
auto devicePointer1 = gpu.allocateMemory<char>(origin.size());
auto devicePointer2 = gpu.allocateMemory<char>(origin.size());
// Move the data around
stream.copyMemory(devicePointer1.get(), origin.data(), origin.size());
conditionalFill<<<1, 256, 0, stream>>>(devicePointer1.get(), 'b', 'a', origin.size());
stream.copyMemory(devicePointer2.get(), devicePointer1.get(), origin.size());
conditionalFill<<<1, 256, 0, stream>>>(devicePointer2.get(), 'c', 'b', origin.size());
stream.copyMemory(result.data(), devicePointer2.get(), origin.size());
stream.synchronize();
verify(result, "cccccccccc", 10);
Represents a GPU.
Definition: Device.hpp:60
Stream createStream(const std::string &name) const
Make a stream on this device.
sp::UniquePtr< __device T > allocateMemory(size_t n, DeviceMemoryType memType=DeviceMemoryType::NORMAL)
Allocate device memory.
Definition: Device.hpp:292
static Device & getActive()
Get the "active" device according to libcuda's global state.
Represents a CUDA stream.
Definition: Stream.hpp:42
const Stream & copyMemory(T *dst, const T *src, size_t count) const
Enqueue a host-to-host copy.
Definition: Stream.hpp:592
const Stream & synchronize() const
Wait for all work on this stream to finish.
T data(T... args)
T size(T... args)

If you need to queue up a host-host memcpy on a stream, this will have lower overhead than using launchHostFunc(). This version is also sometimes useful to call implicitly via overload resolution in templated code.

◆ getDevice()

Device & sp::Stream::getDevice ( ) const

Get a reference to the device this stream is associated with.

◆ getPriority()

int sp::Stream::getPriority ( ) const

Query the stream's priority.

See also
cudaStreamGetPriority

◆ isIdle()

bool sp::Stream::isIdle ( ) const

Return true if the stream has no ongoing GPU jobs.

See also
cudaStreamQuery

◆ launchHostFunc()

const Stream & sp::Stream::launchHostFunc ( const std::function< void()> &  fn) const

Enqueue a host function (or closure) on the stream.

Adds a host-side function call to the stream's work queue. This is significantly cheaper than synchronising the stream and then calling the desired host function. This allows you to queue up work beyond the desired host function, so you don't have a nasty synchronous stall after your host function while you launch something else on the GPU, and the overhead of an asynchronously-queued host function is less than the cost of a full synchronisation.

The following example allocates an sp::StringBuffer, uses a GPU kernel to populate it, copies the string back to the host, and passes the host-side buffer to a verify() function to do some host-side processing on it.

All stream interactions shown are asynchronous: control will most likely reach the end of this example code before the GPU has done anything.

Example

void foo() {
sp::Stream stream = gpu.createStream();
// Use a string as a host-side buffer/destination
std::string buffer = "aaaaaaaaaa";
// Allocate appropriate memory on the device and pass a pointer to that memory to the kernel
auto devicePointer = gpu.allocateMemory<char>(buffer.size());
fill<<<1, 128, 0, stream>>>(devicePointer.get(), 'b', buffer.size()); // GPU memory now "bbbbbbbbbb"
// Queue a copy from the GPU to our host buffer
stream.copyMemory(buffer.data(), devicePointer.get(), buffer.size());
// Queue a host-side call that uses the data in the buffer, which should be filled once this is called
stream.launchHostFunc([&]() {
verify(buffer, "bbbbbbbbbb", 10);
});
// Could queue more stuff here, and it would occur right after the above host function runs, with far less
// delay than if we had done `cudaStreamSynchronise(stream); someHostFunc(); someMoreKernels<<<...>>>(...);`.
}
const Stream & launchHostFunc(const std::function< void()> &fn) const
Enqueue a host function (or closure) on the stream.
Note
On NVIDIA targets, the given lambda is subject to the same restrictions documented for cudaLaunchHostFunc (it isn't allowed to use most CUDA APIs).
See also
cudaLaunchHostFunc

◆ launchKernel()

template<typename... KernelArgTypes>
const Stream & sp::Stream::launchKernel ( void   kernelFunctionKernelArgTypes...,
dim3  gridDim,
dim3  blockDim,
int  dynamicSMem,
KernelArgTypes...  kernelArgs 
) const

Overload which takes the kernel arguments as a parameter pack rather than a single variable.

◆ recordBlockingEvent()

BlockingEvent sp::Stream::recordBlockingEvent ( std::string name) const

Record an event that the host can synchronise with.

Like recordEvent(), but the returned event is a BlockingEvent, allowing the host to synchronise with it using BlockingEvent::synchronize(). This is more expensive than a regular Event, and should only be used if you need the host-synchronisation feature.

Parameters
nameAn optional name for the event. This name may appear in profilers, debuggers, or other tools.
See also
cudaEventRecord

◆ recordEvent()

Event sp::Stream::recordEvent ( std::string name) const

Create an event representing the completion of all tasks currently queued on this stream.

Enqueuing the returned event on another stream will make that stream wait for this stream to complete all tasks that were enqueued at the moment this function was called.

Parameters
nameAn optional name for the event. This name may appear in profilers, debuggers, or other tools.
See also
cudaEventRecord

◆ synchronize()

const Stream & sp::Stream::synchronize ( ) const

Wait for all work on this stream to finish.

See also
cudaStreamSynchronize

◆ zeroMemory() [1/3]

template<typename T >
const Stream & sp::Stream::zeroMemory ( __device T *  dst,
size_t  elements 
) const

Enqueue a device memset on the stream to set elements values from dst to 0.

Parameters
dstDevice memory pointer to begin at
elementsThe number of elements to be zeroed

◆ zeroMemory() [2/3]

template<typename T >
const Stream & sp::Stream::zeroMemory ( __flat T *  dst,
size_t  elements 
) const

Enqueue a host or device memset on the stream to set elements values from dst to 0.

Parameters
dstHost or device memory pointer to begin at
elementsThe number of elements to be zeroed

◆ zeroMemory() [3/3]

template<typename T >
const Stream & sp::Stream::zeroMemory ( T *  dst,
size_t  elements 
) const

Enqueue a host memset on the stream to set elements values from dst to 0.

Parameters
dstHost memory pointer to begin at
elementsThe number of elements to be zeroed