Represents a CUDA stream. More...

#include <Stream.hpp>

Inheritance diagram for sp::Stream:

Public Member Functions
Device &	getDevice () const
	Get a reference to the device this stream is associated with. More...

int	getPriority () const
	Query the stream's priority. More...

bool	isIdle () const
	Return true if the stream has no ongoing GPU jobs. More...

const Stream &	synchronize () const
	Wait for all work on this stream to finish. More...

Event	recordEvent (std::string &name) const
	Create an event representing the completion of all tasks currently queued on this stream. More...

Event	recordEvent () const

BlockingEvent	recordBlockingEvent (std::string &name) const
	Record an event that the host can synchronise with. More...

BlockingEvent	recordBlockingEvent () const

template<typename... KernelArgTypes>
const Stream &	launchKernel (void kernelFunction(KernelArgTypes...), dim3 gridDim, dim3 blockDim, int dynamicSMem, void **kernelArgs) const

template<typename... KernelArgTypes>
const Stream &	launchKernel (void kernelFunction(KernelArgTypes...), dim3 gridDim, dim3 blockDim, int dynamicSMem, KernelArgTypes... kernelArgs) const
	Overload which takes the kernel arguments as a parameter pack rather than a single variable. More...

const Stream &	launchHostFunc (const std::function< void()> &fn) const
	Enqueue a host function (or closure) on the stream. More...

const Stream &	awaitEvent (const Event &e) const
	Enqueue a wait operation for the completion of the given event. More...

const Stream &	memset (__device void *dst, uint8_t value, size_t count) const

const Stream &	memset16 (__device void *dst, uint16_t value, size_t count) const

const Stream &	memset32 (__device void *dst, uint32_t value, size_t count) const

const Stream &	memset (__flat void *dst, uint8_t value, size_t count) const

const Stream &	memset16 (__flat void *dst, uint16_t value, size_t count) const

const Stream &	memset32 (__flat void *dst, uint32_t value, size_t count) const

const Stream &	memset (void *dst, uint8_t value, size_t count) const

const Stream &	memset16 (void *dst, uint16_t value, size_t count) const

const Stream &	memset32 (void *dst, uint32_t value, size_t count) const

template<typename T >
const Stream &	setMemory (__device T *dst, T value, size_t elements) const

template<typename T >
const Stream &	setMemory (__flat T *dst, T value, size_t elements) const

template<typename T >
const Stream &	setMemory (T *dst, T value, size_t elements) const

template<typename T >
const Stream &	zeroMemory (__device T *dst, size_t elements) const
	Enqueue a device memset on the stream to set `elements` values from `dst` to 0. More...

template<typename T >
const Stream &	zeroMemory (__flat T *dst, size_t elements) const
	Enqueue a host or device memset on the stream to set `elements` values from `dst` to 0. More...

template<typename T >
const Stream &	zeroMemory (T *dst, size_t elements) const
	Enqueue a host memset on the stream to set `elements` values from `dst` to 0. More...

const Stream &	memset2D (__device void *dst, size_t dstPitch, uint8_t value, size_t w, size_t h) const

const Stream &	memset2D16 (__device void *dst, size_t dstPitch, uint16_t value, size_t w, size_t h) const

const Stream &	memset2D32 (__device void *dst, size_t dstPitch, uint32_t value, size_t w, size_t h) const

const Stream &	memset2D (__flat void *dst, size_t dstPitch, uint8_t value, size_t w, size_t h) const

const Stream &	memset2D16 (__flat void *dst, size_t dstPitch, uint16_t value, size_t w, size_t h) const

const Stream &	memset2D32 (__flat void *dst, size_t dstPitch, uint32_t value, size_t w, size_t h) const

const Stream &	memset2D (void *dst, size_t dstPitch, uint8_t value, size_t w, size_t h) const

const Stream &	memset2D16 (void *dst, size_t dstPitch, uint16_t value, size_t w, size_t h) const

const Stream &	memset2D32 (void *dst, size_t dstPitch, uint32_t value, size_t w, size_t h) const

template<typename T >
const Stream &	setMemory2D (__device T *dst, size_t dstPitch, T value, size_t w, size_t h) const

template<typename T >
const Stream &	setMemory2D (__flat T *dst, size_t dstPitch, T value, size_t w, size_t h) const

template<typename T >
const Stream &	setMemory2D (T *dst, size_t dstPitch, T value, size_t w, size_t h) const

const Stream &	memcpy (void dst, const void src, size_t count) const

const Stream &	memcpy (__device void dst, const void src, size_t count) const

const Stream &	memcpy (void dst, __device const void src, size_t count) const

const Stream &	memcpy (__device void dst, __device const void src, size_t count) const

const Stream &	memcpy (__flat void dst, __flat const void src, size_t count) const

const Stream &	memcpy (__flat void dst, const void src, size_t count) const

const Stream &	memcpy (__flat void dst, __device const void src, size_t count) const

const Stream &	memcpy (void dst, __flat const void src, size_t count) const

const Stream &	memcpy (__device void dst, __flat const void src, size_t count) const

template<typename T >
const Stream &	copyMemory (T dst, const T src, size_t count) const
	Enqueue a host-to-host copy. More...

template<typename T >
const Stream &	copyMemory (__device T dst, const T src, size_t count) const
	Enqueue a host-to-device copy Handles types larger than 1 byte automatically. More...

template<typename T >
const Stream &	copyMemory (T dst, __device const T src, size_t count) const
	Enqueue a device-to-host copy Handles types larger than 1 byte automatically. More...

template<typename T >
const Stream &	copyMemory (__device T dst, __device const T src, size_t count) const
	Enqueue a device-to-device copy. More...

template<typename T >
const Stream &	copyMemory (__flat T dst, __flat const T src, size_t count) const
	Flat address space copy. More...

template<typename T >
const Stream &	copyMemory (__flat T dst, const T src, size_t count) const

template<typename T >
const Stream &	copyMemory (__flat T dst, __device const T src, size_t count) const

template<typename T >
const Stream &	copyMemory (T dst, __flat const T src, size_t count) const

template<typename T >
const Stream &	copyMemory (__device T dst, __flat const T src, size_t count) const

Public Member Functions inherited from sp::RAIIObject< StreamAllocator, int >
	RAIIObject (const CTorArgs &... args)
	Allocate a new object and take ownership of it. More...

	RAIIObject (const APIType &obj, bool own=true)
	Wrap an existing object. More...

const APIType	get () const
	Get the underlying C API object (eg. `cudaStream_t`) More...

APIType	get ()
	Get the underlying C API object (eg. `cudaStream_t`) More...

APIType	operator* ()

const APIType	operator* () const

	operator APIType () const
	Implicitly convert to the C API type, so you can just pass this object to the C library whence it came. More...

	operator APIType ()

Friends
class	Device

Additional Inherited Members
Protected Types inherited from sp::RAIIObject< StreamAllocator, int >
using	APIType = typename AllocType::APIType
	The C API type. Something like `cudaStream_t`. More...

using	UnderlyingType = std::remove_pointer_t< APIType >

Detailed Description

Represents a CUDA stream.

This object will transparently convert to a cudaStream_t. This means you can use it in kernel launches just like a normal cuda stream.

Member Function Documentation

◆ awaitEvent()

const Stream & sp::Stream::awaitEvent ( const Event & e ) const

Enqueue a wait operation for the completion of the given event.

See also: cudaStreamWaitEvent(); recordEvent()

◆ copyMemory() [1/5]

template<typename T >

const Stream & sp::Stream::copyMemory	(	__device T *	dst,
		__device const T *	src,
		size_t	count
	)		const

Enqueue a device-to-device copy.

Handles types larger than 1 byte automatically

This can do copies between devices as well as copies within the same device

◆ copyMemory() [2/5]

template<typename T >

const Stream & sp::Stream::copyMemory	(	__device T *	dst,
		const T *	src,
		size_t	count
	)		const

Enqueue a host-to-device copy Handles types larger than 1 byte automatically.

◆ copyMemory() [3/5]

template<typename T >

const Stream & sp::Stream::copyMemory	(	__flat T *	dst,
		__flat const T *	src,
		size_t	count
	)		const

Flat address space copy.

Use this overload when the source/destination locations aren't know.

◆ copyMemory() [4/5]

template<typename T >

const Stream & sp::Stream::copyMemory	(	T *	dst,
		__device const T *	src,
		size_t	count
	)		const

Enqueue a device-to-host copy Handles types larger than 1 byte automatically.

◆ copyMemory() [5/5]

template<typename T >

const Stream & sp::Stream::copyMemory	(	T *	dst,
		const T *	src,
		size_t	count
	)		const

Enqueue a host-to-host copy.

Handles types larger than 1 byte automatically

The following example includes usage of all four variants of this function:

Example

    sp::Device& gpu = sp::Device::getActive();
    sp::Stream stream = gpu.createStream();
 
    std::string origin = "aaaaaaaaaa";
    std::string result = "aaaaaaaaaa";
 
    auto devicePointer1 = gpu.allocateMemory<char>(origin.size());
    auto devicePointer2 = gpu.allocateMemory<char>(origin.size());
 
    // Move the data around
    stream.copyMemory(devicePointer1.get(), origin.data(), origin.size());
    conditionalFill<<<1, 256, 0, stream>>>(devicePointer1.get(), 'b', 'a', origin.size());
    stream.copyMemory(devicePointer2.get(), devicePointer1.get(), origin.size());
    conditionalFill<<<1, 256, 0, stream>>>(devicePointer2.get(), 'c', 'b', origin.size());
    stream.copyMemory(result.data(), devicePointer2.get(), origin.size());
 
    stream.synchronize();
 
    verify(result, "cccccccccc", 10);

If you need to queue up a host-host memcpy on a stream, this will have lower overhead than using launchHostFunc(). This version is also sometimes useful to call implicitly via overload resolution in templated code.

◆ getDevice()

Device & sp::Stream::getDevice ( ) const

Get a reference to the device this stream is associated with.

◆ getPriority()

int sp::Stream::getPriority ( ) const

Query the stream's priority.

See also: cudaStreamGetPriority

◆ isIdle()

bool sp::Stream::isIdle ( ) const

Return true if the stream has no ongoing GPU jobs.

See also: cudaStreamQuery

◆ launchHostFunc()

const Stream & sp::Stream::launchHostFunc ( const std::function< void()> & fn ) const

Enqueue a host function (or closure) on the stream.

Adds a host-side function call to the stream's work queue. This is significantly cheaper than synchronising the stream and then calling the desired host function. This allows you to queue up work beyond the desired host function, so you don't have a nasty synchronous stall after your host function while you launch something else on the GPU, and the overhead of an asynchronously-queued host function is less than the cost of a full synchronisation.

The following example allocates an sp::StringBuffer, uses a GPU kernel to populate it, copies the string back to the host, and passes the host-side buffer to a verify() function to do some host-side processing on it.

All stream interactions shown are asynchronous: control will most likely reach the end of this example code before the GPU has done anything.

Example

void foo() {
    sp::Device& gpu = sp::Device::getActive();
    sp::Stream stream = gpu.createStream();
 
    // Use a string as a host-side buffer/destination
    std::string buffer = "aaaaaaaaaa";
 
    // Allocate appropriate memory on the device and pass a pointer to that memory to the kernel
    auto devicePointer = gpu.allocateMemory<char>(buffer.size());
    fill<<<1, 128, 0, stream>>>(devicePointer.get(), 'b', buffer.size()); // GPU memory now "bbbbbbbbbb"
 
    // Queue a copy from the GPU to our host buffer
    stream.copyMemory(buffer.data(), devicePointer.get(), buffer.size());
 
    // Queue a host-side call that uses the data in the buffer, which should be filled once this is called
    stream.launchHostFunc([&]() {
        verify(buffer, "bbbbbbbbbb", 10);
    });
 
    // Could queue more stuff here, and it would occur right after the above host function runs, with far less
    // delay than if we had done `cudaStreamSynchronise(stream); someHostFunc(); someMoreKernels<<<...>>>(...);`.
}

Note: On NVIDIA targets, the given lambda is subject to the same restrictions documented for cudaLaunchHostFunc (it isn't allowed to use most CUDA APIs).

See also: cudaLaunchHostFunc

◆ launchKernel()

template<typename... KernelArgTypes>

const Stream & sp::Stream::launchKernel	(	void	kernelFunctionKernelArgTypes...,
		dim3	gridDim,
		dim3	blockDim,
		int	dynamicSMem,
		KernelArgTypes...	kernelArgs
	)		const

Overload which takes the kernel arguments as a parameter pack rather than a single variable.

◆ recordBlockingEvent()

BlockingEvent sp::Stream::recordBlockingEvent ( std::string & name ) const

Record an event that the host can synchronise with.

Like recordEvent(), but the returned event is a BlockingEvent, allowing the host to synchronise with it using BlockingEvent::synchronize(). This is more expensive than a regular Event, and should only be used if you need the host-synchronisation feature.

Parameters

name	An optional name for the event. This name may appear in profilers, debuggers, or other tools.

See also: cudaEventRecord

◆ recordEvent()

Event sp::Stream::recordEvent ( std::string & name ) const

Create an event representing the completion of all tasks currently queued on this stream.

Enqueuing the returned event on another stream will make that stream wait for this stream to complete all tasks that were enqueued at the moment this function was called.

Parameters

name	An optional name for the event. This name may appear in profilers, debuggers, or other tools.

See also: cudaEventRecord

◆ synchronize()

const Stream & sp::Stream::synchronize ( ) const

Wait for all work on this stream to finish.

See also: cudaStreamSynchronize

◆ zeroMemory() [1/3]

template<typename T >

const Stream & sp::Stream::zeroMemory	(	__device T *	dst,
		size_t	elements
	)		const

Enqueue a device memset on the stream to set elements values from dst to 0.

Parameters

dst	Device memory pointer to begin at
elements	The number of elements to be zeroed

◆ zeroMemory() [2/3]

template<typename T >

const Stream & sp::Stream::zeroMemory	(	__flat T *	dst,
		size_t	elements
	)		const

Enqueue a host or device memset on the stream to set elements values from dst to 0.

Parameters

dst	Host or device memory pointer to begin at
elements	The number of elements to be zeroed

◆ zeroMemory() [3/3]

template<typename T >

const Stream & sp::Stream::zeroMemory	(	T *	dst,
		size_t	elements
	)		const

Enqueue a host memset on the stream to set elements values from dst to 0.

Parameters

dst	Host memory pointer to begin at
elements	The number of elements to be zeroed

Public Member Functions

Friends

Additional Inherited Members

Detailed Description

Member Function Documentation

◆ awaitEvent()

◆ copyMemory() [1/5]

◆ copyMemory() [2/5]

◆ copyMemory() [3/5]

◆ copyMemory() [4/5]

◆ copyMemory() [5/5]

Example

◆ getDevice()

◆ getPriority()

◆ isIdle()

◆ launchHostFunc()

Example

◆ launchKernel()

◆ recordBlockingEvent()

◆ recordEvent()

◆ synchronize()

◆ zeroMemory() [1/3]

◆ zeroMemory() [2/3]

◆ zeroMemory() [3/3]