Kernel functor for performing element-wise operations on a Tensor. More...

#include <ElementWise.hpp>

Inheritance diagram for BoundEltWise< VectorSize, WarpSpreading, Indexer, OutputExprs >:

Public Types
using	Grid = EltwiseGrid

typedef __type_pack_element< 0, OutputExprs... >::Expr	FirstExpr

Public Types inherited from sp::Kernel< BoundEltWise< VectorSize, WarpSpreading, Indexer, OutputExprs... >, EltwiseGrid >
using	Grid = EltwiseGrid

Public Member Functions
__device__ int	getWarpsPerGroup ()
	Get the size of the warpgroup. More...

__device__ int	groupIdx ()
	Get the index of the warpgroup this thread is in. More...

__device__ int	groupThreadIdx ()
	Get the ID of this thread within its warpgroup. More...

__device__ int	getPhase1BatchesPerBlock ()
	Get the number of strip batches processed by each block, at each step, during phase 1. More...

__device__ int	getStripBatchSize ()
	Get the size of a strip batch (also equal to the number of warpgroups in the block). More...

__device__	BoundEltWise (const OutputExprs &... alignedWork, Indexer indexer, int warpsPerGroup, int stripBatchesPerBlock, int stepsPerPhase2Strip)

__device__ void	handleFallout (sp::Strip< Rank > strip)
	Process the fallout elements for the strip. Only one warp per strip should execute this function. More...

__device__ void	processStripBatch (sp::Strip< Rank > strip)
	Process a strip batch. More...

__device__ void	processStrip (sp::Strip< Rank > strip)
	Process a single strip using the entire GPU. More...

template<bool Complete>
__device__ void	phase1 ()

__device__ void	phase2 ()

__device__ void	run ()

Public Member Functions inherited from sp::Kernel< BoundEltWise< VectorSize, WarpSpreading, Indexer, OutputExprs... >, EltwiseGrid >
__device__ void	operator() ()

__device__ bool	isLastBlockHere (__device int *atomicCounter)
	A handy mechanism for determining if a block is the last one to reach a certain point. More...

__host__ void	launch (sp::Vec< int, Grid::GridRank > numBlocks, sp::Vec< int, Grid::BlockRank > numThreads, sp::Stream &stream, int dynamicSMem=0)
	Do the kernel launch. More...

__host__ void	launch (sp::Vec< int, Grid::GridRank > numBlocks, sp::Vec< int, Grid::BlockRank > numThreads, cudaStream_t stream, int dynamicSMem=0)
	Evil legacy wrapper that allows using bare streams. More...

Public Attributes
std::tuple< OutputExprs... >	alignedWork
	The expressions to evaluate. More...

Indexer	indexer

int	warpsPerGroup
	How many warps in each warpgroup? Used to spread warps over multiple strips to handle small strips efficiently. More...

int	stripBatchesPerBlock
	How many complete strip batches are assigned to each block. More...

int	stepsPerPhase2Strip
	How many steps each thread takes for each strip during phase 2. More...

Static Public Attributes
constexpr static int	Rank = FirstExpr::Rank

constexpr static int	StripRank = Indexer::StripRank

Detailed Description

template<int VectorSize, bool WarpSpreading, typename Indexer, typename... OutputExprs>
struct BoundEltWise< VectorSize, WarpSpreading, Indexer, OutputExprs >

Kernel functor for performing element-wise operations on a Tensor.

Technically, this does a copy from a set of TensorExprs into writeable TensorLikes- this can express any element-wise operation by using a suitable TensorExpr which does the operation lazily on reading. This structure also allows more general operations such as rearranging data in memory, repeating along a new dimension, etc, in the same kernel.

This kernel uses a highly optimized execution strategy to minimize latency and overhead, while maximizing parallelism- typical applications are memory bandwidth-bound.

Template Parameters

WarpSpreading	Whether or not warpgroup spreading is in use. TODO ???
Indexer	The indexer type to use.

See also: Indexers.hpp

Template Parameters

OutputExprs The expressions to evaluate in parallel.

See also: OutputExpr.hpp

Member Function Documentation

◆ getPhase1BatchesPerBlock()

template<int VectorSize, bool WarpSpreading, typename Indexer , typename... OutputExprs>

__device__ int BoundEltWise< VectorSize, WarpSpreading, Indexer, OutputExprs >::getPhase1BatchesPerBlock ( )

Get the number of strip batches processed by each block, at each step, during phase 1.

◆ getStripBatchSize()

template<int VectorSize, bool WarpSpreading, typename Indexer , typename... OutputExprs>

__device__ int BoundEltWise< VectorSize, WarpSpreading, Indexer, OutputExprs >::getStripBatchSize ( )

Get the size of a strip batch (also equal to the number of warpgroups in the block).

◆ getWarpsPerGroup()

template<int VectorSize, bool WarpSpreading, typename Indexer , typename... OutputExprs>

__device__ int BoundEltWise< VectorSize, WarpSpreading, Indexer, OutputExprs >::getWarpsPerGroup ( )

Get the size of the warpgroup.

◆ groupIdx()

template<int VectorSize, bool WarpSpreading, typename Indexer , typename... OutputExprs>

__device__ int BoundEltWise< VectorSize, WarpSpreading, Indexer, OutputExprs >::groupIdx ( )

Get the index of the warpgroup this thread is in.

◆ groupThreadIdx()

template<int VectorSize, bool WarpSpreading, typename Indexer , typename... OutputExprs>

__device__ int BoundEltWise< VectorSize, WarpSpreading, Indexer, OutputExprs >::groupThreadIdx ( )

Get the ID of this thread within its warpgroup.

◆ handleFallout()

template<int VectorSize, bool WarpSpreading, typename Indexer , typename... OutputExprs>

__device__ void BoundEltWise< VectorSize, WarpSpreading, Indexer, OutputExprs >::handleFallout ( sp::Strip< Rank > strip )

Process the fallout elements for the strip. Only one warp per strip should execute this function.

◆ processStrip()

template<int VectorSize, bool WarpSpreading, typename Indexer , typename... OutputExprs>

__device__ void BoundEltWise< VectorSize, WarpSpreading, Indexer, OutputExprs >::processStrip ( sp::Strip< Rank > strip )

Process a single strip using the entire GPU.

Parameters

strip The strip.

◆ processStripBatch()

template<int VectorSize, bool WarpSpreading, typename Indexer , typename... OutputExprs>

__device__ void BoundEltWise< VectorSize, WarpSpreading, Indexer, OutputExprs >::processStripBatch ( sp::Strip< Rank > strip )

Process a strip batch.

This function, when called by a whole block, will process one strip batch (each warpgroup will process a single strip).

Parameters

strip The strip that this thread is participating in processing.

Member Data Documentation

◆ alignedWork

template<int VectorSize, bool WarpSpreading, typename Indexer , typename... OutputExprs>

std::tuple<OutputExprs...> BoundEltWise< VectorSize, WarpSpreading, Indexer, OutputExprs >::alignedWork

The expressions to evaluate.

◆ stepsPerPhase2Strip

template<int VectorSize, bool WarpSpreading, typename Indexer , typename... OutputExprs>

int BoundEltWise< VectorSize, WarpSpreading, Indexer, OutputExprs >::stepsPerPhase2Strip

How many steps each thread takes for each strip during phase 2.

◆ stripBatchesPerBlock

template<int VectorSize, bool WarpSpreading, typename Indexer , typename... OutputExprs>

int BoundEltWise< VectorSize, WarpSpreading, Indexer, OutputExprs >::stripBatchesPerBlock

How many complete strip batches are assigned to each block.

◆ warpsPerGroup

template<int VectorSize, bool WarpSpreading, typename Indexer , typename... OutputExprs>

int BoundEltWise< VectorSize, WarpSpreading, Indexer, OutputExprs >::warpsPerGroup

How many warps in each warpgroup? Used to spread warps over multiple strips to handle small strips efficiently.

Public Types

Public Member Functions

Public Attributes

Static Public Attributes

Detailed Description

Member Function Documentation

◆ getPhase1BatchesPerBlock()

◆ getStripBatchSize()

◆ getWarpsPerGroup()

◆ groupIdx()

◆ groupThreadIdx()

◆ handleFallout()

◆ processStrip()

◆ processStripBatch()

Member Data Documentation

◆ alignedWork

◆ stepsPerPhase2Strip

◆ stripBatchesPerBlock

◆ warpsPerGroup