Speclib  0.1.2
The library for writing better CUDA libraries
BoundEltWise< VectorSize, WarpSpreading, Indexer, OutputExprs > Struct Template Reference

Kernel functor for performing element-wise operations on a Tensor. More...

#include <ElementWise.hpp>

Inheritance diagram for BoundEltWise< VectorSize, WarpSpreading, Indexer, OutputExprs >:
[legend]

Public Types

using Grid = EltwiseGrid
 
typedef __type_pack_element< 0, OutputExprs... >::Expr FirstExpr
 
- Public Types inherited from sp::Kernel< BoundEltWise< VectorSize, WarpSpreading, Indexer, OutputExprs... >, EltwiseGrid >
using Grid = EltwiseGrid
 

Public Member Functions

__device__ int getWarpsPerGroup ()
 Get the size of the warpgroup. More...
 
__device__ int groupIdx ()
 Get the index of the warpgroup this thread is in. More...
 
__device__ int groupThreadIdx ()
 Get the ID of this thread within its warpgroup. More...
 
__device__ int getPhase1BatchesPerBlock ()
 Get the number of strip batches processed by each block, at each step, during phase 1. More...
 
__device__ int getStripBatchSize ()
 Get the size of a strip batch (also equal to the number of warpgroups in the block). More...
 
__device__ BoundEltWise (const OutputExprs &... alignedWork, Indexer indexer, int warpsPerGroup, int stripBatchesPerBlock, int stepsPerPhase2Strip)
 
__device__ void handleFallout (sp::Strip< Rank > strip)
 Process the fallout elements for the strip. Only one warp per strip should execute this function. More...
 
__device__ void processStripBatch (sp::Strip< Rank > strip)
 Process a strip batch. More...
 
__device__ void processStrip (sp::Strip< Rank > strip)
 Process a single strip using the entire GPU. More...
 
template<bool Complete>
__device__ void phase1 ()
 
__device__ void phase2 ()
 
__device__ void run ()
 
- Public Member Functions inherited from sp::Kernel< BoundEltWise< VectorSize, WarpSpreading, Indexer, OutputExprs... >, EltwiseGrid >
__device__ void operator() ()
 
__device__ bool isLastBlockHere (__device int *atomicCounter)
 A handy mechanism for determining if a block is the last one to reach a certain point. More...
 
__host__ void launch (sp::Vec< int, Grid::GridRank > numBlocks, sp::Vec< int, Grid::BlockRank > numThreads, sp::Stream &stream, int dynamicSMem=0)
 Do the kernel launch. More...
 
__host__ void launch (sp::Vec< int, Grid::GridRank > numBlocks, sp::Vec< int, Grid::BlockRank > numThreads, cudaStream_t stream, int dynamicSMem=0)
 Evil legacy wrapper that allows using bare streams. More...
 

Public Attributes

std::tuple< OutputExprs... > alignedWork
 The expressions to evaluate. More...
 
Indexer indexer
 
int warpsPerGroup
 How many warps in each warpgroup? Used to spread warps over multiple strips to handle small strips efficiently. More...
 
int stripBatchesPerBlock
 How many complete strip batches are assigned to each block. More...
 
int stepsPerPhase2Strip
 How many steps each thread takes for each strip during phase 2. More...
 

Static Public Attributes

constexpr static int Rank = FirstExpr::Rank
 
constexpr static int StripRank = Indexer::StripRank
 

Detailed Description

template<int VectorSize, bool WarpSpreading, typename Indexer, typename... OutputExprs>
struct BoundEltWise< VectorSize, WarpSpreading, Indexer, OutputExprs >

Kernel functor for performing element-wise operations on a Tensor.

Technically, this does a copy from a set of TensorExprs into writeable TensorLikes- this can express any element-wise operation by using a suitable TensorExpr which does the operation lazily on reading. This structure also allows more general operations such as rearranging data in memory, repeating along a new dimension, etc, in the same kernel.

This kernel uses a highly optimized execution strategy to minimize latency and overhead, while maximizing parallelism- typical applications are memory bandwidth-bound.

Template Parameters
WarpSpreadingWhether or not warpgroup spreading is in use. TODO ???
IndexerThe indexer type to use.
See also
Indexers.hpp
Template Parameters
OutputExprsThe expressions to evaluate in parallel.
See also
OutputExpr.hpp

Member Function Documentation

◆ getPhase1BatchesPerBlock()

template<int VectorSize, bool WarpSpreading, typename Indexer , typename... OutputExprs>
__device__ int BoundEltWise< VectorSize, WarpSpreading, Indexer, OutputExprs >::getPhase1BatchesPerBlock ( )

Get the number of strip batches processed by each block, at each step, during phase 1.

◆ getStripBatchSize()

template<int VectorSize, bool WarpSpreading, typename Indexer , typename... OutputExprs>
__device__ int BoundEltWise< VectorSize, WarpSpreading, Indexer, OutputExprs >::getStripBatchSize ( )

Get the size of a strip batch (also equal to the number of warpgroups in the block).

◆ getWarpsPerGroup()

template<int VectorSize, bool WarpSpreading, typename Indexer , typename... OutputExprs>
__device__ int BoundEltWise< VectorSize, WarpSpreading, Indexer, OutputExprs >::getWarpsPerGroup ( )

Get the size of the warpgroup.

◆ groupIdx()

template<int VectorSize, bool WarpSpreading, typename Indexer , typename... OutputExprs>
__device__ int BoundEltWise< VectorSize, WarpSpreading, Indexer, OutputExprs >::groupIdx ( )

Get the index of the warpgroup this thread is in.

◆ groupThreadIdx()

template<int VectorSize, bool WarpSpreading, typename Indexer , typename... OutputExprs>
__device__ int BoundEltWise< VectorSize, WarpSpreading, Indexer, OutputExprs >::groupThreadIdx ( )

Get the ID of this thread within its warpgroup.

◆ handleFallout()

template<int VectorSize, bool WarpSpreading, typename Indexer , typename... OutputExprs>
__device__ void BoundEltWise< VectorSize, WarpSpreading, Indexer, OutputExprs >::handleFallout ( sp::Strip< Rank >  strip)

Process the fallout elements for the strip. Only one warp per strip should execute this function.

◆ processStrip()

template<int VectorSize, bool WarpSpreading, typename Indexer , typename... OutputExprs>
__device__ void BoundEltWise< VectorSize, WarpSpreading, Indexer, OutputExprs >::processStrip ( sp::Strip< Rank >  strip)

Process a single strip using the entire GPU.

Parameters
stripThe strip.

◆ processStripBatch()

template<int VectorSize, bool WarpSpreading, typename Indexer , typename... OutputExprs>
__device__ void BoundEltWise< VectorSize, WarpSpreading, Indexer, OutputExprs >::processStripBatch ( sp::Strip< Rank >  strip)

Process a strip batch.

This function, when called by a whole block, will process one strip batch (each warpgroup will process a single strip).

Parameters
stripThe strip that this thread is participating in processing.

Member Data Documentation

◆ alignedWork

template<int VectorSize, bool WarpSpreading, typename Indexer , typename... OutputExprs>
std::tuple<OutputExprs...> BoundEltWise< VectorSize, WarpSpreading, Indexer, OutputExprs >::alignedWork

The expressions to evaluate.

◆ stepsPerPhase2Strip

template<int VectorSize, bool WarpSpreading, typename Indexer , typename... OutputExprs>
int BoundEltWise< VectorSize, WarpSpreading, Indexer, OutputExprs >::stepsPerPhase2Strip

How many steps each thread takes for each strip during phase 2.

◆ stripBatchesPerBlock

template<int VectorSize, bool WarpSpreading, typename Indexer , typename... OutputExprs>
int BoundEltWise< VectorSize, WarpSpreading, Indexer, OutputExprs >::stripBatchesPerBlock

How many complete strip batches are assigned to each block.

◆ warpsPerGroup

template<int VectorSize, bool WarpSpreading, typename Indexer , typename... OutputExprs>
int BoundEltWise< VectorSize, WarpSpreading, Indexer, OutputExprs >::warpsPerGroup

How many warps in each warpgroup? Used to spread warps over multiple strips to handle small strips efficiently.