cheshirekow  v0.1.0
Bitonic Sorting

Classes

class  mpblocks::cuda::bitonic::Sorter< KeyType, ValueType >
 A utility class for calculating properties of the bitonic sort kernels. More...
 

Functions

template<typename KeyType , typename ValueType >
__device__ void mpblocks::cuda::bitonic::compareSwap (KeyType &keyA, ValueType &valA, KeyType &keyB, ValueType &valB, Direction dir)
 implements a "comparator": compares to keys and swaps them if they are not in the desired order More...
 
template<typename KeyType >
__device__ void mpblocks::cuda::bitonic::compareSwap (KeyType &keyA, KeyType &keyB, Direction dir)
 compares to keys and swaps them if they are not in the desired order More...
 
template<typename KeyType , typename ValueType >
__global__ void mpblocks::cuda::bitonic::mergeGlobal (KeyType *d_DstKey, ValueType *d_DstVal, KeyType *d_SrcKey, ValueType *d_SrcVal, uint_t arrayLength, uint_t size, uint_t stride, Direction dir)
 sorts a bitonic series, this kernel is for a stride >= SHARED_SIZE_LIMIT More...
 
template<typename KeyType >
__global__ void mpblocks::cuda::bitonic::mergeGlobal (KeyType *d_DstKey, KeyType *d_SrcKey, uint_t arrayLength, uint_t size, uint_t stride, Direction dir)
 sorts a bitonic series, this kernel is for a stride >= SHARED_SIZE_LIMIT More...
 
template<typename KeyType , typename ValueType >
__global__ void mpblocks::cuda::bitonic::mergeShared (KeyType *d_DstKey, ValueType *d_DstVal, KeyType *d_SrcKey, ValueType *d_SrcVal, uint_t arrayLength, uint_t sharedLength, uint_t size, Direction dir)
 sorts a bitonic series, this kernel is for size > SHARED_SIZE_LIMIT and for a stride in [1, SHARED_SIZE_LIMIT/2] More...
 
template<typename KeyType >
__global__ void mpblocks::cuda::bitonic::mergeShared (KeyType *d_DstKey, KeyType *d_SrcKey, uint_t arrayLength, uint_t sharedLength, uint_t size, Direction dir)
 sorts a bitonic series, this kernel is for size > SHARED_SIZE_LIMIT and for a stride in [1, SHARED_SIZE_LIMIT/2] More...
 
template<typename KeyType >
__global__ void mpblocks::cuda::bitonic::prepare (KeyType *d_SrcKey, KeyType init, uint_t arrayLength)
 used when arrayLength is not a power of two, it writes to all values of d_SrcKey (which is an offset from of the actual source buffer) More...
 
template<typename KeyType , typename ValueType >
uint_t mpblocks::cuda::bitonic::sort (KeyType *d_DstKey, ValueType *d_DstVal, KeyType *d_SrcKey, ValueType *d_SrcVal, uint_t arrayLength, uint_t sharedLength, Direction dir, uint_t globalThread)
 kernel launcher, sorts an array of key/value pairs using the bitonic sort algorithm More...
 
template<typename KeyType >
uint_t mpblocks::cuda::bitonic::sort (KeyType *d_DstKey, KeyType *d_SrcKey, uint_t arrayLength, uint_t sharedLength, Direction dir, uint_t globalThread)
 kernel launcher, sorts an array of key/value pairs using the bitonic sort algorithm More...
 
template<typename KeyType , typename ValueType >
__global__ void mpblocks::cuda::bitonic::sortShared (KeyType *d_DstKey, ValueType *d_DstVal, KeyType *d_SrcKey, ValueType *d_SrcVal, uint_t arrayLength, Direction dir)
 single kernel (unified) bitonic sort More...
 
template<typename KeyType >
__global__ void mpblocks::cuda::bitonic::sortShared (KeyType *d_DstKey, KeyType *d_SrcKey, uint_t arrayLength, Direction dir)
 single kernel (unified) bitonic sort More...
 
template<typename KeyType , typename ValueType >
__global__ void mpblocks::cuda::bitonic::sortSharedInc (KeyType *d_DstKey, ValueType *d_DstVal, KeyType *d_SrcKey, ValueType *d_SrcVal, uint_t sharedLength)
 bottom level of the bitonic sort More...
 
template<typename KeyType >
__global__ void mpblocks::cuda::bitonic::sortSharedInc (KeyType *d_DstKey, KeyType *d_SrcKey, uint_t sharedLength)
 bottom level of the bitonic sort More...
 

Detailed Description

Contains all the GPU (CUDA) kernels used to implement the bitonic sort algorithm, as well as a driver class for launching sort operations

See Also
http://www.iti.fh-flensburg.de/lang/algorithmen/sortieren/bitonic/bitonicen.htm

Function Documentation

template<typename KeyType , typename ValueType >
__device__ void mpblocks::cuda::bitonic::compareSwap ( KeyType &  keyA,
ValueType &  valA,
KeyType &  keyB,
ValueType &  valB,
Direction  dir 
)
inline

implements a "comparator": compares to keys and swaps them if they are not in the desired order

there is no validation of this fact but it is required that the less-than operator (<) is defined for KeyType

Parameters
keyAthe first key to compare
valAvalue associated with the first key
keyBthe second key to compare
valBvalue associted with the second key
dirthe direction to sort, 1=ascending

Definition at line 52 of file kernels.cu.hpp.

template<typename KeyType >
__device__ void mpblocks::cuda::bitonic::compareSwap ( KeyType &  keyA,
KeyType &  keyB,
Direction  dir 
)
inline

compares to keys and swaps them if they are not in the desired order

there is no validation of this fact but it is required that the less-than operator (<) is defined for KeyType

Parameters
keyAthe first key to compare
keyBthe second key to compare
dirthe direction to sort, 1=ascending

Definition at line 80 of file kernels.cu.hpp.

template<typename KeyType , typename ValueType >
__global__ void mpblocks::cuda::bitonic::mergeGlobal ( KeyType *  d_DstKey,
ValueType *  d_DstVal,
KeyType *  d_SrcKey,
ValueType *  d_SrcVal,
uint_t  arrayLength,
uint_t  size,
uint_t  stride,
Direction  dir 
)

sorts a bitonic series, this kernel is for a stride >= SHARED_SIZE_LIMIT

Parameters
[out]d_DstKeyarray of sorted keys
[out]d_DstValarray of sorted values
[in]d_SrcKeybitonic array with split at size/2
[in]d_SrcValvalues associated with d_SrcKey
[in]arrayLengththe length of each array to sort
[in]dirwhether we should sort in ascending or descending

If A is an ascending sorted array and B is a descending sorted array, then [A,B] is a bitonic array. Here we merge A and B into a single sorted array

Note: usually this kernel is called with d_Dst... = d_Src... as it just merges results already stored in the destination buffer

Note: this kernel is used when the stride is too large to copy everything into shared memory, each thread just copies the two values they need to compare into global memory, performs the comparison/swap and then writes the results back to global memory

Definition at line 558 of file kernels.cu.hpp.

template<typename KeyType >
__global__ void mpblocks::cuda::bitonic::mergeGlobal ( KeyType *  d_DstKey,
KeyType *  d_SrcKey,
uint_t  arrayLength,
uint_t  size,
uint_t  stride,
Direction  dir 
)

sorts a bitonic series, this kernel is for a stride >= SHARED_SIZE_LIMIT

Parameters
[out]d_DstKeyarray of sorted keys
[in]d_SrcKeybitonic array with split at size/2
[in]arrayLengththe length of each array to sort
[in]dirwhether we should sort in ascending or descending

If A is an ascending sorted array and B is a descending sorted array, then [A,B] is a bitonic array. Here we merge A and B into a single sorted array

Note: usually this kernel is called with d_Dst... = d_Src... as it just merges results already stored in the destination buffer

Note: this kernel is used when the stride is too large to copy everything into shared memory, each thread just copies the two values they need to compare into global memory, performs the comparison/swap and then writes the results back to global memory

Definition at line 630 of file kernels.cu.hpp.

template<typename KeyType , typename ValueType >
__global__ void mpblocks::cuda::bitonic::mergeShared ( KeyType *  d_DstKey,
ValueType *  d_DstVal,
KeyType *  d_SrcKey,
ValueType *  d_SrcVal,
uint_t  arrayLength,
uint_t  sharedLength,
uint_t  size,
Direction  dir 
)

sorts a bitonic series, this kernel is for size > SHARED_SIZE_LIMIT and for a stride in [1, SHARED_SIZE_LIMIT/2]

Parameters
[out]d_DstKeyarray of sorted keys
[out]d_DstValarray of sorted values
[in]d_SrcKeybitonic array with split at size/2
[in]d_SrcValvalues associated with d_SrcKey
[in]arrayLengththe length of each array to sort
[in]sizethe stride betwen two elements a comparator works on
[in]dirwhether we should sort in ascending or descending

If A is an ascending sorted array and B is a descending sorted array, then [A,B] is a bitonic array. Here we merge A and B into a single sorted array

Note: usually this kernel is called with d_Dst... = d_Src... as it just merges results already stored in the destination buffer

Note: this kernel is used when the stride is small enough to copy everything it needs into global memory. It is assumed that the stride is actually SHARED_SIZE_LIMIT/2 (exactly) and this kernel will perform all iterations for strides smaller than the initial until stride = 0;

Definition at line 699 of file kernels.cu.hpp.

template<typename KeyType >
__global__ void mpblocks::cuda::bitonic::mergeShared ( KeyType *  d_DstKey,
KeyType *  d_SrcKey,
uint_t  arrayLength,
uint_t  sharedLength,
uint_t  size,
Direction  dir 
)

sorts a bitonic series, this kernel is for size > SHARED_SIZE_LIMIT and for a stride in [1, SHARED_SIZE_LIMIT/2]

Parameters
[out]d_DstKeyarray of sorted keys
[in]d_SrcKeybitonic array with split at size/2
[in]arrayLengththe length of each array to sort
[in]sizethe stride betwen two elements a comparator works on
[in]dirwhether we should sort in ascending or descending

If A is an ascending sorted array and B is a descending sorted array, then [A,B] is a bitonic array. Here we merge A and B into a single sorted array

Note: usually this kernel is called with d_Dst... = d_Src... as it just merges results already stored in the destination buffer

Note: this kernel is used when the stride is small enough to copy everything it needs into global memory. It is assumed that the stride is actually SHARED_SIZE_LIMIT/2 (exactly) and this kernel will perform all iterations for strides smaller than the initial until stride = 0;

Definition at line 792 of file kernels.cu.hpp.

template<typename KeyType >
__global__ void mpblocks::cuda::bitonic::prepare ( KeyType *  d_SrcKey,
KeyType  init,
uint_t  arrayLength 
)

used when arrayLength is not a power of two, it writes to all values of d_SrcKey (which is an offset from of the actual source buffer)

Parameters
[in]d_SrcKeyoffset with buffer where padding starts
[in]initvalue to write to all the overflow keys
[in]arrayLengthnumber of values to write

Definition at line 1147 of file kernels.cu.hpp.

template<typename KeyType , typename ValueType >
uint_t mpblocks::cuda::bitonic::sort ( KeyType *  d_DstKey,
ValueType *  d_DstVal,
KeyType *  d_SrcKey,
ValueType *  d_SrcVal,
uint_t  arrayLength,
uint_t  sharedLength,
Direction  dir,
uint_t  globalThread 
)

kernel launcher, sorts an array of key/value pairs using the bitonic sort algorithm

Parameters
[out]d_DstKeyarray of sorted keys
[out]d_DstValarray of sorted values
[in]d_SrcKeyarray of unsorted keys
[in]d_SrcValarray of unsorted values
[in]arrayLengththe length of each array to sort
[in]sharedLengthnumber of elements to store in shared arrays
[in]dirwhether we should sort ascending or descending
[in]globalThreadnumber of threads per block for global merge
See Also
http://www.iti.fh-flensburg.de/lang/algorithmen/sortieren/bitonic/bitonicen.htm

Definition at line 870 of file kernels.cu.hpp.

template<typename KeyType >
uint_t mpblocks::cuda::bitonic::sort ( KeyType *  d_DstKey,
KeyType *  d_SrcKey,
uint_t  arrayLength,
uint_t  sharedLength,
Direction  dir,
uint_t  globalThread 
)

kernel launcher, sorts an array of key/value pairs using the bitonic sort algorithm

Parameters
[out]d_DstKeyarray of sorted keys
[in]d_SrcKeyarray of unsorted keys
[in]arrayLengththe length of each array to sort
[in]sharedLengthnumber of elements to store in shared arrays
[in]dirwhether we should sort ascending or descending
[in]globalThreadnumber of threads per block for global merge
See Also
http://www.iti.fh-flensburg.de/lang/algorithmen/sortieren/bitonic/bitonicen.htm

Definition at line 1012 of file kernels.cu.hpp.

template<typename KeyType , typename ValueType >
__global__ void mpblocks::cuda::bitonic::sortShared ( KeyType *  d_DstKey,
ValueType *  d_DstVal,
KeyType *  d_SrcKey,
ValueType *  d_SrcVal,
uint_t  arrayLength,
Direction  dir 
)

single kernel (unified) bitonic sort

If the entire array to be sorted fits in shared memory, then we can perform the entire operation with only one kernel call (that's this kernel). If the entire array does not fit in shared memory then some of the comparator networks will have a stride large enough to cross block boundaries, so we have to divide and conquor (see the other kernels for this method)

Definition at line 108 of file kernels.cu.hpp.

template<typename KeyType >
__global__ void mpblocks::cuda::bitonic::sortShared ( KeyType *  d_DstKey,
KeyType *  d_SrcKey,
uint_t  arrayLength,
Direction  dir 
)

single kernel (unified) bitonic sort

If the entire array to be sorted fits in shared memory, then we can perform the entire operation with only one kernel call (that's this kernel). If the entire array does not fit in shared memory then some of the comparator networks will have a stride large enough to cross block boundaries, so we have to divide and conquor (see the other kernels for this method)

Definition at line 191 of file kernels.cu.hpp.

template<typename KeyType , typename ValueType >
__global__ void mpblocks::cuda::bitonic::sortSharedInc ( KeyType *  d_DstKey,
ValueType *  d_DstVal,
KeyType *  d_SrcKey,
ValueType *  d_SrcVal,
uint_t  sharedLength 
)

bottom level of the bitonic sort

Since this kernel works in shared memory we'd like to use it as much as possible, so what we do is divide up the entire array that we want to sort and only sort it in sections that are small enough to fit in shared memory. However, we sort every other block in a different direction. As a result, each pair of results forms a bitonic series. We can then efficiently merge each pair of blocks into a sorted series, which we continue doing until the entire array is sorted.

Note: the next stage (Bitonic merge) accepts both ascending | descending and descending | ascending bitonic series

Definition at line 283 of file kernels.cu.hpp.

template<typename KeyType >
__global__ void mpblocks::cuda::bitonic::sortSharedInc ( KeyType *  d_DstKey,
KeyType *  d_SrcKey,
uint_t  sharedLength 
)

bottom level of the bitonic sort

Since this kernel works in shared memory we'd like to use it as much as possible, so what we do is divide up the entire array that we want to sort and only sort it in sections that are small enough to fit in shared memory. However, we sort every other block in a different direction. As a result, each pair of results forms a bitonic series. We can then efficiently merge each pair of blocks into a sorted series, which we continue doing until the entire array is sorted.

Note: the next stage (Bitonic merge) accepts both ascending | descending and descending | ascending bitonic series

Definition at line 421 of file kernels.cu.hpp.