Classes
class	mpblocks::cuda::bitonic::Sorter< KeyType, ValueType >
	A utility class for calculating properties of the bitonic sort kernels. More...

Functions
template<typename KeyType , typename ValueType >
__device__ void	mpblocks::cuda::bitonic::compareSwap (KeyType &keyA, ValueType &valA, KeyType &keyB, ValueType &valB, Direction dir)
	implements a "comparator": compares to keys and swaps them if they are not in the desired order More...

template<typename KeyType >
__device__ void	mpblocks::cuda::bitonic::compareSwap (KeyType &keyA, KeyType &keyB, Direction dir)
	compares to keys and swaps them if they are not in the desired order More...

template<typename KeyType , typename ValueType >
__global__ void	mpblocks::cuda::bitonic::mergeGlobal (KeyType d_DstKey, ValueType d_DstVal, KeyType d_SrcKey, ValueType d_SrcVal, uint_t arrayLength, uint_t size, uint_t stride, Direction dir)
	sorts a bitonic series, this kernel is for a stride >= SHARED_SIZE_LIMIT More...

template<typename KeyType >
__global__ void	mpblocks::cuda::bitonic::mergeGlobal (KeyType d_DstKey, KeyType d_SrcKey, uint_t arrayLength, uint_t size, uint_t stride, Direction dir)
	sorts a bitonic series, this kernel is for a stride >= SHARED_SIZE_LIMIT More...

template<typename KeyType , typename ValueType >
__global__ void	mpblocks::cuda::bitonic::mergeShared (KeyType d_DstKey, ValueType d_DstVal, KeyType d_SrcKey, ValueType d_SrcVal, uint_t arrayLength, uint_t sharedLength, uint_t size, Direction dir)
	sorts a bitonic series, this kernel is for size > SHARED_SIZE_LIMIT and for a stride in [1, SHARED_SIZE_LIMIT/2] More...

template<typename KeyType >
__global__ void	mpblocks::cuda::bitonic::mergeShared (KeyType d_DstKey, KeyType d_SrcKey, uint_t arrayLength, uint_t sharedLength, uint_t size, Direction dir)
	sorts a bitonic series, this kernel is for size > SHARED_SIZE_LIMIT and for a stride in [1, SHARED_SIZE_LIMIT/2] More...

template<typename KeyType >
__global__ void	mpblocks::cuda::bitonic::prepare (KeyType *d_SrcKey, KeyType init, uint_t arrayLength)
	used when arrayLength is not a power of two, it writes to all values of d_SrcKey (which is an offset from of the actual source buffer) More...

template<typename KeyType , typename ValueType >
uint_t	mpblocks::cuda::bitonic::sort (KeyType d_DstKey, ValueType d_DstVal, KeyType d_SrcKey, ValueType d_SrcVal, uint_t arrayLength, uint_t sharedLength, Direction dir, uint_t globalThread)
	kernel launcher, sorts an array of key/value pairs using the bitonic sort algorithm More...

template<typename KeyType >
uint_t	mpblocks::cuda::bitonic::sort (KeyType d_DstKey, KeyType d_SrcKey, uint_t arrayLength, uint_t sharedLength, Direction dir, uint_t globalThread)
	kernel launcher, sorts an array of key/value pairs using the bitonic sort algorithm More...

template<typename KeyType , typename ValueType >
__global__ void	mpblocks::cuda::bitonic::sortShared (KeyType d_DstKey, ValueType d_DstVal, KeyType d_SrcKey, ValueType d_SrcVal, uint_t arrayLength, Direction dir)
	single kernel (unified) bitonic sort More...

template<typename KeyType >
__global__ void	mpblocks::cuda::bitonic::sortShared (KeyType d_DstKey, KeyType d_SrcKey, uint_t arrayLength, Direction dir)
	single kernel (unified) bitonic sort More...

template<typename KeyType , typename ValueType >
__global__ void	mpblocks::cuda::bitonic::sortSharedInc (KeyType d_DstKey, ValueType d_DstVal, KeyType d_SrcKey, ValueType d_SrcVal, uint_t sharedLength)
	bottom level of the bitonic sort More...

template<typename KeyType >
__global__ void	mpblocks::cuda::bitonic::sortSharedInc (KeyType d_DstKey, KeyType d_SrcKey, uint_t sharedLength)
	bottom level of the bitonic sort More...

Detailed Description

Contains all the GPU (CUDA) kernels used to implement the bitonic sort algorithm, as well as a driver class for launching sort operations

See Also: http://www.iti.fh-flensburg.de/lang/algorithmen/sortieren/bitonic/bitonicen.htm

Function Documentation

template<typename KeyType , typename ValueType >

__device__ void mpblocks::cuda::bitonic::compareSwap	(	KeyType &	keyA,
		ValueType &	valA,
		KeyType &	keyB,
		ValueType &	valB,
		Direction	dir
	)

inline

implements a "comparator": compares to keys and swaps them if they are not in the desired order

there is no validation of this fact but it is required that the less-than operator (<) is defined for KeyType

Parameters

keyA	the first key to compare
valA	value associated with the first key
keyB	the second key to compare
valB	value associted with the second key
dir	the direction to sort, 1=ascending

Definition at line 52 of file kernels.cu.hpp.

template<typename KeyType >

__device__ void mpblocks::cuda::bitonic::compareSwap	(	KeyType &	keyA,
		KeyType &	keyB,
		Direction	dir
	)

inline

compares to keys and swaps them if they are not in the desired order

there is no validation of this fact but it is required that the less-than operator (<) is defined for KeyType

Parameters

keyA	the first key to compare
keyB	the second key to compare
dir	the direction to sort, 1=ascending

Definition at line 80 of file kernels.cu.hpp.

template<typename KeyType , typename ValueType >

__global__ void mpblocks::cuda::bitonic::mergeGlobal	(	KeyType *	d_DstKey,
		ValueType *	d_DstVal,
		KeyType *	d_SrcKey,
		ValueType *	d_SrcVal,
		uint_t	arrayLength,
		uint_t	size,
		uint_t	stride,
		Direction	dir
	)

sorts a bitonic series, this kernel is for a stride >= SHARED_SIZE_LIMIT

Parameters

[out]	d_DstKey	array of sorted keys
[out]	d_DstVal	array of sorted values
[in]	d_SrcKey	bitonic array with split at size/2
[in]	d_SrcVal	values associated with d_SrcKey
[in]	arrayLength	the length of each array to sort
[in]	dir	whether we should sort in ascending or descending

If A is an ascending sorted array and B is a descending sorted array, then [A,B] is a bitonic array. Here we merge A and B into a single sorted array

Note: usually this kernel is called with d_Dst... = d_Src... as it just merges results already stored in the destination buffer

Note: this kernel is used when the stride is too large to copy everything into shared memory, each thread just copies the two values they need to compare into global memory, performs the comparison/swap and then writes the results back to global memory

Definition at line 558 of file kernels.cu.hpp.

template<typename KeyType >

__global__ void mpblocks::cuda::bitonic::mergeGlobal	(	KeyType *	d_DstKey,
		KeyType *	d_SrcKey,
		uint_t	arrayLength,
		uint_t	size,
		uint_t	stride,
		Direction	dir
	)

sorts a bitonic series, this kernel is for a stride >= SHARED_SIZE_LIMIT

Parameters

[out]	d_DstKey	array of sorted keys
[in]	d_SrcKey	bitonic array with split at size/2
[in]	arrayLength	the length of each array to sort
[in]	dir	whether we should sort in ascending or descending

If A is an ascending sorted array and B is a descending sorted array, then [A,B] is a bitonic array. Here we merge A and B into a single sorted array

Note: usually this kernel is called with d_Dst... = d_Src... as it just merges results already stored in the destination buffer

Note: this kernel is used when the stride is too large to copy everything into shared memory, each thread just copies the two values they need to compare into global memory, performs the comparison/swap and then writes the results back to global memory

Definition at line 630 of file kernels.cu.hpp.

template<typename KeyType , typename ValueType >

__global__ void mpblocks::cuda::bitonic::mergeShared	(	KeyType *	d_DstKey,
		ValueType *	d_DstVal,
		KeyType *	d_SrcKey,
		ValueType *	d_SrcVal,
		uint_t	arrayLength,
		uint_t	sharedLength,
		uint_t	size,
		Direction	dir
	)

sorts a bitonic series, this kernel is for size > SHARED_SIZE_LIMIT and for a stride in [1, SHARED_SIZE_LIMIT/2]

Parameters

[out]	d_DstKey	array of sorted keys
[out]	d_DstVal	array of sorted values
[in]	d_SrcKey	bitonic array with split at size/2
[in]	d_SrcVal	values associated with d_SrcKey
[in]	arrayLength	the length of each array to sort
[in]	size	the stride betwen two elements a comparator works on
[in]	dir	whether we should sort in ascending or descending

If A is an ascending sorted array and B is a descending sorted array, then [A,B] is a bitonic array. Here we merge A and B into a single sorted array

Note: usually this kernel is called with d_Dst... = d_Src... as it just merges results already stored in the destination buffer

Note: this kernel is used when the stride is small enough to copy everything it needs into global memory. It is assumed that the stride is actually SHARED_SIZE_LIMIT/2 (exactly) and this kernel will perform all iterations for strides smaller than the initial until stride = 0;

Definition at line 699 of file kernels.cu.hpp.

template<typename KeyType >

__global__ void mpblocks::cuda::bitonic::mergeShared	(	KeyType *	d_DstKey,
		KeyType *	d_SrcKey,
		uint_t	arrayLength,
		uint_t	sharedLength,
		uint_t	size,
		Direction	dir
	)

sorts a bitonic series, this kernel is for size > SHARED_SIZE_LIMIT and for a stride in [1, SHARED_SIZE_LIMIT/2]

Parameters

[out]	d_DstKey	array of sorted keys
[in]	d_SrcKey	bitonic array with split at size/2
[in]	arrayLength	the length of each array to sort
[in]	size	the stride betwen two elements a comparator works on
[in]	dir	whether we should sort in ascending or descending

If A is an ascending sorted array and B is a descending sorted array, then [A,B] is a bitonic array. Here we merge A and B into a single sorted array

Note: usually this kernel is called with d_Dst... = d_Src... as it just merges results already stored in the destination buffer

Note: this kernel is used when the stride is small enough to copy everything it needs into global memory. It is assumed that the stride is actually SHARED_SIZE_LIMIT/2 (exactly) and this kernel will perform all iterations for strides smaller than the initial until stride = 0;

Definition at line 792 of file kernels.cu.hpp.

template<typename KeyType >

__global__ void mpblocks::cuda::bitonic::prepare	(	KeyType *	d_SrcKey,
		KeyType	init,
		uint_t	arrayLength
	)

used when arrayLength is not a power of two, it writes to all values of d_SrcKey (which is an offset from of the actual source buffer)

Parameters

[in]	d_SrcKey	offset with buffer where padding starts
[in]	init	value to write to all the overflow keys
[in]	arrayLength	number of values to write

Definition at line 1147 of file kernels.cu.hpp.

template<typename KeyType , typename ValueType >

uint_t mpblocks::cuda::bitonic::sort	(	KeyType *	d_DstKey,
		ValueType *	d_DstVal,
		KeyType *	d_SrcKey,
		ValueType *	d_SrcVal,
		uint_t	arrayLength,
		uint_t	sharedLength,
		Direction	dir,
		uint_t	globalThread
	)

kernel launcher, sorts an array of key/value pairs using the bitonic sort algorithm

Parameters

[out]	d_DstKey	array of sorted keys
[out]	d_DstVal	array of sorted values
[in]	d_SrcKey	array of unsorted keys
[in]	d_SrcVal	array of unsorted values
[in]	arrayLength	the length of each array to sort
[in]	sharedLength	number of elements to store in shared arrays
[in]	dir	whether we should sort ascending or descending
[in]	globalThread	number of threads per block for global merge

See Also: http://www.iti.fh-flensburg.de/lang/algorithmen/sortieren/bitonic/bitonicen.htm

Definition at line 870 of file kernels.cu.hpp.

template<typename KeyType >

uint_t mpblocks::cuda::bitonic::sort	(	KeyType *	d_DstKey,
		KeyType *	d_SrcKey,
		uint_t	arrayLength,
		uint_t	sharedLength,
		Direction	dir,
		uint_t	globalThread
	)

kernel launcher, sorts an array of key/value pairs using the bitonic sort algorithm

Parameters

[out]	d_DstKey	array of sorted keys
[in]	d_SrcKey	array of unsorted keys
[in]	arrayLength	the length of each array to sort
[in]	sharedLength	number of elements to store in shared arrays
[in]	dir	whether we should sort ascending or descending
[in]	globalThread	number of threads per block for global merge

See Also: http://www.iti.fh-flensburg.de/lang/algorithmen/sortieren/bitonic/bitonicen.htm

Definition at line 1012 of file kernels.cu.hpp.

template<typename KeyType , typename ValueType >

__global__ void mpblocks::cuda::bitonic::sortShared	(	KeyType *	d_DstKey,
		ValueType *	d_DstVal,
		KeyType *	d_SrcKey,
		ValueType *	d_SrcVal,
		uint_t	arrayLength,
		Direction	dir
	)

single kernel (unified) bitonic sort

If the entire array to be sorted fits in shared memory, then we can perform the entire operation with only one kernel call (that's this kernel). If the entire array does not fit in shared memory then some of the comparator networks will have a stride large enough to cross block boundaries, so we have to divide and conquor (see the other kernels for this method)

Definition at line 108 of file kernels.cu.hpp.

template<typename KeyType >

__global__ void mpblocks::cuda::bitonic::sortShared	(	KeyType *	d_DstKey,
		KeyType *	d_SrcKey,
		uint_t	arrayLength,
		Direction	dir
	)

single kernel (unified) bitonic sort

If the entire array to be sorted fits in shared memory, then we can perform the entire operation with only one kernel call (that's this kernel). If the entire array does not fit in shared memory then some of the comparator networks will have a stride large enough to cross block boundaries, so we have to divide and conquor (see the other kernels for this method)

Definition at line 191 of file kernels.cu.hpp.

template<typename KeyType , typename ValueType >

__global__ void mpblocks::cuda::bitonic::sortSharedInc	(	KeyType *	d_DstKey,
		ValueType *	d_DstVal,
		KeyType *	d_SrcKey,
		ValueType *	d_SrcVal,
		uint_t	sharedLength
	)

bottom level of the bitonic sort

Since this kernel works in shared memory we'd like to use it as much as possible, so what we do is divide up the entire array that we want to sort and only sort it in sections that are small enough to fit in shared memory. However, we sort every other block in a different direction. As a result, each pair of results forms a bitonic series. We can then efficiently merge each pair of blocks into a sorted series, which we continue doing until the entire array is sorted.

Note: the next stage (Bitonic merge) accepts both ascending | descending and descending | ascending bitonic series

Definition at line 283 of file kernels.cu.hpp.

template<typename KeyType >

__global__ void mpblocks::cuda::bitonic::sortSharedInc	(	KeyType *	d_DstKey,
		KeyType *	d_SrcKey,
		uint_t	sharedLength
	)

bottom level of the bitonic sort

Since this kernel works in shared memory we'd like to use it as much as possible, so what we do is divide up the entire array that we want to sort and only sort it in sections that are small enough to fit in shared memory. However, we sort every other block in a different direction. As a result, each pair of results forms a bitonic series. We can then efficiently merge each pair of blocks into a sorted series, which we continue doing until the entire array is sorted.

Note: the next stage (Bitonic merge) accepts both ascending | descending and descending | ascending bitonic series

Definition at line 421 of file kernels.cu.hpp.

Classes

Functions

Detailed Description

Function Documentation