14 #ifndef MPBLOCKS_CUDA_BITONIC_KERNELS_CU_HPP
15 #define MPBLOCKS_CUDA_BITONIC_KERNELS_CU_HPP
30 #define UMUL(a, b) __umul24((a), (b))
31 #define UMAD(a, b, c) ( UMUL((a), (b)) + (c) )
51 template <
typename KeyType,
typename ValueType>
62 if( (keyA > keyB) == dir )
64 tempKey = keyA; keyA = keyB; keyB = tempKey;
65 tempValue = valA; valA = valB; valB = tempValue;
79 template <
typename KeyType>
87 if( (keyA > keyB) == dir )
89 tempKey = keyA; keyA = keyB; keyB = tempKey;
107 template <
typename KeyType,
typename ValueType>
124 KeyType* s_key = (KeyType*)array;
125 ValueType* s_val = (ValueType*)&s_key[arrayLength];
136 s_key[
threadIdx.
x + (arrayLength / 2)] = d_SrcKey[(arrayLength / 2)];
137 s_val[
threadIdx.
x + (arrayLength / 2)] = d_SrcVal[(arrayLength / 2)];
139 for(
uint_t size = 2; size < arrayLength; size <<= 1)
143 ((
unsigned int)dir) ^ ( (
threadIdx.
x & (size / 2)) != 0 ));
145 for(
uint_t stride = size / 2; stride > 0; stride >>= 1)
150 s_key[pos + 0], s_val[pos + 0],
151 s_key[pos + stride], s_val[pos + stride],
158 for(
uint_t stride = arrayLength / 2; stride > 0; stride >>= 1)
163 s_key[pos + 0], s_val[pos + 0],
164 s_key[pos + stride], s_val[pos + stride],
172 d_DstKey[(arrayLength / 2)] = s_key[
threadIdx.
x + (arrayLength / 2)];
173 d_DstVal[(arrayLength / 2)] = s_val[
threadIdx.
x + (arrayLength / 2)];
190 template <
typename KeyType>
205 KeyType* s_key = (KeyType*)array;
213 s_key[
threadIdx.
x + (arrayLength / 2)] = d_SrcKey[(arrayLength / 2)];
215 for(
uint_t size = 2; size < arrayLength; size <<= 1)
219 ((
unsigned int)dir) ^ ( (
threadIdx.
x & (size / 2)) != 0 ));
221 for(
uint_t stride = size / 2; stride > 0; stride >>= 1)
234 for(
uint_t stride = arrayLength / 2; stride > 0; stride >>= 1)
247 d_DstKey[(arrayLength / 2)] = s_key[
threadIdx.
x + (arrayLength / 2)];
282 template <
typename KeyType,
typename ValueType>
297 KeyType* s_key = (KeyType*)array;
298 ValueType* s_val = (ValueType*)&s_key[sharedLength];
309 s_key[
threadIdx.
x + (sharedLength / 2)] = d_SrcKey[(sharedLength / 2)];
310 s_val[
threadIdx.
x + (sharedLength / 2)] = d_SrcVal[(sharedLength / 2)];
319 for(
uint_t size = 2; size < sharedLength; size <<= 1)
329 for(
uint_t stride = size / 2; stride > 0; stride >>= 1)
344 s_key[pos + 0], s_val[pos + 0],
345 s_key[pos + stride], s_val[pos + stride],
375 for(
uint_t stride = sharedLength / 2; stride > 0; stride >>= 1)
387 s_key[pos + 0], s_val[pos + 0],
388 s_key[pos + stride], s_val[pos + stride],
398 d_DstKey[(sharedLength / 2)] = s_key[
threadIdx.
x + (sharedLength / 2)];
399 d_DstVal[(sharedLength / 2)] = s_val[
threadIdx.
x + (sharedLength / 2)];
420 template <
typename KeyType>
433 KeyType* s_key = (KeyType*)array;
441 s_key[
threadIdx.
x + (sharedLength / 2)] = d_SrcKey[(sharedLength / 2)];
450 for(
uint_t size = 2; size < sharedLength; size <<= 1)
460 for(
uint_t stride = size / 2; stride > 0; stride >>= 1)
506 for(
uint_t stride = sharedLength / 2; stride > 0; stride >>= 1)
528 d_DstKey[(sharedLength / 2)] = s_key[
threadIdx.
x + (sharedLength / 2)];
557 template <
typename KeyType,
typename ValueType>
572 uint_t comparatorI = global_comparatorI & (arrayLength / 2 - 1);
578 ^( (comparatorI & (size / 2)) != 0 )
583 uint_t pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1));
586 KeyType keyA = d_SrcKey[pos + 0];
587 ValueType valA = d_SrcVal[pos + 0];
588 KeyType keyB = d_SrcKey[pos + stride];
589 ValueType valB = d_SrcVal[pos + stride];
599 d_DstKey[pos + 0] = keyA;
600 d_DstVal[pos + 0] = valA;
601 d_DstKey[pos + stride] = keyB;
602 d_DstVal[pos + stride] = valB;
629 template <
typename KeyType>
642 uint_t comparatorI = global_comparatorI & (arrayLength / 2 - 1);
648 ^( (comparatorI & (size / 2)) != 0 )
653 uint_t pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1));
656 KeyType keyA = d_SrcKey[pos + 0];
657 KeyType keyB = d_SrcKey[pos + stride];
667 d_DstKey[pos + 0] = keyA;
668 d_DstKey[pos + stride] = keyB;
698 template <
typename KeyType,
typename ValueType>
716 KeyType* s_key = (KeyType*)array;
717 ValueType* s_val = (ValueType*)&s_key[sharedLength];
728 s_key[
threadIdx.
x + (sharedLength / 2)] = d_SrcKey[(sharedLength / 2)];
729 s_val[
threadIdx.
x + (sharedLength / 2)] = d_SrcVal[(sharedLength / 2)];
733 & ((arrayLength / 2) - 1);
738 ^( (comparatorI & (size / 2)) != 0 )
742 for(
uint_t stride = sharedLength / 2; stride > 0; stride >>= 1)
751 s_key[pos + 0], s_val[pos + 0],
752 s_key[pos + stride], s_val[pos + stride],
762 d_DstKey[(sharedLength / 2)] = s_key[
threadIdx.
x + (sharedLength / 2)];
763 d_DstVal[(sharedLength / 2)] = s_val[
threadIdx.
x + (sharedLength / 2)];
791 template <
typename KeyType>
807 KeyType* s_key = (KeyType*)array;
815 s_key[
threadIdx.
x + (sharedLength / 2)] = d_SrcKey[(sharedLength / 2)];
819 & ((arrayLength / 2) - 1);
824 ^( (comparatorI & (size / 2)) != 0 )
828 for(
uint_t stride = sharedLength / 2; stride > 0; stride >>= 1)
847 d_DstKey[(sharedLength / 2)] = s_key[
threadIdx.
x + (sharedLength / 2)];
869 template <
typename KeyType,
typename ValueType>
890 assert(
isPow2(arrayLength) );
891 assert(
isPow2(sharedLength) );
895 if(arrayLength <= sharedLength)
898 uint_t threadCount = arrayLength / 2;
899 uint_t sharedMem = arrayLength * (
sizeof(KeyType)
900 +
sizeof(ValueType) );
906 sortShared<<<blockCount, threadCount, sharedMem>>>(
907 d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength, dir);
928 uint_t threadCount = sharedLength / 2;
933 uint_t sharedMem = sharedLength * (
sizeof(KeyType)
934 +
sizeof(ValueType) );
937 sortSharedInc<<<blockCount, threadCount,sharedMem>>>(
938 d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, sharedLength);
949 for(
uint_t size = 2 * sharedLength; size <= arrayLength; size <<= 1)
953 for(
unsigned stride = size / 2; stride > 0; stride >>= 1)
960 if(stride >= sharedLength)
962 uint_t threadCount = std::min(globalThread,arrayLength/2);
963 uint_t blockCount = arrayLength / (2*threadCount);
964 mergeGlobal<<<blockCount, threadCount>>>(
965 d_DstKey, d_DstVal, d_DstKey, d_DstVal,
966 arrayLength, size, stride, dir);
975 mergeShared<<<blockCount, threadCount, sharedMem>>>(
976 d_DstKey, d_DstVal, d_DstKey, d_DstVal,
977 arrayLength, sharedLength, size, dir);
1011 template <
typename KeyType>
1030 assert(
isPow2(arrayLength) );
1031 assert(
isPow2(sharedLength) );
1035 if(arrayLength <= sharedLength)
1038 uint_t threadCount = arrayLength / 2;
1039 uint_t sharedMem = arrayLength * (
sizeof(KeyType) );
1045 sortShared<<<blockCount, threadCount, sharedMem>>>(
1046 d_DstKey, d_SrcKey, arrayLength, dir);
1067 uint_t threadCount = sharedLength / 2;
1072 uint_t sharedMem = sharedLength * (
sizeof(KeyType) );
1075 sortSharedInc<<<blockCount, threadCount,sharedMem>>>(
1076 d_DstKey, d_SrcKey, sharedLength);
1088 for(
uint_t size = 2 * sharedLength; size <= arrayLength; size <<= 1)
1092 for(
unsigned stride = size / 2; stride > 0; stride >>= 1)
1099 if(stride >= sharedLength)
1101 uint_t threadCount = std::min(globalThread,arrayLength/2);
1102 uint_t blockCount = arrayLength / (2*threadCount);
1103 mergeGlobal<<<blockCount, threadCount>>>(
1105 arrayLength, size, stride, dir);
1115 mergeShared<<<blockCount, threadCount, sharedMem>>>(
1117 arrayLength, sharedLength, size, dir);
1146 template <
typename KeyType>
1153 if(tid < arrayLength)
1154 d_SrcKey[tid] =
init;
__global__ void sortShared(KeyType *d_DstKey, ValueType *d_DstVal, KeyType *d_SrcKey, ValueType *d_SrcVal, uint_t arrayLength, Direction dir)
single kernel (unified) bitonic sort
__global__ void sortSharedInc(KeyType *d_DstKey, ValueType *d_DstVal, KeyType *d_SrcKey, ValueType *d_SrcVal, uint_t sharedLength)
bottom level of the bitonic sort
int init()
calls FCGX_Init
__global__ void prepare(KeyType *d_SrcKey, KeyType init, uint_t arrayLength)
used when arrayLength is not a power of two, it writes to all values of d_SrcKey (which is an offset...
bool isPow2(T x)
returns true if the parameter is an exact power of two
T dividePow2(T x, T y)
returns x/y if x and y are both powers of two, otherwise the result is undefined
__global__ void mergeGlobal(KeyType *d_DstKey, ValueType *d_DstVal, KeyType *d_SrcKey, ValueType *d_SrcVal, uint_t arrayLength, uint_t size, uint_t stride, Direction dir)
sorts a bitonic series, this kernel is for a stride >= SHARED_SIZE_LIMIT
void checkLastError(const std::string &msg="checkLastError")
wraps getLastError
__device__ void compareSwap(KeyType &keyA, ValueType &valA, KeyType &keyB, ValueType &valB, Direction dir)
implements a "comparator": compares to keys and swaps them if they are not in the desired order ...
uint_t sort(KeyType *d_DstKey, ValueType *d_DstVal, KeyType *d_SrcKey, ValueType *d_SrcVal, uint_t arrayLength, uint_t sharedLength, Direction dir, uint_t globalThread)
kernel launcher, sorts an array of key/value pairs using the bitonic sort algorithm ...
Direction
specifies values for the direction the sorter should sort the keys
__global__ void mergeShared(KeyType *d_DstKey, ValueType *d_DstVal, KeyType *d_SrcKey, ValueType *d_SrcVal, uint_t arrayLength, uint_t sharedLength, uint_t size, Direction dir)
sorts a bitonic series, this kernel is for size > SHARED_SIZE_LIMIT and for a stride in [1...