cheshirekow  v0.1.0
kernels.cu.hpp
Go to the documentation of this file.
1 /*
2  * \file bitonicSort.h
3  *
4  * \date Sep 3, 2011
5  * \author Josh Bialkowski (jbialk@mit.edu)
6  * \brief
7  *
8  * Note: adapted from the NVIDIA SDK bitonicSort.cu which references
9  * http://www.iti.fh-flensburg.de/lang/algorithmen/sortieren/bitonic/bitonicen.htm
10  */
11 
12 
13 
14 #ifndef MPBLOCKS_CUDA_BITONIC_KERNELS_CU_HPP
15 #define MPBLOCKS_CUDA_BITONIC_KERNELS_CU_HPP
16 
17 #include <cassert>
18 #include <algorithm>
19 
20 
21 
22 
23 
24 // Map to single instructions on G8x / G9x / G100
25 // note: check hardware and see if we should change this, __umul24 is a single
26 // instruction multiply with 24 bit precision,
27 // note: it's only used to multiply block index times block dimension to get
28 // the block offset of a thread index, so it should be fine unless we need
29 // more than 2^24 = 16,777,216 threads
30 #define UMUL(a, b) __umul24((a), (b))
31 #define UMAD(a, b, c) ( UMUL((a), (b)) + (c) )
32 
33 
34 
35 namespace mpblocks {
36 namespace cuda {
37 namespace bitonic {
38 
39 
40 typedef unsigned int uint_t;
41 
42 
45 
51 template <typename KeyType, typename ValueType>
52 __device__ inline void compareSwap(
53  KeyType& keyA,
54  ValueType& valA,
55  KeyType& keyB,
56  ValueType& valB,
57  Direction dir
58 )
59 {
60  KeyType tempKey;
61  ValueType tempValue;
62  if( (keyA > keyB) == dir )
63  {
64  tempKey = keyA; keyA = keyB; keyB = tempKey;
65  tempValue = valA; valA = valB; valB = tempValue;
66  }
67 }
68 
69 
70 
71 
73 
79 template <typename KeyType>
80 __device__ inline void compareSwap(
81  KeyType& keyA,
82  KeyType& keyB,
83  Direction dir
84 )
85 {
86  KeyType tempKey;
87  if( (keyA > keyB) == dir )
88  {
89  tempKey = keyA; keyA = keyB; keyB = tempKey;
90  }
91 }
92 
93 
94 
95 
97 
107 template <typename KeyType, typename ValueType>
109  KeyType *d_DstKey,
110  ValueType *d_DstVal,
111  KeyType *d_SrcKey,
112  ValueType *d_SrcVal,
113  uint_t arrayLength,
114  Direction dir
115 )
116 {
117  //Shared memory storage for the shared vectors, in the kernel call this
118  //array is allocated to have
119  //arrayLength*(sizeof(KeyType)+sizeof(ValueType)) bytes
120  extern __shared__ unsigned int array[];
121 
122  // we have to generate pointers into the shared memory block for our actual
123  // shared array storage
124  KeyType* s_key = (KeyType*)array;
125  ValueType* s_val = (ValueType*)&s_key[arrayLength];
126 
127  //calculate offset for this thread from the start of the array
128  d_SrcKey += blockIdx.x * arrayLength + threadIdx.x;
129  d_SrcVal += blockIdx.x * arrayLength + threadIdx.x;
130  d_DstKey += blockIdx.x * arrayLength + threadIdx.x;
131  d_DstVal += blockIdx.x * arrayLength + threadIdx.x;
132 
133  // copy this threads data into shared memory
134  s_key[threadIdx.x + 0] = d_SrcKey[ 0];
135  s_val[threadIdx.x + 0] = d_SrcVal[ 0];
136  s_key[threadIdx.x + (arrayLength / 2)] = d_SrcKey[(arrayLength / 2)];
137  s_val[threadIdx.x + (arrayLength / 2)] = d_SrcVal[(arrayLength / 2)];
138 
139  for(uint_t size = 2; size < arrayLength; size <<= 1)
140  {
141  //Bitonic merge
142  Direction ddd = (Direction)(
143  ((unsigned int)dir) ^ ( (threadIdx.x & (size / 2)) != 0 ));
144 
145  for(uint_t stride = size / 2; stride > 0; stride >>= 1)
146  {
147  __syncthreads();
148  uint_t pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
149  compareSwap(
150  s_key[pos + 0], s_val[pos + 0],
151  s_key[pos + stride], s_val[pos + stride],
152  ddd
153  );
154  }
155  }
156 
157  //ddd == dir for the last bitonic merge step
158  for(uint_t stride = arrayLength / 2; stride > 0; stride >>= 1)
159  {
160  __syncthreads();
161  uint_t pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
162  compareSwap(
163  s_key[pos + 0], s_val[pos + 0],
164  s_key[pos + stride], s_val[pos + stride],
165  dir
166  );
167  }
168 
169  __syncthreads();
170  d_DstKey[ 0] = s_key[threadIdx.x + 0];
171  d_DstVal[ 0] = s_val[threadIdx.x + 0];
172  d_DstKey[(arrayLength / 2)] = s_key[threadIdx.x + (arrayLength / 2)];
173  d_DstVal[(arrayLength / 2)] = s_val[threadIdx.x + (arrayLength / 2)];
174 }
175 
176 
177 
178 
180 
190 template <typename KeyType>
192  KeyType *d_DstKey,
193  KeyType *d_SrcKey,
194  uint_t arrayLength,
195  Direction dir
196 )
197 {
198  //Shared memory storage for the shared vectors, in the kernel call this
199  //array is allocated to have
200  //arrayLength*(sizeof(KeyType)+sizeof(ValueType)) bytes
201  extern __shared__ unsigned int array[];
202 
203  // we have to generate pointers into the shared memory block for our actual
204  // shared array storage
205  KeyType* s_key = (KeyType*)array;
206 
207  //calculate offset for this thread from the start of the array
208  d_SrcKey += blockIdx.x * arrayLength + threadIdx.x;
209  d_DstKey += blockIdx.x * arrayLength + threadIdx.x;
210 
211  // copyt this threads data into shared memory
212  s_key[threadIdx.x + 0] = d_SrcKey[ 0];
213  s_key[threadIdx.x + (arrayLength / 2)] = d_SrcKey[(arrayLength / 2)];
214 
215  for(uint_t size = 2; size < arrayLength; size <<= 1)
216  {
217  //Bitonic merge
218  Direction ddd = (Direction)(
219  ((unsigned int)dir) ^ ( (threadIdx.x & (size / 2)) != 0 ));
220 
221  for(uint_t stride = size / 2; stride > 0; stride >>= 1)
222  {
223  __syncthreads();
224  uint_t pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
225  compareSwap(
226  s_key[pos + 0],
227  s_key[pos + stride],
228  ddd
229  );
230  }
231  }
232 
233  //ddd == dir for the last bitonic merge step
234  for(uint_t stride = arrayLength / 2; stride > 0; stride >>= 1)
235  {
236  __syncthreads();
237  uint_t pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
238  compareSwap(
239  s_key[pos + 0],
240  s_key[pos + stride],
241  dir
242  );
243  }
244 
245  __syncthreads();
246  d_DstKey[ 0] = s_key[threadIdx.x + 0];
247  d_DstKey[(arrayLength / 2)] = s_key[threadIdx.x + (arrayLength / 2)];
248 }
249 
250 
251 
252 
253 
254 
255 
256 
257 
258 
259 
260 
261 
262 
263 
264 
265 
266 
268 
282 template <typename KeyType, typename ValueType>
284  KeyType *d_DstKey,
285  ValueType *d_DstVal,
286  KeyType *d_SrcKey,
287  ValueType *d_SrcVal,
288  uint_t sharedLength
289 ){
290  //Shared memory storage for the shared vectors, in the kernel call this
291  //array is allocated to have
292  //sharedLength*(sizeof(KeyType)+sizeof(ValueType)) bytes
293  extern __shared__ unsigned int array[];
294 
295  // we have to generate pointers into the shared memory block for our actual
296  // shared array storage
297  KeyType* s_key = (KeyType*)array;
298  ValueType* s_val = (ValueType*)&s_key[sharedLength];
299 
300  // calculate this threads offset to the data
301  d_SrcKey += blockIdx.x * sharedLength + threadIdx.x;
302  d_SrcVal += blockIdx.x * sharedLength + threadIdx.x;
303  d_DstKey += blockIdx.x * sharedLength + threadIdx.x;
304  d_DstVal += blockIdx.x * sharedLength + threadIdx.x;
305 
306  // copy this threads data into shared memory
307  s_key[threadIdx.x + 0] = d_SrcKey[ 0];
308  s_val[threadIdx.x + 0] = d_SrcVal[ 0];
309  s_key[threadIdx.x + (sharedLength / 2)] = d_SrcKey[(sharedLength / 2)];
310  s_val[threadIdx.x + (sharedLength / 2)] = d_SrcVal[(sharedLength / 2)];
311 
312 
313  // results in a tritonic series that can be split into two bitonic
314  // serieses, i.e. steps a-e
315  //
316  // take a look at the comparator network drawing for the bitonic sort, this
317  // first part is the first half of the network, we start by comparing only
318  // pairs, (size=2), then we compaire quads (size=4)
319  for(uint_t size = 2; size < sharedLength; size <<= 1)
320  {
321  // recall that size is always a power of two, so only one bit is
322  // set to 1, so we shift that bit to the right by 1 and ask if that
323  // new bit location is set in the thread id. If it is then the
324  // direction is 1 otherwise it's 0
325  Direction ddd = (Direction)( (threadIdx.x & (size / 2)) != 0 );
326 
327  // the stride is the distance (in indices) between the two keys being
328  // compared.
329  for(uint_t stride = size / 2; stride > 0; stride >>= 1)
330  {
331  // sync threads so that reads are not stale from the location of
332  // another threads writes
333  __syncthreads();
334 
335  // this math is really wonky, but I'm guessing that it resolves to
336  // be the left side of the comparator network, i.e. when size = 2
337  // then it does (x,y)->a, when size = 4 then it does a->(b,c) and
338  // (b,c)->d, if the length of the example where bigger, this would
339  // do more, but in the example this ends at e
340  uint_t pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
341 
342  // perform the comparison/swap
343  compareSwap(
344  s_key[pos + 0], s_val[pos + 0],
345  s_key[pos + stride], s_val[pos + stride],
346  ddd
347  );
348  }
349  }
350 
351 
352  //Odd / even arrays of sharedLength elements
353  //sorted in opposite directions
354  //
355  // This is the part that start at step f. The previous loop gave us a single
356  // bitonic series, and now we're going to turn that into a sorted series
357  //
358  // I think ddd is the current direction. (blockIdx.x & 1) evaluates to 1
359  // if the number is odd and 0 if the number is even, not that this stays
360  // the same during half of the algorithm, and does not change at each
361  // iteration
362  //
363  // note that this line is the only difference between this kernel and
364  // the previous kernel, and it accounts for alternating directions of
365  // sorting between pairs of contiguous blocks
366  Direction ddd = (Direction)( blockIdx.x & 1 );
367 
368  // the stride is the distance (in indices) between the two keys being
369  // compared, the comparator network for bitonic sort is B_n where n is
370  // the stride length. Each thread t in the network compares elements at
371  // t and t+stride
372 
373  // we iterate the stride starting at half of the size of the problem
374  // and dividing by two at each step
375  for(uint_t stride = sharedLength / 2; stride > 0; stride >>= 1)
376  {
377  // the comparator network is applied to the results of the previous
378  // iteration so we must sync threads at each iteration
379  __syncthreads();
380 
381  // this math is a little wonky bit it shuld resolve to make each
382  // thread start at the correct place for it's comparator
383  uint_t pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
384 
385  // perform the actual compare/swap
386  compareSwap(
387  s_key[pos + 0], s_val[pos + 0],
388  s_key[pos + stride], s_val[pos + stride],
389  ddd
390  );
391  }
392 
393  // perform one last sync before copying things out to global memory;
394  __syncthreads();
395 
396  d_DstKey[ 0] = s_key[threadIdx.x + 0];
397  d_DstVal[ 0] = s_val[threadIdx.x + 0];
398  d_DstKey[(sharedLength / 2)] = s_key[threadIdx.x + (sharedLength / 2)];
399  d_DstVal[(sharedLength / 2)] = s_val[threadIdx.x + (sharedLength / 2)];
400 }
401 
402 
403 
404 
406 
420 template <typename KeyType>
422  KeyType *d_DstKey,
423  KeyType *d_SrcKey,
424  uint_t sharedLength
425 ){
426  //Shared memory storage for the shared vectors, in the kernel call this
427  //array is allocated to have
428  //sharedLength*(sizeof(KeyType)+sizeof(ValueType)) bytes
429  extern __shared__ unsigned int array[];
430 
431  // we have to generate pointers into the shared memory block for our actual
432  // shared array storage
433  KeyType* s_key = (KeyType*)array;
434 
435  // calculate this threads offset to the data
436  d_SrcKey += blockIdx.x * sharedLength + threadIdx.x;
437  d_DstKey += blockIdx.x * sharedLength + threadIdx.x;
438 
439  // copy this threads data into shared memory
440  s_key[threadIdx.x + 0] = d_SrcKey[ 0];
441  s_key[threadIdx.x + (sharedLength / 2)] = d_SrcKey[(sharedLength / 2)];
442 
443 
444  // results in a tritonic series that can be split into two bitonic
445  // serieses, i.e. steps a-e
446  //
447  // take a look at the comparator network drawing for the bitonic sort, this
448  // first part is the first half of the network, we start by comparing only
449  // pairs, (size=2), then we compaire quads (size=4)
450  for(uint_t size = 2; size < sharedLength; size <<= 1)
451  {
452  // recall that size is always a power of two, so only one bit is
453  // set to 1, so we shift that bit to the right by 1 and ask if that
454  // new bit location is set in the thread id. If it is then the
455  // direction is 1 otherwise it's 0
456  Direction ddd = (Direction)( (threadIdx.x & (size / 2)) != 0 );
457 
458  // the stride is the distance (in indices) between the two keys being
459  // compared.
460  for(uint_t stride = size / 2; stride > 0; stride >>= 1)
461  {
462  // sync threads so that reads are not stale from the location of
463  // another threads writes
464  __syncthreads();
465 
466  // this math is really wonky, but I'm guessing that it resolves to
467  // be the left side of the comparator network, i.e. when size = 2
468  // then it does (x,y)->a, when size = 4 then it does a->(b,c) and
469  // (b,c)->d, if the length of the example where bigger, this would
470  // do more, but in the example this ends at e
471  uint_t pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
472 
473  // perform the comparison/swap
474  compareSwap(
475  s_key[pos + 0],
476  s_key[pos + stride],
477  ddd
478  );
479  }
480  }
481 
482 
483  //Odd / even arrays of sharedLength elements
484  //sorted in opposite directions
485  //
486  // This is the part that start at step f. The previous loop gave us a single
487  // bitonic series, and now we're going to turn that into a sorted series
488  //
489  // I think ddd is the current direction. (blockIdx.x & 1) evaluates to 1
490  // if the number is odd and 0 if the number is even, not that this stays
491  // the same during half of the algorithm, and does not change at each
492  // iteration
493  //
494  // note that this line is the only difference between this kernel and
495  // the previous kernel, and it accounts for alternating directions of
496  // sorting between pairs of contiguous blocks
497  Direction ddd = (Direction)( blockIdx.x & 1 );
498 
499  // the stride is the distance (in indices) between the two keys being
500  // compared, the comparator network for bitonic sort is B_n where n is
501  // the stride length. Each thread t in the network compares elements at
502  // t and t+stride
503 
504  // we iterate the stride starting at half of the size of the problem
505  // and dividing by two at each step
506  for(uint_t stride = sharedLength / 2; stride > 0; stride >>= 1)
507  {
508  // the comparator network is applied to the results of the previous
509  // iteration so we must sync threads at each iteration
510  __syncthreads();
511 
512  // this math is a little wonky bit it shuld resolve to make each
513  // thread start at the correct place for it's comparator
514  uint_t pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
515 
516  // perform the actual compare/swap
517  compareSwap(
518  s_key[pos + 0],
519  s_key[pos + stride],
520  ddd
521  );
522  }
523 
524  // perform one last sync before copying things out to global memory;
525  __syncthreads();
526 
527  d_DstKey[ 0] = s_key[threadIdx.x + 0];
528  d_DstKey[(sharedLength / 2)] = s_key[threadIdx.x + (sharedLength / 2)];
529 }
530 
531 
532 
533 
535 
557 template <typename KeyType, typename ValueType>
559  KeyType *d_DstKey,
560  ValueType *d_DstVal,
561  KeyType *d_SrcKey,
562  ValueType *d_SrcVal,
563  uint_t arrayLength,
564  uint_t size,
565  uint_t stride,
566  Direction dir
567 ){
568  // the index of the comparator that this thread is acting as
569  uint_t global_comparatorI = blockIdx.x * blockDim.x + threadIdx.x;
570 
571  // ?? the index of the comparator within it's own network ??
572  uint_t comparatorI = global_comparatorI & (arrayLength / 2 - 1);
573 
574  // wtf? I guess this some how determines the direction that the comparator
575  // should go for this thread
576  Direction ddd = (Direction)(
577  ((unsigned int)dir)
578  ^( (comparatorI & (size / 2)) != 0 )
579  );
580 
581  // calculate the position in the global array that this comparator needs
582  // to start with
583  uint_t pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1));
584 
585  // copy out the two keys and values it needs to work with
586  KeyType keyA = d_SrcKey[pos + 0];
587  ValueType valA = d_SrcVal[pos + 0];
588  KeyType keyB = d_SrcKey[pos + stride];
589  ValueType valB = d_SrcVal[pos + stride];
590 
591  // perform the swap if necessary
592  compareSwap(
593  keyA, valA,
594  keyB, valB,
595  ddd
596  );
597 
598  // write the (potentially swapped) results back to global memory
599  d_DstKey[pos + 0] = keyA;
600  d_DstVal[pos + 0] = valA;
601  d_DstKey[pos + stride] = keyB;
602  d_DstVal[pos + stride] = valB;
603 }
604 
605 
606 
607 
609 
629 template <typename KeyType>
631  KeyType *d_DstKey,
632  KeyType *d_SrcKey,
633  uint_t arrayLength,
634  uint_t size,
635  uint_t stride,
636  Direction dir
637 ){
638  // the index of the comparator that this thread is acting as
639  uint_t global_comparatorI = blockIdx.x * blockDim.x + threadIdx.x;
640 
641  // ?? the index of the comparator within it's own network ??
642  uint_t comparatorI = global_comparatorI & (arrayLength / 2 - 1);
643 
644  // wtf? I guess this some how determines the direction that the comparator
645  // should go for this thread
646  Direction ddd = (Direction)(
647  ((unsigned int)dir)
648  ^( (comparatorI & (size / 2)) != 0 )
649  );
650 
651  // calculate the position in the global array that this comparator needs
652  // to start with
653  uint_t pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1));
654 
655  // copy out the two keys and values it needs to work with
656  KeyType keyA = d_SrcKey[pos + 0];
657  KeyType keyB = d_SrcKey[pos + stride];
658 
659  // perform the swap if necessary
660  compareSwap(
661  keyA,
662  keyB,
663  ddd
664  );
665 
666  // write the (potentially swapped) results back to global memory
667  d_DstKey[pos + 0] = keyA;
668  d_DstKey[pos + stride] = keyB;
669 }
670 
671 
672 
673 
676 
698 template <typename KeyType, typename ValueType>
700  KeyType *d_DstKey,
701  ValueType *d_DstVal,
702  KeyType *d_SrcKey,
703  ValueType *d_SrcVal,
704  uint_t arrayLength,
705  uint_t sharedLength,
706  uint_t size,
707  Direction dir
708 ){
709  //Shared memory storage for the shared vectors, in the kernel call this
710  //array is allocated to have
711  //sharedLength*(sizeof(KeyType)+sizeof(ValueType)) bytes
712  extern __shared__ unsigned int array[];
713 
714  // we have to generate pointers into the shared memory block for our actual
715  // shared array storage
716  KeyType* s_key = (KeyType*)array;
717  ValueType* s_val = (ValueType*)&s_key[sharedLength];
718 
719  // calculate the offset that this thread needs to start at
720  d_SrcKey += blockIdx.x * sharedLength + threadIdx.x;
721  d_SrcVal += blockIdx.x * sharedLength + threadIdx.x;
722  d_DstKey += blockIdx.x * sharedLength + threadIdx.x;
723  d_DstVal += blockIdx.x * sharedLength + threadIdx.x;
724 
725  // copy this threads value into shared (block) memory
726  s_key[threadIdx.x + 0] = d_SrcKey[ 0];
727  s_val[threadIdx.x + 0] = d_SrcVal[ 0];
728  s_key[threadIdx.x + (sharedLength / 2)] = d_SrcKey[(sharedLength / 2)];
729  s_val[threadIdx.x + (sharedLength / 2)] = d_SrcVal[(sharedLength / 2)];
730 
731  // calculate the index of this comparator
732  uint_t comparatorI = UMAD(blockIdx.x, blockDim.x, threadIdx.x)
733  & ((arrayLength / 2) - 1);
734 
735  // determine the direction that this subarray needs to be sorted in
736  Direction ddd = (Direction)(
737  ((unsigned int)dir)
738  ^( (comparatorI & (size / 2)) != 0 )
739  );
740 
741  // iterate over all remaining strides
742  for(uint_t stride = sharedLength / 2; stride > 0; stride >>= 1)
743  {
744  __syncthreads();
745 
746  // cacluate the position of this comparator
747  uint_t pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
748 
749  // perform the swap
750  compareSwap(
751  s_key[pos + 0], s_val[pos + 0],
752  s_key[pos + stride], s_val[pos + stride],
753  ddd
754  );
755  }
756 
757  __syncthreads();
758 
759  // copy results back out to global memory
760  d_DstKey[ 0] = s_key[threadIdx.x + 0];
761  d_DstVal[ 0] = s_val[threadIdx.x + 0];
762  d_DstKey[(sharedLength / 2)] = s_key[threadIdx.x + (sharedLength / 2)];
763  d_DstVal[(sharedLength / 2)] = s_val[threadIdx.x + (sharedLength / 2)];
764 }
765 
766 
767 
768 
771 
791 template <typename KeyType>
793  KeyType *d_DstKey,
794  KeyType *d_SrcKey,
795  uint_t arrayLength,
796  uint_t sharedLength,
797  uint_t size,
798  Direction dir
799 ){
800  //Shared memory storage for the shared vectors, in the kernel call this
801  //array is allocated to have
802  //sharedLength*(sizeof(KeyType)+sizeof(ValueType)) bytes
803  extern __shared__ unsigned int array[];
804 
805  // we have to generate pointers into the shared memory block for our actual
806  // shared array storage
807  KeyType* s_key = (KeyType*)array;
808 
809  // calculate the offset that this thread needs to start at
810  d_SrcKey += blockIdx.x * sharedLength + threadIdx.x;
811  d_DstKey += blockIdx.x * sharedLength + threadIdx.x;
812 
813  // copy this threads value into shared (block) memory
814  s_key[threadIdx.x + 0] = d_SrcKey[ 0];
815  s_key[threadIdx.x + (sharedLength / 2)] = d_SrcKey[(sharedLength / 2)];
816 
817  // calculate the index of this comparator
818  uint_t comparatorI = UMAD(blockIdx.x, blockDim.x, threadIdx.x)
819  & ((arrayLength / 2) - 1);
820 
821  // determine the direction that this subarray needs to be sorted in
822  Direction ddd = (Direction)(
823  ((unsigned int)dir)
824  ^( (comparatorI & (size / 2)) != 0 )
825  );
826 
827  // iterate over all remaining strides
828  for(uint_t stride = sharedLength / 2; stride > 0; stride >>= 1)
829  {
830  __syncthreads();
831 
832  // cacluate the position of this comparator
833  uint_t pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
834 
835  // perform the swap
836  compareSwap(
837  s_key[pos + 0],
838  s_key[pos + stride],
839  ddd
840  );
841  }
842 
843  __syncthreads();
844 
845  // copy results back out to global memory
846  d_DstKey[ 0] = s_key[threadIdx.x + 0];
847  d_DstKey[(sharedLength / 2)] = s_key[threadIdx.x + (sharedLength / 2)];
848 }
849 
850 
851 
852 
855 
869 template <typename KeyType, typename ValueType>
871  KeyType *d_DstKey, //< pointer to start of output array for keys
872  ValueType *d_DstVal, //< pointer to start of output array for values
873  KeyType *d_SrcKey, //< pointer to start of intput array for keys
874  ValueType *d_SrcVal, //< pointer to start of input array for values
875  uint_t arrayLength, //< the size of the array to sort
876  uint_t sharedLength, //< number of shared memory elements per kernel
877  Direction dir, //< whether to sort ascending or descending
878  uint_t globalThread //< how many threads to use in global merge
879 ){
880  uint_t ret;
881 
882  //Nothing to sort
883  if(arrayLength < 2)
884  return 0;
885 
886  // fail if the the arrayLength is not a power of 2
887  // note that, for future version, we just need to find the next power of
888  // two and then pretend that all values greater than the array length are
889  // infinite
890  assert( isPow2(arrayLength) );
891  assert( isPow2(sharedLength) );
892 
893  // if the array length is smaller than the shared size limit, then we only
894  // need to do the sort kernel
895  if(arrayLength <= sharedLength)
896  {
897  uint_t blockCount = 1;
898  uint_t threadCount = arrayLength / 2;
899  uint_t sharedMem = arrayLength * ( sizeof(KeyType)
900  + sizeof(ValueType) );
901 
902  // fail if the number of items is not a multiple of the shared size
903  // limit... This appears to only be a simplification for the SDK
904  // example, the algorithm generalizes to arrays which are not multiples
905  // of the block size, we just need to fix that later
906  sortShared<<<blockCount, threadCount, sharedMem>>>(
907  d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength, dir);
908 
909  ret = threadCount;
910  }
911 
912  // if the array lenght is larger than the shared size limit, then we need
913  // to split the array up into blocks. We call the sort kernel on each
914  // block which results in each block being sorted, note that they're
915  // sorted in opposite directions though so that each pair forms a bitonic
916  // series (i.e. increasing up to the break, then decreasing after)
917  else
918  {
919  // note that right here we are assuming that each array is a multiple
920  // of sharedLength, however, note that we enforce that arrayLength and
921  // shared length are both powers of two, so if
922  // arrayLength is >= sharedLength then blockCount will be a whole number
923  // (in fact, a power of two)
924  uint_t blockCount = dividePow2(arrayLength,sharedLength);
925 
926  // the number of threads we need is exactly 1/2 the size of the shared
927  // buffer
928  uint_t threadCount = sharedLength / 2;
929 
930  // the amount of shared memory the kernel needs is given by the length
931  // of the shared array it will sort times the amount of data in
932  // each element
933  uint_t sharedMem = sharedLength * ( sizeof(KeyType)
934  + sizeof(ValueType) );
935 
936  // this kernel sorts each block of the array into a bitonic list
937  sortSharedInc<<<blockCount, threadCount,sharedMem>>>(
938  d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, sharedLength);
939 
940  // now we go through and iteratively merge the results
941  // size starts at 2* the memory of a block and is double at each
942  // iteration until it reaches the size of the array
943  // note that the array composed of length (2*blockSize) composed of
944  // concatinating two block results together forms a bitonic series
945  //
946  // note that this process is basically the same as the second half of
947  // the sort kernel, however we have to do it by iteratively calling
948  // the kernel because the resuls are all stored in global memory
949  for(uint_t size = 2 * sharedLength; size <= arrayLength; size <<= 1)
950  {
951  // the stride starts at half of the size and is divided by two until
952  // it reaches 0
953  for(unsigned stride = size / 2; stride > 0; stride >>= 1)
954  {
955  // if the stride is too large, we used the merge kerenel that
956  // works on global memory, this kernel only performs one
957  // iteration of the merge (i.e. e->f and no further), this
958  // is because the stride is so long that it crosses block
959  // boundaries
960  if(stride >= sharedLength)
961  {
962  uint_t threadCount = std::min(globalThread,arrayLength/2);
963  uint_t blockCount = arrayLength / (2*threadCount);
964  mergeGlobal<<<blockCount, threadCount>>>(
965  d_DstKey, d_DstVal, d_DstKey, d_DstVal,
966  arrayLength, size, stride, dir);
967  }
968 
969  // if the stride is small enough, the comparator separation is
970  // small enough that it will not cross thread boundaries (i.e.
971  // see how f->(g,h) has no comparator crossing the half-way
972  // mark)
973  else
974  {
975  mergeShared<<<blockCount, threadCount, sharedMem>>>(
976  d_DstKey, d_DstVal, d_DstKey, d_DstVal,
977  arrayLength, sharedLength, size, dir);
978 
979  // this is important, note the break here, it should exit
980  // us as soon as we used the shared kernel, and consumes
981  // the rest of the for loop.. this kernel will loop through
982  // all remaining smaller strides
983  break;
984  }
985  }
986  }
987 
988  ret = threadCount;
989  }
990 
991  return ret;
992 }
993 
994 
995 
996 
999 
1011 template <typename KeyType>
1013  KeyType *d_DstKey, //< pointer to start of output array for keys
1014  KeyType *d_SrcKey, //< pointer to start of intput array for keys
1015  uint_t arrayLength, //< the size of the array to sort
1016  uint_t sharedLength, //< number of shared memory elements per kernel
1017  Direction dir, //< whether to sort ascending or descending
1018  uint_t globalThread //< how many threads to use in global merge
1019 ){
1020  uint_t ret;
1021 
1022  //Nothing to sort
1023  if(arrayLength < 2)
1024  return 0;
1025 
1026  // fail if the the arrayLength is not a power of 2
1027  // note that, for future version, we just need to find the next power of
1028  // two and then pretend that all values greater than the array length are
1029  // infinite
1030  assert( isPow2(arrayLength) );
1031  assert( isPow2(sharedLength) );
1032 
1033  // if the array length is smaller than the shared size limit, then we only
1034  // need to do the sort kernel
1035  if(arrayLength <= sharedLength)
1036  {
1037  uint_t blockCount = 1;
1038  uint_t threadCount = arrayLength / 2;
1039  uint_t sharedMem = arrayLength * ( sizeof(KeyType) );
1040 
1041  // fail if the number of items is not a multiple of the shared size
1042  // limit... This appears to only be a simplification for the SDK
1043  // example, the algorithm generalizes to arrays which are not multiples
1044  // of the block size, we just need to fix that later
1045  sortShared<<<blockCount, threadCount, sharedMem>>>(
1046  d_DstKey, d_SrcKey, arrayLength, dir);
1047 
1048  ret = threadCount;
1049  }
1050 
1051  // if the array lenght is larger than the shared size limit, then we need
1052  // to split the array up into blocks. We call the sort kernel on each
1053  // block which results in each block being sorted, note that they're
1054  // sorted in opposite directions though so that each pair forms a bitonic
1055  // series (i.e. increasing up to the break, then decreasing after)
1056  else
1057  {
1058  // note that right here we are assuming that each array is a multiple
1059  // of sharedLength, however, note that we enforce that arrayLength and
1060  // shared length are both powers of two, so if
1061  // arrayLength is >= sharedLength then blockCount will be a whole number
1062  // (in fact, a power of two)
1063  uint_t blockCount = dividePow2(arrayLength,sharedLength);
1064 
1065  // the number of threads we need is exactly 1/2 the size of the shared
1066  // buffer
1067  uint_t threadCount = sharedLength / 2;
1068 
1069  // the amount of shared memory the kernel needs is given by the length
1070  // of the shared array it will sort times the amount of data in
1071  // each element
1072  uint_t sharedMem = sharedLength * ( sizeof(KeyType) );
1073 
1074  // this kernel sorts each block of the array into a bitonic list
1075  sortSharedInc<<<blockCount, threadCount,sharedMem>>>(
1076  d_DstKey, d_SrcKey, sharedLength);
1077  cuda::checkLastError("bitonic::sortSharedInc");
1078 
1079  // now we go through and iteratively merge the results
1080  // size starts at 2* the memory of a block and is double at each
1081  // iteration until it reaches the size of the array
1082  // note that the array composed of length (2*blockSize) composed of
1083  // concatinating two block results together forms a bitonic series
1084  //
1085  // note that this process is basically the same as the second half of
1086  // the sort kernel, however we have to do it by iteratively calling
1087  // the kernel because the resuls are all stored in global memory
1088  for(uint_t size = 2 * sharedLength; size <= arrayLength; size <<= 1)
1089  {
1090  // the stride starts at half of the size and is divided by two until
1091  // it reaches 0
1092  for(unsigned stride = size / 2; stride > 0; stride >>= 1)
1093  {
1094  // if the stride is too large, we used the merge kerenel that
1095  // works on global memory, this kernel only performs one
1096  // iteration of the merge (i.e. e->f and no further), this
1097  // is because the stride is so long that it crosses block
1098  // boundaries
1099  if(stride >= sharedLength)
1100  {
1101  uint_t threadCount = std::min(globalThread,arrayLength/2);
1102  uint_t blockCount = arrayLength / (2*threadCount);
1103  mergeGlobal<<<blockCount, threadCount>>>(
1104  d_DstKey,d_DstKey,
1105  arrayLength, size, stride, dir);
1106  cuda::checkLastError("bitonic::mergeGlobal");
1107  }
1108 
1109  // if the stride is small enough, the comparator separation is
1110  // small enough that it will not cross thread boundaries (i.e.
1111  // see how f->(g,h) has no comparator crossing the half-way
1112  // mark)
1113  else
1114  {
1115  mergeShared<<<blockCount, threadCount, sharedMem>>>(
1116  d_DstKey, d_DstKey,
1117  arrayLength, sharedLength, size, dir);
1118  cuda::checkLastError("bitonic::mergeShared");
1119 
1120  // this is important, note the break here, it should exit
1121  // us as soon as we used the shared kernel, and consumes
1122  // the rest of the for loop.. this kernel will loop through
1123  // all remaining smaller strides
1124  break;
1125  }
1126  }
1127  }
1128 
1129  ret = threadCount;
1130  }
1131 
1132  return ret;
1133 }
1134 
1135 
1136 
1137 
1140 
1146 template <typename KeyType>
1147 __global__ void prepare( KeyType* d_SrcKey,
1148  KeyType init,
1149  uint_t arrayLength
1150  )
1151 {
1152  uint_t tid = blockIdx.x * blockDim.x + threadIdx.x;
1153  if(tid < arrayLength)
1154  d_SrcKey[tid] = init;
1155 }
1156 
1157 
1158 
1159 
1160 
1161 
1162 
1163 
1164 
1165 
1166 } // namespace bitonic
1167 } // namespace cuda
1168 } // namespace mpblocks
1169 
1170 
1171 
1172 #endif
__global__ void sortShared(KeyType *d_DstKey, ValueType *d_DstVal, KeyType *d_SrcKey, ValueType *d_SrcVal, uint_t arrayLength, Direction dir)
single kernel (unified) bitonic sort
Definition: kernels.cu.hpp:108
int x
Definition: fakecuda.h:44
#define __global__
Definition: fakecuda.h:33
__global__ void sortSharedInc(KeyType *d_DstKey, ValueType *d_DstVal, KeyType *d_SrcKey, ValueType *d_SrcVal, uint_t sharedLength)
bottom level of the bitonic sort
Definition: kernels.cu.hpp:283
int init()
calls FCGX_Init
__global__ void prepare(KeyType *d_SrcKey, KeyType init, uint_t arrayLength)
used when arrayLength is not a power of two, it writes to all values of d_SrcKey (which is an offset...
#define __shared__
Definition: fakecuda.h:36
bool isPow2(T x)
returns true if the parameter is an exact power of two
Definition: powersOfTwo.h:71
T dividePow2(T x, T y)
returns x/y if x and y are both powers of two, otherwise the result is undefined
Definition: powersOfTwo.h:321
__global__ void mergeGlobal(KeyType *d_DstKey, ValueType *d_DstVal, KeyType *d_SrcKey, ValueType *d_SrcVal, uint_t arrayLength, uint_t size, uint_t stride, Direction dir)
sorts a bitonic series, this kernel is for a stride >= SHARED_SIZE_LIMIT
Definition: kernels.cu.hpp:558
void checkLastError(const std::string &msg="checkLastError")
wraps getLastError
Dim3 threadIdx
__device__ void compareSwap(KeyType &keyA, ValueType &valA, KeyType &keyB, ValueType &valB, Direction dir)
implements a "comparator": compares to keys and swaps them if they are not in the desired order ...
Definition: kernels.cu.hpp:52
Dim3 blockIdx
#define UMAD(a, b, c)
Definition: kernels.cu.hpp:31
#define __device__
Definition: fakecuda.h:34
uint_t sort(KeyType *d_DstKey, ValueType *d_DstVal, KeyType *d_SrcKey, ValueType *d_SrcVal, uint_t arrayLength, uint_t sharedLength, Direction dir, uint_t globalThread)
kernel launcher, sorts an array of key/value pairs using the bitonic sort algorithm ...
Definition: kernels.cu.hpp:870
Dim3 blockDim
Direction
specifies values for the direction the sorter should sort the keys
Definition: Direction.h:37
__global__ void mergeShared(KeyType *d_DstKey, ValueType *d_DstVal, KeyType *d_SrcKey, ValueType *d_SrcVal, uint_t arrayLength, uint_t sharedLength, uint_t size, Direction dir)
sorts a bitonic series, this kernel is for size > SHARED_SIZE_LIMIT and for a stride in [1...
Definition: kernels.cu.hpp:699
void __syncthreads()