cheshirekow  v0.1.0
PointSet.cu.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2012 Josh Bialkowski (jbialk@mit.edu)
3  *
4  * This file is part of mpblocks.
5  *
6  * mpblocks is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * mpblocks is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with mpblocks. If not, see <http://www.gnu.org/licenses/>.
18  */
25 #ifndef MPBLOCKS_DUBINS_CURVES_CUDA2_POINTSET_HPP_
26 #define MPBLOCKS_DUBINS_CURVES_CUDA2_POINTSET_HPP_
27 
28 #include <map>
29 #include <string>
31 
32 namespace mpblocks {
33 namespace dubins {
34 namespace curves_cuda {
35 
36 template <typename Format_t>
38  if (m_buf) delete[] m_buf;
39 
40  m_buf = new float[rows * cols];
41 
42  m_rows = rows;
43  m_cols = cols;
44 }
45 
46 template <typename Format_t>
48  : m_sorter(-std::numeric_limits<Format_t>::max(),
49  std::numeric_limits<Format_t>::max()) {
50  m_g_in = 0;
51  m_g_out = 0;
52  m_g_sorted = 0;
53  m_params.r = r;
54 
56  m_nSM = 0;
57 
58  deallocate();
59 
60  try {
61  config();
62  allocate(n);
63  } catch (const std::exception& ex) {
64  std::cerr << "Error in constructing dubins CUDA PointSet: " << ex.what()
65  << "\nNote: point set is unallocated\n";
66  }
67 }
68 
69 template <typename Format_t>
71  deallocate();
72 }
73 
74 template <typename Format_t>
76  if (m_g_in) {
77  cuda::free(m_g_in);
78  m_g_in = 0;
79  }
80 
81  if (m_g_out) {
82  cuda::free(m_g_out);
83  m_g_out = 0;
84  }
85 
86  if (m_g_sorted) {
87  cuda::free(m_g_sorted);
88  m_g_sorted = 0;
89  }
90 
91  m_dbAlloc = 0;
92  m_dbAlloc2 = 0;
93  m_dbSize = 0;
94 }
95 
96 template <typename Format_t>
98  deallocate();
99 
100  m_dbAlloc = n;
101  m_dbAlloc2 = cuda::nextPow2(n);
102 
103  m_g_in = cuda::mallocPitchT<Format_t>(m_pitchIn, m_dbAlloc, 3);
104  std::cout << "allocated m_g_in for " << m_dbAlloc
105  << " object with pitch: " << m_pitchIn << "\n";
106 
107  m_g_out = cuda::mallocPitchT<Format_t>(m_pitchOut, m_dbAlloc2, 2);
108  std::cout << "allocated m_g_out for " << m_dbAlloc
109  << " object with pitch: " << m_pitchOut << "\n";
110 
111  m_g_sorted = cuda::mallocPitchT<Format_t>(m_pitchOut, m_dbAlloc2, 2);
112  std::cout << "allocated m_g_sorted for " << m_dbAlloc
113  << " object with pitch: " << m_pitchOut << "\n";
114 }
115 
116 template <typename Format_t>
117 void PointSet<Format_t>::set_r(Format_t r) {
118  m_params.r = r;
119 }
120 
121 template <typename Format_t>
122 void PointSet<Format_t>::clear(bool clearmem) {
123  m_dbSize = 0;
124  if (clearmem) {
125  cuda::memset2DT(m_g_in, m_pitchIn, 0, m_dbAlloc, 3);
126  cuda::memset2DT(m_g_out, m_pitchOut, 0, m_dbAlloc2, 2);
127  cuda::memset2DT(m_g_sorted, m_pitchOut, 0, m_dbAlloc2, 2);
128  }
129 }
130 
131 template <typename Format_t>
133  int devId = cuda::getDevice();
134  config(devId);
135 }
136 
137 template <typename Format_t>
138 void PointSet<Format_t>::config(int devId) {
139  cuda::DeviceProp devProps(devId);
141  uint_t maxRegs = 0;
142 
143  attr.getFrom(&kernels::distance_to_set<Format_t>);
144  maxRegs = std::max(maxRegs, (uint_t)attr.numRegs);
145 
146  attr.getFrom(&kernels::distance_to_set_with_id<Format_t>);
147  maxRegs = std::max(maxRegs, (uint_t)attr.numRegs);
148 
149  attr.getFrom(&kernels::distance_to_set_debug<Format_t>);
150  maxRegs = std::max(maxRegs, (uint_t)attr.numRegs);
151 
152  attr.getFrom(&kernels::distance_from_set<Format_t>);
153  maxRegs = std::max(maxRegs, (uint_t)attr.numRegs);
154 
155  attr.getFrom(&kernels::distance_from_set_with_id<Format_t>);
156  maxRegs = std::max(maxRegs, (uint_t)attr.numRegs);
157 
158  attr.getFrom(&kernels::distance_from_set_debug<Format_t>);
159  maxRegs = std::max(maxRegs, (uint_t)attr.numRegs);
160 
161  // the maximum number of threads we can put into a block is given by the
162  // number of registers on each SM divided by the number of registers that
163  // are used by each thread in the kernel
164  uint_t threadCount_max = (uint_t)devProps.regsPerBlock / maxRegs;
165 
166  // make sure that the number of threads per block computed as above doesn't
167  // exceed the max per-block for the architectture
168  m_threadsPerBlock =
169  std::min(threadCount_max, (uint_t)devProps.maxThreadsPerBlock);
170 
171  // get the number of multiprocessors
172  m_nSM = devProps.multiProcessorCount;
173 
174  // configure the sorter
175  m_sorter.config(devId);
176 }
177 
178 template <typename Format_t>
180  threads = cuda::intDivideRoundUp(m_dbSize, m_nSM);
181  if (threads > m_threadsPerBlock) threads = m_threadsPerBlock;
182  blocks = cuda::intDivideRoundUp(m_dbSize, threads);
183 }
184 
185 template <typename Format_t>
186 int PointSet<Format_t>::insert(Format_t q[3]) {
187  cuda::memcpy2DT(m_g_in + m_dbSize, m_pitchIn, q, sizeof(Format_t), 1, 3,
188  cudaMemcpyHostToDevice);
189 
190  m_dbSize++;
191  return m_dbSize - 1;
192 }
193 
194 template <typename Format_t>
196  ResultBlock<Format_t>& out) {
197  m_params.set_q(q);
198 
199  uint_t blocks, threads;
200  computeGrid(blocks, threads);
201 
202  size_t pitchIn = m_pitchIn / sizeof(Format_t);
203  size_t pitchOut = m_pitchOut / sizeof(Format_t);
204 
205  const int DEBUG_ROWS = 4 * 11 + 4 * 13;
206 
207  switch (out.rows()) {
208  case 1: {
209  // call the kernel
210  kernels::distance_to_set<Format_t><<<blocks,threads>>>(
211  m_params,
212  m_g_in,
213  pitchIn,
214  m_g_out,
215  pitchOut,
216  m_dbSize
217  );
219 
220  // retrieve results
221  cuda::memcpy2DT(out.ptr(), out.pitch(), m_g_out, m_pitchOut, out.cols(),
222  2, cudaMemcpyDeviceToHost);
223 
224  break;
225  }
226 
227  case 2: {
228  // call the kernel
229  kernels::distance_to_set_with_id<Format_t><<<blocks,threads>>>(
230  m_params,
231  m_g_in,
232  pitchIn,
233  m_g_out,
234  pitchOut,
235  m_dbSize
236  );
237 
239 
240  // retrieve results
241  cuda::memcpy2DT(out.ptr(), out.pitch(), m_g_out, m_pitchOut, out.cols(),
242  2, cudaMemcpyDeviceToHost);
243  break;
244  }
245 
246  case DEBUG_ROWS: {
247  // allocate output storage
248  size_t pitch = 1;
249  Format_t* g_out =
250  cuda::mallocPitchT<Format_t>(pitch, m_dbSize, DEBUG_ROWS);
251 
252  // call the kernel
253  kernels::distance_to_set_debug<Format_t><<<blocks,threads>>>(
254  m_params,
255  m_g_in,
256  pitchIn,
257  g_out,
258  pitch/sizeof(Format_t),
259  m_dbSize
260  );
262 
263  // retrieve results
264  cuda::memcpy2DT(out.ptr(), out.pitch(), g_out, pitch, out.cols(),
265  DEBUG_ROWS, cudaMemcpyDeviceToHost);
266 
267  // free output storage
268  cuda::free(g_out);
269  break;
270  }
271 
272  default:
273  utility::ex() << "PointSet::distance_to_set: "
274  "Valid output rows is 1,2, or 24";
275  break;
276  }
277 }
278 
279 template <typename Format_t>
281  ResultBlock<Format_t>& out) {
282  m_params.set_q(q);
283 
284  uint_t blocks, threads;
285  computeGrid(blocks, threads);
286 
287  size_t pitchIn = m_pitchIn / sizeof(Format_t);
288  size_t pitchOut = m_pitchOut / sizeof(Format_t);
289 
290  switch (out.rows()) {
291  case 1: {
292  // call the kernel
293  kernels::distance_from_set<Format_t><<<blocks,threads>>>(
294  m_params,
295  m_g_in,
296  pitchIn,
297  m_g_out,
298  pitchOut,
299  m_dbSize
300  );
302 
303  // retrieve results
305  out.ptr(), out.pitch(),
306  m_g_out, m_pitchOut,
307  out.cols(), 2,
308  cudaMemcpyDeviceToHost );
309  break;
310  }
311 
312  case 2: {
313  // call the kernel
314  kernels::distance_from_set_with_id<Format_t><<<blocks,threads>>>(
315  m_params,
316  m_g_in,
317  pitchIn,
318  m_g_out,
319  pitchOut,
320  m_dbSize
321  );
323 
324  // retrieve results
325  cuda::memcpy2DT(out.ptr(), out.pitch(), m_g_out, m_pitchOut, out.cols(), 2,
326  cudaMemcpyDeviceToHost);
327  break;
328  }
329 
330  case 24: {
331  // allocate output storage
332  size_t pitch = 1;
333  Format_t* g_out = cuda::mallocPitchT<Format_t>(pitch, m_dbSize, 24);
334 
335  // call the kernel
336  kernels::distance_from_set_debug<Format_t><<<blocks,threads>>>(
337  m_params,
338  m_g_in,
339  pitchIn,
340  g_out,
341  pitch/sizeof(Format_t),
342  m_dbSize
343  );
345 
346  // retrieve results
347  cuda::memcpy2DT(out.ptr(), out.pitch(), g_out, pitch, out.cols(), 24,
348  cudaMemcpyDeviceToHost);
349 
350  // free output storage
351  cuda::free(g_out);
352  break;
353  }
354 
355  default:
356  utility::ex() << "PointSet::distance_from_set: "
357  "Valid output rows is 1,2, or 24";
358  break;
359  }
360 }
361 
362 template <typename Format_t>
364  ResultBlock<Format_t>& out) {
365  m_params.set_q(q);
366 
367  uint_t blocks, threads;
368  computeGrid(blocks, threads);
369 
370  size_t pitchIn = m_pitchIn / sizeof(Format_t);
371  size_t pitchOut = m_pitchOut / sizeof(Format_t);
372 
373  // call the kernel to calculate distances to children
374  kernels::distance_to_set_with_id<Format_t><<<blocks,threads>>>(
375  m_params,
376  m_g_in,
377  pitchIn,
378  m_g_out,
379  pitchOut,
380  m_dbSize
381  );
383 
384  Format_t* unsortedKeys = m_g_out;
385  Format_t* unsortedVals = m_g_out + pitchOut;
386  Format_t* sortedKeys = m_g_sorted;
387  Format_t* sortedVals = m_g_sorted + pitchOut;
388 
389  // call the kernel to sort the results
390  m_sorter.sort(sortedKeys, sortedVals, unsortedKeys, unsortedVals, m_dbSize,
393 
394  // fetch the k smallest
395  cuda::memcpy2DT(out.ptr(), out.pitch(), m_g_sorted, m_pitchOut, out.cols(), 2,
396  cudaMemcpyDeviceToHost);
397 }
398 
399 template <typename Format_t>
401  ResultBlock<Format_t>& out) {
402  m_params.set_q(q);
403 
404  uint_t blocks, threads;
405  computeGrid(blocks, threads);
406 
407  size_t pitchIn = m_pitchIn / sizeof(Format_t);
408  size_t pitchOut = m_pitchOut / sizeof(Format_t);
409 
410  // call the kernel to calculate distances to children
411  kernels::distance_from_set_with_id<Format_t><<<blocks,threads>>>(
412  m_params,
413  m_g_in,
414  pitchIn,
415  m_g_out,
416  pitchOut,
417  m_dbSize
418  );
420 
421  Format_t* unsortedKeys = m_g_out;
422  Format_t* unsortedVals = m_g_out + pitchOut;
423  Format_t* sortedKeys = m_g_sorted;
424  Format_t* sortedVals = m_g_sorted + pitchOut;
425 
426  // call the kernel to sort the results
427  m_sorter.sort(sortedKeys, sortedVals, unsortedKeys, unsortedVals, m_dbSize,
430 
431  // fetch the k smallest
432  cuda::memcpy2DT(out.ptr(), out.pitch(), m_g_sorted, m_pitchOut, out.cols(), 2,
433  cudaMemcpyDeviceToHost);
434 }
435 
436 template <typename Format_t>
438  ResultBlock<Format_t>& out) {
439  uint_t blocks, threads;
440  computeGrid(blocks, threads);
441 
442  size_t pitchIn = m_pitchIn / sizeof(Format_t);
443  size_t pitchOut = m_pitchOut / sizeof(Format_t);
444 
446  params.set_q(q);
447 
448  // call the kernel to calculate distances to children
449  kernels::group_distance_to_set<Format_t><<<blocks,threads>>>(
450  params,
451  m_g_in,
452  pitchIn,
453  m_g_out,
454  pitchOut,
455  m_dbSize
456  );
458 
459  // fetch the distances
460  cuda::memcpy2DT(out.ptr(), out.pitch(), m_g_out, m_pitchOut, out.cols(), 1,
461  cudaMemcpyDeviceToHost);
462 }
463 
464 template <typename Format_t>
466  ResultBlock<Format_t>& out) {
467  uint_t blocks, threads;
468  computeGrid(blocks, threads);
469 
470  size_t pitchIn = m_pitchIn / sizeof(Format_t);
471  size_t pitchOut = m_pitchOut / sizeof(Format_t);
472 
474  params.set_q(q);
475 
476  // call the kernel to calculate distances to children
477  kernels::group_distance_to_set_with_id<Format_t><<<blocks,threads>>>(
478  params,
479  m_g_in,
480  pitchIn,
481  m_g_out,
482  pitchOut,
483  m_dbSize
484  );
486 
487  Format_t* unsortedKeys = m_g_out;
488  Format_t* unsortedVals = m_g_out + pitchOut;
489  Format_t* sortedKeys = m_g_sorted;
490  Format_t* sortedVals = m_g_sorted + pitchOut;
491 
492  // call the kernel to sort the results
493  m_sorter.sort(sortedKeys, sortedVals, unsortedKeys, unsortedVals, m_dbSize,
496 
497  // fetch the k smallest
498  cuda::memcpy2DT(out.ptr(), out.pitch(), m_g_sorted, m_pitchOut, out.cols(), 2,
499  cudaMemcpyDeviceToHost);
500 }
501 
502 template <typename Format_t>
504  map["distance_to_set"].getFrom(&kernels::distance_to_set<Format_t>);
505  map["distance_to_set_with_id"].getFrom(
506  &kernels::distance_to_set_with_id<Format_t>);
507  map["distance_to_set_debug"].getFrom(
508  &kernels::distance_to_set_debug<Format_t>);
509  map["distance_from_set"].getFrom(&kernels::distance_from_set<Format_t>);
510  map["distance_from_set_with_id"].getFrom(
511  &kernels::distance_from_set_with_id<Format_t>);
512  map["distance_from_set_debug"].getFrom(
513  &kernels::distance_from_set_debug<Format_t>);
514  map["euclidean_to_set"].getFrom(&kernels::group_distance_to_set<Format_t>);
515  map["euclidean_to_set_with_id"].getFrom(
516  &kernels::group_distance_to_set<Format_t>);
517 
518  // Sorter_t::get_fattr(map);
519 }
520 
521 } // curves
522 } // dubins
523 } // mpblocks
524 
525 #endif // MPBLOCKS_DUBINS_CURVES_CUDA2_POINTSET_HPP_
void distance_to_set(Format_t q[3], ResultBlock< Format_t > &out)
batch compute distance to point set
Format_t * m_g_sorted
output for sorted results
Definition: PointSet.h:101
void allocate(uint_t n)
reallocates device storage for a point set of size n, also resets the database
Definition: PointSet.cu.hpp:97
ExceptionStream< std::runtime_error > ex
void clear(bool clearmem=false)
clear the database and reset input iterator
static void get_fattr(fattrMap_t &)
retrieve kernel attributes into the map, intended only for printing out statistics ...
void deallocate()
deallocate and zero out pointers
Definition: PointSet.cu.hpp:75
std::map< std::string, cuda::FuncAttributes > fattrMap_t
Definition: PointSet.h:88
void deviceSynchronize()
blocks the host thread until kernels are done executing
void group_distance_to_set(Format_t q[3], ResultBlock< Format_t > &out)
batch compute euclidean distances
Format_t * m_g_out
kernel output buffer
Definition: PointSet.h:100
T intDivideRoundUp(T x, T y)
integer divide with round up
Definition: powersOfTwo.h:270
void nearest_parents(Format_t q[3], ResultBlock< Format_t > &out)
return k nearest parents of q
void free(void *devPtr)
wraps cudaFree
void set_r(Format_t r)
set the radius
void distance_from_set(Format_t q[3], ResultBlock< Format_t > &out)
batch compute distance from point set
void memcpy2DT(T *dst, size_t dpitchBytes, const T *src, size_t spitchBytes, size_t widthObs, size_t height, MemcpyKind kind)
wraps cudaMemcpy2D
Definition: wrap.hpp:62
void nearest_children(Format_t q[3], ResultBlock< Format_t > &out)
return k nearest children of q
void computeGrid(uint_t &blocks, uint_t &threads)
compute the grid size given the current configuration and size of the point set
Params< Format_t > m_params
query parameters
Definition: PointSet.h:92
int insert(Format_t q[3])
insert a new state into the point set, and return it's id
void getFrom(T *entry)
Definition: wrap.hpp:35
ExceptionStream< std::runtime_error > ex
void memset2DT(T *devPtr, size_t pitchBytes, int value, size_t widthObjs, size_t height)
wraps cudaMemset2D
Definition: wrap.hpp:88
uint_t m_threadsPerBlock
maximum threads per block
Definition: PointSet.h:105
uint_t m_nSM
number of multiprocessors
Definition: PointSet.h:106
PointSet(uint_t n=10, Format_t r=1)
Definition: PointSet.cu.hpp:47
void group_distance_neighbors(Format_t q[3], ResultBlock< Format_t > &out)
find k euclidean nearest neighbors
Format_t * m_g_in
kernel input buffer
Definition: PointSet.h:99
void allocate(uint_t rows, uint_t cols)
Definition: PointSet.cu.hpp:37
void config()
retreives device properties of the current device, used to calculate kernel peramaters, call once after setting the cuda device and before launching any kernels
sort should be ascending, i.e. a[i] < a[j], i < j
Definition: Direction.h:40
T nextPow2(T x)
returns the smallest power of two that is not less than x
Definition: powersOfTwo.h:102
int getDevice()
wraps cudaGetDevice