cheshirekow  v0.1.0
PointSet.cu.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2012 Josh Bialkowski (jbialk@mit.edu)
3  *
4  * This file is part of mpblocks.
5  *
6  * mpblocks is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * mpblocks is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with mpblocks. If not, see <http://www.gnu.org/licenses/>.
18  */
27 #ifndef MPBLOCKS_CUDANN_POINTSET_CU_HPP_
28 #define MPBLOCKS_CUDANN_POINTSET_CU_HPP_
29 
30 
31 #include <map>
32 #include <string>
33 #include <limits>
34 #include <iostream>
35 
36 #include <mpblocks/cuda.hpp>
40 
41 
42 namespace mpblocks {
43 namespace cudaNN {
44 
45 
46 template <typename Format_t, unsigned int NDim, bool Enable>
47 struct SE3Attr
48 {
49  static void maxRegs(unsigned int &maxRegs){}
50 };
51 
52 template <typename Format_t, unsigned int NDim>
53 struct SE3Attr<Format_t,NDim,true>
54 {
55  static void maxRegs(unsigned int &maxRegs)
56  {
57  typedef unsigned int uint_t;
59 
60  attr.getFrom( &kernels::se3_distance<Format_t,NDim> );
61  maxRegs = std::max(maxRegs, (uint_t)attr.numRegs);
62  }
63 };
64 
65 
66 
67 template <typename Format_t, unsigned int NDim, bool Enable>
68 struct R2S1Attr
69 {
70  static void maxRegs(unsigned int &maxRegs){}
71 };
72 
73 template <typename Format_t, unsigned int NDim>
74 struct R2S1Attr<Format_t,NDim,true>
75 {
76  static void maxRegs(unsigned int &maxRegs)
77  {
78  typedef unsigned int uint_t;
80 
81  attr.getFrom( &kernels::r2s1_distance<Format_t,NDim> );
82  maxRegs = std::max(maxRegs, (uint_t)attr.numRegs);
83  }
84 };
85 
86 
87 template <typename Format_t, unsigned int NDim>
89 {
90  cuda::DeviceProp devProps(devId);
92  uint_t maxRegs = 0;
93 
94  typedef QueryPoint<Format_t,NDim> QP;
96  typedef unsigned int uint;
97  typedef void (*point_dist_fn)(QP,Format_t*,uint,Format_t*,uint,uint);
98  point_dist_fn euclidean_dist_fn =
99  &kernels::euclidean_distance<Format_t,NDim>;
100 
101  attr.getFrom( euclidean_dist_fn );
102  maxRegs = std::max(maxRegs, (uint_t)attr.numRegs);
103 
104  SE3Attr<Format_t, NDim,(NDim>=7)>::maxRegs(maxRegs);
105  R2S1Attr<Format_t,NDim,(NDim>=3)>::maxRegs(maxRegs);
106 
107  // the maximum number of threads we can put into a block is given by the
108  // number of registers on each SM divided by the number of registers that
109  // are used by each thread in the kernel
110  uint_t threadCount_max = (uint_t)devProps.regsPerBlock / maxRegs;
111 
112  // make sure that the number of threads per block computed as above doesn't
113  // exceed the max per-block for the architectture
114  m_threadsPerBlock = std::min( threadCount_max,
115  (uint_t)devProps.maxThreadsPerBlock);
116 
117  // get the number of multiprocessors
118  m_nSM = devProps.multiProcessorCount;
119 
120  // configure the sorter
121  m_sorter.config(devId);
122 }
123 
124 
125 template <typename Format_t, unsigned int NDim>
127  EuclideanTag tag, const Format_t q[NDim], Result_t& out )
128 {
129  distance(tag,q,m_dbSize,out);
130 }
131 
132 
133 template <typename Format_t, unsigned int NDim>
135  EuclideanTag, const Format_t q[NDim], uint_t size, Result_t& out )
136 {
137  uint_t blocks,threads;
138  computeGrid(blocks,threads);
139 
140  size_t pitchIn = m_pitchIn/sizeof(Format_t);
141  size_t pitchOut = m_pitchOut/sizeof(Format_t);
143  std::copy(q,q+NDim,query.data);
144 
145  // call the kernel
146  kernels::euclidean_distance<Format_t,NDim><<<blocks,threads>>>(
147  query,
148  m_g_in,
149  pitchIn,
150  m_g_out,
151  pitchOut,
152  size
153  );
155 
156  // retrieve results
158  out.ptr(), out.pitch(),
159  m_g_out, m_pitchOut,
160  out.cols(), 1,
161  cudaMemcpyDeviceToHost );
162 }
163 
164 
165 template <typename Format_t, unsigned int NDim>
167  EuclideanTag tag, const Format_t q[NDim], Result_t& out )
168 {
169  nearest(tag,q,m_dbSize,out);
170 }
171 
172 
173 template <typename Format_t, unsigned int NDim>
175  EuclideanTag, const Format_t q[NDim], uint_t size, Result_t& out )
176 {
177  uint_t blocks,threads;
178  computeGrid(blocks,threads);
179 
180  size_t pitchIn = m_pitchIn/sizeof(Format_t);
181  size_t pitchOut = m_pitchOut/sizeof(Format_t);
183  std::copy(q,q+NDim,query.data);
184 
185  // call the kernel to calculate distances to children
186  kernels::euclidean_distance<Format_t,NDim><<<blocks,threads>>>(
187  query,
188  m_g_in,
189  pitchIn,
190  m_g_out,
191  pitchOut,
192  size
193  );
195 
196  Format_t* unsortedKeys = m_g_out;
197  Format_t* unsortedVals = m_g_out + pitchOut;
198  Format_t* sortedKeys = m_g_sorted;
199  Format_t* sortedVals = m_g_sorted + pitchOut;
200 
201  // call the kernel to sort the results
202  m_sorter.sort(
203  sortedKeys, sortedVals,
204  unsortedKeys, unsortedVals,
205  size,
208 
209  // fetch the k smallest
211  out.ptr(), out.pitch(),
212  m_g_sorted, m_pitchOut,
213  out.cols(), 2,
214  cudaMemcpyDeviceToHost );
215 }
216 
217 
218 template <typename Format_t, unsigned int NDim,bool Enabled>
219 struct SE3Kernel
220 {
221  static void dispatch( Format_t weight,
222  const QueryPoint<Format_t,NDim>& q,
223  Format_t* g_in,
224  unsigned int pitchIn,
225  Format_t* g_out,
226  unsigned int pitchOut,
227  unsigned int n,
228  unsigned int blocks,
229  unsigned int threads )
230  {
231  std::cerr << "CANNOT CALL SE3 KERNEL IF NDIM != 7\n";
232  assert( false );
233  }
234 };
235 
236 
237 
238 
239 template <typename Format_t, unsigned int NDim>
240 struct SE3Kernel<Format_t,NDim,true>
241 {
242  static void dispatch(
243  Format_t weight,
244  const QueryPoint<Format_t,NDim>& q,
245  Format_t* g_in,
246  unsigned int pitchIn,
247  Format_t* g_out,
248  unsigned int pitchOut,
249  unsigned int n,
250  unsigned int blocks,
251  unsigned int threads)
252  {
253  // call the kernel
254  kernels::se3_distance<Format_t,NDim><<<blocks,threads>>>
255  (weight,q,g_in,pitchIn,g_out,pitchOut,n );
257  }
258 };
259 
260 
261 template <typename Format_t, unsigned int NDim>
263  SE3Tag params, const Format_t q[NDim], Result_t& out )
264 {
265  distance(params,q,m_dbSize,out);
266 }
267 
268 
269 template <typename Format_t, unsigned int NDim>
271  SE3Tag params, const Format_t q[NDim], uint_t size, Result_t& out )
272 {
273  uint_t blocks,threads;
274  computeGrid(blocks,threads);
275 
276  size_t pitchIn = m_pitchIn/sizeof(Format_t);
277  size_t pitchOut = m_pitchOut/sizeof(Format_t);
279  std::copy(q,q+NDim,query.data);
280  Format_t w = params.w;
281 
282  SE3Kernel<Format_t,NDim,(NDim>=7)>::dispatch(
283  w,
284  query,
285  m_g_in,
286  pitchIn,
287  m_g_out,
288  pitchOut,
289  size,
290  blocks, threads);
291 
292 
293  // retrieve results
295  out.ptr(), out.pitch(),
296  m_g_out, m_pitchOut,
297  out.cols(), 1,
298  cudaMemcpyDeviceToHost );
299 }
300 
301 
302 template <typename Format_t, unsigned int NDim>
304  SE3Tag params, const Format_t q[NDim], Result_t& out )
305 {
306  nearest(params,q,m_dbSize,out);
307 }
308 
309 
310 template <typename Format_t, unsigned int NDim>
312  SE3Tag params, const Format_t q[NDim], uint_t size, Result_t& out )
313 {
314  uint_t blocks,threads;
315  computeGrid(blocks,threads);
316 
317 
318  size_t pitchIn = m_pitchIn/sizeof(Format_t);
319  size_t pitchOut = m_pitchOut/sizeof(Format_t);
321  std::copy(q,q+NDim,query.data);
322  Format_t w = params.w;
323 
324  // call the kernel to calculate distances to children
325  SE3Kernel<Format_t,NDim,(NDim>=7)>::dispatch(
326  w,
327  query,
328  m_g_in,
329  pitchIn,
330  m_g_out,
331  pitchOut,
332  size,
333  blocks, threads );
334 
335  Format_t* unsortedKeys = m_g_out;
336  Format_t* unsortedVals = m_g_out + pitchOut;
337  Format_t* sortedKeys = m_g_sorted;
338  Format_t* sortedVals = m_g_sorted + pitchOut;
339 
340  // call the kernel to sort the results
341  m_sorter.sort(
342  sortedKeys, sortedVals,
343  unsortedKeys, unsortedVals,
344  size,
347 
348  // fetch the k smallest
350  out.ptr(), out.pitch(),
351  m_g_sorted, m_pitchOut,
352  out.cols(), 2,
353  cudaMemcpyDeviceToHost );
354 
356 }
357 
358 
359 
360 
361 
362 
363 template <typename Format_t, unsigned int NDim,bool Enabled>
365 {
366  static void dispatch( Format_t weight,
367  const QueryPoint<Format_t,NDim>& q,
368  Format_t* g_in,
369  unsigned int pitchIn,
370  Format_t* g_out,
371  unsigned int pitchOut,
372  unsigned int n,
373  unsigned int blocks,
374  unsigned int threads )
375  {
376  std::cerr << "CANNOT CALL R2S1 KERNEL IF NDIM < 3 (" << NDim << ")\n";
377  assert( false );
378  }
379 };
380 
381 
382 
383 
384 template <typename Format_t, unsigned int NDim>
385 struct R2S1Kernel<Format_t,NDim,true>
386 {
387  static void dispatch(
388  Format_t weight,
389  const QueryPoint<Format_t,NDim>& q,
390  Format_t* g_in,
391  unsigned int pitchIn,
392  Format_t* g_out,
393  unsigned int pitchOut,
394  unsigned int n,
395  unsigned int blocks,
396  unsigned int threads)
397  {
398  // call the kernel
399  kernels::r2s1_distance<Format_t,NDim><<<blocks,threads>>>
400  (weight,q,g_in,pitchIn,g_out,pitchOut,n );
402  }
403 };
404 
405 
406 template <typename Format_t, unsigned int NDim>
408  R2S1Tag params, const Format_t q[NDim], Result_t& out )
409 {
410  distance(params,q,m_dbSize,out);
411 }
412 
413 
414 
415 template <typename Format_t, unsigned int NDim>
417  R2S1Tag params, const Format_t q[NDim], uint_t size, Result_t& out )
418 {
419  uint_t blocks,threads;
420  computeGrid(blocks,threads);
421 
422  size_t pitchIn = m_pitchIn/sizeof(Format_t);
423  size_t pitchOut = m_pitchOut/sizeof(Format_t);
425  std::copy(q,q+NDim,query.data);
426  Format_t w = params.w;
427 
428  R2S1Kernel<Format_t,NDim,(NDim>=3)>::dispatch(
429  w,
430  query,
431  m_g_in,
432  pitchIn,
433  m_g_out,
434  pitchOut,
435  size,
436  blocks, threads);
437 
438 
439  // retrieve results
441  out.ptr(), out.pitch(),
442  m_g_out, m_pitchOut,
443  out.cols(), 1,
444  cudaMemcpyDeviceToHost );
445 }
446 
447 
448 template <typename Format_t, unsigned int NDim>
450  R2S1Tag params, const Format_t q[NDim], Result_t& out )
451 {
452  nearest(params,q,m_dbSize,out);
453 }
454 
455 template <typename Format_t, unsigned int NDim>
457  R2S1Tag params, const Format_t q[NDim], uint_t size, Result_t& out )
458 {
459  uint_t blocks,threads;
460  computeGrid(blocks,threads);
461 
462 
463  size_t pitchIn = m_pitchIn/sizeof(Format_t);
464  size_t pitchOut = m_pitchOut/sizeof(Format_t);
466  std::copy(q,q+NDim,query.data);
467  Format_t w = params.w;
468 
469  // call the kernel to calculate distances to children
470  R2S1Kernel<Format_t,NDim,(NDim>=3)>::dispatch(
471  w,
472  query,
473  m_g_in,
474  pitchIn,
475  m_g_out,
476  pitchOut,
477  size,
478  blocks, threads );
479 
480  Format_t* unsortedKeys = m_g_out;
481  Format_t* unsortedVals = m_g_out + pitchOut;
482  Format_t* sortedKeys = m_g_sorted;
483  Format_t* sortedVals = m_g_sorted + pitchOut;
484 
485  // call the kernel to sort the results
486  m_sorter.sort(
487  sortedKeys, sortedVals,
488  unsortedKeys, unsortedVals,
489  size,
492 
493  // fetch the k smallest
495  out.ptr(), out.pitch(),
496  m_g_sorted, m_pitchOut,
497  out.cols(), 2,
498  cudaMemcpyDeviceToHost );
499 
501 }
502 
503 
504 
505 
506 
507 
508 
509 } //< namespace cudaNN
510 } //< namespace mpblocks
511 
512 
513 #endif // POINTSET_HPP_
static void maxRegs(unsigned int &maxRegs)
Definition: PointSet.cu.hpp:76
static void maxRegs(unsigned int &maxRegs)
Definition: PointSet.cu.hpp:55
static void dispatch(Format_t weight, const QueryPoint< Format_t, NDim > &q, Format_t *g_in, unsigned int pitchIn, Format_t *g_out, unsigned int pitchOut, unsigned int n, unsigned int blocks, unsigned int threads)
Format_t * ptr() const
Definition: PointSet.h:65
void deviceSynchronize()
blocks the host thread until kernels are done executing
static void dispatch(Format_t weight, const QueryPoint< Format_t, NDim > &q, Format_t *g_in, unsigned int pitchIn, Format_t *g_out, unsigned int pitchOut, unsigned int n, unsigned int blocks, unsigned int threads)
void memcpy2DT(T *dst, size_t dpitchBytes, const T *src, size_t spitchBytes, size_t widthObs, size_t height, MemcpyKind kind)
wraps cudaMemcpy2D
Definition: wrap.hpp:62
uint_t cols() const
Definition: PointSet.h:66
void distance(EuclideanTag, const Format_t q[NDim], Result_t &out)
batch compute distance to point set
void config()
retreives device properties of the current device, used to calculate kernel peramaters, call once after setting the cuda device and before launching any kernels
Definition: PointSet.hpp:210
static void maxRegs(unsigned int &maxRegs)
Definition: PointSet.cu.hpp:49
void getFrom(T *entry)
Definition: wrap.hpp:35
static void dispatch(Format_t weight, const QueryPoint< Format_t, NDim > &q, Format_t *g_in, unsigned int pitchIn, Format_t *g_out, unsigned int pitchOut, unsigned int n, unsigned int blocks, unsigned int threads)
static void maxRegs(unsigned int &maxRegs)
Definition: PointSet.cu.hpp:70
void nearest(EuclideanTag, const Format_t q[NDim], Result_t &out)
return k nearest children of q, k is columns of out
Char8_t * copy(const Char8_t *s)
static void dispatch(Format_t weight, const QueryPoint< Format_t, NDim > &q, Format_t *g_in, unsigned int pitchIn, Format_t *g_out, unsigned int pitchOut, unsigned int n, unsigned int blocks, unsigned int threads)
sort should be ascending, i.e. a[i] < a[j], i < j
Definition: Direction.h:40
uint_t pitch() const
Definition: PointSet.h:67