27 #ifndef MPBLOCKS_CUDANN_POINTSET_CU_HPP_
28 #define MPBLOCKS_CUDANN_POINTSET_CU_HPP_
46 template <
typename Format_t,
unsigned int NDim,
bool Enable>
52 template <
typename Format_t,
unsigned int NDim>
57 typedef unsigned int uint_t;
60 attr.
getFrom( &kernels::se3_distance<Format_t,NDim> );
61 maxRegs = std::max(maxRegs, (uint_t)attr.numRegs);
67 template <
typename Format_t,
unsigned int NDim,
bool Enable>
73 template <
typename Format_t,
unsigned int NDim>
78 typedef unsigned int uint_t;
81 attr.
getFrom( &kernels::r2s1_distance<Format_t,NDim> );
82 maxRegs = std::max(maxRegs, (uint_t)attr.numRegs);
87 template <
typename Format_t,
unsigned int NDim>
96 typedef unsigned int uint;
97 typedef void (*point_dist_fn)(QP,Format_t*,
uint,Format_t*,
uint,
uint);
98 point_dist_fn euclidean_dist_fn =
99 &kernels::euclidean_distance<Format_t,NDim>;
101 attr.
getFrom( euclidean_dist_fn );
102 maxRegs = std::max(maxRegs, (
uint_t)attr.numRegs);
110 uint_t threadCount_max = (
uint_t)devProps.regsPerBlock / maxRegs;
114 m_threadsPerBlock = std::min( threadCount_max,
115 (
uint_t)devProps.maxThreadsPerBlock);
118 m_nSM = devProps.multiProcessorCount;
121 m_sorter.config(devId);
125 template <
typename Format_t,
unsigned int NDim>
129 distance(tag,q,m_dbSize,out);
133 template <
typename Format_t,
unsigned int NDim>
138 computeGrid(blocks,threads);
140 size_t pitchIn = m_pitchIn/
sizeof(Format_t);
141 size_t pitchOut = m_pitchOut/
sizeof(Format_t);
146 kernels::euclidean_distance<Format_t,NDim><<<blocks,threads>>>(
161 cudaMemcpyDeviceToHost );
165 template <
typename Format_t,
unsigned int NDim>
169 nearest(tag,q,m_dbSize,out);
173 template <
typename Format_t,
unsigned int NDim>
178 computeGrid(blocks,threads);
180 size_t pitchIn = m_pitchIn/
sizeof(Format_t);
181 size_t pitchOut = m_pitchOut/
sizeof(Format_t);
186 kernels::euclidean_distance<Format_t,NDim><<<blocks,threads>>>(
196 Format_t* unsortedKeys = m_g_out;
197 Format_t* unsortedVals = m_g_out + pitchOut;
198 Format_t* sortedKeys = m_g_sorted;
199 Format_t* sortedVals = m_g_sorted + pitchOut;
203 sortedKeys, sortedVals,
204 unsortedKeys, unsortedVals,
212 m_g_sorted, m_pitchOut,
214 cudaMemcpyDeviceToHost );
218 template <
typename Format_t,
unsigned int NDim,
bool Enabled>
224 unsigned int pitchIn,
226 unsigned int pitchOut,
229 unsigned int threads )
231 std::cerr <<
"CANNOT CALL SE3 KERNEL IF NDIM != 7\n";
239 template <
typename Format_t,
unsigned int NDim>
246 unsigned int pitchIn,
248 unsigned int pitchOut,
251 unsigned int threads)
254 kernels::se3_distance<Format_t,NDim><<<blocks,threads>>>
255 (weight,q,g_in,pitchIn,g_out,pitchOut,n );
261 template <
typename Format_t,
unsigned int NDim>
265 distance(params,q,m_dbSize,out);
269 template <
typename Format_t,
unsigned int NDim>
274 computeGrid(blocks,threads);
276 size_t pitchIn = m_pitchIn/
sizeof(Format_t);
277 size_t pitchOut = m_pitchOut/
sizeof(Format_t);
280 Format_t w = params.
w;
298 cudaMemcpyDeviceToHost );
302 template <
typename Format_t,
unsigned int NDim>
306 nearest(params,q,m_dbSize,out);
310 template <
typename Format_t,
unsigned int NDim>
315 computeGrid(blocks,threads);
318 size_t pitchIn = m_pitchIn/
sizeof(Format_t);
319 size_t pitchOut = m_pitchOut/
sizeof(Format_t);
322 Format_t w = params.
w;
335 Format_t* unsortedKeys = m_g_out;
336 Format_t* unsortedVals = m_g_out + pitchOut;
337 Format_t* sortedKeys = m_g_sorted;
338 Format_t* sortedVals = m_g_sorted + pitchOut;
342 sortedKeys, sortedVals,
343 unsortedKeys, unsortedVals,
351 m_g_sorted, m_pitchOut,
353 cudaMemcpyDeviceToHost );
363 template <
typename Format_t,
unsigned int NDim,
bool Enabled>
369 unsigned int pitchIn,
371 unsigned int pitchOut,
374 unsigned int threads )
376 std::cerr <<
"CANNOT CALL R2S1 KERNEL IF NDIM < 3 (" << NDim <<
")\n";
384 template <
typename Format_t,
unsigned int NDim>
391 unsigned int pitchIn,
393 unsigned int pitchOut,
396 unsigned int threads)
399 kernels::r2s1_distance<Format_t,NDim><<<blocks,threads>>>
400 (weight,q,g_in,pitchIn,g_out,pitchOut,n );
406 template <
typename Format_t,
unsigned int NDim>
410 distance(params,q,m_dbSize,out);
415 template <
typename Format_t,
unsigned int NDim>
420 computeGrid(blocks,threads);
422 size_t pitchIn = m_pitchIn/
sizeof(Format_t);
423 size_t pitchOut = m_pitchOut/
sizeof(Format_t);
426 Format_t w = params.
w;
444 cudaMemcpyDeviceToHost );
448 template <
typename Format_t,
unsigned int NDim>
452 nearest(params,q,m_dbSize,out);
455 template <
typename Format_t,
unsigned int NDim>
460 computeGrid(blocks,threads);
463 size_t pitchIn = m_pitchIn/
sizeof(Format_t);
464 size_t pitchOut = m_pitchOut/
sizeof(Format_t);
467 Format_t w = params.
w;
480 Format_t* unsortedKeys = m_g_out;
481 Format_t* unsortedVals = m_g_out + pitchOut;
482 Format_t* sortedKeys = m_g_sorted;
483 Format_t* sortedVals = m_g_sorted + pitchOut;
487 sortedKeys, sortedVals,
488 unsortedKeys, unsortedVals,
496 m_g_sorted, m_pitchOut,
498 cudaMemcpyDeviceToHost );
513 #endif // POINTSET_HPP_
static void dispatch(Format_t weight, const QueryPoint< Format_t, NDim > &q, Format_t *g_in, unsigned int pitchIn, Format_t *g_out, unsigned int pitchOut, unsigned int n, unsigned int blocks, unsigned int threads)
void deviceSynchronize()
blocks the host thread until kernels are done executing
void memcpy2DT(T *dst, size_t dpitchBytes, const T *src, size_t spitchBytes, size_t widthObs, size_t height, MemcpyKind kind)
wraps cudaMemcpy2D
void distance(EuclideanTag, const Format_t q[NDim], Result_t &out)
batch compute distance to point set
void config()
retreives device properties of the current device, used to calculate kernel peramaters, call once after setting the cuda device and before launching any kernels
static void maxRegs(unsigned int &maxRegs)
static void maxRegs(unsigned int &maxRegs)
void nearest(EuclideanTag, const Format_t q[NDim], Result_t &out)
return k nearest children of q, k is columns of out
Char8_t * copy(const Char8_t *s)
static void dispatch(Format_t weight, const QueryPoint< Format_t, NDim > &q, Format_t *g_in, unsigned int pitchIn, Format_t *g_out, unsigned int pitchOut, unsigned int n, unsigned int blocks, unsigned int threads)
sort should be ascending, i.e. a[i] < a[j], i < j