25 #ifndef MPBLOCKS_DUBINS_CURVES_CUDA2_POINTSET_HPP_
26 #define MPBLOCKS_DUBINS_CURVES_CUDA2_POINTSET_HPP_
34 namespace curves_cuda {
36 template <
typename Format_t>
38 if (m_buf)
delete[] m_buf;
40 m_buf =
new float[rows * cols];
46 template <
typename Format_t>
48 : m_sorter(-std::numeric_limits<Format_t>::max(),
49 std::numeric_limits<Format_t>::max()) {
63 }
catch (
const std::exception&
ex) {
64 std::cerr <<
"Error in constructing dubins CUDA PointSet: " << ex.what()
65 <<
"\nNote: point set is unallocated\n";
69 template <
typename Format_t>
74 template <
typename Format_t>
96 template <
typename Format_t>
103 m_g_in = cuda::mallocPitchT<Format_t>(m_pitchIn, m_dbAlloc, 3);
104 std::cout <<
"allocated m_g_in for " << m_dbAlloc
105 <<
" object with pitch: " << m_pitchIn <<
"\n";
107 m_g_out = cuda::mallocPitchT<Format_t>(m_pitchOut, m_dbAlloc2, 2);
108 std::cout <<
"allocated m_g_out for " << m_dbAlloc
109 <<
" object with pitch: " << m_pitchOut <<
"\n";
111 m_g_sorted = cuda::mallocPitchT<Format_t>(m_pitchOut, m_dbAlloc2, 2);
112 std::cout <<
"allocated m_g_sorted for " << m_dbAlloc
113 <<
" object with pitch: " << m_pitchOut <<
"\n";
116 template <
typename Format_t>
121 template <
typename Format_t>
131 template <
typename Format_t>
137 template <
typename Format_t>
143 attr.
getFrom(&kernels::distance_to_set<Format_t>);
144 maxRegs = std::max(maxRegs, (
uint_t)attr.numRegs);
146 attr.
getFrom(&kernels::distance_to_set_with_id<Format_t>);
147 maxRegs = std::max(maxRegs, (
uint_t)attr.numRegs);
149 attr.
getFrom(&kernels::distance_to_set_debug<Format_t>);
150 maxRegs = std::max(maxRegs, (
uint_t)attr.numRegs);
152 attr.
getFrom(&kernels::distance_from_set<Format_t>);
153 maxRegs = std::max(maxRegs, (
uint_t)attr.numRegs);
155 attr.
getFrom(&kernels::distance_from_set_with_id<Format_t>);
156 maxRegs = std::max(maxRegs, (
uint_t)attr.numRegs);
158 attr.
getFrom(&kernels::distance_from_set_debug<Format_t>);
159 maxRegs = std::max(maxRegs, (
uint_t)attr.numRegs);
164 uint_t threadCount_max = (
uint_t)devProps.regsPerBlock / maxRegs;
169 std::min(threadCount_max, (
uint_t)devProps.maxThreadsPerBlock);
172 m_nSM = devProps.multiProcessorCount;
175 m_sorter.config(devId);
178 template <
typename Format_t>
181 if (threads > m_threadsPerBlock) threads = m_threadsPerBlock;
185 template <
typename Format_t>
187 cuda::memcpy2DT(m_g_in + m_dbSize, m_pitchIn, q,
sizeof(Format_t), 1, 3,
188 cudaMemcpyHostToDevice);
194 template <
typename Format_t>
200 computeGrid(blocks, threads);
202 size_t pitchIn = m_pitchIn /
sizeof(Format_t);
203 size_t pitchOut = m_pitchOut /
sizeof(Format_t);
205 const int DEBUG_ROWS = 4 * 11 + 4 * 13;
207 switch (out.
rows()) {
210 kernels::distance_to_set<Format_t><<<blocks,threads>>>(
222 2, cudaMemcpyDeviceToHost);
229 kernels::distance_to_set_with_id<Format_t><<<blocks,threads>>>(
242 2, cudaMemcpyDeviceToHost);
250 cuda::mallocPitchT<Format_t>(pitch, m_dbSize, DEBUG_ROWS);
253 kernels::distance_to_set_debug<Format_t><<<blocks,threads>>>(
258 pitch/
sizeof(Format_t),
265 DEBUG_ROWS, cudaMemcpyDeviceToHost);
274 "Valid output rows is 1,2, or 24";
279 template <
typename Format_t>
285 computeGrid(blocks, threads);
287 size_t pitchIn = m_pitchIn /
sizeof(Format_t);
288 size_t pitchOut = m_pitchOut /
sizeof(Format_t);
290 switch (out.
rows()) {
293 kernels::distance_from_set<Format_t><<<blocks,threads>>>(
308 cudaMemcpyDeviceToHost );
314 kernels::distance_from_set_with_id<Format_t><<<blocks,threads>>>(
326 cudaMemcpyDeviceToHost);
333 Format_t* g_out = cuda::mallocPitchT<Format_t>(pitch, m_dbSize, 24);
336 kernels::distance_from_set_debug<Format_t><<<blocks,threads>>>(
341 pitch/
sizeof(Format_t),
348 cudaMemcpyDeviceToHost);
357 "Valid output rows is 1,2, or 24";
362 template <
typename Format_t>
368 computeGrid(blocks, threads);
370 size_t pitchIn = m_pitchIn /
sizeof(Format_t);
371 size_t pitchOut = m_pitchOut /
sizeof(Format_t);
374 kernels::distance_to_set_with_id<Format_t><<<blocks,threads>>>(
384 Format_t* unsortedKeys = m_g_out;
385 Format_t* unsortedVals = m_g_out + pitchOut;
386 Format_t* sortedKeys = m_g_sorted;
387 Format_t* sortedVals = m_g_sorted + pitchOut;
390 m_sorter.sort(sortedKeys, sortedVals, unsortedKeys, unsortedVals, m_dbSize,
396 cudaMemcpyDeviceToHost);
399 template <
typename Format_t>
405 computeGrid(blocks, threads);
407 size_t pitchIn = m_pitchIn /
sizeof(Format_t);
408 size_t pitchOut = m_pitchOut /
sizeof(Format_t);
411 kernels::distance_from_set_with_id<Format_t><<<blocks,threads>>>(
421 Format_t* unsortedKeys = m_g_out;
422 Format_t* unsortedVals = m_g_out + pitchOut;
423 Format_t* sortedKeys = m_g_sorted;
424 Format_t* sortedVals = m_g_sorted + pitchOut;
427 m_sorter.sort(sortedKeys, sortedVals, unsortedKeys, unsortedVals, m_dbSize,
433 cudaMemcpyDeviceToHost);
436 template <
typename Format_t>
440 computeGrid(blocks, threads);
442 size_t pitchIn = m_pitchIn /
sizeof(Format_t);
443 size_t pitchOut = m_pitchOut /
sizeof(Format_t);
449 kernels::group_distance_to_set<Format_t><<<blocks,threads>>>(
461 cudaMemcpyDeviceToHost);
464 template <
typename Format_t>
468 computeGrid(blocks, threads);
470 size_t pitchIn = m_pitchIn /
sizeof(Format_t);
471 size_t pitchOut = m_pitchOut /
sizeof(Format_t);
477 kernels::group_distance_to_set_with_id<Format_t><<<blocks,threads>>>(
487 Format_t* unsortedKeys = m_g_out;
488 Format_t* unsortedVals = m_g_out + pitchOut;
489 Format_t* sortedKeys = m_g_sorted;
490 Format_t* sortedVals = m_g_sorted + pitchOut;
493 m_sorter.sort(sortedKeys, sortedVals, unsortedKeys, unsortedVals, m_dbSize,
499 cudaMemcpyDeviceToHost);
502 template <
typename Format_t>
504 map[
"distance_to_set"].getFrom(&kernels::distance_to_set<Format_t>);
505 map[
"distance_to_set_with_id"].getFrom(
506 &kernels::distance_to_set_with_id<Format_t>);
507 map[
"distance_to_set_debug"].getFrom(
508 &kernels::distance_to_set_debug<Format_t>);
509 map[
"distance_from_set"].getFrom(&kernels::distance_from_set<Format_t>);
510 map[
"distance_from_set_with_id"].getFrom(
511 &kernels::distance_from_set_with_id<Format_t>);
512 map[
"distance_from_set_debug"].getFrom(
513 &kernels::distance_from_set_debug<Format_t>);
514 map[
"euclidean_to_set"].getFrom(&kernels::group_distance_to_set<Format_t>);
515 map[
"euclidean_to_set_with_id"].getFrom(
516 &kernels::group_distance_to_set<Format_t>);
525 #endif // MPBLOCKS_DUBINS_CURVES_CUDA2_POINTSET_HPP_
void distance_to_set(Format_t q[3], ResultBlock< Format_t > &out)
batch compute distance to point set
Format_t * m_g_sorted
output for sorted results
void allocate(uint_t n)
reallocates device storage for a point set of size n, also resets the database
ExceptionStream< std::runtime_error > ex
void clear(bool clearmem=false)
clear the database and reset input iterator
static void get_fattr(fattrMap_t &)
retrieve kernel attributes into the map, intended only for printing out statistics ...
void deallocate()
deallocate and zero out pointers
std::map< std::string, cuda::FuncAttributes > fattrMap_t
void deviceSynchronize()
blocks the host thread until kernels are done executing
void group_distance_to_set(Format_t q[3], ResultBlock< Format_t > &out)
batch compute euclidean distances
Format_t * m_g_out
kernel output buffer
T intDivideRoundUp(T x, T y)
integer divide with round up
void nearest_parents(Format_t q[3], ResultBlock< Format_t > &out)
return k nearest parents of q
void free(void *devPtr)
wraps cudaFree
void set_r(Format_t r)
set the radius
void set_q(Format_t q_in[3])
void distance_from_set(Format_t q[3], ResultBlock< Format_t > &out)
batch compute distance from point set
void memcpy2DT(T *dst, size_t dpitchBytes, const T *src, size_t spitchBytes, size_t widthObs, size_t height, MemcpyKind kind)
wraps cudaMemcpy2D
void nearest_children(Format_t q[3], ResultBlock< Format_t > &out)
return k nearest children of q
void computeGrid(uint_t &blocks, uint_t &threads)
compute the grid size given the current configuration and size of the point set
Params< Format_t > m_params
query parameters
int insert(Format_t q[3])
insert a new state into the point set, and return it's id
ExceptionStream< std::runtime_error > ex
void memset2DT(T *devPtr, size_t pitchBytes, int value, size_t widthObjs, size_t height)
wraps cudaMemset2D
uint_t m_threadsPerBlock
maximum threads per block
uint_t m_nSM
number of multiprocessors
PointSet(uint_t n=10, Format_t r=1)
void group_distance_neighbors(Format_t q[3], ResultBlock< Format_t > &out)
find k euclidean nearest neighbors
Format_t * m_g_in
kernel input buffer
void allocate(uint_t rows, uint_t cols)
void config()
retreives device properties of the current device, used to calculate kernel peramaters, call once after setting the cuda device and before launching any kernels
sort should be ascending, i.e. a[i] < a[j], i < j
T nextPow2(T x)
returns the smallest power of two that is not less than x
int getDevice()
wraps cudaGetDevice