dubins_2curves__cuda_2_point_set_8cu_8hpp_source.html

 /*

  *  Copyright (C) 2012 Josh Bialkowski (jbialk@mit.edu)

  *

  *  This file is part of mpblocks.

  *

  *  mpblocks is free software: you can redistribute it and/or modify

  *  it under the terms of the GNU General Public License as published by

  *  the Free Software Foundation, either version 3 of the License, or

  *  (at your option) any later version.

  *

  *  mpblocks is distributed in the hope that it will be useful,

  *  but WITHOUT ANY WARRANTY; without even the implied warranty of

  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

  *  GNU General Public License for more details.

  *

  *  You should have received a copy of the GNU General Public License

  *  along with mpblocks.  If not, see <http://www.gnu.org/licenses/>.

  */

 #ifndef MPBLOCKS_DUBINS_CURVES_CUDA2_POINTSET_HPP_

 #define MPBLOCKS_DUBINS_CURVES_CUDA2_POINTSET_HPP_


 #include <map>

 #include <string>

 #include <mpblocks/util/exception_stream.hpp>


 namespace    mpblocks {

 namespace      dubins {

 namespace curves_cuda {


 template <typename Format_t>

 void ResultBlock<Format_t>::allocate(uint_t rows, uint_t cols) {

   if (m_buf) delete[] m_buf;


   m_buf = new float[rows * cols];


   m_rows = rows;

   m_cols = cols;

 }


 template <typename Format_t>

 PointSet<Format_t>::PointSet(uint_t n, Format_t r)

     : m_sorter(-std::numeric_limits<Format_t>::max(),

                std::numeric_limits<Format_t>::max()) {

   m_g_in = 0;

   m_g_out = 0;

   m_g_sorted = 0;

   m_params.r = r;


   m_threadsPerBlock = 0;

   m_nSM = 0;


   deallocate();


   try {

     config();

     allocate(n);

   } catch (const std::exception& ex) {

     std::cerr << "Error in constructing dubins CUDA PointSet: " << ex.what()

               << "\nNote: point set is unallocated\n";

   }

 }


 template <typename Format_t>

 PointSet<Format_t>::~PointSet() {

   deallocate();

 }


 template <typename Format_t>

 void PointSet<Format_t>::deallocate() {

   if (m_g_in) {

     cuda::free(m_g_in);

     m_g_in = 0;

   }


   if (m_g_out) {

     cuda::free(m_g_out);

     m_g_out = 0;

   }


   if (m_g_sorted) {

     cuda::free(m_g_sorted);

     m_g_sorted = 0;

   }


   m_dbAlloc = 0;

   m_dbAlloc2 = 0;

   m_dbSize = 0;

 }


 template <typename Format_t>

 void PointSet<Format_t>::allocate(uint_t n) {

   deallocate();


   m_dbAlloc = n;

   m_dbAlloc2 = cuda::nextPow2(n);


   m_g_in = cuda::mallocPitchT<Format_t>(m_pitchIn, m_dbAlloc, 3);

   std::cout << "allocated m_g_in for " << m_dbAlloc

             << " object with pitch: " << m_pitchIn << "\n";


   m_g_out = cuda::mallocPitchT<Format_t>(m_pitchOut, m_dbAlloc2, 2);

   std::cout << "allocated m_g_out for " << m_dbAlloc

             << " object with pitch: " << m_pitchOut << "\n";


   m_g_sorted = cuda::mallocPitchT<Format_t>(m_pitchOut, m_dbAlloc2, 2);

   std::cout << "allocated m_g_sorted for " << m_dbAlloc

             << " object with pitch: " << m_pitchOut << "\n";

 }


 template <typename Format_t>

 void PointSet<Format_t>::set_r(Format_t r) {

   m_params.r = r;

 }


 template <typename Format_t>

 void PointSet<Format_t>::clear(bool clearmem) {

   m_dbSize = 0;

   if (clearmem) {

     cuda::memset2DT(m_g_in, m_pitchIn, 0, m_dbAlloc, 3);

     cuda::memset2DT(m_g_out, m_pitchOut, 0, m_dbAlloc2, 2);

     cuda::memset2DT(m_g_sorted, m_pitchOut, 0, m_dbAlloc2, 2);

   }

 }


 template <typename Format_t>

 void PointSet<Format_t>::config() {

   int devId = cuda::getDevice();

   config(devId);

 }


 template <typename Format_t>

 void PointSet<Format_t>::config(int devId) {

   cuda::DeviceProp devProps(devId);

   cuda::FuncAttributes attr;

   uint_t maxRegs = 0;


   attr.getFrom(&kernels::distance_to_set<Format_t>);

   maxRegs = std::max(maxRegs, (uint_t)attr.numRegs);


   attr.getFrom(&kernels::distance_to_set_with_id<Format_t>);

   maxRegs = std::max(maxRegs, (uint_t)attr.numRegs);


   attr.getFrom(&kernels::distance_to_set_debug<Format_t>);

   maxRegs = std::max(maxRegs, (uint_t)attr.numRegs);


   attr.getFrom(&kernels::distance_from_set<Format_t>);

   maxRegs = std::max(maxRegs, (uint_t)attr.numRegs);


   attr.getFrom(&kernels::distance_from_set_with_id<Format_t>);

   maxRegs = std::max(maxRegs, (uint_t)attr.numRegs);


   attr.getFrom(&kernels::distance_from_set_debug<Format_t>);

   maxRegs = std::max(maxRegs, (uint_t)attr.numRegs);


   // the maximum number of threads we can put into a block is given by the

   // number of registers on each SM divided by the number of registers that

   // are used by each thread in the kernel

   uint_t threadCount_max = (uint_t)devProps.regsPerBlock / maxRegs;


   // make sure that the number of threads per block computed as above doesn't

   // exceed the max per-block for the architectture

   m_threadsPerBlock =

       std::min(threadCount_max, (uint_t)devProps.maxThreadsPerBlock);


   // get the number of multiprocessors

   m_nSM = devProps.multiProcessorCount;


   // configure the sorter

   m_sorter.config(devId);

 }


 template <typename Format_t>

 void PointSet<Format_t>::computeGrid(uint_t& blocks, uint_t& threads) {

   threads = cuda::intDivideRoundUp(m_dbSize, m_nSM);

   if (threads > m_threadsPerBlock) threads = m_threadsPerBlock;

   blocks = cuda::intDivideRoundUp(m_dbSize, threads);

 }


 template <typename Format_t>

 int PointSet<Format_t>::insert(Format_t q[3]) {

   cuda::memcpy2DT(m_g_in + m_dbSize, m_pitchIn, q, sizeof(Format_t), 1, 3,

                   cudaMemcpyHostToDevice);


   m_dbSize++;

   return m_dbSize - 1;

 }


 template <typename Format_t>

 void PointSet<Format_t>::distance_to_set(Format_t q[3],

                                          ResultBlock<Format_t>& out) {

   m_params.set_q(q);


   uint_t blocks, threads;

   computeGrid(blocks, threads);


   size_t pitchIn = m_pitchIn / sizeof(Format_t);

   size_t pitchOut = m_pitchOut / sizeof(Format_t);


   const int DEBUG_ROWS = 4 * 11 + 4 * 13;


   switch (out.rows()) {

     case 1: {

       // call the kernel

       kernels::distance_to_set<Format_t><<<blocks,threads>>>(

               m_params,

               m_g_in,

               pitchIn,

               m_g_out,

               pitchOut,

               m_dbSize

               );

       cuda::deviceSynchronize();


       // retrieve results

       cuda::memcpy2DT(out.ptr(), out.pitch(), m_g_out, m_pitchOut, out.cols(),

                       2, cudaMemcpyDeviceToHost);


       break;

     }


     case 2: {

       // call the kernel

       kernels::distance_to_set_with_id<Format_t><<<blocks,threads>>>(

               m_params,

               m_g_in,

               pitchIn,

               m_g_out,

               pitchOut,

               m_dbSize

               );


       cuda::deviceSynchronize();


       // retrieve results

       cuda::memcpy2DT(out.ptr(), out.pitch(), m_g_out, m_pitchOut, out.cols(),

                       2, cudaMemcpyDeviceToHost);

       break;

     }


     case DEBUG_ROWS: {

       // allocate output storage

       size_t pitch = 1;

       Format_t* g_out =

           cuda::mallocPitchT<Format_t>(pitch, m_dbSize, DEBUG_ROWS);


       // call the kernel

       kernels::distance_to_set_debug<Format_t><<<blocks,threads>>>(

               m_params,

               m_g_in,

               pitchIn,

               g_out,

               pitch/sizeof(Format_t),

               m_dbSize

               );

       cuda::deviceSynchronize();


       // retrieve results

       cuda::memcpy2DT(out.ptr(), out.pitch(), g_out, pitch, out.cols(),

                       DEBUG_ROWS, cudaMemcpyDeviceToHost);


       // free output storage

       cuda::free(g_out);

       break;

     }


     default:

       utility::ex() << "PointSet::distance_to_set: "

                        "Valid output rows is 1,2, or 24";

       break;

   }

 }


 template <typename Format_t>

 void PointSet<Format_t>::distance_from_set(Format_t q[3],

                                            ResultBlock<Format_t>& out) {

   m_params.set_q(q);


   uint_t blocks, threads;

   computeGrid(blocks, threads);


   size_t pitchIn = m_pitchIn / sizeof(Format_t);

   size_t pitchOut = m_pitchOut / sizeof(Format_t);


   switch (out.rows()) {

     case 1: {

       // call the kernel

       kernels::distance_from_set<Format_t><<<blocks,threads>>>(

                m_params,

               m_g_in,

               pitchIn,

               m_g_out,

               pitchOut,

               m_dbSize

               );

       cuda::deviceSynchronize();


       // retrieve results

       cuda::memcpy2DT(

               out.ptr(),  out.pitch(),

               m_g_out,    m_pitchOut,

               out.cols(), 2,

               cudaMemcpyDeviceToHost );

       break;

   }


   case 2: {

     // call the kernel

     kernels::distance_from_set_with_id<Format_t><<<blocks,threads>>>(

              m_params,

             m_g_in,

             pitchIn,

             m_g_out,

             pitchOut,

             m_dbSize

             );

     cuda::deviceSynchronize();


     // retrieve results

     cuda::memcpy2DT(out.ptr(), out.pitch(), m_g_out, m_pitchOut, out.cols(), 2,

                     cudaMemcpyDeviceToHost);

     break;

   }


   case 24: {

     // allocate output storage

     size_t pitch = 1;

     Format_t* g_out = cuda::mallocPitchT<Format_t>(pitch, m_dbSize, 24);


     // call the kernel

     kernels::distance_from_set_debug<Format_t><<<blocks,threads>>>(

              m_params,

             m_g_in,

             pitchIn,

             g_out,

             pitch/sizeof(Format_t),

             m_dbSize

             );

     cuda::deviceSynchronize();


     // retrieve results

     cuda::memcpy2DT(out.ptr(), out.pitch(), g_out, pitch, out.cols(), 24,

                     cudaMemcpyDeviceToHost);


     // free output storage

     cuda::free(g_out);

     break;

   }


   default:

     utility::ex() << "PointSet::distance_from_set: "

                      "Valid output rows is 1,2, or 24";

     break;

   }

 }


 template <typename Format_t>

 void PointSet<Format_t>::nearest_children(Format_t q[3],

                                           ResultBlock<Format_t>& out) {

   m_params.set_q(q);


   uint_t blocks, threads;

   computeGrid(blocks, threads);


   size_t pitchIn = m_pitchIn / sizeof(Format_t);

   size_t pitchOut = m_pitchOut / sizeof(Format_t);


   // call the kernel to calculate distances to children

   kernels::distance_to_set_with_id<Format_t><<<blocks,threads>>>(

       m_params,

       m_g_in,

       pitchIn,

       m_g_out,

       pitchOut,

       m_dbSize

       );

   cuda::deviceSynchronize();


   Format_t* unsortedKeys = m_g_out;

   Format_t* unsortedVals = m_g_out + pitchOut;

   Format_t* sortedKeys = m_g_sorted;

   Format_t* sortedVals = m_g_sorted + pitchOut;


   // call the kernel to sort the results

   m_sorter.sort(sortedKeys, sortedVals, unsortedKeys, unsortedVals, m_dbSize,

                 cuda::bitonic::Ascending);

   cuda::deviceSynchronize();


   // fetch the k smallest

   cuda::memcpy2DT(out.ptr(), out.pitch(), m_g_sorted, m_pitchOut, out.cols(), 2,

                   cudaMemcpyDeviceToHost);

 }


 template <typename Format_t>

 void PointSet<Format_t>::nearest_parents(Format_t q[3],

                                          ResultBlock<Format_t>& out) {

   m_params.set_q(q);


   uint_t blocks, threads;

   computeGrid(blocks, threads);


   size_t pitchIn = m_pitchIn / sizeof(Format_t);

   size_t pitchOut = m_pitchOut / sizeof(Format_t);


   // call the kernel to calculate distances to children

   kernels::distance_from_set_with_id<Format_t><<<blocks,threads>>>(

       m_params,

       m_g_in,

       pitchIn,

       m_g_out,

       pitchOut,

       m_dbSize

       );

   cuda::deviceSynchronize();


   Format_t* unsortedKeys = m_g_out;

   Format_t* unsortedVals = m_g_out + pitchOut;

   Format_t* sortedKeys = m_g_sorted;

   Format_t* sortedVals = m_g_sorted + pitchOut;


   // call the kernel to sort the results

   m_sorter.sort(sortedKeys, sortedVals, unsortedKeys, unsortedVals, m_dbSize,

                 cuda::bitonic::Ascending);

   cuda::deviceSynchronize();


   // fetch the k smallest

   cuda::memcpy2DT(out.ptr(), out.pitch(), m_g_sorted, m_pitchOut, out.cols(), 2,

                   cudaMemcpyDeviceToHost);

 }


 template <typename Format_t>

 void PointSet<Format_t>::group_distance_to_set(Format_t q[3],

                                                ResultBlock<Format_t>& out) {

   uint_t blocks, threads;

   computeGrid(blocks, threads);


   size_t pitchIn = m_pitchIn / sizeof(Format_t);

   size_t pitchOut = m_pitchOut / sizeof(Format_t);


   EuclideanParams<Format_t> params;

   params.set_q(q);


   // call the kernel to calculate distances to children

   kernels::group_distance_to_set<Format_t><<<blocks,threads>>>(

       params,

       m_g_in,

       pitchIn,

       m_g_out,

       pitchOut,

       m_dbSize

       );

   cuda::deviceSynchronize();


   // fetch the distances

   cuda::memcpy2DT(out.ptr(), out.pitch(), m_g_out, m_pitchOut, out.cols(), 1,

                   cudaMemcpyDeviceToHost);

 }


 template <typename Format_t>

 void PointSet<Format_t>::group_distance_neighbors(Format_t q[3],

                                                   ResultBlock<Format_t>& out) {

   uint_t blocks, threads;

   computeGrid(blocks, threads);


   size_t pitchIn = m_pitchIn / sizeof(Format_t);

   size_t pitchOut = m_pitchOut / sizeof(Format_t);


   EuclideanParams<Format_t> params;

   params.set_q(q);


   // call the kernel to calculate distances to children

   kernels::group_distance_to_set_with_id<Format_t><<<blocks,threads>>>(

       params,

       m_g_in,

       pitchIn,

       m_g_out,

       pitchOut,

       m_dbSize

       );

   cuda::deviceSynchronize();


   Format_t* unsortedKeys = m_g_out;

   Format_t* unsortedVals = m_g_out + pitchOut;

   Format_t* sortedKeys = m_g_sorted;

   Format_t* sortedVals = m_g_sorted + pitchOut;


   // call the kernel to sort the results

   m_sorter.sort(sortedKeys, sortedVals, unsortedKeys, unsortedVals, m_dbSize,

                 cuda::bitonic::Ascending);

   cuda::deviceSynchronize();


   // fetch the k smallest

   cuda::memcpy2DT(out.ptr(), out.pitch(), m_g_sorted, m_pitchOut, out.cols(), 2,

                   cudaMemcpyDeviceToHost);

 }


 template <typename Format_t>

 void PointSet<Format_t>::get_fattr(fattrMap_t& map) {

   map["distance_to_set"].getFrom(&kernels::distance_to_set<Format_t>);

   map["distance_to_set_with_id"].getFrom(

       &kernels::distance_to_set_with_id<Format_t>);

   map["distance_to_set_debug"].getFrom(

       &kernels::distance_to_set_debug<Format_t>);

   map["distance_from_set"].getFrom(&kernels::distance_from_set<Format_t>);

   map["distance_from_set_with_id"].getFrom(

       &kernels::distance_from_set_with_id<Format_t>);

   map["distance_from_set_debug"].getFrom(

       &kernels::distance_from_set_debug<Format_t>);

   map["euclidean_to_set"].getFrom(&kernels::group_distance_to_set<Format_t>);

   map["euclidean_to_set_with_id"].getFrom(

       &kernels::group_distance_to_set<Format_t>);


   // Sorter_t::get_fattr(map);

 }


 } // curves

 } // dubins

 } // mpblocks


 #endif // MPBLOCKS_DUBINS_CURVES_CUDA2_POINTSET_HPP_

mpblocks::dubins::curves_cuda::PointSet::distance_to_set
void distance_to_set(Format_t q[3], ResultBlock< Format_t > &out)
batch compute distance to point set
Definition: PointSet.cu.hpp:195

mpblocks::dubins::curves_cuda::PointSet::m_g_sorted
Format_t * m_g_sorted
output for sorted results
Definition: PointSet.h:101

mpblocks::dubins::curves_cuda::PointSet::allocate
void allocate(uint_t n)
reallocates device storage for a point set of size n, also resets the database
Definition: PointSet.cu.hpp:97

mpblocks::cuda::DeviceProp
Definition: wrap.h:48

mpblocks::dubins::curves_cuda::ResultBlock::rows
uint_t rows() const
Definition: PointSet.h:69

mpblocks::dubins::curves_cuda::PointSet::~PointSet
~PointSet()
Definition: PointSet.cu.hpp:70

mpblocks::utility::ex
ExceptionStream< std::runtime_error > ex
Definition: exception_stream.hpp:27

mpblocks::dubins::curves_cuda::PointSet::clear
void clear(bool clearmem=false)
clear the database and reset input iterator
Definition: PointSet.cu.hpp:122

mpblocks::dubins::curves_cuda::PointSet::get_fattr
static void get_fattr(fattrMap_t &)
retrieve kernel attributes into the map, intended only for printing out statistics ...
Definition: PointSet.cu.hpp:503

mpblocks::dubins::curves_cuda::PointSet::deallocate
void deallocate()
deallocate and zero out pointers
Definition: PointSet.cu.hpp:75

mpblocks::dubins::curves_cuda::PointSet::fattrMap_t
std::map< std::string, cuda::FuncAttributes > fattrMap_t
Definition: PointSet.h:88

mpblocks::cuda::deviceSynchronize
void deviceSynchronize()
blocks the host thread until kernels are done executing

mpblocks::cuda::bitonic::uint_t
unsigned int uint_t
Definition: kernels.cu.hpp:40

mpblocks::dubins::curves_cuda::PointSet::group_distance_to_set
void group_distance_to_set(Format_t q[3], ResultBlock< Format_t > &out)
batch compute euclidean distances
Definition: PointSet.cu.hpp:437

mpblocks::dubins::curves_cuda::ResultBlock::cols
uint_t cols() const
Definition: PointSet.h:70

mpblocks::dubins::curves_cuda::PointSet::m_g_out
Format_t * m_g_out
kernel output buffer
Definition: PointSet.h:100

mpblocks::cuda::intDivideRoundUp
T intDivideRoundUp(T x, T y)
integer divide with round up
Definition: powersOfTwo.h:270

mpblocks::dubins::curves_cuda::PointSet::nearest_parents
void nearest_parents(Format_t q[3], ResultBlock< Format_t > &out)
return k nearest parents of q
Definition: PointSet.cu.hpp:400

mpblocks::cuda::free
void free(void *devPtr)
wraps cudaFree

mpblocks::dubins::curves_cuda::EuclideanParams
Definition: Params.h:54

mpblocks::dubins::curves_cuda::PointSet::set_r
void set_r(Format_t r)
set the radius
Definition: PointSet.cu.hpp:117

mpblocks::dubins::curves_cuda::EuclideanParams::set_q
void set_q(Format_t q_in[3])
Definition: Params.h:58

mpblocks::dubins::curves_cuda::PointSet::distance_from_set
void distance_from_set(Format_t q[3], ResultBlock< Format_t > &out)
batch compute distance from point set
Definition: PointSet.cu.hpp:280

mpblocks::dubins::curves_cuda::PointSet::uint_t
unsigned int uint_t
Definition: PointSet.h:87

mpblocks::cuda::memcpy2DT
void memcpy2DT(T *dst, size_t dpitchBytes, const T *src, size_t spitchBytes, size_t widthObs, size_t height, MemcpyKind kind)
wraps cudaMemcpy2D
Definition: wrap.hpp:62

mpblocks::dubins::curves_cuda::PointSet::nearest_children
void nearest_children(Format_t q[3], ResultBlock< Format_t > &out)
return k nearest children of q
Definition: PointSet.cu.hpp:363

mpblocks::dubins::curves_cuda::PointSet::computeGrid
void computeGrid(uint_t &blocks, uint_t &threads)
compute the grid size given the current configuration and size of the point set
Definition: PointSet.cu.hpp:179

mpblocks::dubins::curves_cuda::PointSet::m_params
Params< Format_t > m_params
query parameters
Definition: PointSet.h:92

mpblocks::dubins::curves_cuda::PointSet::insert
int insert(Format_t q[3])
insert a new state into the point set, and return it's id
Definition: PointSet.cu.hpp:186

mpblocks::dubins::curves_cuda::ResultBlock::ptr
Format_t * ptr() const
Definition: PointSet.h:68

mpblocks::cuda::FuncAttributes::getFrom
void getFrom(T *entry)
Definition: wrap.hpp:35

mpblocks::dubins::curves_cuda::ResultBlock
Definition: PointSet.h:40

mpblocks::cuda::ex
ExceptionStream< std::runtime_error > ex
Definition: ExceptionStream.h:119

mpblocks::cuda::memset2DT
void memset2DT(T *devPtr, size_t pitchBytes, int value, size_t widthObjs, size_t height)
wraps cudaMemset2D
Definition: wrap.hpp:88

mpblocks::dubins::curves_cuda::PointSet::m_threadsPerBlock
uint_t m_threadsPerBlock
maximum threads per block
Definition: PointSet.h:105

mpblocks::dubins::curves_cuda::PointSet::m_nSM
uint_t m_nSM
number of multiprocessors
Definition: PointSet.h:106

mpblocks::dubins::curves_cuda::PointSet::PointSet
PointSet(uint_t n=10, Format_t r=1)
Definition: PointSet.cu.hpp:47

exception_stream.hpp

mpblocks::dubins::curves_cuda::PointSet::group_distance_neighbors
void group_distance_neighbors(Format_t q[3], ResultBlock< Format_t > &out)
find k euclidean nearest neighbors
Definition: PointSet.cu.hpp:465

mpblocks::cuda::FuncAttributes
Definition: wrap.h:38

mpblocks::dubins::curves_cuda::PointSet::m_g_in
Format_t * m_g_in
kernel input buffer
Definition: PointSet.h:99

mpblocks::dubins::curves_cuda::ResultBlock::allocate
void allocate(uint_t rows, uint_t cols)
Definition: PointSet.cu.hpp:37

mpblocks::dubins::curves_cuda::PointSet::config
void config()
retreives device properties of the current device, used to calculate kernel peramaters, call once after setting the cuda device and before launching any kernels
Definition: PointSet.cu.hpp:132

mpblocks::dubins::curves_cuda::ResultBlock::uint_t
unsigned int uint_t
Definition: PointSet.h:43

mpblocks::cuda::bitonic::Ascending
sort should be ascending, i.e. a[i] < a[j], i < j
Definition: Direction.h:40

mpblocks::cuda::nextPow2
T nextPow2(T x)
returns the smallest power of two that is not less than x
Definition: powersOfTwo.h:102

mpblocks::cuda::getDevice
int getDevice()
wraps cudaGetDevice

mpblocks::dubins::curves_cuda::ResultBlock::pitch
uint_t pitch() const
Definition: PointSet.h:71