cheshirekow  v0.1.0
se3.cu.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2012 Josh Bialkowski (jbialk@mit.edu)
3  *
4  * This file is part of openbook.
5  *
6  * openbook is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * openbook is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with openbook. If not, see <http://www.gnu.org/licenses/>.
18  */
27 #ifndef MPBLOCKS_CUDANN_KERNELS_SE3_CU_HPP_
28 #define MPBLOCKS_CUDANN_KERNELS_SE3_CU_HPP_
29 
30 #include <mpblocks/cuda/linalg2.h>
33 
34 namespace mpblocks {
35 namespace cudaNN {
36 namespace kernels {
37 
38 namespace linalg = cuda::linalg2;
39 
40 
41 template< typename Format_t, unsigned int NDim>
43  Format_t weight,
45  Format_t* g_in,
46  unsigned int pitchIn,
47  Format_t* g_out,
48  unsigned int pitchOut,
49  unsigned int n
50  )
51 {
52  using namespace linalg;
53 
54  int threadId = threadIdx.x;
55  int blockId = blockIdx.x;
56  int N = blockDim.x;
57 
58  // which data point we work on
59  int idx = blockId * N + threadId;
60 
61  // if our idx is greater than the number of data points then we are a
62  // left-over thread so just bail
63  // @todo is this OK with non-power of
64  // two array sizes and the fact that we syncthreads after this point?
65  if( idx > n )
66  return;
67 
68  // compose the query object
71 
72  // read in the query point q0, no synchronization between reads
73  if( NDim >= 7 )
74  {
75  set<0>(t0) = q.data[0];
76  set<1>(t0) = q.data[1];
77  set<2>(t0) = q.data[2];
78  set<0>(q0) = q.data[3];
79  set<1>(q0) = q.data[4];
80  set<2>(q0) = q.data[5];
81  set<3>(q0) = q.data[6];
82 
83  set<0>(t1) = g_in[0*pitchIn + idx];
84  __syncthreads();
85  set<1>(t1) = g_in[1*pitchIn + idx];
86  __syncthreads();
87  set<2>(t1) = g_in[2*pitchIn + idx];
88  __syncthreads();
89  set<0>(q1) = g_in[3*pitchIn + idx];
90  __syncthreads();
91  set<1>(q1) = g_in[4*pitchIn + idx];
92  __syncthreads();
93  set<2>(q1) = g_in[5*pitchIn + idx];
94  __syncthreads();
95  set<3>(q1) = g_in[6*pitchIn + idx];
96  __syncthreads();
97 
98  // now compute the distance for this point
99  Format_t dq = linalg::dot(q0,q1);
100  Format_t d = linalg::norm_squared(t1-t0) + weight*(1-dq*dq);
101  __syncthreads();
102  g_out[0*pitchOut + idx] = d;
103  __syncthreads();
104  g_out[1*pitchOut + idx] = idx;
105  }
106 }
107 
108 
109 
110 template< typename Format_t, unsigned int NDim>
112  Format_t weight,
114  Format_t* g_in,
115  unsigned int pitchIn,
116  Format_t* g_out,
117  unsigned int pitchOut,
118  unsigned int n
119  )
120 {
121  using namespace linalg;
122 
123  int threadId = threadIdx.x;
124  int blockId = blockIdx.x;
125  int N = blockDim.x;
126 
127  // which data point we work on
128  int idx = blockId * N + threadId;
129 
130  // if our idx is greater than the number of data points then we are a
131  // left-over thread so just bail
132  // @todo is this OK with non-power of
133  // two array sizes and the fact that we syncthreads after this point?
134  if( idx > n )
135  return;
136 
137  // compose the query object
140 
141  // read in the query point q0, no synchronization between reads
142  if( NDim >= 7 )
143  {
144  set<0>(t0) = q.data[0];
145  set<1>(t0) = q.data[1];
146  set<2>(t0) = q.data[2];
147  set<0>(q0) = q.data[3];
148  set<1>(q0) = q.data[4];
149  set<2>(q0) = q.data[5];
150  set<3>(q0) = q.data[6];
151 
152  set<0>(t1) = g_in[0*pitchIn + idx];
153  __syncthreads();
154  set<1>(t1) = g_in[1*pitchIn + idx];
155  __syncthreads();
156  set<2>(t1) = g_in[2*pitchIn + idx];
157  __syncthreads();
158  set<0>(q1) = g_in[3*pitchIn + idx];
159  __syncthreads();
160  set<1>(q1) = g_in[4*pitchIn + idx];
161  __syncthreads();
162  set<2>(q1) = g_in[5*pitchIn + idx];
163  __syncthreads();
164  set<3>(q1) = g_in[6*pitchIn + idx];
165  __syncthreads();
166 
167  // now compute the distance for this point
168  Format_t dq = linalg::dot(q0,q1);
169  Format_t arg = 2*dq*dq - 1;
170  arg = fmaxf(-0.999999999999999999999999999f,
171  fminf(arg, 0.9999999999999999999999999f));
172  Format_t d =
173  sqrtf(linalg::norm_squared(t1-t0)) + weight*acosf( arg );
174  __syncthreads();
175  g_out[0*pitchOut + idx] = d;
176  __syncthreads();
177  g_out[1*pitchOut + idx] = idx;
178  }
179 }
180 
181 
182 
183 
184 
185 
186 
187 
188 
189 
190 
191 
192 
193 } // kernels
194 } // cudaNN
195 } // mpblocks
196 
197 
198 #endif
199 
__device__ __host__ Scalar norm_squared(const RValue< Scalar, ROWS, COLS, Exp > &M)
compute the norm
Definition: Norm.h:130
__global__ void se3_distance(Format_t weight, QueryPoint< Format_t, NDim > query, Format_t *g_in, unsigned int pitchIn, Format_t *g_out, unsigned int pitchOut, unsigned int n)
weighted distance between elements of se3, points are interpreted as 3 values of position and 4 value...
Definition: se3.cu.hpp:111
__global__ void se3_pseudo_distance(Format_t weight, QueryPoint< Format_t, NDim > query, Format_t *g_in, unsigned int pitchIn, Format_t *g_out, unsigned int pitchOut, unsigned int n)
weighted distance between elements of se3, points are interpreted as 3 values of position and 4 value...
Definition: se3.cu.hpp:42
int x
Definition: fakecuda.h:44
#define __global__
Definition: fakecuda.h:33
Dim3 threadIdx
Dim3 blockIdx
Dim3 blockDim
__device__ __host__ Scalar dot(const RValue< Scalar, ROWS, 1, ExpA > &A, const RValue< Scalar, ROWS, 1, ExpB > &B)
compute the DOT
Definition: Dot.h:84
void __syncthreads()