cheshirekow  v0.1.0
so3.cu.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2012 Josh Bialkowski (jbialk@mit.edu)
3  *
4  * This file is part of openbook.
5  *
6  * openbook is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * openbook is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with openbook. If not, see <http://www.gnu.org/licenses/>.
18  */
27 #ifndef MPBLOCKS_CUDANN_KERNELS_SO3_CU_HPP_
28 #define MPBLOCKS_CUDANN_KERNELS_SO3_CU_HPP_
29 
30 #include <mpblocks/cuda/linalg2.h>
33 
34 namespace mpblocks {
35 namespace cudaNN {
36 namespace kernels {
37 
38 namespace linalg = cuda::linalg2;
39 
40 template< typename Scalar >
44  const linalg::Matrix<Scalar,4,1>& q1 )
45 {
46  Scalar dq = linalg::dot(q0,q1);
47  dq = fmaxf(-1,fminf(dq,1));
48  return 1-dq;
49 }
50 
51 template< typename Scalar >
53 Scalar so3_distance(
55  const linalg::Matrix<Scalar,4,1>& q1 )
56 {
57  Scalar dot = linalg::dot(q0,q1);
58  Scalar arg = 2*dot*dot - 1;
59  arg = fmaxf(-0.999999999999999999999999999f,
60  fminf(arg, 0.9999999999999999999999999f));
61  return acosf(arg);
62 }
63 
64 template< typename Scalar, bool Pseudo >
66 {
68  static Scalar compute(
70  const linalg::Matrix<Scalar,4,1>& q1 )
71  {
72  return so3_distance(q0,q1);
73  }
74 };
75 
76 template< typename Scalar>
77 struct so3_distance_fn<Scalar,true>
78 {
80  static Scalar compute(
82  const linalg::Matrix<Scalar,4,1>& q1 )
83  {
84  return so3_pseudo_distance(q0,q1);
85  }
86 };
87 
88 
89 template< bool Pseudo, typename Scalar, unsigned int NDim>
92  Scalar* g_in,
93  unsigned int pitchIn,
94  Scalar* g_out,
95  unsigned int pitchOut,
96  unsigned int n
97  )
98 {
99  using namespace linalg;
100 
101  int threadId = threadIdx.x;
102  int blockId = blockIdx.x;
103  int N = blockDim.x;
104 
105  // which data point we work on
106  int idx = blockId * N + threadId;
107 
108  // if our idx is greater than the number of data points then we are a
109  // left-over thread so just bail
110  // @todo is this OK with non-power of
111  // two array sizes and the fact that we syncthreads after this point?
112  if( idx > n )
113  return;
114 
115  // compose the query object
117 
118  // read in the query point q0, no synchronization between reads
119  if( NDim >= 4 )
120  {
121  set<0>(q0) = q.data[0];
122  set<1>(q0) = q.data[1];
123  set<2>(q0) = q.data[2];
124  set<3>(q0) = q.data[3];
125 
126  set<0>(q1) = g_in[0*pitchIn + idx];
127  __syncthreads();
128  set<1>(q1) = g_in[1*pitchIn + idx];
129  __syncthreads();
130  set<2>(q1) = g_in[2*pitchIn + idx];
131  __syncthreads();
132  set<3>(q1) = g_in[3*pitchIn + idx];
133  __syncthreads();
134 
135  // now compute the distance for this point
136  Scalar d = so3_distance_fn<Scalar,Pseudo>::compute(q0,q1);
137  __syncthreads();
138  g_out[0*pitchOut + idx] = d;
139  __syncthreads();
140  g_out[1*pitchOut + idx] = idx;
141  }
142 }
143 
145 {
146  OFF = 0,
147  MIN = 1,
148  MAX = 2
149 };
150 
151 
152 
153 template< bool Pseudo, typename Scalar, unsigned int NDim>
156  Scalar* g_out
157  )
158 {
159  using namespace linalg;
160 
161  __shared__ Scalar med[4];
162  __shared__ Scalar min[4];
163  __shared__ Scalar max[4];
164  __shared__ Scalar dist[192];
165 
166  int idx = threadIdx.x;
167  int constraintIdx = idx % 81;
168  int sign = idx > 80 ? -1 : 1;
169  int childIdx = blockIdx.x;
170 
171  // all threads initialize their output block
172  dist[idx] = 1000;
173  __syncthreads();
174 
175  // the first four threads for this block build up the child hyper
176  // rectangle for this block
177  if( idx < 4 )
178  {
179  med[idx] = Scalar(0.5)*( query.min[idx] + query.max[idx] );
180 
181  if( childIdx & (0x01 << idx ) )
182  {
183  min[idx] = med[idx];
184  max[idx] = query.max[idx];
185  }
186  else
187  {
188  min[idx] = query.min[idx];
189  max[idx] = med[idx];
190  }
191  }
192  __syncthreads();
193 
194  // if our idx is greater than the number of output points then we are a
195  // left-over thread so just bail
196  if( idx >= 81*2 )
197  return;
198 
199  // compose the query matrices
201 
202  // the if statement just avoids useless cuda warnings
203  if( NDim >= 4 )
204  {
205  set<0>(q) = query.point[0];
206  set<1>(q) = query.point[1];
207  set<2>(q) = query.point[2];
208  set<3>(q) = query.point[3];
209 
210  // compute the specification for this thread
211  int cnst = constraintIdx;;
212  char spec_0 = cnst % 3; cnst /= 3;
213  char spec_1 = cnst % 3; cnst /= 3;
214  char spec_2 = cnst % 3; cnst /= 3;
215  char spec_3 = cnst % 3; cnst /= 3;
216 
217  // build up the lambda calculation for this thread
218  Scalar den = 1;
219  Scalar num = 0;
220 
221  if( spec_0 == MAX )
222  den -= max[0]*max[0];
223  else if( spec_0 == MIN )
224  den -= min[0]*min[0];
225  else
226  num += get<0>(q)*get<0>(q);
227 
228  if( spec_1 == MAX )
229  den -= max[1]*max[1];
230  else if( spec_1 == MIN )
231  den -= min[1]*min[1];
232  else
233  num += get<1>(q)*get<1>(q);
234 
235  if( spec_2 == MAX )
236  den -= max[2]*max[2];
237  else if( spec_2 == MIN )
238  den -= min[2]*min[2];
239  else
240  num += get<2>(q)*get<2>(q);
241 
242  if( spec_3 == MAX )
243  den -= max[3]*max[3];
244  else if( spec_3 == MIN )
245  den -= min[3]*min[3];
246  else
247  num += get<3>(q)*get<3>(q);
248 
249  bool feasible = true;
250 
251  if( den < 1e-8 )
252  feasible = false;
253 
254  Scalar lambda2 = num / (4*den);
255  Scalar lambda = std::sqrt(lambda2);
256 
257  if( idx == 0 )
258  {
259  lambda = -Scalar(1.0)/(Scalar(2.0)*sign);
260  feasible = true;
261  }
262 
264 
265  if( spec_0 == MAX )
266  set<0>(x) = max[0];
267  else if( spec_0 == MIN )
268  set<0>(x) = min[0];
269  else
270  set<0>(x) = -get<0>(q) / (2*sign*lambda);
271 
272  if( get<0>(x) > max[0] || get<0>(x) < min[0] )
273  feasible = false;
274 
275  if( spec_1 == MAX )
276  set<1>(x) = max[1];
277  else if( spec_1 == MIN )
278  set<1>(x) = min[1];
279  else
280  set<1>(x) = -get<1>(q) / (2*sign*lambda);
281 
282  if( get<1>(x) > max[1] || get<1>(x) < min[1] )
283  feasible = false;
284 
285  if( spec_2 == MAX )
286  set<2>(x) = max[2];
287  else if( spec_2 == MIN )
288  set<2>(x) = min[2];
289  else
290  set<2>(x) = -get<2>(q) / (2*sign*lambda);
291 
292  if( get<2>(x) > max[2] || get<2>(x) < min[2] )
293  feasible = false;
294 
295  if( spec_3 == MAX )
296  set<3>(x) = max[3];
297  else if( spec_3 == MIN )
298  set<3>(x) = min[3];
299  else
300  set<3>(x) = -get<3>(q) / (2*sign*lambda);
301 
302  if( get<3>(x) > max[3] || get<3>(x) < min[3] )
303  feasible = false;
304 
305  // now compute the distance for this point
307  if(idx == 0 )
308  d = 0;
309  if(!feasible)
310  d = 1000;
311 
312  dist[idx] = d;
313  __syncthreads();
314 
315  // only the first warp does the reduction
316  if( idx < 32 )
317  {
318  // start by taking the min across warps
319  Scalar dist_i = dist[idx];
320  Scalar dist_j = 1000;
321 
322  for(int j=1; j < 6; j++)
323  {
324  Scalar dist_j = dist[idx + j*32];
325  if( dist_j < dist_i )
326  dist_i = dist_j;
327  }
328 
329  // write out our min, no need to sync b/c warp synchronous
330  dist[idx] = dist_i;
331 
332  // now perform the reduction
333  for(int j=16; j > 0; j /= 2 )
334  {
335  dist_j = dist[idx + j];
336  if( dist_j < dist_i )
337  dist_i = dist_j;
338  dist[idx] = dist_i;
339  }
340 
341  if( idx ==0 )
342  g_out[childIdx] = dist_i;
343  }
344  }
345 }
346 
347 
348 
349 
350 
351 
352 
353 
354 
355 
356 
357 
358 } // kernels
359 } // cudaNN
360 } // mpblocks
361 
362 
363 #endif
364 
int x
Definition: fakecuda.h:44
__device__ Scalar so3_pseudo_distance(const linalg::Matrix< Scalar, 4, 1 > &q0, const linalg::Matrix< Scalar, 4, 1 > &q1)
Definition: so3.cu.hpp:42
#define __global__
Definition: fakecuda.h:33
#define __shared__
Definition: fakecuda.h:36
static __device__ Scalar compute(const linalg::Matrix< Scalar, 4, 1 > &q0, const linalg::Matrix< Scalar, 4, 1 > &q1)
Definition: so3.cu.hpp:68
Dim3 threadIdx
Dim3 blockIdx
#define __device__
Definition: fakecuda.h:34
static __device__ Scalar compute(const linalg::Matrix< Scalar, 4, 1 > &q0, const linalg::Matrix< Scalar, 4, 1 > &q1)
Definition: so3.cu.hpp:80
Dim3 blockDim
__device__ __host__ Scalar dot(const RValue< Scalar, ROWS, 1, ExpA > &A, const RValue< Scalar, ROWS, 1, ExpB > &B)
compute the DOT
Definition: Dot.h:84
void __syncthreads()
__global__ void so3_distance(QueryPoint< Scalar, NDim > query, Scalar *g_in, unsigned int pitchIn, Scalar *g_out, unsigned int pitchOut, unsigned int n)
Definition: so3.cu.hpp:90