cheshirekow  v0.1.0
euclidean.cu.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2012 Josh Bialkowski (jbialk@mit.edu)
3  *
4  * This file is part of openbook.
5  *
6  * openbook is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * openbook is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with openbook. If not, see <http://www.gnu.org/licenses/>.
18  */
27 #ifndef MPBLOCKS_CUDANN_KERNELS_EUCLIDEAN_CU_HPP_
28 #define MPBLOCKS_CUDANN_KERNELS_EUCLIDEANCU_HPP_
29 
30 #include <mpblocks/cuda/linalg2.h>
33 
34 namespace mpblocks {
35 namespace cudaNN {
36 namespace kernels {
37 
38 namespace linalg = cuda::linalg2;
39 
40 
41 template< typename Scalar, int NDim >
46 {
47  return linalg::norm_squared(q1-q0);
48 }
49 
50 template< typename Scalar, int NDim >
55 {
56  return linalg::norm(q1-q0);
57 }
58 
59 
60 template< typename Format_t, unsigned int NDim>
63  Format_t* g_in,
64  unsigned int pitchIn,
65  Format_t* g_out,
66  unsigned int pitchOut,
67  unsigned int n
68  )
69 {
70  int threadId = threadIdx.x;
71  int blockId = blockIdx.x;
72  int N = blockDim.x;
73 
74  // which data point we work on
75  int idx = blockId * N + threadId;
76 
77  // if our idx is greater than the number of data points then we are a
78  // left-over thread so just bail
79  // @todo is this OK with non-power of
80  // two array sizes and the fact that we syncthreads after this point?
81  if( idx > n )
82  return;
83 
84  // compose the query object
86 
87  // read in the query point q0, no synchronization between reads
88  read(q,q0);
89 
90  // read in the target point q1, we synchronize between reads so that
91  // reads are coallesced for maximum throughput
92  read(g_in,pitchIn,idx,q1);
93 
94  // now compute the distance for this point
95  Format_t d = linalg::norm_squared(q1-q0);
96  __syncthreads();
97  g_out[0*pitchOut + idx] = d;
98  __syncthreads();
99  g_out[1*pitchOut + idx] = idx;
100 }
101 
102 
103 template <int Arg,int i,bool final>
105 {
107 };
108 
109 template <int Arg,int i>
110 struct NextPow2_iter<Arg,i,true>
111 {
112  enum{ value = i };
113 };
114 
115 template <int Arg>
116 struct NextPow2
117 {
119 };
120 
121 
122 template< bool Pseudo, typename Scalar, unsigned int NDim>
125  Scalar* g_out
126  )
127 {
128  using namespace linalg;
129 
130  __shared__ Scalar med[NDim];
131  __shared__ Scalar min[NDim];
132  __shared__ Scalar max[NDim];
133  __shared__ Scalar dist[NDim];
134 
135  int idx = threadIdx.x;
136  int childIdx = blockIdx.x;
137 
138  // the first NDim threads for this block build up the child hyper
139  // rectangle for this block
140  if( idx < NDim )
141  {
142  med[idx] = Scalar(0.5)*( query.min[idx] + query.max[idx] );
143  }
144  __syncthreads();
145 
146  if( idx < NDim )
147  {
148  if( childIdx & (0x01 << idx ) )
149  {
150  min[idx] = med[idx];
151  max[idx] = query.max[idx];
152  }
153  else
154  {
155  min[idx] = query.min[idx];
156  max[idx] = med[idx];
157  }
158  }
159  __syncthreads();
160 
161  // each thread reads in his data
162  Scalar q_i = query.point[idx];
163  __syncthreads();
164 
165  Scalar min_i = min[idx];
166  __syncthreads();
167 
168  Scalar max_i = max[idx];
169  __syncthreads();
170 
171  Scalar dist_i = 0;
172  if( q_i < min_i )
173  dist_i = min_i - q_i;
174  if( q_i > max_i )
175  dist_i = q_i - max_i;
176 
177  dist_i *= dist_i;
178  dist[idx] = dist_i;
179  __syncthreads();
180 
181  for(int s=NextPow2<NDim>::value; s>0; s>>=1 )
182  {
183  if(idx < s && idx+s < NDim)
184  dist[idx] += dist[idx+s];
185  __syncthreads();
186  }
187 
188  if( idx ==0 )
189  {
190  if(Pseudo)
191  g_out[childIdx] = dist[0];
192  else
193  g_out[childIdx] = std::sqrt(dist[0]);
194  }
195 }
196 
197 
198 
199 
200 
201 
202 
203 } // kernels
204 } // cudaNN
205 } // mpblocks
206 
207 
208 #endif
209 
__device__ __host__ Scalar norm_squared(const RValue< Scalar, ROWS, COLS, Exp > &M)
compute the norm
Definition: Norm.h:130
int x
Definition: fakecuda.h:44
#define __global__
Definition: fakecuda.h:33
#define __shared__
Definition: fakecuda.h:36
Dim3 threadIdx
__device__ __host__ Scalar norm(const RValue< Scalar, ROWS, COLS, Exp > &M)
compute the norm
Definition: Norm.h:140
Dim3 blockIdx
#define __device__
Definition: fakecuda.h:34
Dim3 blockDim
__global__ void euclidean_distance(QueryPoint< Format_t, NDim > query, Format_t *g_in, unsigned int pitchIn, Format_t *g_out, unsigned int pitchOut, unsigned int n)
vector norm (2-norm) between two points
void __syncthreads()
__device__ void read(Format_t *g_data, unsigned int pitch, int idx, linalg::Matrix< Format_t, NDim, 1 > &v)
Definition: Reader.cu.hpp:79
__device__ Scalar euclidean_pseudo_distance(const linalg::Matrix< Scalar, NDim, 1 > &q0, const linalg::Matrix< Scalar, NDim, 1 > &q1)