cheshirekow  v0.1.0
kernels.cu.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2012 Josh Bialkowski (jbialk@mit.edu)
3  *
4  * This file is part of openbook.
5  *
6  * openbook is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * openbook is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with openbook. If not, see <http://www.gnu.org/licenses/>.
18  */
27 #ifndef MPBLOCKS_DUBINS_CURVES_CUDA2_KERNELS_CU_HPP_
28 #define MPBLOCKS_DUBINS_CURVES_CUDA2_KERNELS_CU_HPP_
29 
30 
31 namespace mpblocks {
32 namespace dubins {
33 namespace curves_cuda {
34 namespace kernels {
35 
36 namespace linalg = cuda::linalg2;
37 
38 
39 
40 
41 template< SolutionId Id, typename Format_t >
45  const Format_t r,
47 {
49  if( soln.f && soln.d < best.d )
50  {
51  best.d = soln.d;
52  best.id = Id;
53  }
54 }
55 
56 template< SolutionId Id, typename Format_t >
60  const Format_t r,
61  Format_t& best )
62 {
64  if( soln.f && soln.d < best )
65  best = soln.d;
66 }
67 
68 
69 
70 
71 
72 
73 
74 
75 
76 
77 
78 template< typename Format_t >
81  Result<Format_t>& result,
82  int off,
83  int pitch,
84  int idx,
85  Format_t* g_out)
86 {
87  // DebugCurved has 11 elements
88  // 3 x 2ea center points
89  // 3 x 1ea distances
90  // 1 total distance
91  // 1 feasible
92  off *= 11;
93 
94  #pragma unroll
95  for(int i=0; i < 3; i++)
96  {
97  #pragma unroll
98  for(int j=0; j < 2; j++)
99  {
100  const int k = i*2 + j;
101  __syncthreads();
102  g_out[ (off + k)*pitch +idx ] = soln.c[i][j];
103  }
104  }
105 
106  #pragma unroll
107  for(int i=0; i < 3; i++)
108  {
109  const int k = 6+i;
110  __syncthreads();
111  g_out[ (off + k)*pitch + idx ] = soln.l[i];
112  }
113 
114  __syncthreads();
115  g_out[ (off + 9)*pitch + idx ] = result.d;
116  __syncthreads();
117  g_out[ (off + 10)*pitch + idx ] = result.f ? 1 : 0;
118 }
119 
120 
121 
122 
123 template< typename Format_t >
126  Result<Format_t>& result,
127  int off,
128  int pitch,
129  int idx,
130  Format_t* g_out)
131 {
132  // DebugStraight has 13 elements
133  // 2 x 2ea center points
134  // 2 x 2ea tangent points
135  // 3 x 1ea distances
136  // 1 total distance
137  // 1 feasible
138  // In addition, the DebugStraight block is written after the DebugCurved
139  // block. g_out is moved to point to the DebugStraight block but the
140  // first Enum value corresponding to a straight output is four, not zero
141  off = (off - 4)*13;
142 
143  #pragma unroll
144  for(int i=0; i < 2; i++)
145  {
146  #pragma unroll
147  for(int j=0; j < 2; j++)
148  {
149  const int k = i*2 + j;
150  __syncthreads();
151  g_out[ (off + k)*pitch +idx ] = soln.c[i][j];
152  }
153  }
154 
155  #pragma unroll
156  for(int i=0; i < 2; i++)
157  {
158  #pragma unroll
159  for(int j=0; j < 2; j++)
160  {
161  const int k = 4 + i*2 + j;
162  __syncthreads();
163  g_out[ (off + k)*pitch +idx ] = soln.c[i][j];
164  }
165  }
166 
167  #pragma unroll
168  for(int i=0; i < 3; i++)
169  {
170  const int k = 8+i;
171  __syncthreads();
172  g_out[ (off + k)*pitch + idx ] = soln.l[i];
173  }
174 
175  __syncthreads();
176  g_out[ (off + 11)*pitch + idx ] = result.d;
177  __syncthreads();
178  g_out[ (off + 12)*pitch + idx ] = result.f ? 1 : 0;
179 }
180 
181 
182 
183 
184 
185 
186 
187 
188 
189 
190 
191 
192 template< typename Format_t>
194  Params<Format_t> p,
195  Format_t* g_in,
196  unsigned int pitchIn,
197  Format_t* g_out,
198  unsigned int pitchOut,
199  unsigned int n
200  )
201 {
202  using namespace linalg;
203 
204  int threadId = threadIdx.x;
205  int blockId = blockIdx.x;
206  int N = blockDim.x;
207 
208  // which data point we work on
209  int idx = blockId * N + threadId;
210 
211  // if our idx is greater than the number of data points then we are a
212  // left-over thread so just bail
213  if( idx > n )
214  return;
215 
216  // compose the query object
218  set<0>( q0 ) = p.q[0];
219  set<1>( q0 ) = p.q[1];
220  set<2>( q0 ) = p.q[2];
221  Format_t r = p.r;
222 
223  // read in the target point q1, we synchronize between reads so that
224  // reads are coallesced for maximum throughput
225  set<0>(q1) = g_in[0*pitchIn + idx];
226  __syncthreads();
227  set<1>(q1) = g_in[1*pitchIn + idx];
228  __syncthreads();
229  set<2>(q1) = g_in[2*pitchIn + idx];
230  __syncthreads();
231 
232  // best solution found
233  Format_t dBest;
234 
235  // now compute the distance for each of the solvers, and then record
236  // the minimum one
237 
238  // the solutions with straight segments are always feasible so let's start
239  // with one of them
241 
242  applySolver<LSR, Format_t>(q0,q1,r,dBest);
243  applySolver<RSR, Format_t>(q0,q1,r,dBest);
244  applySolver<RSL, Format_t>(q0,q1,r,dBest);
245  applySolver<RLRa,Format_t>(q0,q1,r,dBest);
246  applySolver<RLRb,Format_t>(q0,q1,r,dBest);
247  applySolver<LRLa,Format_t>(q0,q1,r,dBest);
248  applySolver<LRLb,Format_t>(q0,q1,r,dBest);
249 
250  __syncthreads();
251  g_out[0*pitchOut + idx] = dBest;
252 }
253 
254 
255 
256 
257 
258 
259 
260 
261 
262 
263 
264 template< typename Format_t >
266  Params<Format_t> p,
267  Format_t* g_in,
268  unsigned int pitchIn,
269  Format_t* g_out,
270  unsigned int pitchOut,
271  unsigned int n
272  )
273 {
274  using namespace cuda::linalg2;
275 
276  int threadId = threadIdx.x;
277  int blockId = blockIdx.x;
278  int N = blockDim.x;
279 
280  // which data point we work on
281  int idx = blockId * N + threadId;
282 
283  // if our idx is greater than the number of data points then we are a
284  // left-over thread so just bail
285  if( idx > n )
286  return;
287 
288  // compose the query object
290  set<0>( q0 ) = p.q[0];
291  set<1>( q0 ) = p.q[1];
292  set<2>( q0 ) = p.q[2];
293  Format_t r = p.r;
294 
295  // read in the target point q1, we synchronize between reads so that
296  // reads are coallesced for maximum throughput
297  set<0>(q1) = g_in[0*pitchIn + idx];
298  __syncthreads();
299  set<1>(q1) = g_in[1*pitchIn + idx];
300  __syncthreads();
301  set<2>(q1) = g_in[2*pitchIn + idx];
302  __syncthreads();
303 
304  // best solution found
305  Format_t dBest;
306 
307  // now compute the distance for each of the solvers, and then record
308  // the minimum one
309 
310  // the solutions with straight segments are always feasible so let's start
311  // with one of them
313 
314  applySolver<LSR, Format_t>(q1,q0,r,dBest);
315  applySolver<RSR, Format_t>(q1,q0,r,dBest);
316  applySolver<RSL, Format_t>(q1,q0,r,dBest);
317  applySolver<RLRa,Format_t>(q1,q0,r,dBest);
318  applySolver<RLRb,Format_t>(q1,q0,r,dBest);
319  applySolver<LRLa,Format_t>(q1,q0,r,dBest);
320  applySolver<LRLb,Format_t>(q1,q0,r,dBest);
321 
322  __syncthreads();
323  g_out[0*pitchOut + idx] = dBest;
324 }
325 
326 
327 
328 
329 
330 
331 
332 
333 
334 
335 
336 template< typename Format_t>
338  Params<Format_t> p,
339  Format_t* g_in,
340  unsigned int pitchIn,
341  Format_t* g_out,
342  unsigned int pitchOut,
343  unsigned int n
344  )
345 {
346  using namespace cuda::linalg2;
347 
348  int threadId = threadIdx.x;
349  int blockId = blockIdx.x;
350  int N = blockDim.x;
351 
352  // which data point we work on
353  int idx = blockId * N + threadId;
354 
355  // if our idx is greater than the number of data points then we are a
356  // left-over thread so just bail
357  if( idx > n )
358  return;
359 
360  // compose the query object
362  set<0>( q0 ) = p.q[0];
363  set<1>( q0 ) = p.q[1];
364  set<2>( q0 ) = p.q[2];
365  Format_t r = p.r;
366 
367  // read in the target point q1, we synchronize between reads so that
368  // reads are coallesced for maximum throughput
369  set<0>(q1) = g_in[0*pitchIn + idx];
370  __syncthreads();
371  set<1>(q1) = g_in[1*pitchIn + idx];
372  __syncthreads();
373  set<2>(q1) = g_in[2*pitchIn + idx];
374  __syncthreads();
375 
376  // now compute the distance for each of the solvers, and then record
377  // the minimum one
378 
379  // the solutions with straight segments are always feasible so let's start
380  // with one of them
382  DistanceAndId<Format_t> dBest(soln.d,LSL);
383 
384  applySolver<LSR, Format_t>(q0,q1,r,dBest);
385  applySolver<RSR, Format_t>(q0,q1,r,dBest);
386  applySolver<RSL, Format_t>(q0,q1,r,dBest);
387  applySolver<RLRa,Format_t>(q0,q1,r,dBest);
388  applySolver<RLRb,Format_t>(q0,q1,r,dBest);
389  applySolver<LRLa,Format_t>(q0,q1,r,dBest);
390  applySolver<LRLb,Format_t>(q0,q1,r,dBest);
391 
392  typedef typename PackedStorage<sizeof(Format_t)>::Result Unsigned;
393  Unsigned pack = (idx << 4 ) | dBest.id;
394  Unsigned* out = reinterpret_cast<Unsigned*>(g_out + pitchOut);
395 
396  __syncthreads();
397  g_out[idx] = dBest.d;
398  __syncthreads();
399  out[idx] = pack;
400 }
401 
402 
403 
404 
405 
406 
407 
408 
409 
410 
411 
412 template< typename Format_t >
414  Params<Format_t> p,
415  Format_t* g_in,
416  unsigned int pitchIn,
417  Format_t* g_out,
418  unsigned int pitchOut,
419  unsigned int n
420  )
421 {
422  using namespace cuda::linalg2;
423 
424  int threadId = threadIdx.x;
425  int blockId = blockIdx.x;
426  int N = blockDim.x;
427 
428  // which data point we work on
429  int idx = blockId * N + threadId;
430 
431  // if our idx is greater than the number of data points then we are a
432  // left-over thread so just bail
433  if( idx > n )
434  return;
435 
436  // compose the query object
438  set<0>( q0 ) = p.q[0];
439  set<1>( q0 ) = p.q[1];
440  set<2>( q0 ) = p.q[2];
441  Format_t r = p.r;
442 
443  // read in the target point q1, we synchronize between reads so that
444  // reads are coallesced for maximum throughput
445  set<0>(q1) = g_in[0*pitchIn + idx];
446  __syncthreads();
447  set<1>(q1) = g_in[1*pitchIn + idx];
448  __syncthreads();
449  set<2>(q1) = g_in[2*pitchIn + idx];
450  __syncthreads();
451 
452  // now compute the distance for each of the solvers, and then record
453  // the minimum one
454 
455  // the solutions with straight segments are always feasible so let's start
456  // with one of them
458  DistanceAndId<Format_t> dBest(soln.d,LSL);
459 
460  applySolver<LSR, Format_t>(q1,q0,r,dBest);
461  applySolver<RSR, Format_t>(q1,q0,r,dBest);
462  applySolver<RSL, Format_t>(q1,q0,r,dBest);
463  applySolver<RLRa,Format_t>(q1,q0,r,dBest);
464  applySolver<RLRb,Format_t>(q1,q0,r,dBest);
465  applySolver<LRLa,Format_t>(q1,q0,r,dBest);
466  applySolver<LRLb,Format_t>(q1,q0,r,dBest);
467 
468  typedef typename PackedStorage<sizeof(Format_t)>::Result Unsigned;
469  Unsigned pack = (idx << 4 ) | dBest.id;
470  Unsigned* out = reinterpret_cast<Unsigned*>(g_out + pitchOut);
471 
472  __syncthreads();
473  g_out[idx] = dBest.d;
474  __syncthreads();
475  out[idx] = pack;
476 }
477 
478 
479 
480 
481 
482 
483 
484 
485 
486 
487 
488 template< typename Format_t>
490  Params<Format_t> p,
491  Format_t* g_in,
492  unsigned int pitchIn,
493  Format_t* g_out,
494  unsigned int pitchOut,
495  unsigned int n
496  )
497 {
498  using namespace cuda::linalg2;
499 
500  int threadId = threadIdx.x;
501  int blockId = blockIdx.x;
502  int N = blockDim.x;
503 
504  // which data point we work on
505  int idx = blockId * N + threadId;
506 
507  // if our idx is greater than the number of data points then we are a
508  // left-over thread so just bail
509  if( idx > n )
510  return;
511 
512  // compose the query object
514  set<0>( q0 ) = p.q[0];
515  set<1>( q0 ) = p.q[1];
516  set<2>( q0 ) = p.q[2];
517  Format_t r = p.r;
518 
519  // read in the target point q1, we synchronize between reads so that
520  // reads are coallesced for maximum throughput
521  set<0>(q1) = g_in[0*pitchIn + idx];
522  __syncthreads();
523  set<1>(q1) = g_in[1*pitchIn + idx];
524  __syncthreads();
525  set<2>(q1) = g_in[2*pitchIn + idx];
526  __syncthreads();
527 
528  // storage for the solution
529  Result<Format_t> soln;
530 
531  {
532  DebugCurved<Format_t> debug;
533 
534  // now compute the distance for each of the solve_debugrs
535  soln = Solver<LRLa,Format_t>::solve_debug(q0,q1,r,debug);
536  writeSolution(debug,soln,LRLa,pitchOut,idx,g_out);
537 
538  soln = Solver<LRLb,Format_t>::solve_debug(q0,q1,r,debug);
539  writeSolution(debug,soln,LRLb,pitchOut,idx,g_out);
540 
541  soln = Solver<RLRa,Format_t>::solve_debug(q0,q1,r,debug);
542  writeSolution(debug,soln,RLRa,pitchOut,idx,g_out);
543 
544  soln = Solver<RLRb,Format_t>::solve_debug(q0,q1,r,debug);
545  writeSolution(debug,soln,RLRb,pitchOut,idx,g_out);
546  }
547 
548  // there are 4 curved solutions, each one 11 elements, and pitchOut cols
549  // so we advance the write head by
550  g_out += 44*pitchOut;
551  {
553  soln = Solver<LSL,Format_t>::solve_debug(q0,q1,r,debug);
554  writeSolution(debug,soln,LSL,pitchOut,idx,g_out);
555 
556  soln = Solver<RSL,Format_t>::solve_debug(q0,q1,r,debug);
557  writeSolution(debug,soln,RSL,pitchOut,idx,g_out);
558 
559  soln = Solver<RSR,Format_t>::solve_debug(q0,q1,r,debug);
560  writeSolution(debug,soln,RSR,pitchOut,idx,g_out);
561 
562  soln = Solver<LSR,Format_t>::solve_debug(q0,q1,r,debug);
563  writeSolution(debug,soln,LSR,pitchOut,idx,g_out);
564  }
565 
566 
567 }
568 
569 
570 
571 
572 
573 
574 
575 
576 
577 
578 
579 template< typename Format_t >
581  Params<Format_t> p,
582  Format_t* g_in,
583  unsigned int pitchIn,
584  Format_t* g_out,
585  unsigned int pitchOut,
586  unsigned int n
587  )
588 {
589  using namespace cuda::linalg2;
590 
591  int threadId = threadIdx.x;
592  int blockId = blockIdx.x;
593  int N = blockDim.x;
594 
595  // which data point we work on
596  int idx = blockId * N + threadId;
597 
598  // if our idx is greater than the number of data points then we are a
599  // left-over thread so just bail
600  if( idx > n )
601  return;
602 
603  // compose the query object
605  set<0>( q0 ) = p.q[0];
606  set<1>( q0 ) = p.q[1];
607  set<2>( q0 ) = p.q[2];
608  Format_t r = p.r;
609 
610  // read in the target point q1, we synchronize between reads so that
611  // reads are coallesced for maximum throughput
612  set<0>(q1) = g_in[0*pitchIn + idx];
613  __syncthreads();
614  set<1>(q1) = g_in[1*pitchIn + idx];
615  __syncthreads();
616  set<2>(q1) = g_in[2*pitchIn + idx];
617  __syncthreads();
618 
619  // storage for the solution
620  Result<Format_t> soln;
621 
622  {
623  DebugCurved<Format_t> debug;
624 
625  // now compute the distance for each of the solvers
626  soln = Solver<LRLa,Format_t>::solve_debug(q1,q0,r,debug);
627  writeSolution(debug,soln,LRLa,pitchOut,idx,g_out);
628 
629  soln = Solver<LRLb,Format_t>::solve_debug(q1,q0,r,debug);
630  writeSolution(debug,soln,LRLb,pitchOut,idx,g_out);
631 
632  soln = Solver<RLRa,Format_t>::solve_debug(q1,q0,r,debug);
633  writeSolution(debug,soln,RLRa,pitchOut,idx,g_out);
634 
635  soln = Solver<RLRb,Format_t>::solve_debug(q1,q0,r,debug);
636  writeSolution(debug,soln,RLRb,pitchOut,idx,g_out);
637  }
638 
639  // there are 4 curved solutions, each one 11 elements, and pitchOut cols
640  // so we advance the write head by
641  g_out += 44*pitchOut;
642  {
644  soln = Solver<LSL,Format_t>::solve_debug(q1,q0,r,debug);
645  writeSolution(debug,soln,LSL,pitchOut,idx,g_out);
646 
647  soln = Solver<RSL,Format_t>::solve_debug(q1,q0,r,debug);
648  writeSolution(debug,soln,RSL,pitchOut,idx,g_out);
649 
650  soln = Solver<RSR,Format_t>::solve_debug(q1,q0,r,debug);
651  writeSolution(debug,soln,RSR,pitchOut,idx,g_out);
652 
653  soln = Solver<LSR,Format_t>::solve_debug(q1,q0,r,debug);
654  writeSolution(debug,soln,LSR,pitchOut,idx,g_out);
655  }
656 }
657 
658 
659 
660 
661 
662 
663 
664 
665 
666 
667 
668 template< typename Format_t >
671  Format_t* g_in,
672  unsigned int pitchIn,
673  Format_t* g_out,
674  unsigned int pitchOut,
675  unsigned int n
676  )
677 {
678  using namespace cuda::linalg2;
679 
680  int threadId = threadIdx.x;
681  int blockId = blockIdx.x;
682  int N = blockDim.x;
683 
684  // which data point we work on
685  int idx = blockId * N + threadId;
686 
687  // if our idx is greater than the number of data points then we are a
688  // left-over thread so just bail
689  if( idx > n )
690  return;
691 
692  // compose the query object
693  Matrix<Format_t,3,1> q0, q1, diff;
694  set<0>( q0 ) = p.q[0];
695  set<1>( q0 ) = p.q[1];
696  set<2>( q0 ) = p.q[2];
697 
698  // read in the target point q1, we synchronize between reads so that
699  // reads are coallesced for maximum throughput
700  set<0>( q1 ) = g_in[0*pitchIn + idx];
701  __syncthreads();
702  set<1>( q1 ) = g_in[1*pitchIn + idx];
703  __syncthreads();
704  set<2>( q1 ) = g_in[2*pitchIn + idx];
705  __syncthreads();
706 
707  // find the difference
708  diff = q1 - q0;
709 
710  // correct the rotation
711  const Format_t _PI = static_cast<Format_t>(M_PI);
712  if( get<2>( diff ) > _PI )
713  set<2>( diff ) -= 2*_PI;
714  if( get<2>( diff ) < _PI )
715  set<2>( diff ) += 2*_PI;
716 
717  Format_t dist2 = norm_squared(diff);
718 
719  __syncthreads();
720  g_out[0*pitchOut + idx] = dist2;
721 }
722 
723 
724 
725 
726 
727 
728 
729 
730 
731 
732 
733 template< typename Format_t>
736  Format_t* g_in,
737  unsigned int pitchIn,
738  Format_t* g_out,
739  unsigned int pitchOut,
740  unsigned int n
741  )
742 {
743  using namespace cuda::linalg2;
744 
745  int threadId = threadIdx.x;
746  int blockId = blockIdx.x;
747  int N = blockDim.x;
748 
749  // which data point we work on
750  int idx = blockId * N + threadId;
751 
752  // if our idx is greater than the number of data points then we are a
753  // left-over thread so just bail
754  if( idx > n )
755  return;
756 
757  // compose the query object
758  Matrix<Format_t,3,1> q0, q1, diff;
759  set<0>( q0 ) = p.q[0];
760  set<1>( q0 ) = p.q[1];
761  set<2>( q0 ) = p.q[2];
762 
763  // read in the target point q1, we synchronize between reads so that
764  // reads are coallesced for maximum throughput
765  set<0>( q1 ) = g_in[0*pitchIn + idx];
766  __syncthreads();
767  set<1>( q1 ) = g_in[1*pitchIn + idx];
768  __syncthreads();
769  set<2>( q1 ) = g_in[2*pitchIn + idx];
770  __syncthreads();
771 
772  // find the difference
773  diff = q1 - q0;
774 
775  // correct the rotation
776  const Format_t _PI = static_cast<Format_t>(M_PI);
777  if( get<2>( diff ) > _PI )
778  set<2>( diff ) -= 2*_PI;
779  if( get<2>( diff ) < _PI )
780  set<2>( diff ) += 2*_PI;
781 
782  Format_t dist2 = norm_squared(diff);
783 
784  typedef typename PackedStorage<sizeof(Format_t)>::Result Unsigned;
785  Unsigned* out = reinterpret_cast<Unsigned*>(g_out + pitchOut);
786 
787  __syncthreads();
788  g_out[idx] = dist2;
789  __syncthreads();
790  out[idx] = idx;
791 }
792 
793 
794 
795 
796 
797 } // kernels
798 } // curves
799 } // dubins
800 } // mpblocks
801 
802 
803 #endif
804 
__device__ __host__ Scalar norm_squared(const RValue< Scalar, ROWS, COLS, Exp > &M)
compute the norm
Definition: Norm.h:130
Format_t d
distance
Definition: result.h:45
int x
Definition: fakecuda.h:44
Format_t d
distance
Definition: result.h:83
#define __global__
Definition: fakecuda.h:33
__global__ void group_distance_to_set(EuclideanParams< Format_t > p, Format_t *g_in, unsigned int pitchIn, Format_t *g_out, unsigned int pitchOut, unsigned int n)
batch-compute the euclidean distance from a single dubins state to a batch of many dubins states ...
Definition: kernels.cu.hpp:669
__device__ void applySolver(const linalg::Matrix< Format_t, 3, 1 > &q0, const linalg::Matrix< Format_t, 3, 1 > &q1, const Format_t r, DistanceAndId< Format_t > &best)
Definition: kernels.cu.hpp:42
__global__ void distance_from_set(Params< Format_t > p, Format_t *g_in, unsigned int pitchIn, Format_t *g_out, unsigned int pitchOut, unsigned int n)
batch-compute the distance from a batch of many dubins states to a single dubins state ...
Definition: kernels.cu.hpp:265
Format_t r
turning radius
Definition: Params.h:43
__global__ void distance_to_set_debug(Params< Format_t > p, Format_t *g_in, unsigned int pitchIn, Format_t *g_out, unsigned int pitchOut, unsigned int n)
batch-compute the distance from a single dubins state to a batch of many dubins states ...
Definition: kernels.cu.hpp:489
bool f
is feasible
Definition: result.h:46
Encapsulates the solution distance along with a feasibility bit for a particular primitive solution...
Definition: result.h:44
Dim3 threadIdx
Dim3 blockIdx
static Result< Format_t > solve(const Vector3d_t &q0, const Vector3d_t &q1, const Format_t r)
basic interface returns only the total distance
#define __device__
Definition: fakecuda.h:34
__device__ void writeSolution(DebugCurved< Format_t > &soln, Result< Format_t > &result, int off, int pitch, int idx, Format_t *g_out)
Definition: kernels.cu.hpp:79
Dim3 blockDim
Format_t q[3]
the query state
Definition: Params.h:44
__global__ void distance_from_set_debug(Params< Format_t > p, Format_t *g_in, unsigned int pitchIn, Format_t *g_out, unsigned int pitchOut, unsigned int n)
batch-compute the distance from a batch of many dubins states to a single dubins state ...
Definition: kernels.cu.hpp:580
Format_t q[3]
the query state
Definition: Params.h:56
__global__ void group_distance_to_set_with_id(EuclideanParams< Format_t > p, Format_t *g_in, unsigned int pitchIn, Format_t *g_out, unsigned int pitchOut, unsigned int n)
batch-compute the euclidean distance from a single dubins state to a batch of many dubins states ...
Definition: kernels.cu.hpp:734
__global__ void distance_to_set_with_id(Params< Format_t > p, Format_t *g_in, unsigned int pitchIn, Format_t *g_out, unsigned int pitchOut, unsigned int n)
batch-compute the distance from a single dubins state to a batch of many dubins states ...
Definition: kernels.cu.hpp:337
__global__ void distance_to_set(Params< Format_t > p, Format_t *g_in, unsigned int pitchIn, Format_t *g_out, unsigned int pitchOut, unsigned int n)
batch-compute the distance from a single dubins state to a batch of many dubins states ...
Definition: kernels.cu.hpp:193
void __syncthreads()
__global__ void distance_from_set_with_id(Params< Format_t > p, Format_t *g_in, unsigned int pitchIn, Format_t *g_out, unsigned int pitchOut, unsigned int n)
batch-compute the distance from a batch of many dubins states to a single dubins state ...
Definition: kernels.cu.hpp:413
interface for different solutions
Definition: Solution.h:58
Encapsulates a solution distance along with the id of the path type, identifying the nature of the th...
Definition: result.h:82