FiberVISH 0.2
Fish - The Fiber Bundle API for the Vish Visualization Shell
TensorFromPointCloud_CL_state.hpp
1#pragma once
2
3#include <memcore/stringlist.hpp>
4#include <memcore/RefPtr.hpp>
5
6#include <ocean/plankton/VObjectState.hpp>
7#include <eagle/FixedArray.hpp>
8#include <bone/FishField.hpp>
9
10// local work size used in sorting
11// warning: this value is shared with bitonic sorting source. If you change this, also change it in the .cl file
12#define LOCAL_SIZE_LIMIT 512U
13
14// local work size used in tensor kernel
15size_t tensorlocalWorkSize = 128; // XXX this value could be defined into the GUI (64,128,256,512,1024) - it affects performance!
16
17
18// an Insieme-compatible C interface for OpenCL
19#include "lib_icl.h"
20#include "lib_iglcl.h"
21
22using namespace Wizt;
23using namespace std;
24
25typedef cl_uint uint;
30using namespace Fiber;
31
32// struct used to send parameters to the device kernel
33typedef struct param_t {
34 uint4 gridSize; // number of cell per dimension (x,y,z), total number of cell (w)
35 float4 min;
36 float4 cellSize;
37} param_t;
38
39
40// #define VERBOSE
41const int DebugNum = 128; // number of element per buffer prints to be printed
42
43
44// note(Biagio): we need this method because the code only support power of two
45// grid dimension, but to match the radius we prefer equal or bigger sizes
46// i.e. having equal or smaller number of cell per dimension - in respect
47// of the ideal one
48// return the closest power of two, <= than x
49static unsigned int closestPow2_less(unsigned int x){
50 int log = 0;
51 int y = static_cast<int>(x);
52 while (x >>= 1) ++log;
53 if(1 << log != y) log++;
54 return 1 << log;
55}
56
57// return 1 if L is power of two
58unsigned int factorRadix2(unsigned int L){
59 if(!L) return 0;
60 else {
61 unsigned int log2L;
62 for(log2L = 0; (L & 1) == 0; L >>= 1, log2L++);
63 return L;
64 }
65}
66
67
69
70/*
71State object used for collecting data to be passed to the task and iterator objects.
72*/
74{
75private:
76 // the following values avoid multiple device, kernel and buffer creations
77 bool relaxedMath; // true if kernel should be compiled with a relaxed math flag
78 bool fpType; // true if kernels use float instead of double
79 bool kernelAlloc, bufferAlloc; // true id currently allocated, false otherwise
80
81 // allocated buffer size (= number of particle)
82 uint allocatedBufferSize;
83
84 // sizes used in kernel invocation
85 uint particleNum, mulParticleNum, pow2ParticleNum;
86
87 // number of cell
88 uint cellNum, mulCellNum;
89
90 // struct used to send parameters to the device
91 param_t param;
92
93 // OpenCL device
94 icl_device* device;
95 uint deviceId;
96
97 // OpenCL buffers
98 icl_buffer* posBuffer;
99 icl_buffer* tensorBuffer;
100 icl_buffer* paramBuffer;
101 icl_buffer* hashBuffer;
102 icl_buffer* indexBuffer;
103 icl_buffer* startBuffer;
104 icl_buffer* endBuffer;
105 icl_buffer* pSortedBuffer;
106
107 // OpenCL kernels
108 icl_kernel* sortLocalKernel;
109 icl_kernel* sortLocal1Kernel;
110 icl_kernel* sortMergeGlobalKernel;
111 icl_kernel* sortMergeLocalKernel;
112 icl_kernel* hashKernel;
113 icl_kernel* memsetUintKernel;
114 icl_kernel* memsetFloatKernel;
115 icl_kernel* startEndKernel;
116 icl_kernel* tensorKernel;
120
121public:
122
123 void setSize(size_t _particleNum, RefPtr<BoundingBox> bb, float radius);
124 void setupDeviceContext();
126
127
128 void setupKernel(bool _relaxedMath, bool _useFloat);
129 void setupBuffer(size_t _particleNum, RefPtr<BoundingBox> bb, float _radius);
130
131
132 void writeFragment(float *);
133 void compute();
134 void readFragment(float *);
135
136 void displayFragment(){} // XXX Note(Biagio): this method should *meet* a OpenGL context in order to display data without extra data moves
137
138 // timing
139 double kernelTime;
140 double overallTime;
141 Timer time;
142
143 /* Transient data structure with per-device OpenCL data */
144 TensorDeviceState(unsigned _deviceId) :
145 kernelAlloc(false), bufferAlloc(false),
146 allocatedBufferSize(0), device(NULL), deviceId(_deviceId),
147 kernelTime(0.0), overallTime(0.0)
148 { setupDeviceContext(); }
149
151 {
152 cout << "*** ~TensorDeviceState ***" << endl;
153
154 if(bufferAlloc) finalizeDeviceBuffers();
155 if(kernelAlloc) finalizeDeviceKernels();
157 }
158
159
160
161private:
162 void initDeviceKernels() {
163 assert(!kernelAlloc);
164 cout << "\n---------------------------------------------------------------------" << endl;
165 cout << "CL create kernels for device " << device->name << endl;
166
167 char kernelParam[128] = "";
168 if(relaxedMath) sprintf(kernelParam, "-cl-fast-relaxed-math is ON");
169
170 //icl_print_device_infos(device);
172
173 // sorting kernel
174 sortLocalKernel = icl_create_kernel(device, "bitonic_sorting_kernel.cl", "bitonicSortLocal", kernelParam, ICL_SOURCE);
175 sortLocal1Kernel = icl_create_kernel(device, "bitonic_sorting_kernel.cl", "bitonicSortLocal1", kernelParam, ICL_SOURCE);
176 sortMergeGlobalKernel = icl_create_kernel(device, "bitonic_sorting_kernel.cl", "bitonicMergeGlobal", kernelParam, ICL_SOURCE);
177 sortMergeLocalKernel = icl_create_kernel(device, "bitonic_sorting_kernel.cl", "bitonicMergeLocal", kernelParam, ICL_SOURCE);
178
179 memsetUintKernel = icl_create_kernel(device, "tensor_from_point_kernel.cl", "memset_uint", kernelParam, ICL_SOURCE);
180 memsetFloatKernel = icl_create_kernel(device, "tensor_from_point_kernel.cl", "memset_float", kernelParam, ICL_SOURCE);
181
182 // particle kernels
183 hashKernel = icl_create_kernel(device, "tensor_from_point_kernel.cl", "calcHash", kernelParam, ICL_SOURCE);
184 startEndKernel = icl_create_kernel(device, "tensor_from_point_kernel.cl", "startEndReorder", kernelParam, ICL_SOURCE);
185 tensorKernel = icl_create_kernel(device, "tensor_from_point_kernel.cl", "tensorAnalysis", kernelParam, ICL_SOURCE);
186
187 /*
188 // particle kernels - double version
189 hashKernel = icl_create_kernel(device, "tensor_from_point_kernel_double.cl", "calcHash", "", ICL_SOURCE);
190 startEndKernel = icl_create_kernel(device, "tensor_from_point_kernel_double.cl", "startEndReorder", "", ICL_SOURCE);
191 tensorKernel = icl_create_kernel(device, "tensor_from_point_kernel_double.cl", "tensorAnalysis", "", ICL_SOURCE);
192 */
193
194 kernelAlloc = true;
195 cout << "CL: kernels initialization done" << endl;
196 cout << "---------------------------------------------------------------------\n" << endl;
197 }
198
199 void finalizeDeviceKernels() {
200 cout << "\n---------------------------------------------------------------------" << endl;
201 cout << "CL: releasing kernels for device " << device->name << endl;
202 assert(kernelAlloc);
203
204 // kernel deallocation
205 icl_release_kernel(sortLocalKernel);
206 icl_release_kernel(sortLocal1Kernel);
207 icl_release_kernel(sortMergeGlobalKernel);
208 icl_release_kernel(sortMergeLocalKernel);
209
210 icl_release_kernel(memsetUintKernel);
211 icl_release_kernel(memsetFloatKernel);
212
213 icl_release_kernel(hashKernel);
214 icl_release_kernel(startEndKernel);
215 icl_release_kernel(tensorKernel);
216
217 /* XXX
218 icl_release_kernel(hashKernelDouble);
219 icl_release_kernel(startEndKernelDouble);
220 icl_release_kernel(tensorKernelDouble);
221 */
222
223 printf("CL: kernels released\n");
224 kernelAlloc = false;
225 cout << "---------------------------------------------------------------------\n" << endl;
226 }
227
228 void initDeviceBuffers(size_t _size, size_t cellNum) {
229 // FIXME: Remove unused _size
230 cout << "\n---------------------------------------------------------------------" << endl;
231 cout << "CL create buffers for device " << device->name << endl;
232
233 assert(!bufferAlloc);
234 cout << "particleNum "<< particleNum << ", pow2ParticleNum " << pow2ParticleNum << ", cellNum " << cellNum << endl;
235
236 posBuffer = icl_create_buffer(device, CL_MEM_READ_ONLY, sizeof(float) * 4 * particleNum); // in
237 pSortedBuffer = icl_create_buffer(device, CL_MEM_READ_WRITE, sizeof(float) * 4 * particleNum);
238 paramBuffer = icl_create_buffer(device, CL_MEM_READ_ONLY, sizeof(param_t) ); // in
239 // buffer of tensor metrics (6 elements - half matrix)
240 tensorBuffer = icl_create_buffer(device, CL_MEM_WRITE_ONLY, sizeof(float) * 6 * particleNum); // out
241 // hash and index buffer buffers are involved in a bitonic sorting
242 hashBuffer = icl_create_buffer(device, CL_MEM_READ_WRITE, sizeof(uint) * pow2ParticleNum);
243 indexBuffer = icl_create_buffer(device, CL_MEM_READ_WRITE, sizeof(uint) * pow2ParticleNum);
244 startBuffer = icl_create_buffer(device, CL_MEM_READ_WRITE, sizeof(uint) * cellNum );
245 endBuffer = icl_create_buffer(device, CL_MEM_READ_WRITE, sizeof(uint) * cellNum );
246 allocatedBufferSize = particleNum;
247 bufferAlloc = true;
248 cout << "---------------------------------------------------------------------\n" << endl;
249 }
250
251 void finalizeDeviceBuffers()
252 {
253 cout << "\n---------------------------------------------------------------------" << endl;
254 cout << endl << "CL: releasing buffers for device " << posBuffer->dev->name << endl;
255 assert(bufferAlloc);
256
257 icl_release_buffer(posBuffer);
258 icl_release_buffer(pSortedBuffer);
259 icl_release_buffer(paramBuffer);
260 icl_release_buffer(tensorBuffer);
261 icl_release_buffer(hashBuffer);
262 icl_release_buffer(indexBuffer);
263 icl_release_buffer(startBuffer);
264 icl_release_buffer(endBuffer);
265
266 allocatedBufferSize = -1;
267 bufferAlloc = false;
268 cout << "---------------------------------------------------------------------\n" << endl;
269 }
270
271
272
273private:
274 /* This method update the param data structure according tot he BBox and radius */
275 void calculateCellSize(RefPtr<BoundingBox> bb, float _radius) {
276 const float epsilon = 0.0005;
277 // the minimum is moved a bit back to avoid overlapping
278 param.min[0] = bb->mincoord()[0] - epsilon;
279 param.min[1] = bb->mincoord()[1] - epsilon;
280 param.min[2] = bb->mincoord()[2] - epsilon;
281 Eagle::PhysicalSpace::tvector diag = bb->diagonal();
282 cout << "BoundingBox (";
283 cout << "min: " << param.min[0] << "," << param.min[1] << "," << param.min[2] ;
284 cout << ", max: " << bb->maxcoord()[0] << "," << bb->maxcoord()[1] << "," << bb->maxcoord()[2] << ")" << endl;
285
286 // we assume that the cell size is the same size of the particle (double its radius)
287 // this means that each particle can cover only a limited number of grid cells (8 in 3 dimensions)
288 float newMax[3];
290 float f_cellNum[3];
291 for(int i=0; i<3; i++) {
292 if(diag[i] <= _radius) diag[i] = 2.f * _radius;
293 f_cellNum[i] = diag[i] / (2.f*_radius);
294 axis_cellNum[i] = (uint) ceil(f_cellNum[i]);
295 // cout << "dim" << i << " float_cellNum:" << f_cellNum <<" cellNum: " << axis_cellNum << endl;
296 // round up the grid size to a power of two - without any changes to the the cellSize
297 // (just more not used cells that, because of the hashing schema, they're not really a overhead)
298 param.gridSize[i] = closestPow2_less(axis_cellNum[i]);
300 param.cellSize[i] = 2.f * _radius;
301 cout << "gridSize*: " << param.gridSize[i] << " " << endl;
302 // new bounding box (for debug)
303 newMax[i] = param.min[i] + param.cellSize[i] * param.gridSize[i];
304 }
305
306 cout << "cell num "
307 << "(" << f_cellNum[0] << "," << f_cellNum[1] << "," << f_cellNum[2] << ") => "
308 << "(" << axis_cellNum[0] << "," << axis_cellNum[1] << "," << axis_cellNum[2] << ") => "
309 << "(" << param.gridSize[0] << "," << param.gridSize[1] << "," << param.gridSize[2] << ") " << endl;
310
311
312 // calculating the number of cell
313 cellNum = param.gridSize[0] * param.gridSize[1] * param.gridSize[2];
314
315 // cell num with size multiple of local work size
316 if(cellNum % tensorlocalWorkSize) mulCellNum = (cellNum / tensorlocalWorkSize + 1) * (tensorlocalWorkSize);
317 else mulCellNum = cellNum;
318 cout << "Extended BoundingBox (min: " << param.min[0] << "," << param.min[1] << "," << param.min[2] << ", max: " << newMax[0] << "," << newMax[1] << "," << newMax[2] << ")" << endl;
319 }
320
321public:
322 friend ostream& operator<<(std::ostream& out, const TensorDeviceState& t)
323 {
324 out << "CL kernel & buffers status:" << endl;
325 out << "relaxed math: " << (t.relaxedMath?"on":"off");
326 out << ", fp type: " << (t.fpType?"float":"double");
327 out << ", device id: " << t.deviceId;
328 out << ", buffer size: " << t.allocatedBufferSize << endl;
329 return out;
330 }
331
332}; // end state struct
333
334
335void TensorDeviceState::setupDeviceContext(){
336 // first device selection
337 if(device == NULL) cout << "First assignment device to the deviceState" << endl;
338
339 // if buffers were allocated, we deallocate them
340 if(bufferAlloc) finalizeDeviceBuffers();
341
342 // idem for kernels
343 if(kernelAlloc) finalizeDeviceKernels();
344
345 // setting up the right kernel
346 device = icl_get_device(deviceId);
347}
348
349/*
350Setup (optionally create) new kernels for the current context device.
351We suppose that the device is correct (via setupDeviceContext).
352*/
353void TensorDeviceState::setupKernel(bool _relaxedMath, bool _useFloat){
354 cout << "---------------------------------------------------------------------" << endl;
355 cout << "Starting setup of kernels for device " << device->name << endl;
356 assert(deviceId >= 0);
357
358 if(kernelAlloc)
359 finalizeDeviceKernels();
360 relaxedMath = _relaxedMath;
361 fpType = _useFloat;
362 initDeviceKernels();
363}
364
365/*
366Setup (optionally create) new buffers for the current context device.
367We suppose that device is correct (via setupDeviceContext)
368*/
369void TensorDeviceState::setupBuffer(size_t _particleNum, RefPtr<BoundingBox> _bb, float _radius) {
370 cout << "---------------------------------------------------------------------" << endl;
371 cout << "setup buffer for device " << device->name << endl;
372 assert(deviceId >=0 );
373
375 // (double _radius, size_t _size, size_t _cellNum)
376
377 cout << "radius "<< _radius<<", size "<< particleNum <<", cell num"<< cellNum << endl;
378
379 if(bufferAlloc)
380 finalizeDeviceBuffers();
381 //radius = _radius;
382 initDeviceBuffers(particleNum, cellNum);
383}
384
385/* Calculate sizes of CL buffers according to the particle and cell number */
387 assert(_radius != 0.0);
388 assert(_particleNum != 0);
389
390 // Calculate particle-related sizes
391
392 // mulParticleNum is the number of particle rounded to the next multiple of localWokSize
393 if(_particleNum % tensorlocalWorkSize) mulParticleNum = (_particleNum / tensorlocalWorkSize + 1) * (tensorlocalWorkSize);
394 else mulParticleNum = _particleNum;
395
396 // pow2ParticleNum is the number of particle rounded to the next power fo two; this is used for the bitonic sorting algorithm
397 pow2ParticleNum = closestPow2_less(mulParticleNum);
398
399 particleNum = _particleNum;
400
401
402 // Calculate cell-related sizes (updates the param data structure)
403 calculateCellSize(_bb,_radius);
404
405 // setup 4th value
406 param.gridSize[3] = cellNum;
407 param.min[3] = 1;
408 param.cellSize[3] = 1;
409
410 // calculate params on the axis aligned bounding box
411 cout << " particleNum " << particleNum << ", multiple " << mulParticleNum << ", pow2 " << pow2ParticleNum << endl;
412 uint globalWorkSize = mulParticleNum; // this to support non multiple of localWorkSize particle num
413 cout << " cellNum " << cellNum << "(mul "<< mulCellNum << ")"<< endl;
414 cout << " CL threads: global " << globalWorkSize << ", tensor local " << tensorlocalWorkSize <<", sorting local " << LOCAL_SIZE_LIMIT << endl;
415 cout << "struct param" << endl;
416 cout << " * gridSize" << param.gridSize << endl;
417 cout << " * min " << param.min << endl;
418 cout << " * cellSize " << param.cellSize << endl;
419}
420
421
422/* Perform buffer sorting with bitonic sorting */
423void TensorDeviceState::sorting(icl_buffer *d_DstKey, icl_buffer *d_DstVal,
426{
427 // this sorting implementation supports only power-of-two array lengths
430
431 // sorting direction used in the bitonic sorting, 0 is descending, 1 ascending
432 dir = (dir != 0); // direction inversion
434
435
436 // Note(Biagio): there are two versions of bitonic sorting
437 // a. one for data fitting the local memory, using a single kernel
438 // b. one for data bigger than the local memory, using three kernel
439 // More infos are available on the Sorting Networks white paper from NVidia.
440
441 // a) local memory-only version
442 if(arrayLength <= LOCAL_SIZE_LIMIT)
443 {
444 //assert((batch * arrayLength) % LOCAL_SIZE_LIMIT == 0);
445
446 localWorkSize = 32; //LOCAL_SIZE_LIMIT / 2; // IVAN
448
449 //printf("\n => global %u, local %u \n", globalSortSize, localSortSize);
450 icl_run_kernel(sortLocalKernel, 1, &globalWorkSize, &localWorkSize, NULL, NULL, 6, // bitonic sort kernel works on half the size
451 (size_t)0, static_cast<void*>(d_DstKey),
452 (size_t)0, static_cast<void*>(d_DstVal),
453 (size_t)0, static_cast<void*>(d_SrcKey),
454 (size_t)0, static_cast<void*>(d_SrcVal),
455 sizeof(uint), static_cast<void*>(&arrayLength),
456 sizeof(uint), static_cast<void*>(&dir)
457 );
458 }
459
460 // b) global to local memory version
461 else {
462 // launch bitonicSortLocal1
463 localWorkSize = LOCAL_SIZE_LIMIT / 2;
465 icl_run_kernel(sortLocal1Kernel, 1, &globalWorkSize, &localWorkSize, NULL, NULL, 4,
466 (size_t)0, static_cast<void*>(d_DstKey),
467 (size_t)0, static_cast<void*>(d_DstVal),
468 (size_t)0, static_cast<void*>(d_SrcKey),
469 (size_t)0, static_cast<void*>(d_SrcVal)
470 );
471
472 for(uint size = 2 * LOCAL_SIZE_LIMIT; size <= arrayLength; size <<= 1)
473 {
474 for(unsigned stride = size / 2; stride > 0; stride >>= 1)
475 {
476 if(stride >= LOCAL_SIZE_LIMIT)
477 {
478 localWorkSize = LOCAL_SIZE_LIMIT / 4;
480
481 // launch bitonicMergeGlobal
482 icl_run_kernel(sortMergeGlobalKernel, 1, &globalWorkSize, &localWorkSize, NULL, NULL, 8,
483 (size_t)0, static_cast<void*>(d_DstKey),
484 (size_t)0, static_cast<void*>(d_DstVal),
485 (size_t)0, static_cast<void*>(d_DstKey),
486 (size_t)0, static_cast<void*>(d_DstVal),
487 sizeof(uint), static_cast<void*>(&arrayLength),
488 sizeof(uint), static_cast<void*>(&size),
489 sizeof(uint), static_cast<void*>(&stride),
490 sizeof(uint), static_cast<void*>(&dir)
491 );
492 }
493 else
494 {
495 localWorkSize = LOCAL_SIZE_LIMIT / 2;
497
498 // launch bitonicMergeLocal
499 icl_run_kernel(sortMergeLocalKernel, 1, &globalWorkSize, &localWorkSize, NULL, NULL, 8,
500 (size_t)0, static_cast<void*>(d_DstKey),
501 (size_t)0, static_cast<void*>(d_DstVal),
502 (size_t)0, static_cast<void*>(d_DstKey),
503 (size_t)0, static_cast<void*>(d_DstVal),
504 sizeof(uint), static_cast<void*>(&arrayLength),
505 sizeof(uint), static_cast<void*>(&stride),
506 sizeof(uint), static_cast<void*>(&size),
507 sizeof(uint), static_cast<void*>(&dir)
508 );
509 break;
510 } // stride else
511 } // inner for
512 } // outer for
513 } // else - bitonic sorting
514} // end sorting()
515
516
517/*
518 Tensor computation core algorithm:
519 1. initialize hash values to a max unsigned
520 2. calculate the hash for each particle
521 3. particle sorting by hash
522 4. clear cell memory
523 5. for each particle, set the cell start and end, and reorder particle positions'
524 6. tensor calculation
525*/
527{
528 // kernel timing
531
532
533 const size_t globalWorkSize = mulParticleNum; //particleNum;
534 const size_t localWorkSize = tensorlocalWorkSize;
535 const size_t pow2part = pow2ParticleNum;
536
537 assert(particleNum);
538 assert(cellNum);
539 cout << "DEBUG (ls " <<localWorkSize << ", gs "<< globalWorkSize <<", cell "<< cellNum << ")" << endl;
540 cout << *this;
541
542
543 // cout << endl << "CL debug: --- posBuffer (" << endl;
544 // icl_out_float_buffer(posBuffer, 128*3);
545 // icl_out_float_buffer(posBuffer, particleNum);
546 // cout << endl << "CL debug: --- posBuffer )" << endl;
547
548
549 clFinish(device->queue);
550
551 // for debug
553
554 cout << endl << "CL debug: hash calculation" << endl;
555
556 // 1. initialize hash values to a max unsigned (also the one not used now, because of the sorting step)
557 // memset to 0xFFFFFFFFU for cell starts
558 uint memsetValue = 0xFFFFFFFFU;
559 icl_run_kernel(memsetUintKernel, 1, &pow2part, NULL, NULL, begin_event, 3,
560 (size_t)0, static_cast<void*>(hashBuffer),
561 sizeof(uint), static_cast<void*>(&mulCellNum), //cellNum,
562 sizeof(uint), static_cast<void*>(&pow2ParticleNum)
563 );
564
565 // 2. calculating a hash value for each particle
566 icl_run_kernel(hashKernel, 1, &globalWorkSize, &localWorkSize, NULL, NULL, 5,
567 (size_t)0, static_cast<void*>(posBuffer),
568 (size_t)0, static_cast<void*>(hashBuffer),
569 (size_t)0, static_cast<void*>(indexBuffer),
570 (size_t)0, static_cast<void*>(paramBuffer),
571 sizeof(uint), static_cast<void*>(&particleNum)
572 );
573
574 // cout << endl << "CL debug: unsorted hashBuffer" << endl;
575 // icl_out_uint_buffer(hashBuffer, particleNum);
576
577
578 cout << endl << "CL debug: sorting" << endl;
579
580 // 3. particle sorting by hash
581 sorting(hashBuffer, indexBuffer, hashBuffer, indexBuffer,
582 1, // batch
583 pow2ParticleNum, // size
584 1 // 0 is descending
585 );
586
587
588 // cout << endl << "CL debug: indexBuffer (extended & sorted)" << endl;
589 // icl_out_uint_buffer(tensorState->indexBuffer, 30);
590 // cout << endl << "CL debug: hashBuffer (extended & sorted)" << endl;
591 // cout << endl << "sorted hash" << endl;
592 // icl_out_uint_buffer(hashBuffer, particleNum);
593 // cout << endl << "sorted index" << endl;
594 // icl_out_uint_buffer(indexBuffer, particleNum);
595
596 // 4. clear cell memory
597 // memset to 0xFFFFFFFFU for cell starts
598 memsetValue = 0xFFFFFFFFU;
599 const size_t mcn = mulCellNum;
600 icl_run_kernel(memsetUintKernel, 1, &mcn, NULL, NULL, NULL, 3, // FIXME ??
601 (size_t)0, static_cast<void*>(startBuffer),
602 sizeof(uint), static_cast<void*>(&cellNum), // WHY WAS mulCellNum ?
603 sizeof(uint), static_cast<void*>(&memsetValue)
604 );
605
606
607 // 5. compute starts and ends index for each cell, than reordering positions for better locality
608 //icl_run_kernel(startEndKernel, 1, &cellNum, &localWorkSize, NULL, NULL, 8,
609 icl_run_kernel(startEndKernel, 1, &globalWorkSize, &localWorkSize, NULL, NULL, 8,
610 (size_t)0, static_cast<void*>(hashBuffer),
611 (size_t)0, static_cast<void*>(indexBuffer),
612 (size_t)0, static_cast<void*>(posBuffer),
613
614 (size_t)0, static_cast<void*>(startBuffer),
615 (size_t)0, static_cast<void*>(endBuffer),
616 (size_t)0, static_cast<void*>(pSortedBuffer),
617
618 (localWorkSize + 1) * sizeof(uint), NULL, // local memory
619 sizeof(uint), static_cast<void*>(&particleNum)
620 );
621
622
623 // cout << endl << "start" << endl;
624 // icl_out_uint_buffer(startBuffer, cellNum);
625 // cout << endl << "end" << endl;
626 // icl_out_uint_buffer(endBuffer, cellNum);
627
628 // debug print - sorted particle
629 cout << endl << "CL debug: tensor kernel" << endl;
630
631 // icl_out_float_buffer(tensorState->pSortedBuffer, 3 * particleNum);
632 // cout << endl << "t hash" << endl;
633 // icl_out_uint_buffer(hashBuffer, particleNum);
634 // cout << endl << "t start" << endl;
635 // icl_out_uint_buffer(startBuffer, cellNum);
636 // cout << endl << "t end" << endl;
637 // icl_out_uint_buffer(endBuffer, cellNum);
638
639 // 6. tensor computation
641 (size_t)0, static_cast<void*>(pSortedBuffer), // posBuffer
642 (size_t)0, static_cast<void*>(startBuffer),
643 (size_t)0, static_cast<void*>(endBuffer),
644 (size_t)0, static_cast<void*>(indexBuffer),
645 (size_t)0, static_cast<void*>(paramBuffer),
646 sizeof(uint), static_cast<void*>(&particleNum),
647 (size_t)0, static_cast<void*>(tensorBuffer)
648 );
649
650 clFinish(device->queue);
651 cout << "YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY TEST" << endl;
652
655
656 cout << "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXx CL: all kernels launched - kernel time: " << kernelTime << endl;
657}
658
659
660void TensorDeviceState::writeFragment(float *position)
661{
662 overallTime = time.secs();
663 icl_write_buffer(posBuffer, CL_FALSE, sizeof(float) * 4 * particleNum, position, NULL, NULL);
664 icl_write_buffer(paramBuffer, CL_FALSE, sizeof(param_t), &param, NULL, NULL);
665 clFinish(device->queue); // IVAN
666
667 /*
668 cout << "param " << endl
669 << "gridsize " << param.gridSize << endl
670 << "min " << param.min << endl
671 << "cellsize " << param.cellSize << endl;
672
673 printf("--- write fragment %d (\n", particleNum);
674 for(unsigned i=0; i< particleNum; i++){
675 const int b = i*4;
676 float x = position[b], y = position[b+1], z = position[b+2];
677 int gridPos_x, gridPos_y, gridPos_z;
678 gridPos_x = (int)floor((x - param.min[0]) / param.cellSize[0]);
679 gridPos_y = (int)floor((y - param.min[1]) / param.cellSize[1]);
680 gridPos_z = (int)floor((z - param.min[2]) / param.cellSize[2]);
681 gridPos_x = gridPos_x & (param.gridSize[0]-1);
682 gridPos_y = gridPos_y & (param.gridSize[1]-1);
683 gridPos_z = gridPos_z & (param.gridSize[2]-1);
684 unsigned hash = (gridPos_z * param.gridSize[1] + gridPos_y) * param.gridSize[0] + gridPos_x;
685
686 printf("%d ", hash);
687 //printf("%d ", gridPos_x);
688 //printf("%.3f ", x);
689 //printf("%.3f %.3f %.3f, ", x, y, z);
690 //printf("(%f.5 %f.5 %f.5, h %d)", x, y, z, hash);
691 }
692 printf("--- write fragment )\n");
693 */
694}
695
696void TensorDeviceState::readFragment(float *tensor)
697{
698 time.restart();
699 icl_read_buffer(tensorBuffer, CL_TRUE, sizeof(float) * 6 * particleNum, tensor, NULL, NULL);
700 clFinish(device->queue);
701
702 cout << "CL: print TENSOR" << endl;
703 icl_out_float_buffer(tensorBuffer, 120);
704 //icl_out_float_buffer(tensorBuffer, 6 * particleNum);
705}
706
707
709
713struct TensorDevicePool : public std::vector<TensorDeviceState*>
714{
715private:
716 bool relaxedMath, useFloat;
717
718public:
719 RefPtr<Field> positions;
720 RefPtr<Field> tensorField;
721 double radius;
722
723 StringSelection devices, policies;
724
726 : relaxedMath(false), useFloat(true)
727 {
728 cout << "*** TensorDevicePool ***" << endl;
729 initialize();
730 }
731
733 cout << "*** ~TensorDevicePool ***" << endl;
734 dealloc();
735 }
736
737 /* compile kernel for each device */
738 void setupKernel(bool m, bool f){
739 // is recopmilation required?
740 if(relaxedMath != m || useFloat != f){
741 for(int i=0; i<size(); i++)
742 (*this)[i]->setupKernel(m,f); // call the setupKernel for the actual device
743 }
744 // update vars
745 relaxedMath = m;
746 useFloat = f;
747 }
748
749 /* OpenCL device pool initialization. */
750 void initialize()
751 {
752 cout << "OpenCL device state pool initialization" << endl;
755
756 // create device list
757 //dealloc();
758 this->clear();
759 for(int i=0; i<deviceNum; i++){
762 st << i; // important! we add an ID in order to distinguish multiple identical devices (e.g. 2 identical GPUs)
763 st << ". " << dev->name;
764 st << "/" << dev->vendor;
765 devices.push_back(st.str());
766 cout << st.str() << endl;
767
769 push_back(t);
770 }
771
772 // create scheduling policy list
773 policies.clear();
775 for(it = devices.begin(); it != devices.end(); it++) policies.push_back(*it);
776 policies.push_back("RoundRobin");
777
778 // first time kernel compilation
779 for(int i=0; i<size(); i++)
780 (*this)[i]->setupKernel(relaxedMath, useFloat);
781
782 cout << "Done" << endl;
783 }
784
785 void dealloc(){
786 for(int i=0; i<size(); i++)
787 delete (*this)[i];
788 clear();
789 }
790
791private:
792 /* Device list */
793 StringSelection* getDeviceList() {
794 return &devices;
795 }
796
797public:
798 /* Supported scheduling policies */
799 StringSelection* getSchedulingPolicyList() {
800 return &policies;
801 }
802
803 friend ostream& operator<<(std::ostream& out, const TensorDevicePool& t)
804 {
805 out << "TensorDevicePool (radius: " << t.radius << ") devices:";
806 for(unsigned i=0; i<t.size(); i++)
807 cout << i << ")"<< endl << * (t[i]) << endl;
808 return out;
809 }
810
811};
812
813
815
820public:
821 virtual TensorDeviceState& next(TensorDevicePool &pool) = 0;
822 virtual ~SchedulingPolicy(){}
823};
824
826private:
827 int deviceId;
828
829public:
831
833 return *(pool[deviceId]);
834 }
835};
836
838private:
839 int deviceId;
840public:
841 RoundRobinSchedulingPolicy() : deviceId(0) {}
842
844 deviceId = deviceId++ % pool.size();
845 return *(pool[deviceId]);
846 }
847};
848
849
constexpr __enable_if_is_duration< _ToDur > ceil(const duration< _Rep, _Period > &__d)
complex< _Tp > log(const complex< _Tp > &)
basic_stringstream< char > stringstream
basic_ostream< char > ostream
valarray< size_t > stride() const
valarray< size_t > size() const
basic_ostream< _CharT, _Traits > & endl(basic_ostream< _CharT, _Traits > &__os)
ostream cout
constexpr void push_back(const value_type &__x)
constexpr void clear() noexcept
constexpr size_type size() const noexcept
An iterator with an optional DataCreator, which is just a class to intercept creation of data along a...
Definition CreativeIterator.hpp:34
double secs() const noexcept
Definition TensorFromPointCloud_CL_state.hpp:837
Scheduling policy interface.
Definition TensorFromPointCloud_CL_state.hpp:819
Definition TensorFromPointCloud_CL_state.hpp:825
Given a fragmented field of curvilinear coordinates, (3D array of coordinates), build a uniform Grid ...
Definition FAQ.dox:2
note: cannot derive from FloatingSkeletonRenderer as long as independent base class TriangleRenderer ...
STL namespace.
Pool of current OpenCL devices.
Definition TensorFromPointCloud_CL_state.hpp:714
Definition TensorFromPointCloud_CL_state.hpp:74
void compute()
Definition TensorFromPointCloud_CL_state.hpp:526
void setSize(size_t _particleNum, RefPtr< BoundingBox > bb, float radius)
icl_kernel *hashKernelDouble; icl_kernel *startEndKernelDouble; icl_kernel *tensorKernelDouble;
Definition TensorFromPointCloud_CL_state.hpp:386
Definition TensorFromPointCloud_CL_state.hpp:33