fish/TensorFromPointCloud__CL__state_8hpp_source.html

#pragma once


#include <memcore/stringlist.hpp>

#include <memcore/RefPtr.hpp>


#include <ocean/plankton/VObjectState.hpp>

#include <eagle/FixedArray.hpp>

#include <bone/FishField.hpp>


// local work size used in sorting

// warning: this value is shared with bitonic sorting source. If you change this, also change it in the .cl file

#define LOCAL_SIZE_LIMIT 512U


// local work size used in tensor kernel

size_t tensorlocalWorkSize = 128; // XXX this value could be defined into the GUI (64,128,256,512,1024) - it affects performance!


// an Insieme-compatible C interface for OpenCL

#include "lib_icl.h"

#include "lib_iglcl.h"


using namespace Wizt;

using namespace std;


typedef cl_uint uint;

typedef Eagle::FixedArray<float,3> float3;

typedef Eagle::FixedArray<cl_uint,3> uint3;

typedef Eagle::FixedArray<float,4> float4;

typedef Eagle::FixedArray<cl_uint,4> uint4;

using namespace Fiber;


// struct used to send parameters to the device kernel


typedef struct param_t {

        uint4 gridSize; // number of cell per dimension (x,y,z), total number of cell (w)

        float4 min;

        float4 cellSize;

} param_t;


// #define VERBOSE

const int DebugNum = 128; // number of element per buffer prints to be printed


// note(Biagio): we need this method because the code only support power of two

//               grid dimension, but to match the radius we prefer equal or bigger sizes

//                               i.e. having equal or smaller number of cell per dimension - in respect

//               of the ideal one

// return the closest power of two, <= than x

static unsigned int closestPow2_less(unsigned int x){

        int log = 0;

        int y = static_cast<int>(x);

        while (x >>= 1) ++log;

        if(1 << log != y) log++;

        return 1 << log;

}


// return 1 if L is  power of two

unsigned int factorRadix2(unsigned  int L){

        if(!L) return 0;

        else {

                unsigned int log2L;

                for(log2L = 0; (L & 1) == 0; L >>= 1, log2L++);

                return L;

        }

}


/*

State object used for collecting data to be passed to the task and iterator objects.

*/


struct TensorDeviceState

{

private:

        // the following values avoid multiple device, kernel and buffer creations

        bool relaxedMath;                               // true if kernel should be compiled with a relaxed math flag

        bool fpType;                                    // true if kernels use float instead of double

        bool kernelAlloc, bufferAlloc;  // true id currently allocated, false otherwise


        // allocated buffer size (= number of particle)

        uint allocatedBufferSize;


        // sizes used in kernel invocation

        uint particleNum, mulParticleNum, pow2ParticleNum;


        // number of cell

        uint cellNum, mulCellNum;


        // struct used to send parameters to the device

        param_t param;


        // OpenCL device

        icl_device* device;

        uint deviceId;


        // OpenCL buffers

        icl_buffer* posBuffer;

        icl_buffer* tensorBuffer;

        icl_buffer* paramBuffer;

        icl_buffer* hashBuffer;

        icl_buffer* indexBuffer;

        icl_buffer* startBuffer;

        icl_buffer* endBuffer;

        icl_buffer* pSortedBuffer;


        // OpenCL kernels

        icl_kernel* sortLocalKernel;

        icl_kernel* sortLocal1Kernel;

        icl_kernel* sortMergeGlobalKernel;

        icl_kernel* sortMergeLocalKernel;

        icl_kernel* hashKernel;

        icl_kernel* memsetUintKernel;

        icl_kernel* memsetFloatKernel;

        icl_kernel* startEndKernel;

        icl_kernel* tensorKernel;


public:


        void setSize(size_t _particleNum, RefPtr<BoundingBox> bb, float radius);

        void setupDeviceContext();

        void sorting(icl_buffer *d_DstKey, icl_buffer *d_DstVal, icl_buffer *d_SrcKey, icl_buffer *d_SrcVal, uint batch, uint arrayLength, uint dir);


        void setupKernel(bool _relaxedMath, bool _useFloat);

        void setupBuffer(size_t _particleNum, RefPtr<BoundingBox> bb, float _radius);


        void writeFragment(float *);

        void compute();

        void readFragment(float *);


        void displayFragment(){} // XXX Note(Biagio): this method should *meet* a OpenGL context in order to display data without extra data moves


        // timing

        double kernelTime;

        double overallTime;

        Timer time;


        /* Transient data structure with per-device OpenCL data */

        TensorDeviceState(unsigned _deviceId) :

                kernelAlloc(false), bufferAlloc(false),

                allocatedBufferSize(0), device(NULL), deviceId(_deviceId),

                kernelTime(0.0), overallTime(0.0)

        {       setupDeviceContext(); }


        ~TensorDeviceState()

        {

                cout << "*** ~TensorDeviceState ***" <<  endl;


                if(bufferAlloc) finalizeDeviceBuffers();

                if(kernelAlloc) finalizeDeviceKernels();

                icl_release_devices();

        }


private:

   void initDeviceKernels() {

                assert(!kernelAlloc);

                cout << "\n---------------------------------------------------------------------" << endl;

                cout << "CL create kernels for device " << device->name << endl;


                char kernelParam[128] = "";

                if(relaxedMath) sprintf(kernelParam, "-cl-fast-relaxed-math is ON");


                //icl_print_device_infos(device);

                icl_print_device_short_info(device);


                // sorting kernel

                sortLocalKernel       = icl_create_kernel(device, "bitonic_sorting_kernel.cl", "bitonicSortLocal", kernelParam, ICL_SOURCE);

                sortLocal1Kernel      = icl_create_kernel(device, "bitonic_sorting_kernel.cl", "bitonicSortLocal1", kernelParam, ICL_SOURCE);

                sortMergeGlobalKernel = icl_create_kernel(device, "bitonic_sorting_kernel.cl", "bitonicMergeGlobal", kernelParam, ICL_SOURCE);

                sortMergeLocalKernel  = icl_create_kernel(device, "bitonic_sorting_kernel.cl", "bitonicMergeLocal", kernelParam, ICL_SOURCE);


                memsetUintKernel      = icl_create_kernel(device, "tensor_from_point_kernel.cl", "memset_uint", kernelParam, ICL_SOURCE);

                memsetFloatKernel     = icl_create_kernel(device, "tensor_from_point_kernel.cl", "memset_float", kernelParam, ICL_SOURCE);


                // particle kernels

                hashKernel            = icl_create_kernel(device, "tensor_from_point_kernel.cl", "calcHash", kernelParam, ICL_SOURCE);

                startEndKernel        = icl_create_kernel(device, "tensor_from_point_kernel.cl", "startEndReorder", kernelParam, ICL_SOURCE);

                tensorKernel          = icl_create_kernel(device, "tensor_from_point_kernel.cl", "tensorAnalysis", kernelParam, ICL_SOURCE);


                /*

                // particle kernels - double version

                hashKernel            = icl_create_kernel(device, "tensor_from_point_kernel_double.cl", "calcHash", "", ICL_SOURCE);

                startEndKernel        = icl_create_kernel(device, "tensor_from_point_kernel_double.cl", "startEndReorder", "", ICL_SOURCE);

                tensorKernel          = icl_create_kernel(device, "tensor_from_point_kernel_double.cl", "tensorAnalysis", "", ICL_SOURCE);

                */


                kernelAlloc = true;

                cout << "CL: kernels initialization done" << endl;

                cout << "---------------------------------------------------------------------\n" << endl;

        }


        void finalizeDeviceKernels() {

                cout << "\n---------------------------------------------------------------------" << endl;

                cout << "CL: releasing kernels for device " << device->name << endl;

                assert(kernelAlloc);


                // kernel deallocation

                icl_release_kernel(sortLocalKernel);

                icl_release_kernel(sortLocal1Kernel);

                icl_release_kernel(sortMergeGlobalKernel);

                icl_release_kernel(sortMergeLocalKernel);


                icl_release_kernel(memsetUintKernel);

                icl_release_kernel(memsetFloatKernel);


                icl_release_kernel(hashKernel);

                icl_release_kernel(startEndKernel);

                icl_release_kernel(tensorKernel);


                /* XXX

                icl_release_kernel(hashKernelDouble);

                icl_release_kernel(startEndKernelDouble);

                icl_release_kernel(tensorKernelDouble);

                */


                printf("CL: kernels released\n");

                kernelAlloc = false;

                cout << "---------------------------------------------------------------------\n" << endl;

        }


        void initDeviceBuffers(size_t _size, size_t cellNum) {

                // FIXME: Remove unused _size

                cout << "\n---------------------------------------------------------------------" << endl;

                cout << "CL create buffers for device " << device->name << endl;


                assert(!bufferAlloc);

                cout << "particleNum "<< particleNum << ", pow2ParticleNum " << pow2ParticleNum << ", cellNum " << cellNum <<  endl;


                posBuffer     = icl_create_buffer(device, CL_MEM_READ_ONLY, sizeof(float) * 4 * particleNum);  // in

                pSortedBuffer = icl_create_buffer(device, CL_MEM_READ_WRITE, sizeof(float) * 4 * particleNum);

                paramBuffer   = icl_create_buffer(device, CL_MEM_READ_ONLY,  sizeof(param_t) );                // in

                // buffer of tensor metrics  (6 elements - half matrix)

                tensorBuffer  = icl_create_buffer(device, CL_MEM_WRITE_ONLY, sizeof(float) * 6 * particleNum); // out

                // hash and index buffer buffers are involved in a bitonic sorting

                hashBuffer    = icl_create_buffer(device, CL_MEM_READ_WRITE, sizeof(uint) * pow2ParticleNum);

                indexBuffer   = icl_create_buffer(device, CL_MEM_READ_WRITE, sizeof(uint) * pow2ParticleNum);

                startBuffer   = icl_create_buffer(device, CL_MEM_READ_WRITE, sizeof(uint) * cellNum );

                endBuffer     = icl_create_buffer(device, CL_MEM_READ_WRITE, sizeof(uint) * cellNum );

                allocatedBufferSize = particleNum;

                bufferAlloc = true;

                cout << "---------------------------------------------------------------------\n" << endl;

        }


        void finalizeDeviceBuffers()

        {

                cout << "\n---------------------------------------------------------------------" << endl;

                cout << endl << "CL: releasing buffers for device " << posBuffer->dev->name << endl;

                assert(bufferAlloc);


                icl_release_buffer(posBuffer);

                icl_release_buffer(pSortedBuffer);

                icl_release_buffer(paramBuffer);

                icl_release_buffer(tensorBuffer);

                icl_release_buffer(hashBuffer);

                icl_release_buffer(indexBuffer);

                icl_release_buffer(startBuffer);

                icl_release_buffer(endBuffer);


                allocatedBufferSize = -1;

                bufferAlloc = false;

                cout << "---------------------------------------------------------------------\n" << endl;

        }


private:

        /* This method update the param data structure according tot he BBox and radius */

        void calculateCellSize(RefPtr<BoundingBox> bb, float _radius) {

                const float epsilon = 0.0005;

                // the minimum is moved a bit back to avoid overlapping

                param.min[0] = bb->mincoord()[0] - epsilon;

                param.min[1] = bb->mincoord()[1] - epsilon;

                param.min[2] = bb->mincoord()[2] - epsilon;

                Eagle::PhysicalSpace::tvector diag = bb->diagonal();

                cout << "BoundingBox (";

                cout << "min: " << param.min[0] << "," << param.min[1] << "," << param.min[2] ;

                cout << ", max: " << bb->maxcoord()[0] << "," << bb->maxcoord()[1] << "," << bb->maxcoord()[2] << ")" << endl;


                // we assume that the cell size is the same size of the particle (double its radius)

                // this means that each particle can cover only a limited number of grid cells (8 in 3 dimensions)

                float newMax[3];

                uint axis_cellNum[3];

                float f_cellNum[3];

                for(int i=0; i<3; i++) {

                        if(diag[i] <= _radius) diag[i] =  2.f * _radius;

                        f_cellNum[i] = diag[i] /  (2.f*_radius);

                        axis_cellNum[i] = (uint) ceil(f_cellNum[i]);

                        //                      cout << "dim" << i << " float_cellNum:" << f_cellNum <<" cellNum: " << axis_cellNum << endl;

                        // round up the grid size to a power of two - without any changes to the the cellSize

                        // (just more not used cells that, because of the hashing schema, they're not really a overhead)

                        param.gridSize[i] = closestPow2_less(axis_cellNum[i]);

                        param.cellSize[i] = 2.f * _radius;

                        cout << "gridSize*: " << param.gridSize[i] << " " << endl;

                        // new bounding box (for debug)

                        newMax[i] = param.min[i] + param.cellSize[i] * param.gridSize[i];

                }


                cout << "cell num "

                         << "(" << f_cellNum[0] << "," << f_cellNum[1] << "," << f_cellNum[2] << ") => "

                         << "(" << axis_cellNum[0] << "," << axis_cellNum[1] << "," << axis_cellNum[2] << ") => "

                         << "(" << param.gridSize[0] << "," << param.gridSize[1] << "," << param.gridSize[2] << ") " << endl;


                // calculating the number of cell

                cellNum = param.gridSize[0] * param.gridSize[1] * param.gridSize[2];


                // cell num with size multiple of local work size

                if(cellNum % tensorlocalWorkSize)       mulCellNum = (cellNum / tensorlocalWorkSize + 1) * (tensorlocalWorkSize);

                else mulCellNum = cellNum;

                cout << "Extended BoundingBox (min: " << param.min[0] << "," << param.min[1] << "," << param.min[2] << ", max: " << newMax[0] << "," << newMax[1] << "," << newMax[2] << ")" << endl;

        }


public:

        friend ostream& operator<<(std::ostream& out, const TensorDeviceState& t)

        {

                out << "CL kernel & buffers status:" << endl;

                out << "relaxed math: " << (t.relaxedMath?"on":"off");

                out << ", fp type: " << (t.fpType?"float":"double");

                out << ", device id: " << t.deviceId;

                out << ", buffer size: " << t.allocatedBufferSize << endl;

                return out;

        }


}; // end state struct


void TensorDeviceState::setupDeviceContext(){

        // first device selection

        if(device == NULL)      cout << "First assignment device to the deviceState" << endl;


        // if buffers were allocated, we deallocate them

        if(bufferAlloc) finalizeDeviceBuffers();


        // idem for kernels

        if(kernelAlloc) finalizeDeviceKernels();


        // setting up the right kernel

        device = icl_get_device(deviceId);

}


/*

Setup (optionally create) new kernels for the current context device.

We suppose that the device is correct (via setupDeviceContext).

*/

void TensorDeviceState::setupKernel(bool _relaxedMath, bool _useFloat){

        cout << "---------------------------------------------------------------------" << endl;

        cout << "Starting setup of kernels for device " << device->name << endl;

        assert(deviceId >= 0);


        if(kernelAlloc)

                finalizeDeviceKernels();

        relaxedMath = _relaxedMath;

        fpType = _useFloat;

        initDeviceKernels();

}


/*

Setup (optionally create) new buffers for the current context device.

We suppose that device is correct (via setupDeviceContext)

*/

void TensorDeviceState::setupBuffer(size_t _particleNum, RefPtr<BoundingBox> _bb, float _radius) {

        cout << "---------------------------------------------------------------------" << endl;

        cout << "setup buffer for device " << device->name << endl;

        assert(deviceId >=0 );


        setSize(_particleNum, _bb, _radius);

        //      (double _radius, size_t _size, size_t _cellNum)


        cout << "radius "<< _radius<<", size "<<  particleNum <<", cell num"<< cellNum << endl;


        if(bufferAlloc)

                finalizeDeviceBuffers();

        //radius = _radius;

        initDeviceBuffers(particleNum, cellNum);

}


/* Calculate sizes of CL buffers according to the particle and cell number */


void TensorDeviceState::setSize(size_t _particleNum, RefPtr<BoundingBox> _bb, float _radius) {

        assert(_radius != 0.0);

        assert(_particleNum != 0);


        // Calculate particle-related sizes


        // mulParticleNum is the number of particle rounded to the next multiple of localWokSize

        if(_particleNum % tensorlocalWorkSize)  mulParticleNum = (_particleNum / tensorlocalWorkSize + 1) * (tensorlocalWorkSize);

        else mulParticleNum = _particleNum;


        // pow2ParticleNum is the number of particle rounded to the next power fo two; this is used for the bitonic sorting algorithm

        pow2ParticleNum     = closestPow2_less(mulParticleNum);


        particleNum = _particleNum;


        // Calculate cell-related sizes (updates the param data structure)

        calculateCellSize(_bb,_radius);


        // setup 4th value

        param.gridSize[3] = cellNum;

        param.min[3] = 1;

        param.cellSize[3] = 1;


        // calculate params on the axis aligned bounding box

        cout << " particleNum " << particleNum << ", multiple " << mulParticleNum << ", pow2 " << pow2ParticleNum << endl;

        uint globalWorkSize = mulParticleNum; // this to support non multiple of localWorkSize particle num

        cout << "  cellNum " << cellNum << "(mul "<< mulCellNum << ")"<< endl;

        cout << "  CL threads: global " <<  globalWorkSize << ", tensor local " << tensorlocalWorkSize <<", sorting local " << LOCAL_SIZE_LIMIT << endl;

        cout << "struct param" << endl;

        cout << " * gridSize" << param.gridSize << endl;

        cout << " * min " << param.min << endl;

        cout << " * cellSize " << param.cellSize << endl;

}


/* Perform buffer sorting with bitonic sorting */

void TensorDeviceState::sorting(icl_buffer *d_DstKey, icl_buffer *d_DstVal,

                                                                icl_buffer *d_SrcKey, icl_buffer *d_SrcVal,

                                                                uint batch, uint arrayLength, uint dir)

{

        // this sorting implementation supports only power-of-two array lengths

        uint factorizationRemainder = factorRadix2(arrayLength);

        assert(factorizationRemainder == 1);


        // sorting direction used in the bitonic sorting, 0 is descending, 1 ascending

        dir = (dir != 0); // direction inversion

        size_t localWorkSize, globalWorkSize;


        // Note(Biagio): there are two versions of bitonic sorting

        //      a. one for data fitting the local memory, using a single kernel

        //      b. one for data bigger than the local memory, using three kernel

        // More infos are available on the Sorting Networks white paper from NVidia.


        // a) local memory-only version

        if(arrayLength <= LOCAL_SIZE_LIMIT)

        {

                //assert((batch * arrayLength) % LOCAL_SIZE_LIMIT == 0);


                localWorkSize  = 32;  //LOCAL_SIZE_LIMIT / 2; // IVAN

                globalWorkSize = batch * arrayLength / 2;


                //printf("\n => global %u, local %u \n", globalSortSize, localSortSize);

                icl_run_kernel(sortLocalKernel, 1, &globalWorkSize, &localWorkSize, NULL, NULL, 6,   // bitonic sort kernel works on half the size

                                           (size_t)0, static_cast<void*>(d_DstKey),

                                           (size_t)0, static_cast<void*>(d_DstVal),

                                           (size_t)0, static_cast<void*>(d_SrcKey),

                                           (size_t)0, static_cast<void*>(d_SrcVal),

                                           sizeof(uint), static_cast<void*>(&arrayLength),

                                           sizeof(uint), static_cast<void*>(&dir)

                                           );

        }


        // b) global to local memory version

        else {

                // launch bitonicSortLocal1

                localWorkSize  = LOCAL_SIZE_LIMIT / 2;

                globalWorkSize = batch * arrayLength / 2;

                icl_run_kernel(sortLocal1Kernel, 1, &globalWorkSize, &localWorkSize, NULL, NULL, 4,

                                           (size_t)0, static_cast<void*>(d_DstKey),

                                           (size_t)0, static_cast<void*>(d_DstVal),

                                           (size_t)0, static_cast<void*>(d_SrcKey),

                                           (size_t)0, static_cast<void*>(d_SrcVal)

                                           );


                for(uint size = 2 * LOCAL_SIZE_LIMIT; size <= arrayLength; size <<= 1)

                {

                        for(unsigned stride = size / 2; stride > 0; stride >>= 1)

                        {

                                if(stride >= LOCAL_SIZE_LIMIT)

                                {

                                        localWorkSize  = LOCAL_SIZE_LIMIT / 4;

                                        globalWorkSize = batch * arrayLength / 2;


                                        // launch bitonicMergeGlobal

                                        icl_run_kernel(sortMergeGlobalKernel, 1, &globalWorkSize, &localWorkSize, NULL, NULL, 8,

                                                                   (size_t)0, static_cast<void*>(d_DstKey),

                                                                   (size_t)0, static_cast<void*>(d_DstVal),

                                                                   (size_t)0, static_cast<void*>(d_DstKey),

                                                                   (size_t)0, static_cast<void*>(d_DstVal),

                                                                   sizeof(uint), static_cast<void*>(&arrayLength),

                                                                   sizeof(uint), static_cast<void*>(&size),

                                                                   sizeof(uint), static_cast<void*>(&stride),

                                                                   sizeof(uint), static_cast<void*>(&dir)

                                                                   );

                                }

                                else

                                {

                                        localWorkSize  = LOCAL_SIZE_LIMIT / 2;

                                        globalWorkSize = batch * arrayLength / 2;


                                        // launch bitonicMergeLocal

                                        icl_run_kernel(sortMergeLocalKernel, 1, &globalWorkSize, &localWorkSize, NULL, NULL, 8,

                                                                   (size_t)0, static_cast<void*>(d_DstKey),

                                                                   (size_t)0, static_cast<void*>(d_DstVal),

                                                                   (size_t)0, static_cast<void*>(d_DstKey),

                                                                   (size_t)0, static_cast<void*>(d_DstVal),

                                                                   sizeof(uint), static_cast<void*>(&arrayLength),

                                                                   sizeof(uint), static_cast<void*>(&stride),

                                                                   sizeof(uint), static_cast<void*>(&size),

                                                                   sizeof(uint), static_cast<void*>(&dir)

                                                                   );

                                        break;

                                } // stride else

                        } // inner for

                } // outer for

        } // else - bitonic sorting

} // end sorting()


/*

        Tensor computation core algorithm:

        1. initialize hash values to a max unsigned

        2. calculate the hash for each particle

        3. particle sorting by hash

        4. clear cell memory

        5. for each particle, set the cell start and end, and reorder particle positions'

        6. tensor calculation

*/


void TensorDeviceState::compute()

{

        // kernel timing

        icl_event *begin_event = icl_create_event();

        icl_event *end_event = icl_create_event();


        const size_t globalWorkSize = mulParticleNum; //particleNum;

        const size_t localWorkSize = tensorlocalWorkSize;

        const size_t pow2part = pow2ParticleNum;


        assert(particleNum);

        assert(cellNum);

        cout << "DEBUG (ls " <<localWorkSize << ", gs "<< globalWorkSize <<", cell "<< cellNum << ")" << endl;

        cout << *this;


        // cout << endl << "CL debug: --- posBuffer (" << endl;

        // icl_out_float_buffer(posBuffer, 128*3);

        // icl_out_float_buffer(posBuffer, particleNum);

        // cout << endl << "CL debug: --- posBuffer )" << endl;


        clFinish(device->queue);


        // for debug


        cout << endl << "CL debug: hash calculation" << endl;


        // 1. initialize hash values to a max unsigned (also the one not used now, because of the sorting step)

        // memset to 0xFFFFFFFFU for cell starts

        uint memsetValue = 0xFFFFFFFFU;

        icl_run_kernel(memsetUintKernel, 1, &pow2part, NULL, NULL, begin_event, 3,

                                                                (size_t)0, static_cast<void*>(hashBuffer),

                                                                sizeof(uint), static_cast<void*>(&mulCellNum), //cellNum,

                                                                sizeof(uint), static_cast<void*>(&pow2ParticleNum)

        );


        // 2. calculating a hash value for each particle

        icl_run_kernel(hashKernel, 1, &globalWorkSize, &localWorkSize, NULL, NULL, 5,

                                                                        (size_t)0, static_cast<void*>(posBuffer),

                                                                        (size_t)0, static_cast<void*>(hashBuffer),

                                                                        (size_t)0, static_cast<void*>(indexBuffer),

                                                                        (size_t)0, static_cast<void*>(paramBuffer),

                                                                        sizeof(uint), static_cast<void*>(&particleNum)

        );


        //      cout << endl << "CL debug: unsorted hashBuffer" << endl;

        //      icl_out_uint_buffer(hashBuffer, particleNum);


        cout << endl << "CL debug: sorting" << endl;


        // 3. particle sorting by hash

        sorting(hashBuffer, indexBuffer, hashBuffer, indexBuffer,

                        1,  // batch

                        pow2ParticleNum, // size

                        1 // 0 is descending

        );


        // cout << endl << "CL debug: indexBuffer (extended & sorted)" << endl;

        // icl_out_uint_buffer(tensorState->indexBuffer, 30);

        // cout << endl << "CL debug: hashBuffer (extended & sorted)" << endl;

        // cout << endl << "sorted hash" << endl;

        // icl_out_uint_buffer(hashBuffer, particleNum);

        // cout << endl << "sorted index" << endl;

        // icl_out_uint_buffer(indexBuffer, particleNum);


        // 4. clear cell memory

        // memset to 0xFFFFFFFFU for cell starts

        memsetValue = 0xFFFFFFFFU;

        const size_t mcn = mulCellNum;

        icl_run_kernel(memsetUintKernel, 1, &mcn, NULL, NULL, NULL, 3, // FIXME ??

                                                        (size_t)0, static_cast<void*>(startBuffer),

                                                        sizeof(uint), static_cast<void*>(&cellNum), // WHY WAS mulCellNum ?

                                                        sizeof(uint), static_cast<void*>(&memsetValue)

        );


        // 5.  compute starts and ends index for each cell, than reordering positions for better locality

        //icl_run_kernel(startEndKernel, 1, &cellNum, &localWorkSize, NULL, NULL, 8,

        icl_run_kernel(startEndKernel, 1, &globalWorkSize, &localWorkSize, NULL, NULL, 8,

                                                        (size_t)0, static_cast<void*>(hashBuffer),

                                                        (size_t)0, static_cast<void*>(indexBuffer),

                                                        (size_t)0, static_cast<void*>(posBuffer),


                                                        (size_t)0, static_cast<void*>(startBuffer),

                                                        (size_t)0, static_cast<void*>(endBuffer),

                                                        (size_t)0, static_cast<void*>(pSortedBuffer),


                                                        (localWorkSize + 1) * sizeof(uint), NULL, // local memory

                                                        sizeof(uint), static_cast<void*>(&particleNum)

        );


        // cout << endl << "start" << endl;

        // icl_out_uint_buffer(startBuffer, cellNum);

        // cout << endl << "end" << endl;

        // icl_out_uint_buffer(endBuffer, cellNum);


        // debug print - sorted particle

        cout << endl << "CL debug: tensor kernel" << endl;


        // icl_out_float_buffer(tensorState->pSortedBuffer, 3 * particleNum);

        // cout << endl << "t hash" << endl;

        // icl_out_uint_buffer(hashBuffer, particleNum);

        // cout << endl << "t start" << endl;

        // icl_out_uint_buffer(startBuffer, cellNum);

        // cout << endl << "t end" << endl;

        // icl_out_uint_buffer(endBuffer, cellNum);


        // 6. tensor computation

        icl_run_kernel(tensorKernel, 1, &globalWorkSize, &localWorkSize, NULL, end_event, 7,

                                                        (size_t)0, static_cast<void*>(pSortedBuffer),   // posBuffer

                                                        (size_t)0, static_cast<void*>(startBuffer),

                                                        (size_t)0, static_cast<void*>(endBuffer),

                                                        (size_t)0, static_cast<void*>(indexBuffer),

                                                        (size_t)0, static_cast<void*>(paramBuffer),

                                                        sizeof(uint), static_cast<void*>(&particleNum),

                                                        (size_t)0, static_cast<void*>(tensorBuffer)

        );


        clFinish(device->queue);

        cout << "YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY TEST" << endl;


        kernelTime = icl_profile_events(begin_event, ICL_STARTED, end_event, ICL_FINISHED, ICL_SEC);

        icl_release_events(2, begin_event, end_event);


        cout << "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXx CL: all kernels launched - kernel time: " << kernelTime << endl;

}


void TensorDeviceState::writeFragment(float *position)

{

        overallTime = time.secs();

        icl_write_buffer(posBuffer,   CL_FALSE, sizeof(float) * 4 * particleNum, position, NULL, NULL);

        icl_write_buffer(paramBuffer, CL_FALSE, sizeof(param_t), &param, NULL, NULL);

        clFinish(device->queue); // IVAN


        /*

        cout << "param " << endl

                << "gridsize " << param.gridSize << endl

                << "min " << param.min << endl

                << "cellsize " << param.cellSize << endl;


        printf("--- write fragment %d (\n", particleNum);

        for(unsigned i=0; i< particleNum; i++){

                const int b = i*4;

                float x =  position[b], y =  position[b+1], z =  position[b+2];

                int gridPos_x, gridPos_y, gridPos_z;

                gridPos_x = (int)floor((x - param.min[0]) / param.cellSize[0]);

                gridPos_y = (int)floor((y - param.min[1]) / param.cellSize[1]);

                gridPos_z = (int)floor((z - param.min[2]) / param.cellSize[2]);

                gridPos_x = gridPos_x & (param.gridSize[0]-1);

                gridPos_y = gridPos_y & (param.gridSize[1]-1);

                gridPos_z = gridPos_z & (param.gridSize[2]-1);

                unsigned hash = (gridPos_z * param.gridSize[1] + gridPos_y) * param.gridSize[0] + gridPos_x;


                printf("%d ", hash);

                //printf("%d ", gridPos_x);

                //printf("%.3f ", x);

                //printf("%.3f %.3f %.3f, ", x, y, z);

                //printf("(%f.5 %f.5 %f.5, h %d)", x, y, z, hash);

        }

        printf("--- write fragment )\n");

        */

}


void TensorDeviceState::readFragment(float *tensor)

{

        time.restart();

        icl_read_buffer(tensorBuffer, CL_TRUE, sizeof(float) * 6 * particleNum, tensor, NULL, NULL);

        clFinish(device->queue);


        cout << "CL: print TENSOR" << endl;

        icl_out_float_buffer(tensorBuffer, 120);

        //icl_out_float_buffer(tensorBuffer, 6 * particleNum);

}


struct TensorDevicePool : public std::vector<TensorDeviceState*>

{

private:

        bool relaxedMath, useFloat;


public:

        RefPtr<Field> positions;

        RefPtr<Field> tensorField;

        double radius;


        StringSelection devices, policies;


        TensorDevicePool()

                : relaxedMath(false), useFloat(true)

        {

                cout << "*** TensorDevicePool ***" << endl;

                initialize();

        }


        ~TensorDevicePool(){

                cout << "*** ~TensorDevicePool ***" << endl;

                dealloc();

        }


        /* compile kernel for each device */

        void setupKernel(bool m, bool f){

                // is recopmilation required?

                if(relaxedMath != m || useFloat != f){

                        for(int i=0; i<size(); i++)

                                (*this)[i]->setupKernel(m,f); // call the setupKernel for the actual device

                }

                // update vars

                relaxedMath = m;

                useFloat = f;

        }


        /* OpenCL device pool initialization. */

        void initialize()

        {

                cout << "OpenCL device state pool initialization" << endl;

                icl_init_devices(ICL_ALL);

                int deviceNum = icl_get_num_devices();


                // create device list

                //dealloc();

                this->clear();

                for(int i=0; i<deviceNum; i++){

                        icl_device *dev = icl_get_device(i);

                        stringstream st;

                        st << i;  // important! we add an ID in order to distinguish multiple identical devices (e.g. 2 identical GPUs)

                        st << ". " << dev->name;

                        st << "/" << dev->vendor;

                        devices.push_back(st.str());

                        cout << st.str() << endl;


                        TensorDeviceState *t = new TensorDeviceState(i);

                        push_back(t);

                }


                // create scheduling policy list

                policies.clear();

                std::list<string>::iterator it;

                for(it = devices.begin(); it != devices.end(); it++) policies.push_back(*it);

                policies.push_back("RoundRobin");


                // first time kernel compilation

                for(int i=0; i<size(); i++)

                        (*this)[i]->setupKernel(relaxedMath, useFloat);


                cout << "Done" << endl;

        }


        void dealloc(){

                for(int i=0; i<size(); i++)

                        delete (*this)[i];

                clear();

        }


private:

        /* Device list */

        StringSelection* getDeviceList() {

                return &devices;

        }


public:

        /* Supported scheduling policies */

        StringSelection* getSchedulingPolicyList() {

                return &policies;

        }


        friend ostream& operator<<(std::ostream& out, const TensorDevicePool& t)

        {

                out << "TensorDevicePool (radius: " <<  t.radius << ") devices:";

                for(unsigned i=0; i<t.size(); i++)

                        cout << i << ")"<< endl << * (t[i]) << endl;

                return out;

        }


};


class SchedulingPolicy {

public:

        virtual TensorDeviceState& next(TensorDevicePool &pool) = 0;

        virtual ~SchedulingPolicy(){}

};


class SingleDeviceSchedulingPolicy : public SchedulingPolicy {

private:

        int deviceId;


public:

        SingleDeviceSchedulingPolicy(int _deviceId) : deviceId(_deviceId){}


        TensorDeviceState& next(TensorDevicePool &pool) {

                return *(pool[deviceId]);

        }

};


class RoundRobinSchedulingPolicy : public SchedulingPolicy {

private:

        int deviceId;

public:

        RoundRobinSchedulingPolicy() : deviceId(0) {}


        TensorDeviceState& next(TensorDevicePool &pool) {

                deviceId = deviceId++ % pool.size();

                return *(pool[deviceId]);

        }

};


ceil
constexpr __enable_if_is_duration< _ToDur > ceil(const duration< _Rep, _Period > &__d)

std::log
complex< _Tp > log(const complex< _Tp > &)

stringstream
basic_stringstream< char > stringstream

ostream
basic_ostream< char > ostream

stride
valarray< size_t > stride() const

size
valarray< size_t > size() const

std::endl
basic_ostream< _CharT, _Traits > & endl(basic_ostream< _CharT, _Traits > &__os)

std::cout
ostream cout

std::list

std::vector

std::vector< TensorDeviceState * >::push_back
constexpr void push_back(const value_type &__x)

std::vector< TensorDeviceState * >::clear
constexpr void clear() noexcept

std::vector< TensorDeviceState * >::size
constexpr size_type size() const noexcept

Eagle::FixedArray

Fiber::CreativeIterator
An iterator with an optional DataCreator, which is just a class to intercept creation of data along a...
Definition CreativeIterator.hpp:34

MemCore::Timer

MemCore::Timer::secs
double secs() const noexcept

RoundRobinSchedulingPolicy
Definition TensorFromPointCloud_CL_state.hpp:837

SchedulingPolicy
Scheduling policy interface.
Definition TensorFromPointCloud_CL_state.hpp:819

SingleDeviceSchedulingPolicy
Definition TensorFromPointCloud_CL_state.hpp:825

Wizt::StringSelection

Fiber
Given a fragmented field of curvilinear coordinates, (3D array of coordinates), build a uniform Grid ...
Definition FAQ.dox:2

Wizt
note: cannot derive from FloatingSkeletonRenderer as long as independent base class TriangleRenderer ...

std
STL namespace.

TensorDevicePool
Pool of current OpenCL devices.
Definition TensorFromPointCloud_CL_state.hpp:714

TensorDeviceState
Definition TensorFromPointCloud_CL_state.hpp:74

TensorDeviceState::compute
void compute()
Definition TensorFromPointCloud_CL_state.hpp:526

TensorDeviceState::setSize
void setSize(size_t _particleNum, RefPtr< BoundingBox > bb, float radius)
icl_kernel *hashKernelDouble; icl_kernel *startEndKernelDouble; icl_kernel *tensorKernelDouble;
Definition TensorFromPointCloud_CL_state.hpp:386

param_t
Definition TensorFromPointCloud_CL_state.hpp:33