Fraxinus  2023.01.05-dev+develop.0da12
An IGT application
cxVNNclAlgorithm.cpp
Go to the documentation of this file.
1 /*=========================================================================
2 This file is part of CustusX, an Image Guided Therapy Application.
3 
4 Copyright (c) SINTEF Department of Medical Technology.
5 All rights reserved.
6 
7 CustusX is released under a BSD 3-Clause license.
8 
9 See Lisence.txt (https://github.com/SINTEFMedtek/CustusX/blob/master/License.txt) for details.
10 =========================================================================*/
11 
12 #include "cxVNNclAlgorithm.h"
13 
14 #include <QFileInfo>
15 #include "cxLogger.h"
16 #include "cxTypeConversions.h"
17 #include "HelperFunctions.hpp"
18 #include <vtkImageData.h>
19 #include <recConfig.h>
20 #include "cxVolumeHelpers.h"
21 
22 namespace cx
23 {
25  mRuntime(new oul::RuntimeMeasurementsManager()),
26  mKernelMeasurementName("vnncl_execute_kernel")
27 {
28  oul::DeviceCriteria criteria;
29  criteria.setTypeCriteria(oul::DEVICE_TYPE_GPU);
30  bool enableProfilling = true;
31  mOulContex = oul::opencl()->createContextPtr(criteria, NULL, enableProfilling);
32  if(enableProfilling)
33  mRuntime->enable();
34 }
35 
37 {}
38 
39 bool VNNclAlgorithm::initCL(QString kernelFilePath, int nMaxPlanes, int nPlanes, int method, int planeMethod, int nStarts, float brightnessWeight, float newnessWeight)
40 {
41  // READ KERNEL FILE
42  report(QString("Kernel path: %1").arg(kernelFilePath));
43  std::string source = oul::readFile(kernelFilePath.toStdString());
44 
45  QFileInfo path(kernelFilePath);
46  path.absolutePath();
47 
48  // BUILD PROGRAM
49  cl::Program clprogram = this->buildCLProgram(source, path.absolutePath().toStdString(), nMaxPlanes, nPlanes, method, planeMethod, nStarts,brightnessWeight, newnessWeight);
50 
51  // CREATE KERNEL
52  mKernel = mOulContex->createKernel(clprogram, "voxel_methods");
53 
54  return true;
55 }
56 
57 cl::Program VNNclAlgorithm::buildCLProgram(std::string program_src, std::string kernelPath, int nMaxPlanes, int nPlanes, int method, int planeMethod, int nStarts, float newnessWeight, float brightnessWeight)
58 {
59  cl::Program retval;
60  cl_int err = 0;
61  VECTOR_CLASS<cl::Device> devices;
62  try
63  {
64  QString define = "-D MAX_PLANES=%1 -D N_PLANES=%2 -D METHOD=%3 -D PLANE_METHOD=%4 -D MAX_MULTISTART_STARTS=%5 -D NEWNESS_FACTOR=%6 -D BRIGHTNESS_FACTOR=%7";
65  define = define.arg(nMaxPlanes).arg(nPlanes).arg(method).arg(planeMethod).arg(nStarts).arg(newnessWeight).arg(brightnessWeight);
66 
67  int programID = mOulContex->createProgramFromString(program_src, "-I " + std::string(kernelPath) + " " + define.toStdString());
68  retval = mOulContex->getProgram(programID);
69 
70  } catch (cl::Error &error)
71  {
72  reportError("Could not build a OpenCL program. Reason: "+QString(error.what()));
73  reportError(qstring_cast(oul::getCLErrorString(error.err())));
74  }
75  return retval;
76 }
77 
78 bool VNNclAlgorithm::initializeFrameBlocks(frameBlock_t* framePointers, int numBlocks, ProcessedUSInputDataPtr inputFrames)
79 {
80  // Compute the size of each frame in bytes
81  Eigen::Array3i dims = inputFrames->getDimensions();
82  size_t frameSize = dims[0] * dims[1];
83  size_t numFrames = dims[2];
84  report(QString("Input dims: (%1, %2, %3)").arg(dims[0]).arg(dims[1]).arg(dims[2]));
85 
86  // Find out how many frames needs to be in each block
87  size_t framesPerBlock = numFrames / numBlocks;
88  report(QString("Frames: %1, Blocks: %2, Frames per block: %3").arg(numFrames).arg(numBlocks).arg(framesPerBlock));
89 
90  // Some blocks will need to contain one extra frame
91  // (numFrames and numBlocks is probably not evenly divisible)
92  size_t numBigBlocks = numFrames % numBlocks;
93 
94  // Allocate the big blocks
95  report(QString("Allocating %1 big blocks outside of OpenCL").arg(numBigBlocks));
96  for (unsigned int block = 0; block < numBigBlocks; block++)
97  {
98  framePointers[block].length = (1 + framesPerBlock) * frameSize;
99  framePointers[block].data = new unsigned char[framePointers[block].length];
100  }
101 
102  // Then the small ones
103  report(QString("Allocating %1 small blocks outside of OpenCL").arg(numBlocks - numBigBlocks));
104  for (int block = numBigBlocks; block < numBlocks; block++)
105  {
106  framePointers[block].length = (framesPerBlock) * frameSize;
107  framePointers[block].data = new unsigned char[framePointers[block].length];
108  }
109 
110  // Now fill them
111  unsigned int frame = 0;
112  for (int block = 0; block < numBlocks; block++)
113  {
114  for (unsigned int frameInThisBlock = 0; frameInThisBlock < framePointers[block].length / frameSize; frameInThisBlock++)
115  {
116  memcpy(&(framePointers[block].data[frameInThisBlock * frameSize]), inputFrames->getFrame(frame), frameSize);
117  frame++;
118  }
119  }
120  return true;
121 }
122 
123 bool VNNclAlgorithm::reconstruct(ProcessedUSInputDataPtr input, vtkImageDataPtr outputData, float radius, int nClosePlanes)
124 {
125  mMeasurementNames.clear();
126 
127  int numBlocks = 10; // FIXME? needs to be the same as the number of input bscans to the voxel_method kernel
128 
129  // Split input US into blocks
130  // Splits and copies data from the processed input in the way the kernel will processes it, which is per frameBlock
131  frameBlock_t* inputBlocks = new frameBlock_t[numBlocks];
132  size_t nPlanes_numberOfInputImages = input->getDimensions()[2];
133  this->initializeFrameBlocks(inputBlocks, numBlocks, input);
134 
135  // Allocate CL memory for each frame block
136  VECTOR_CLASS<cl::Buffer> clBlocks;
137  report("Allocating OpenCL input block buffers");
138  for (int i = 0; i < numBlocks; i++)
139  {
140  //TODO why does the context suddenly contain a "dummy" device?
141  cl::Buffer buffer = mOulContex->createBuffer(mOulContex->getContext(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, inputBlocks[i].length, inputBlocks[i].data, "block buffer "+QString::number(i).toStdString());
142  clBlocks.push_back(buffer);
143  }
144  // Allocate output memory
145  int *outputDims = outputData->GetDimensions();
146 
147  size_t outputVolumeSize = outputDims[0] * outputDims[1] * outputDims[2] * sizeof(unsigned char);
148 
149  report(QString("Allocating CL output buffer, size %1").arg(outputVolumeSize));
150 
151  cl_ulong globalMemUse = 10 * inputBlocks[0].length + outputVolumeSize + sizeof(float) * 16 * nPlanes_numberOfInputImages + sizeof(cl_uchar) * input->getDimensions()[0] * input->getDimensions()[1];
152  if(isUsingTooMuchMemory(outputVolumeSize, inputBlocks[0].length, globalMemUse))
153  return false;
154 
155  cl::Buffer outputBuffer = mOulContex->createBuffer(mOulContex->getContext(), CL_MEM_WRITE_ONLY, outputVolumeSize, NULL, "output volume buffer");
156 
157  // Fill the plane matrices
158  float *planeMatrices = new float[16 * nPlanes_numberOfInputImages]; //4x4 (matrix) = 16
159  this->fillPlaneMatrices(planeMatrices, input);
160 
161  cl::Buffer clPlaneMatrices = mOulContex->createBuffer(mOulContex->getContext(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nPlanes_numberOfInputImages * sizeof(float) * 16, planeMatrices, "plane matrices buffer");
162 
163  // US Probe mask
164  cl::Buffer clMask = mOulContex->createBuffer(mOulContex->getContext(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
165  sizeof(cl_uchar) * input->getMask()->GetDimensions()[0] * input->getMask()->GetDimensions()[1],
166  input->getMask()->GetScalarPointer(), "mask buffer");
167 
168  double *out_spacing = outputData->GetSpacing();
169  float spacings[2];
170  float f_out_spacings[3];
171  f_out_spacings[0] = out_spacing[0];
172  f_out_spacings[1] = out_spacing[1];
173  f_out_spacings[2] = out_spacing[2];
174 
175 
176  spacings[0] = input->getSpacing()[0];
177  spacings[1] = input->getSpacing()[1];
178 
179  //TODO why 4? because float4 is used??
180  size_t planes_eqs_size = sizeof(cl_float)*4*nPlanes_numberOfInputImages;
181 
182  // Find the optimal local work size
183  size_t local_work_size;
184  unsigned int deviceNumber = 0;
185  cl::Device device = mOulContex->getDevice(deviceNumber);
186  mKernel.getWorkGroupInfo(device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, &local_work_size);
187 
188  size_t close_planes_size = this->calculateSpaceNeededForClosePlanes(mKernel, device, local_work_size, nPlanes_numberOfInputImages, nClosePlanes);
189 
190  this->setKernelArguments(
191  mKernel,
192  outputDims[0],
193  outputDims[1],
194  outputDims[2],
195  f_out_spacings[0],
196  f_out_spacings[1],
197  f_out_spacings[2],
198  input->getDimensions()[0],
199  input->getDimensions()[1],
200  spacings[0],
201  spacings[1],
202  clBlocks,
203  outputBuffer,
204  clPlaneMatrices,
205  clMask,
206  planes_eqs_size,
207  close_planes_size,
208  radius);
209 
210  report(QString("Using %1 as local workgroup size").arg(local_work_size));
211 
212  // We will divide the work into cubes of CUBE_DIM^3 voxels. The global work size is the total number of voxels divided by that.
213  int cube_dim = 4;
214  int cube_dim_pow3 = cube_dim * cube_dim * cube_dim;
215  // Global work items:
216  size_t global_work_size = (((outputDims[0] + cube_dim) * (outputDims[1] + cube_dim) * (outputDims[2] + cube_dim)) / cube_dim_pow3); // = number of cubes = number of kernels to run
217 
218  // Round global_work_size up to nearest multiple of local_work_size
219  if (global_work_size % local_work_size)
220  global_work_size = ((global_work_size / local_work_size) + 1) * local_work_size; // ceil(...)
221 
222  unsigned int queueNumber = 0;
223  cl::CommandQueue queue = mOulContex->getQueue(queueNumber);
224  this->measureAndExecuteKernel(queue, mKernel, global_work_size, local_work_size, mKernelMeasurementName);
225  this->measureAndReadBuffer(queue, outputBuffer, outputVolumeSize, outputData->GetScalarPointer(), "vnncl_read_buffer");
226  setDeepModified(outputData);
227  // Cleaning up
228  report(QString("Done, freeing GPU memory"));
229  this->freeFrameBlocks(inputBlocks, numBlocks);
230  delete[] inputBlocks;
231 
232  inputBlocks = NULL;
233 
234  return true;
235 }
236 
238 {
239  std::vector<TimedPosition> vecPosition = input->getFrames();
240 
241  // Sanity check on the number of frames
242  if (input->getDimensions()[2] != vecPosition.end() - vecPosition.begin())
243  {
244  reportError(QString("Number of frames %1 != %2 dimension 2 of US input").arg(input->getDimensions()[2]).arg(vecPosition.end() - vecPosition.begin()));
245  return;
246  }
247 
248  int i = 0;
249  for (std::vector<TimedPosition>::iterator it = vecPosition.begin(); it != vecPosition.end(); ++it)
250  {
251  Transform3D pos = it->mPos;
252 
253  // Now store the result in the output
254  for (int j = 0; j < 16; j++)
255  {
256  planeMatrices[i++] = pos(j / 4, j % 4);
257  }
258  }
259 }
260 
262 {
263  if(on)
264  mRuntime->enable();
265  else
266  mRuntime->disable();
267 }
268 
270 {
271  double totalExecutionTime = -1;
272  std::set<std::string>::iterator it;
273  for(it = mMeasurementNames.begin(); it != mMeasurementNames.end(); ++it)
274  {
275  oul::RuntimeMeasurement measurement = mRuntime->getTiming(*it);
276  totalExecutionTime += measurement.getSum();
277  }
278  return totalExecutionTime;
279 }
280 
282 {
283 // double kernelExecutionTime = -1;
284  return mRuntime->getTiming(mKernelMeasurementName).getSum();
285 
286 }
287 
288 void VNNclAlgorithm::freeFrameBlocks(frameBlock_t *framePointers, int numBlocks)
289 {
290  for (int i = 0; i < numBlocks; i++)
291  {
292  delete[] framePointers[i].data;
293  }
294 }
295 
296 void VNNclAlgorithm::setKernelArguments(
297  cl::Kernel kernel,
298  int volume_xsize,
299  int volume_ysize,
300  int volume_zsize,
301  float volume_xspacing,
302  float volume_yspacing,
303  float volume_zspacing,
304  int in_xsize,
305  int in_ysize,
306  float in_xspacing,
307  float in_yspacing,
308  std::vector<cl::Buffer>& blocks,
309  cl::Buffer out_volume,
310  cl::Buffer plane_matrices,
311  cl::Buffer mask,
312  size_t plane_eqs_size,
313  size_t close_planes_size,
314  float radius)
315 {
316  int arg = 0;
317  kernel.setArg(arg++, volume_xsize);
318  kernel.setArg(arg++, volume_ysize);
319  kernel.setArg(arg++, volume_zsize);
320  kernel.setArg(arg++, volume_xspacing);
321  kernel.setArg(arg++, volume_yspacing);
322  kernel.setArg(arg++, volume_zspacing);
323  kernel.setArg(arg++, in_xsize);
324  kernel.setArg(arg++, in_ysize);
325  kernel.setArg(arg++, in_xspacing);
326  kernel.setArg(arg++, in_yspacing);
327  for (int i = 0; i < blocks.size(); i++)
328  {
329  kernel.setArg(arg++, blocks[i]);
330  }
331  kernel.setArg(arg++, out_volume);
332  kernel.setArg(arg++, plane_matrices);
333  kernel.setArg(arg++, mask);
334  kernel.setArg<cl::LocalSpaceArg>(arg++, cl::__local(plane_eqs_size));
335  kernel.setArg<cl::LocalSpaceArg>(arg++, cl::__local(close_planes_size));
336  kernel.setArg(arg++, radius);
337 }
338 
339 size_t VNNclAlgorithm::calculateSpaceNeededForClosePlanes(cl::Kernel kernel, cl::Device device, size_t local_work_size, size_t nPlanes_numberOfInputImages, int nClosePlanes)
340 {
341  // Find out how much local memory the device has
342  size_t dev_local_mem_size;
343  dev_local_mem_size = device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
344 
345  // Find the maximum work group size
346  size_t max_work_size;
347  kernel.getWorkGroupInfo(device, CL_KERNEL_WORK_GROUP_SIZE, &max_work_size);
348 
349  // Now find the largest multiple of the preferred work group size that will fit into local mem
350  size_t constant_local_mem = sizeof(cl_float) * 4 * nPlanes_numberOfInputImages;
351 
352  size_t varying_local_mem = (sizeof(cl_float) + sizeof(cl_short) + sizeof(cl_uchar) + sizeof(cl_uchar)) * (nClosePlanes + 1); //see _close_plane struct in kernels.cl
353  report(QString("Device has %1 bytes of local memory").arg(dev_local_mem_size));
354  dev_local_mem_size -= constant_local_mem + 128; //Hmmm? 128?
355 
356  // How many work items can the local mem support?
357  size_t maxItems = dev_local_mem_size / varying_local_mem;
358  // And what is the biggest multiple of local_work_size that fits into that?
359  int multiple = maxItems / local_work_size;
360 
361  if(multiple == 0)
362  {
363  // If the maximum amount of work items is smaller than the preferred multiple, we end up here.
364  // This means that the local memory use is so big we can't even fit into the preferred multiple, and
365  // have use a sub-optimal local work size.
366  local_work_size = std::min(max_work_size, maxItems);
367  }
368  else
369  {
370  // Otherwise, we make it fit into the local work size.
371  local_work_size = std::min(max_work_size, multiple * local_work_size);
372  }
373 
374  size_t close_planes_size = varying_local_mem*local_work_size;
375 
376  return close_planes_size;
377 }
378 
379 bool VNNclAlgorithm::isUsingTooMuchMemory(size_t outputVolumeSize, size_t inputBlocksLength, cl_ulong globalMemUse)
380 {
381  bool usingTooMuchMemory = false;
382 
383  unsigned int deviceNumber = 0;
384  cl_ulong maxAllocSize = mOulContex->getDevice(deviceNumber).getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>();
385  cl_ulong globalMemSize = mOulContex->getDevice(deviceNumber).getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>();
386  if (maxAllocSize < outputVolumeSize)
387  {
388  reportError(QString("Output volume size too large! %1 > %2\n").arg(outputVolumeSize).arg(maxAllocSize));
389  usingTooMuchMemory = true;
390  }
391 
392  if (maxAllocSize < inputBlocksLength)
393  {
394  reportError(QString("Input blocks too large! %1 > %2\n").arg(inputBlocksLength).arg(maxAllocSize));
395  usingTooMuchMemory = true;
396  }
397 
398  if (globalMemSize < globalMemUse)
399  {
400  reportError(QString("Using too much global memory! %1 > %2").arg(globalMemUse).arg(globalMemSize));
401  usingTooMuchMemory = true;
402  }
403 
404  report(QString("Using %1 of %2 global memory").arg(globalMemUse).arg(globalMemSize));
405  return usingTooMuchMemory;
406 }
407 
408 void VNNclAlgorithm::measureAndExecuteKernel(cl::CommandQueue queue, cl::Kernel kernel, size_t global_work_size, size_t local_work_size, std::string measurementName)
409 {
410  this->startProfiling(measurementName, queue);
411  mOulContex->executeKernel(queue, kernel, global_work_size, local_work_size);
412  this->stopProfiling(measurementName, queue);
413 }
414 
415 void VNNclAlgorithm::measureAndReadBuffer(cl::CommandQueue queue, cl::Buffer outputBuffer, size_t outputVolumeSize, void *outputData, std::string measurementName)
416 {
417  this->startProfiling(measurementName, queue);
418  mOulContex->readBuffer(queue, outputBuffer, outputVolumeSize, outputData);
419  this->stopProfiling(measurementName, queue);
420 }
421 
422 void VNNclAlgorithm::startProfiling(std::string name, cl::CommandQueue queue) {
423  if(!mRuntime->isEnabled())
424  return;
425 
426  mRuntime->startCLTimer(name, queue);
427  mMeasurementNames.insert(name);
428 }
429 
430 void VNNclAlgorithm::stopProfiling(std::string name, cl::CommandQueue queue) {
431  if(!mRuntime->isEnabled())
432  return;
433 
434  mRuntime->stopCLTimer(name, queue);
435  mRuntime->printAll();
436 }
437 
438 
439 
440 } /* namespace cx */
QString qstring_cast(const T &val)
void reportError(QString msg)
Definition: cxLogger.cpp:71
virtual bool initCL(QString kernelFile, int nMaxPlanes, int nPlanes, int method, int planeMethod, int nStarts, float brightnessWeight, float newnessWeight)
Transform3D Transform3D
Transform3D is a representation of an affine 3D transform.
virtual void fillPlaneMatrices(float *planeMatrices, ProcessedUSInputDataPtr input)
virtual cl::Program buildCLProgram(std::string program_src, std::string kernelPath, int nMaxPlanes, int nPlanes, int method, int planeMethod, int nStarts, float brightnessWeight, float newnessWeight)
virtual bool reconstruct(ProcessedUSInputDataPtr input, vtkImageDataPtr outputData, float radius, int nClosePlanes)
virtual bool initializeFrameBlocks(frameBlock_t *framePointers, int numBlocks, ProcessedUSInputDataPtr inputFrames)
void setProfiling(bool on)
void report(QString msg)
Definition: cxLogger.cpp:69
void setDeepModified(vtkImageDataPtr image)
RealScalar length() const
boost::shared_ptr< class ProcessedUSInputData > ProcessedUSInputDataPtr
virtual void freeFrameBlocks(frameBlock_t *framePointers, int numBlocks)
vtkSmartPointer< class vtkImageData > vtkImageDataPtr
Namespace for all CustusX production code.