Fraxinus  17.12-rc2
An IGT application
cxVNNclAlgorithm.cpp
Go to the documentation of this file.
1 /*=========================================================================
2 This file is part of CustusX, an Image Guided Therapy Application.
3 
4 Copyright (c) 2008-2014, SINTEF Department of Medical Technology
5 All rights reserved.
6 
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are met:
9 
10 1. Redistributions of source code must retain the above copyright notice,
11  this list of conditions and the following disclaimer.
12 
13 2. Redistributions in binary form must reproduce the above copyright notice,
14  this list of conditions and the following disclaimer in the documentation
15  and/or other materials provided with the distribution.
16 
17 3. Neither the name of the copyright holder nor the names of its contributors
18  may be used to endorse or promote products derived from this software
19  without specific prior written permission.
20 
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
25 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 =========================================================================*/
32 
33 #include "cxVNNclAlgorithm.h"
34 
35 #include <QFileInfo>
36 #include "cxLogger.h"
37 #include "cxTypeConversions.h"
38 #include "HelperFunctions.hpp"
39 #include <vtkImageData.h>
40 #include <recConfig.h>
41 #include "cxVolumeHelpers.h"
42 
43 namespace cx
44 {
46  mRuntime(new oul::RuntimeMeasurementsManager()),
47  mKernelMeasurementName("vnncl_execute_kernel")
48 {
49  oul::DeviceCriteria criteria;
50  criteria.setTypeCriteria(oul::DEVICE_TYPE_GPU);
51  bool enableProfilling = true;
52  mOulContex = oul::opencl()->createContextPtr(criteria, NULL, enableProfilling);
53  if(enableProfilling)
54  mRuntime->enable();
55 }
56 
58 {}
59 
60 bool VNNclAlgorithm::initCL(QString kernelFilePath, int nMaxPlanes, int nPlanes, int method, int planeMethod, int nStarts, float brightnessWeight, float newnessWeight)
61 {
62  // READ KERNEL FILE
63  report(QString("Kernel path: %1").arg(kernelFilePath));
64  std::string source = oul::readFile(kernelFilePath.toStdString());
65 
66  QFileInfo path(kernelFilePath);
67  path.absolutePath();
68 
69  // BUILD PROGRAM
70  cl::Program clprogram = this->buildCLProgram(source, path.absolutePath().toStdString(), nMaxPlanes, nPlanes, method, planeMethod, nStarts,brightnessWeight, newnessWeight);
71 
72  // CREATE KERNEL
73  mKernel = mOulContex->createKernel(clprogram, "voxel_methods");
74 
75  return true;
76 }
77 
78 cl::Program VNNclAlgorithm::buildCLProgram(std::string program_src, std::string kernelPath, int nMaxPlanes, int nPlanes, int method, int planeMethod, int nStarts, float newnessWeight, float brightnessWeight)
79 {
80  cl::Program retval;
81  cl_int err = 0;
82  VECTOR_CLASS<cl::Device> devices;
83  try
84  {
85  QString define = "-D MAX_PLANES=%1 -D N_PLANES=%2 -D METHOD=%3 -D PLANE_METHOD=%4 -D MAX_MULTISTART_STARTS=%5 -D NEWNESS_FACTOR=%6 -D BRIGHTNESS_FACTOR=%7";
86  define = define.arg(nMaxPlanes).arg(nPlanes).arg(method).arg(planeMethod).arg(nStarts).arg(newnessWeight).arg(brightnessWeight);
87 
88  int programID = mOulContex->createProgramFromString(program_src, "-I " + std::string(kernelPath) + " " + define.toStdString());
89  retval = mOulContex->getProgram(programID);
90 
91  } catch (cl::Error &error)
92  {
93  reportError("Could not build a OpenCL program. Reason: "+QString(error.what()));
94  reportError(qstring_cast(oul::getCLErrorString(error.err())));
95  }
96  return retval;
97 }
98 
99 bool VNNclAlgorithm::initializeFrameBlocks(frameBlock_t* framePointers, int numBlocks, ProcessedUSInputDataPtr inputFrames)
100 {
101  // Compute the size of each frame in bytes
102  Eigen::Array3i dims = inputFrames->getDimensions();
103  size_t frameSize = dims[0] * dims[1];
104  size_t numFrames = dims[2];
105  report(QString("Input dims: (%1, %2, %3)").arg(dims[0]).arg(dims[1]).arg(dims[2]));
106 
107  // Find out how many frames needs to be in each block
108  size_t framesPerBlock = numFrames / numBlocks;
109  report(QString("Frames: %1, Blocks: %2, Frames per block: %3").arg(numFrames).arg(numBlocks).arg(framesPerBlock));
110 
111  // Some blocks will need to contain one extra frame
112  // (numFrames and numBlocks is probably not evenly divisible)
113  size_t numBigBlocks = numFrames % numBlocks;
114 
115  // Allocate the big blocks
116  report(QString("Allocating %1 big blocks outside of OpenCL").arg(numBigBlocks));
117  for (unsigned int block = 0; block < numBigBlocks; block++)
118  {
119  framePointers[block].length = (1 + framesPerBlock) * frameSize;
120  framePointers[block].data = new unsigned char[framePointers[block].length];
121  }
122 
123  // Then the small ones
124  report(QString("Allocating %1 small blocks outside of OpenCL").arg(numBlocks - numBigBlocks));
125  for (int block = numBigBlocks; block < numBlocks; block++)
126  {
127  framePointers[block].length = (framesPerBlock) * frameSize;
128  framePointers[block].data = new unsigned char[framePointers[block].length];
129  }
130 
131  // Now fill them
132  unsigned int frame = 0;
133  for (int block = 0; block < numBlocks; block++)
134  {
135  for (unsigned int frameInThisBlock = 0; frameInThisBlock < framePointers[block].length / frameSize; frameInThisBlock++)
136  {
137  memcpy(&(framePointers[block].data[frameInThisBlock * frameSize]), inputFrames->getFrame(frame), frameSize);
138  frame++;
139  }
140  }
141  return true;
142 }
143 
144 bool VNNclAlgorithm::reconstruct(ProcessedUSInputDataPtr input, vtkImageDataPtr outputData, float radius, int nClosePlanes)
145 {
146  mMeasurementNames.clear();
147 
148  int numBlocks = 10; // FIXME? needs to be the same as the number of input bscans to the voxel_method kernel
149 
150  // Split input US into blocks
151  // Splits and copies data from the processed input in the way the kernel will processes it, which is per frameBlock
152  frameBlock_t* inputBlocks = new frameBlock_t[numBlocks];
153  size_t nPlanes_numberOfInputImages = input->getDimensions()[2];
154  this->initializeFrameBlocks(inputBlocks, numBlocks, input);
155 
156  // Allocate CL memory for each frame block
157  VECTOR_CLASS<cl::Buffer> clBlocks;
158  report("Allocating OpenCL input block buffers");
159  for (int i = 0; i < numBlocks; i++)
160  {
161  //TODO why does the context suddenly contain a "dummy" device?
162  cl::Buffer buffer = mOulContex->createBuffer(mOulContex->getContext(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, inputBlocks[i].length, inputBlocks[i].data, "block buffer "+QString::number(i).toStdString());
163  clBlocks.push_back(buffer);
164  }
165  // Allocate output memory
166  int *outputDims = outputData->GetDimensions();
167 
168  size_t outputVolumeSize = outputDims[0] * outputDims[1] * outputDims[2] * sizeof(unsigned char);
169 
170  report(QString("Allocating CL output buffer, size %1").arg(outputVolumeSize));
171 
172  cl_ulong globalMemUse = 10 * inputBlocks[0].length + outputVolumeSize + sizeof(float) * 16 * nPlanes_numberOfInputImages + sizeof(cl_uchar) * input->getDimensions()[0] * input->getDimensions()[1];
173  if(isUsingTooMuchMemory(outputVolumeSize, inputBlocks[0].length, globalMemUse))
174  return false;
175 
176  cl::Buffer outputBuffer = mOulContex->createBuffer(mOulContex->getContext(), CL_MEM_WRITE_ONLY, outputVolumeSize, NULL, "output volume buffer");
177 
178  // Fill the plane matrices
179  float *planeMatrices = new float[16 * nPlanes_numberOfInputImages]; //4x4 (matrix) = 16
180  this->fillPlaneMatrices(planeMatrices, input);
181 
182  cl::Buffer clPlaneMatrices = mOulContex->createBuffer(mOulContex->getContext(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nPlanes_numberOfInputImages * sizeof(float) * 16, planeMatrices, "plane matrices buffer");
183 
184  // US Probe mask
185  cl::Buffer clMask = mOulContex->createBuffer(mOulContex->getContext(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
186  sizeof(cl_uchar) * input->getMask()->GetDimensions()[0] * input->getMask()->GetDimensions()[1],
187  input->getMask()->GetScalarPointer(), "mask buffer");
188 
189  double *out_spacing = outputData->GetSpacing();
190  float spacings[2];
191  float f_out_spacings[3];
192  f_out_spacings[0] = out_spacing[0];
193  f_out_spacings[1] = out_spacing[1];
194  f_out_spacings[2] = out_spacing[2];
195 
196 
197  spacings[0] = input->getSpacing()[0];
198  spacings[1] = input->getSpacing()[1];
199 
200  //TODO why 4? because float4 is used??
201  size_t planes_eqs_size = sizeof(cl_float)*4*nPlanes_numberOfInputImages;
202 
203  // Find the optimal local work size
204  size_t local_work_size;
205  unsigned int deviceNumber = 0;
206  cl::Device device = mOulContex->getDevice(deviceNumber);
207  mKernel.getWorkGroupInfo(device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, &local_work_size);
208 
209  size_t close_planes_size = this->calculateSpaceNeededForClosePlanes(mKernel, device, local_work_size, nPlanes_numberOfInputImages, nClosePlanes);
210 
211  this->setKernelArguments(
212  mKernel,
213  outputDims[0],
214  outputDims[1],
215  outputDims[2],
216  f_out_spacings[0],
217  f_out_spacings[1],
218  f_out_spacings[2],
219  input->getDimensions()[0],
220  input->getDimensions()[1],
221  spacings[0],
222  spacings[1],
223  clBlocks,
224  outputBuffer,
225  clPlaneMatrices,
226  clMask,
227  planes_eqs_size,
228  close_planes_size,
229  radius);
230 
231  report(QString("Using %1 as local workgroup size").arg(local_work_size));
232 
233  // We will divide the work into cubes of CUBE_DIM^3 voxels. The global work size is the total number of voxels divided by that.
234  int cube_dim = 4;
235  int cube_dim_pow3 = cube_dim * cube_dim * cube_dim;
236  // Global work items:
237  size_t global_work_size = (((outputDims[0] + cube_dim) * (outputDims[1] + cube_dim) * (outputDims[2] + cube_dim)) / cube_dim_pow3); // = number of cubes = number of kernels to run
238 
239  // Round global_work_size up to nearest multiple of local_work_size
240  if (global_work_size % local_work_size)
241  global_work_size = ((global_work_size / local_work_size) + 1) * local_work_size; // ceil(...)
242 
243  unsigned int queueNumber = 0;
244  cl::CommandQueue queue = mOulContex->getQueue(queueNumber);
245  this->measureAndExecuteKernel(queue, mKernel, global_work_size, local_work_size, mKernelMeasurementName);
246  this->measureAndReadBuffer(queue, outputBuffer, outputVolumeSize, outputData->GetScalarPointer(), "vnncl_read_buffer");
247  setDeepModified(outputData);
248  // Cleaning up
249  report(QString("Done, freeing GPU memory"));
250  this->freeFrameBlocks(inputBlocks, numBlocks);
251  delete[] inputBlocks;
252 
253  inputBlocks = NULL;
254 
255  return true;
256 }
257 
259 {
260  std::vector<TimedPosition> vecPosition = input->getFrames();
261 
262  // Sanity check on the number of frames
263  if (input->getDimensions()[2] != vecPosition.end() - vecPosition.begin())
264  {
265  reportError(QString("Number of frames %1 != %2 dimension 2 of US input").arg(input->getDimensions()[2]).arg(vecPosition.end() - vecPosition.begin()));
266  return;
267  }
268 
269  int i = 0;
270  for (std::vector<TimedPosition>::iterator it = vecPosition.begin(); it != vecPosition.end(); ++it)
271  {
272  Transform3D pos = it->mPos;
273 
274  // Now store the result in the output
275  for (int j = 0; j < 16; j++)
276  {
277  planeMatrices[i++] = pos(j / 4, j % 4);
278  }
279  }
280 }
281 
283 {
284  if(on)
285  mRuntime->enable();
286  else
287  mRuntime->disable();
288 }
289 
291 {
292  double totalExecutionTime = -1;
293  std::set<std::string>::iterator it;
294  for(it = mMeasurementNames.begin(); it != mMeasurementNames.end(); ++it)
295  {
296  oul::RuntimeMeasurement measurement = mRuntime->getTiming(*it);
297  totalExecutionTime += measurement.getSum();
298  }
299  return totalExecutionTime;
300 }
301 
303 {
304 // double kernelExecutionTime = -1;
305  return mRuntime->getTiming(mKernelMeasurementName).getSum();
306 
307 }
308 
309 void VNNclAlgorithm::freeFrameBlocks(frameBlock_t *framePointers, int numBlocks)
310 {
311  for (int i = 0; i < numBlocks; i++)
312  {
313  delete[] framePointers[i].data;
314  }
315 }
316 
317 void VNNclAlgorithm::setKernelArguments(
318  cl::Kernel kernel,
319  int volume_xsize,
320  int volume_ysize,
321  int volume_zsize,
322  float volume_xspacing,
323  float volume_yspacing,
324  float volume_zspacing,
325  int in_xsize,
326  int in_ysize,
327  float in_xspacing,
328  float in_yspacing,
329  std::vector<cl::Buffer>& blocks,
330  cl::Buffer out_volume,
331  cl::Buffer plane_matrices,
332  cl::Buffer mask,
333  size_t plane_eqs_size,
334  size_t close_planes_size,
335  float radius)
336 {
337  int arg = 0;
338  kernel.setArg(arg++, volume_xsize);
339  kernel.setArg(arg++, volume_ysize);
340  kernel.setArg(arg++, volume_zsize);
341  kernel.setArg(arg++, volume_xspacing);
342  kernel.setArg(arg++, volume_yspacing);
343  kernel.setArg(arg++, volume_zspacing);
344  kernel.setArg(arg++, in_xsize);
345  kernel.setArg(arg++, in_ysize);
346  kernel.setArg(arg++, in_xspacing);
347  kernel.setArg(arg++, in_yspacing);
348  for (int i = 0; i < blocks.size(); i++)
349  {
350  kernel.setArg(arg++, blocks[i]);
351  }
352  kernel.setArg(arg++, out_volume);
353  kernel.setArg(arg++, plane_matrices);
354  kernel.setArg(arg++, mask);
355  kernel.setArg<cl::LocalSpaceArg>(arg++, cl::__local(plane_eqs_size));
356  kernel.setArg<cl::LocalSpaceArg>(arg++, cl::__local(close_planes_size));
357  kernel.setArg(arg++, radius);
358 }
359 
360 size_t VNNclAlgorithm::calculateSpaceNeededForClosePlanes(cl::Kernel kernel, cl::Device device, size_t local_work_size, size_t nPlanes_numberOfInputImages, int nClosePlanes)
361 {
362  // Find out how much local memory the device has
363  size_t dev_local_mem_size;
364  dev_local_mem_size = device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
365 
366  // Find the maximum work group size
367  size_t max_work_size;
368  kernel.getWorkGroupInfo(device, CL_KERNEL_WORK_GROUP_SIZE, &max_work_size);
369 
370  // Now find the largest multiple of the preferred work group size that will fit into local mem
371  size_t constant_local_mem = sizeof(cl_float) * 4 * nPlanes_numberOfInputImages;
372 
373  size_t varying_local_mem = (sizeof(cl_float) + sizeof(cl_short) + sizeof(cl_uchar) + sizeof(cl_uchar)) * (nClosePlanes + 1); //see _close_plane struct in kernels.cl
374  report(QString("Device has %1 bytes of local memory").arg(dev_local_mem_size));
375  dev_local_mem_size -= constant_local_mem + 128; //Hmmm? 128?
376 
377  // How many work items can the local mem support?
378  size_t maxItems = dev_local_mem_size / varying_local_mem;
379  // And what is the biggest multiple of local_work_size that fits into that?
380  int multiple = maxItems / local_work_size;
381 
382  if(multiple == 0)
383  {
384  // If the maximum amount of work items is smaller than the preferred multiple, we end up here.
385  // This means that the local memory use is so big we can't even fit into the preferred multiple, and
386  // have use a sub-optimal local work size.
387  local_work_size = std::min(max_work_size, maxItems);
388  }
389  else
390  {
391  // Otherwise, we make it fit into the local work size.
392  local_work_size = std::min(max_work_size, multiple * local_work_size);
393  }
394 
395  size_t close_planes_size = varying_local_mem*local_work_size;
396 
397  return close_planes_size;
398 }
399 
400 bool VNNclAlgorithm::isUsingTooMuchMemory(size_t outputVolumeSize, size_t inputBlocksLength, cl_ulong globalMemUse)
401 {
402  bool usingTooMuchMemory = false;
403 
404  unsigned int deviceNumber = 0;
405  cl_ulong maxAllocSize = mOulContex->getDevice(deviceNumber).getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>();
406  cl_ulong globalMemSize = mOulContex->getDevice(deviceNumber).getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>();
407  if (maxAllocSize < outputVolumeSize)
408  {
409  reportError(QString("Output volume size too large! %1 > %2\n").arg(outputVolumeSize).arg(maxAllocSize));
410  usingTooMuchMemory = true;
411  }
412 
413  if (maxAllocSize < inputBlocksLength)
414  {
415  reportError(QString("Input blocks too large! %1 > %2\n").arg(inputBlocksLength).arg(maxAllocSize));
416  usingTooMuchMemory = true;
417  }
418 
419  if (globalMemSize < globalMemUse)
420  {
421  reportError(QString("Using too much global memory! %1 > %2").arg(globalMemUse).arg(globalMemSize));
422  usingTooMuchMemory = true;
423  }
424 
425  report(QString("Using %1 of %2 global memory").arg(globalMemUse).arg(globalMemSize));
426  return usingTooMuchMemory;
427 }
428 
429 void VNNclAlgorithm::measureAndExecuteKernel(cl::CommandQueue queue, cl::Kernel kernel, size_t global_work_size, size_t local_work_size, std::string measurementName)
430 {
431  this->startProfiling(measurementName, queue);
432  mOulContex->executeKernel(queue, kernel, global_work_size, local_work_size);
433  this->stopProfiling(measurementName, queue);
434 }
435 
436 void VNNclAlgorithm::measureAndReadBuffer(cl::CommandQueue queue, cl::Buffer outputBuffer, size_t outputVolumeSize, void *outputData, std::string measurementName)
437 {
438  this->startProfiling(measurementName, queue);
439  mOulContex->readBuffer(queue, outputBuffer, outputVolumeSize, outputData);
440  this->stopProfiling(measurementName, queue);
441 }
442 
443 void VNNclAlgorithm::startProfiling(std::string name, cl::CommandQueue queue) {
444  if(!mRuntime->isEnabled())
445  return;
446 
447  mRuntime->startCLTimer(name, queue);
448  mMeasurementNames.insert(name);
449 }
450 
451 void VNNclAlgorithm::stopProfiling(std::string name, cl::CommandQueue queue) {
452  if(!mRuntime->isEnabled())
453  return;
454 
455  mRuntime->stopCLTimer(name, queue);
456  mRuntime->printAll();
457 }
458 
459 
460 
461 } /* namespace cx */
QString qstring_cast(const T &val)
void reportError(QString msg)
Definition: cxLogger.cpp:92
virtual bool initCL(QString kernelFile, int nMaxPlanes, int nPlanes, int method, int planeMethod, int nStarts, float brightnessWeight, float newnessWeight)
Transform3D Transform3D
Transform3D is a representation of an affine 3D transform.
virtual void fillPlaneMatrices(float *planeMatrices, ProcessedUSInputDataPtr input)
virtual cl::Program buildCLProgram(std::string program_src, std::string kernelPath, int nMaxPlanes, int nPlanes, int method, int planeMethod, int nStarts, float brightnessWeight, float newnessWeight)
virtual bool reconstruct(ProcessedUSInputDataPtr input, vtkImageDataPtr outputData, float radius, int nClosePlanes)
virtual bool initializeFrameBlocks(frameBlock_t *framePointers, int numBlocks, ProcessedUSInputDataPtr inputFrames)
void setProfiling(bool on)
void report(QString msg)
Definition: cxLogger.cpp:90
void setDeepModified(vtkImageDataPtr image)
RealScalar length() const
boost::shared_ptr< class ProcessedUSInputData > ProcessedUSInputDataPtr
virtual void freeFrameBlocks(frameBlock_t *framePointers, int numBlocks)
vtkSmartPointer< class vtkImageData > vtkImageDataPtr
Namespace for all CustusX production code.