38 #include "HelperFunctions.hpp"
39 #include <vtkImageData.h>
40 #include <recConfig.h>
46 mRuntime(new oul::RuntimeMeasurementsManager()),
47 mKernelMeasurementName(
"vnncl_execute_kernel")
49 oul::DeviceCriteria criteria;
50 criteria.setTypeCriteria(oul::DEVICE_TYPE_GPU);
51 bool enableProfilling =
true;
52 mOulContex = oul::opencl()->createContextPtr(criteria, NULL, enableProfilling);
60 bool VNNclAlgorithm::initCL(QString kernelFilePath,
int nMaxPlanes,
int nPlanes,
int method,
int planeMethod,
int nStarts,
float brightnessWeight,
float newnessWeight)
63 report(QString(
"Kernel path: %1").arg(kernelFilePath));
64 std::string source = oul::readFile(kernelFilePath.toStdString());
66 QFileInfo path(kernelFilePath);
70 cl::Program clprogram = this->
buildCLProgram(source, path.absolutePath().toStdString(), nMaxPlanes, nPlanes, method, planeMethod, nStarts,brightnessWeight, newnessWeight);
73 mKernel = mOulContex->createKernel(clprogram,
"voxel_methods");
78 cl::Program
VNNclAlgorithm::buildCLProgram(std::string program_src, std::string kernelPath,
int nMaxPlanes,
int nPlanes,
int method,
int planeMethod,
int nStarts,
float newnessWeight,
float brightnessWeight)
82 VECTOR_CLASS<cl::Device> devices;
85 QString define =
"-D MAX_PLANES=%1 -D N_PLANES=%2 -D METHOD=%3 -D PLANE_METHOD=%4 -D MAX_MULTISTART_STARTS=%5 -D NEWNESS_FACTOR=%6 -D BRIGHTNESS_FACTOR=%7";
86 define = define.arg(nMaxPlanes).arg(nPlanes).arg(method).arg(planeMethod).arg(nStarts).arg(newnessWeight).arg(brightnessWeight);
88 int programID = mOulContex->createProgramFromString(program_src,
"-I " + std::string(kernelPath) +
" " + define.toStdString());
89 retval = mOulContex->getProgram(programID);
91 }
catch (cl::Error &error)
93 reportError(
"Could not build a OpenCL program. Reason: "+QString(error.what()));
102 Eigen::Array3i dims = inputFrames->getDimensions();
103 size_t frameSize = dims[0] * dims[1];
104 size_t numFrames = dims[2];
105 report(QString(
"Input dims: (%1, %2, %3)").arg(dims[0]).arg(dims[1]).arg(dims[2]));
108 size_t framesPerBlock = numFrames / numBlocks;
109 report(QString(
"Frames: %1, Blocks: %2, Frames per block: %3").arg(numFrames).arg(numBlocks).arg(framesPerBlock));
113 size_t numBigBlocks = numFrames % numBlocks;
116 report(QString(
"Allocating %1 big blocks outside of OpenCL").arg(numBigBlocks));
117 for (
unsigned int block = 0; block < numBigBlocks; block++)
119 framePointers[block].
length = (1 + framesPerBlock) * frameSize;
120 framePointers[block].
data =
new unsigned char[framePointers[block].
length];
124 report(QString(
"Allocating %1 small blocks outside of OpenCL").arg(numBlocks - numBigBlocks));
125 for (
int block = numBigBlocks; block < numBlocks; block++)
127 framePointers[block].
length = (framesPerBlock) * frameSize;
128 framePointers[block].
data =
new unsigned char[framePointers[block].
length];
132 unsigned int frame = 0;
133 for (
int block = 0; block < numBlocks; block++)
135 for (
unsigned int frameInThisBlock = 0; frameInThisBlock < framePointers[block].
length / frameSize; frameInThisBlock++)
137 memcpy(&(framePointers[block].data[frameInThisBlock * frameSize]), inputFrames->getFrame(frame), frameSize);
146 mMeasurementNames.clear();
153 size_t nPlanes_numberOfInputImages = input->getDimensions()[2];
157 VECTOR_CLASS<cl::Buffer> clBlocks;
158 report(
"Allocating OpenCL input block buffers");
159 for (
int i = 0; i < numBlocks; i++)
162 cl::Buffer buffer = mOulContex->createBuffer(mOulContex->getContext(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, inputBlocks[i].
length, inputBlocks[i].
data,
"block buffer "+QString::number(i).toStdString());
163 clBlocks.push_back(buffer);
166 int *outputDims = outputData->GetDimensions();
168 size_t outputVolumeSize = outputDims[0] * outputDims[1] * outputDims[2] *
sizeof(
unsigned char);
170 report(QString(
"Allocating CL output buffer, size %1").arg(outputVolumeSize));
172 cl_ulong globalMemUse = 10 * inputBlocks[0].
length + outputVolumeSize +
sizeof(float) * 16 * nPlanes_numberOfInputImages +
sizeof(cl_uchar) * input->getDimensions()[0] * input->getDimensions()[1];
173 if(isUsingTooMuchMemory(outputVolumeSize, inputBlocks[0].
length, globalMemUse))
176 cl::Buffer outputBuffer = mOulContex->createBuffer(mOulContex->getContext(), CL_MEM_WRITE_ONLY, outputVolumeSize, NULL,
"output volume buffer");
179 float *planeMatrices =
new float[16 * nPlanes_numberOfInputImages];
182 cl::Buffer clPlaneMatrices = mOulContex->createBuffer(mOulContex->getContext(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nPlanes_numberOfInputImages *
sizeof(float) * 16, planeMatrices,
"plane matrices buffer");
185 cl::Buffer clMask = mOulContex->createBuffer(mOulContex->getContext(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
186 sizeof(cl_uchar) * input->getMask()->GetDimensions()[0] * input->getMask()->GetDimensions()[1],
187 input->getMask()->GetScalarPointer(),
"mask buffer");
189 double *out_spacing = outputData->GetSpacing();
191 float f_out_spacings[3];
192 f_out_spacings[0] = out_spacing[0];
193 f_out_spacings[1] = out_spacing[1];
194 f_out_spacings[2] = out_spacing[2];
197 spacings[0] = input->getSpacing()[0];
198 spacings[1] = input->getSpacing()[1];
201 size_t planes_eqs_size =
sizeof(cl_float)*4*nPlanes_numberOfInputImages;
204 size_t local_work_size;
205 unsigned int deviceNumber = 0;
206 cl::Device device = mOulContex->getDevice(deviceNumber);
207 mKernel.getWorkGroupInfo(device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, &local_work_size);
209 size_t close_planes_size = this->calculateSpaceNeededForClosePlanes(mKernel, device, local_work_size, nPlanes_numberOfInputImages, nClosePlanes);
211 this->setKernelArguments(
219 input->getDimensions()[0],
220 input->getDimensions()[1],
231 report(QString(
"Using %1 as local workgroup size").arg(local_work_size));
235 int cube_dim_pow3 = cube_dim * cube_dim * cube_dim;
237 size_t global_work_size = (((outputDims[0] + cube_dim) * (outputDims[1] + cube_dim) * (outputDims[2] + cube_dim)) / cube_dim_pow3);
240 if (global_work_size % local_work_size)
241 global_work_size = ((global_work_size / local_work_size) + 1) * local_work_size;
243 unsigned int queueNumber = 0;
244 cl::CommandQueue queue = mOulContex->getQueue(queueNumber);
245 this->measureAndExecuteKernel(queue, mKernel, global_work_size, local_work_size, mKernelMeasurementName);
246 this->measureAndReadBuffer(queue, outputBuffer, outputVolumeSize, outputData->GetScalarPointer(),
"vnncl_read_buffer");
249 report(QString(
"Done, freeing GPU memory"));
251 delete[] inputBlocks;
260 std::vector<TimedPosition> vecPosition = input->getFrames();
263 if (input->getDimensions()[2] != vecPosition.end() - vecPosition.begin())
265 reportError(QString(
"Number of frames %1 != %2 dimension 2 of US input").arg(input->getDimensions()[2]).arg(vecPosition.end() - vecPosition.begin()));
270 for (std::vector<TimedPosition>::iterator it = vecPosition.begin(); it != vecPosition.end(); ++it)
275 for (
int j = 0; j < 16; j++)
277 planeMatrices[i++] = pos(j / 4, j % 4);
292 double totalExecutionTime = -1;
293 std::set<std::string>::iterator it;
294 for(it = mMeasurementNames.begin(); it != mMeasurementNames.end(); ++it)
296 oul::RuntimeMeasurement measurement = mRuntime->getTiming(*it);
297 totalExecutionTime += measurement.getSum();
299 return totalExecutionTime;
305 return mRuntime->getTiming(mKernelMeasurementName).getSum();
311 for (
int i = 0; i < numBlocks; i++)
313 delete[] framePointers[i].
data;
317 void VNNclAlgorithm::setKernelArguments(
322 float volume_xspacing,
323 float volume_yspacing,
324 float volume_zspacing,
329 std::vector<cl::Buffer>& blocks,
330 cl::Buffer out_volume,
331 cl::Buffer plane_matrices,
333 size_t plane_eqs_size,
334 size_t close_planes_size,
338 kernel.setArg(arg++, volume_xsize);
339 kernel.setArg(arg++, volume_ysize);
340 kernel.setArg(arg++, volume_zsize);
341 kernel.setArg(arg++, volume_xspacing);
342 kernel.setArg(arg++, volume_yspacing);
343 kernel.setArg(arg++, volume_zspacing);
344 kernel.setArg(arg++, in_xsize);
345 kernel.setArg(arg++, in_ysize);
346 kernel.setArg(arg++, in_xspacing);
347 kernel.setArg(arg++, in_yspacing);
348 for (
int i = 0; i < blocks.size(); i++)
350 kernel.setArg(arg++, blocks[i]);
352 kernel.setArg(arg++, out_volume);
353 kernel.setArg(arg++, plane_matrices);
354 kernel.setArg(arg++, mask);
355 kernel.setArg<cl::LocalSpaceArg>(arg++, cl::__local(plane_eqs_size));
356 kernel.setArg<cl::LocalSpaceArg>(arg++, cl::__local(close_planes_size));
357 kernel.setArg(arg++, radius);
360 size_t VNNclAlgorithm::calculateSpaceNeededForClosePlanes(cl::Kernel kernel, cl::Device device,
size_t local_work_size,
size_t nPlanes_numberOfInputImages,
int nClosePlanes)
363 size_t dev_local_mem_size;
364 dev_local_mem_size = device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
367 size_t max_work_size;
368 kernel.getWorkGroupInfo(device, CL_KERNEL_WORK_GROUP_SIZE, &max_work_size);
371 size_t constant_local_mem =
sizeof(cl_float) * 4 * nPlanes_numberOfInputImages;
373 size_t varying_local_mem = (
sizeof(cl_float) +
sizeof(cl_short) +
sizeof(cl_uchar) +
sizeof(cl_uchar)) * (nClosePlanes + 1);
374 report(QString(
"Device has %1 bytes of local memory").arg(dev_local_mem_size));
375 dev_local_mem_size -= constant_local_mem + 128;
378 size_t maxItems = dev_local_mem_size / varying_local_mem;
380 int multiple = maxItems / local_work_size;
387 local_work_size = std::min(max_work_size, maxItems);
392 local_work_size = std::min(max_work_size, multiple * local_work_size);
395 size_t close_planes_size = varying_local_mem*local_work_size;
397 return close_planes_size;
400 bool VNNclAlgorithm::isUsingTooMuchMemory(
size_t outputVolumeSize,
size_t inputBlocksLength, cl_ulong globalMemUse)
402 bool usingTooMuchMemory =
false;
404 unsigned int deviceNumber = 0;
405 cl_ulong maxAllocSize = mOulContex->getDevice(deviceNumber).getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>();
406 cl_ulong globalMemSize = mOulContex->getDevice(deviceNumber).getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>();
407 if (maxAllocSize < outputVolumeSize)
409 reportError(QString(
"Output volume size too large! %1 > %2\n").arg(outputVolumeSize).arg(maxAllocSize));
410 usingTooMuchMemory =
true;
413 if (maxAllocSize < inputBlocksLength)
415 reportError(QString(
"Input blocks too large! %1 > %2\n").arg(inputBlocksLength).arg(maxAllocSize));
416 usingTooMuchMemory =
true;
419 if (globalMemSize < globalMemUse)
421 reportError(QString(
"Using too much global memory! %1 > %2").arg(globalMemUse).arg(globalMemSize));
422 usingTooMuchMemory =
true;
425 report(QString(
"Using %1 of %2 global memory").arg(globalMemUse).arg(globalMemSize));
426 return usingTooMuchMemory;
429 void VNNclAlgorithm::measureAndExecuteKernel(cl::CommandQueue queue, cl::Kernel kernel,
size_t global_work_size,
size_t local_work_size, std::string measurementName)
431 this->startProfiling(measurementName, queue);
432 mOulContex->executeKernel(queue, kernel, global_work_size, local_work_size);
433 this->stopProfiling(measurementName, queue);
436 void VNNclAlgorithm::measureAndReadBuffer(cl::CommandQueue queue, cl::Buffer outputBuffer,
size_t outputVolumeSize,
void *outputData, std::string measurementName)
438 this->startProfiling(measurementName, queue);
439 mOulContex->readBuffer(queue, outputBuffer, outputVolumeSize, outputData);
440 this->stopProfiling(measurementName, queue);
443 void VNNclAlgorithm::startProfiling(std::string name, cl::CommandQueue queue) {
444 if(!mRuntime->isEnabled())
447 mRuntime->startCLTimer(name, queue);
448 mMeasurementNames.insert(name);
451 void VNNclAlgorithm::stopProfiling(std::string name, cl::CommandQueue queue) {
452 if(!mRuntime->isEnabled())
455 mRuntime->stopCLTimer(name, queue);
456 mRuntime->printAll();
QString qstring_cast(const T &val)
void reportError(QString msg)
virtual bool initCL(QString kernelFile, int nMaxPlanes, int nPlanes, int method, int planeMethod, int nStarts, float brightnessWeight, float newnessWeight)
Transform3D Transform3D
Transform3D is a representation of an affine 3D transform.
double getTotalExecutionTime()
virtual void fillPlaneMatrices(float *planeMatrices, ProcessedUSInputDataPtr input)
virtual cl::Program buildCLProgram(std::string program_src, std::string kernelPath, int nMaxPlanes, int nPlanes, int method, int planeMethod, int nStarts, float brightnessWeight, float newnessWeight)
virtual bool reconstruct(ProcessedUSInputDataPtr input, vtkImageDataPtr outputData, float radius, int nClosePlanes)
virtual bool initializeFrameBlocks(frameBlock_t *framePointers, int numBlocks, ProcessedUSInputDataPtr inputFrames)
void setProfiling(bool on)
void setDeepModified(vtkImageDataPtr image)
RealScalar length() const
boost::shared_ptr< class ProcessedUSInputData > ProcessedUSInputDataPtr
virtual void freeFrameBlocks(frameBlock_t *framePointers, int numBlocks)
double getKernelExecutionTime()