17 #include "HelperFunctions.hpp" 18 #include <vtkImageData.h> 19 #include <recConfig.h> 25 mRuntime(new oul::RuntimeMeasurementsManager()),
26 mKernelMeasurementName(
"vnncl_execute_kernel")
28 oul::DeviceCriteria criteria;
29 criteria.setTypeCriteria(oul::DEVICE_TYPE_GPU);
30 bool enableProfilling =
true;
31 mOulContex = oul::opencl()->createContextPtr(criteria, NULL, enableProfilling);
39 bool VNNclAlgorithm::initCL(QString kernelFilePath,
int nMaxPlanes,
int nPlanes,
int method,
int planeMethod,
int nStarts,
float brightnessWeight,
float newnessWeight)
42 report(QString(
"Kernel path: %1").arg(kernelFilePath));
43 std::string source = oul::readFile(kernelFilePath.toStdString());
45 QFileInfo path(kernelFilePath);
49 cl::Program clprogram = this->
buildCLProgram(source, path.absolutePath().toStdString(), nMaxPlanes, nPlanes, method, planeMethod, nStarts,brightnessWeight, newnessWeight);
52 mKernel = mOulContex->createKernel(clprogram,
"voxel_methods");
57 cl::Program
VNNclAlgorithm::buildCLProgram(std::string program_src, std::string kernelPath,
int nMaxPlanes,
int nPlanes,
int method,
int planeMethod,
int nStarts,
float newnessWeight,
float brightnessWeight)
61 VECTOR_CLASS<cl::Device> devices;
64 QString define =
"-D MAX_PLANES=%1 -D N_PLANES=%2 -D METHOD=%3 -D PLANE_METHOD=%4 -D MAX_MULTISTART_STARTS=%5 -D NEWNESS_FACTOR=%6 -D BRIGHTNESS_FACTOR=%7";
65 define = define.arg(nMaxPlanes).arg(nPlanes).arg(method).arg(planeMethod).arg(nStarts).arg(newnessWeight).arg(brightnessWeight);
67 int programID = mOulContex->createProgramFromString(program_src,
"-I " + std::string(kernelPath) +
" " + define.toStdString());
68 retval = mOulContex->getProgram(programID);
70 }
catch (cl::Error &error)
72 reportError(
"Could not build a OpenCL program. Reason: "+QString(error.what()));
81 Eigen::Array3i dims = inputFrames->getDimensions();
82 size_t frameSize = dims[0] * dims[1];
83 size_t numFrames = dims[2];
84 report(QString(
"Input dims: (%1, %2, %3)").arg(dims[0]).arg(dims[1]).arg(dims[2]));
87 size_t framesPerBlock = numFrames / numBlocks;
88 report(QString(
"Frames: %1, Blocks: %2, Frames per block: %3").arg(numFrames).arg(numBlocks).arg(framesPerBlock));
92 size_t numBigBlocks = numFrames % numBlocks;
95 report(QString(
"Allocating %1 big blocks outside of OpenCL").arg(numBigBlocks));
96 for (
unsigned int block = 0; block < numBigBlocks; block++)
98 framePointers[block].
length = (1 + framesPerBlock) * frameSize;
99 framePointers[block].
data =
new unsigned char[framePointers[block].
length];
103 report(QString(
"Allocating %1 small blocks outside of OpenCL").arg(numBlocks - numBigBlocks));
104 for (
int block = numBigBlocks; block < numBlocks; block++)
106 framePointers[block].
length = (framesPerBlock) * frameSize;
107 framePointers[block].
data =
new unsigned char[framePointers[block].
length];
111 unsigned int frame = 0;
112 for (
int block = 0; block < numBlocks; block++)
114 for (
unsigned int frameInThisBlock = 0; frameInThisBlock < framePointers[block].
length / frameSize; frameInThisBlock++)
116 memcpy(&(framePointers[block].data[frameInThisBlock * frameSize]), inputFrames->getFrame(frame), frameSize);
125 mMeasurementNames.clear();
132 size_t nPlanes_numberOfInputImages = input->getDimensions()[2];
136 VECTOR_CLASS<cl::Buffer> clBlocks;
137 report(
"Allocating OpenCL input block buffers");
138 for (
int i = 0; i < numBlocks; i++)
141 cl::Buffer buffer = mOulContex->createBuffer(mOulContex->getContext(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, inputBlocks[i].
length, inputBlocks[i].
data,
"block buffer "+QString::number(i).toStdString());
142 clBlocks.push_back(buffer);
145 int *outputDims = outputData->GetDimensions();
147 size_t outputVolumeSize = outputDims[0] * outputDims[1] * outputDims[2] *
sizeof(
unsigned char);
149 report(QString(
"Allocating CL output buffer, size %1").arg(outputVolumeSize));
151 cl_ulong globalMemUse = 10 * inputBlocks[0].
length + outputVolumeSize +
sizeof(float) * 16 * nPlanes_numberOfInputImages +
sizeof(cl_uchar) * input->getDimensions()[0] * input->getDimensions()[1];
152 if(isUsingTooMuchMemory(outputVolumeSize, inputBlocks[0].
length, globalMemUse))
155 cl::Buffer outputBuffer = mOulContex->createBuffer(mOulContex->getContext(), CL_MEM_WRITE_ONLY, outputVolumeSize, NULL,
"output volume buffer");
158 float *planeMatrices =
new float[16 * nPlanes_numberOfInputImages];
161 cl::Buffer clPlaneMatrices = mOulContex->createBuffer(mOulContex->getContext(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nPlanes_numberOfInputImages *
sizeof(float) * 16, planeMatrices,
"plane matrices buffer");
164 cl::Buffer clMask = mOulContex->createBuffer(mOulContex->getContext(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
165 sizeof(cl_uchar) * input->getMask()->GetDimensions()[0] * input->getMask()->GetDimensions()[1],
166 input->getMask()->GetScalarPointer(),
"mask buffer");
168 double *out_spacing = outputData->GetSpacing();
170 float f_out_spacings[3];
171 f_out_spacings[0] = out_spacing[0];
172 f_out_spacings[1] = out_spacing[1];
173 f_out_spacings[2] = out_spacing[2];
176 spacings[0] = input->getSpacing()[0];
177 spacings[1] = input->getSpacing()[1];
180 size_t planes_eqs_size =
sizeof(cl_float)*4*nPlanes_numberOfInputImages;
183 size_t local_work_size;
184 unsigned int deviceNumber = 0;
185 cl::Device device = mOulContex->getDevice(deviceNumber);
186 mKernel.getWorkGroupInfo(device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, &local_work_size);
188 size_t close_planes_size = this->calculateSpaceNeededForClosePlanes(mKernel, device, local_work_size, nPlanes_numberOfInputImages, nClosePlanes);
190 this->setKernelArguments(
198 input->getDimensions()[0],
199 input->getDimensions()[1],
210 report(QString(
"Using %1 as local workgroup size").arg(local_work_size));
214 int cube_dim_pow3 = cube_dim * cube_dim * cube_dim;
216 size_t global_work_size = (((outputDims[0] + cube_dim) * (outputDims[1] + cube_dim) * (outputDims[2] + cube_dim)) / cube_dim_pow3);
219 if (global_work_size % local_work_size)
220 global_work_size = ((global_work_size / local_work_size) + 1) * local_work_size;
222 unsigned int queueNumber = 0;
223 cl::CommandQueue queue = mOulContex->getQueue(queueNumber);
224 this->measureAndExecuteKernel(queue, mKernel, global_work_size, local_work_size, mKernelMeasurementName);
225 this->measureAndReadBuffer(queue, outputBuffer, outputVolumeSize, outputData->GetScalarPointer(),
"vnncl_read_buffer");
228 report(QString(
"Done, freeing GPU memory"));
230 delete[] inputBlocks;
239 std::vector<TimedPosition> vecPosition = input->getFrames();
242 if (input->getDimensions()[2] != vecPosition.end() - vecPosition.begin())
244 reportError(QString(
"Number of frames %1 != %2 dimension 2 of US input").arg(input->getDimensions()[2]).arg(vecPosition.end() - vecPosition.begin()));
249 for (std::vector<TimedPosition>::iterator it = vecPosition.begin(); it != vecPosition.end(); ++it)
254 for (
int j = 0; j < 16; j++)
256 planeMatrices[i++] = pos(j / 4, j % 4);
271 double totalExecutionTime = -1;
272 std::set<std::string>::iterator it;
273 for(it = mMeasurementNames.begin(); it != mMeasurementNames.end(); ++it)
275 oul::RuntimeMeasurement measurement = mRuntime->getTiming(*it);
276 totalExecutionTime += measurement.getSum();
278 return totalExecutionTime;
284 return mRuntime->getTiming(mKernelMeasurementName).getSum();
290 for (
int i = 0; i < numBlocks; i++)
292 delete[] framePointers[i].
data;
296 void VNNclAlgorithm::setKernelArguments(
301 float volume_xspacing,
302 float volume_yspacing,
303 float volume_zspacing,
308 std::vector<cl::Buffer>& blocks,
309 cl::Buffer out_volume,
310 cl::Buffer plane_matrices,
312 size_t plane_eqs_size,
313 size_t close_planes_size,
317 kernel.setArg(arg++, volume_xsize);
318 kernel.setArg(arg++, volume_ysize);
319 kernel.setArg(arg++, volume_zsize);
320 kernel.setArg(arg++, volume_xspacing);
321 kernel.setArg(arg++, volume_yspacing);
322 kernel.setArg(arg++, volume_zspacing);
323 kernel.setArg(arg++, in_xsize);
324 kernel.setArg(arg++, in_ysize);
325 kernel.setArg(arg++, in_xspacing);
326 kernel.setArg(arg++, in_yspacing);
327 for (
int i = 0; i < blocks.size(); i++)
329 kernel.setArg(arg++, blocks[i]);
331 kernel.setArg(arg++, out_volume);
332 kernel.setArg(arg++, plane_matrices);
333 kernel.setArg(arg++, mask);
334 kernel.setArg<cl::LocalSpaceArg>(arg++, cl::__local(plane_eqs_size));
335 kernel.setArg<cl::LocalSpaceArg>(arg++, cl::__local(close_planes_size));
336 kernel.setArg(arg++, radius);
339 size_t VNNclAlgorithm::calculateSpaceNeededForClosePlanes(cl::Kernel kernel, cl::Device device,
size_t local_work_size,
size_t nPlanes_numberOfInputImages,
int nClosePlanes)
342 size_t dev_local_mem_size;
343 dev_local_mem_size = device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
346 size_t max_work_size;
347 kernel.getWorkGroupInfo(device, CL_KERNEL_WORK_GROUP_SIZE, &max_work_size);
350 size_t constant_local_mem =
sizeof(cl_float) * 4 * nPlanes_numberOfInputImages;
352 size_t varying_local_mem = (
sizeof(cl_float) +
sizeof(cl_short) +
sizeof(cl_uchar) +
sizeof(cl_uchar)) * (nClosePlanes + 1);
353 report(QString(
"Device has %1 bytes of local memory").arg(dev_local_mem_size));
354 dev_local_mem_size -= constant_local_mem + 128;
357 size_t maxItems = dev_local_mem_size / varying_local_mem;
359 int multiple = maxItems / local_work_size;
366 local_work_size = std::min(max_work_size, maxItems);
371 local_work_size = std::min(max_work_size, multiple * local_work_size);
374 size_t close_planes_size = varying_local_mem*local_work_size;
376 return close_planes_size;
379 bool VNNclAlgorithm::isUsingTooMuchMemory(
size_t outputVolumeSize,
size_t inputBlocksLength, cl_ulong globalMemUse)
381 bool usingTooMuchMemory =
false;
383 unsigned int deviceNumber = 0;
384 cl_ulong maxAllocSize = mOulContex->getDevice(deviceNumber).getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>();
385 cl_ulong globalMemSize = mOulContex->getDevice(deviceNumber).getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>();
386 if (maxAllocSize < outputVolumeSize)
388 reportError(QString(
"Output volume size too large! %1 > %2\n").arg(outputVolumeSize).arg(maxAllocSize));
389 usingTooMuchMemory =
true;
392 if (maxAllocSize < inputBlocksLength)
394 reportError(QString(
"Input blocks too large! %1 > %2\n").arg(inputBlocksLength).arg(maxAllocSize));
395 usingTooMuchMemory =
true;
398 if (globalMemSize < globalMemUse)
400 reportError(QString(
"Using too much global memory! %1 > %2").arg(globalMemUse).arg(globalMemSize));
401 usingTooMuchMemory =
true;
404 report(QString(
"Using %1 of %2 global memory").arg(globalMemUse).arg(globalMemSize));
405 return usingTooMuchMemory;
408 void VNNclAlgorithm::measureAndExecuteKernel(cl::CommandQueue queue, cl::Kernel kernel,
size_t global_work_size,
size_t local_work_size, std::string measurementName)
410 this->startProfiling(measurementName, queue);
411 mOulContex->executeKernel(queue, kernel, global_work_size, local_work_size);
412 this->stopProfiling(measurementName, queue);
415 void VNNclAlgorithm::measureAndReadBuffer(cl::CommandQueue queue, cl::Buffer outputBuffer,
size_t outputVolumeSize,
void *outputData, std::string measurementName)
417 this->startProfiling(measurementName, queue);
418 mOulContex->readBuffer(queue, outputBuffer, outputVolumeSize, outputData);
419 this->stopProfiling(measurementName, queue);
422 void VNNclAlgorithm::startProfiling(std::string name, cl::CommandQueue queue) {
423 if(!mRuntime->isEnabled())
426 mRuntime->startCLTimer(name, queue);
427 mMeasurementNames.insert(name);
430 void VNNclAlgorithm::stopProfiling(std::string name, cl::CommandQueue queue) {
431 if(!mRuntime->isEnabled())
434 mRuntime->stopCLTimer(name, queue);
435 mRuntime->printAll();
QString qstring_cast(const T &val)
void reportError(QString msg)
virtual bool initCL(QString kernelFile, int nMaxPlanes, int nPlanes, int method, int planeMethod, int nStarts, float brightnessWeight, float newnessWeight)
Transform3D Transform3D
Transform3D is a representation of an affine 3D transform.
double getTotalExecutionTime()
virtual void fillPlaneMatrices(float *planeMatrices, ProcessedUSInputDataPtr input)
virtual cl::Program buildCLProgram(std::string program_src, std::string kernelPath, int nMaxPlanes, int nPlanes, int method, int planeMethod, int nStarts, float brightnessWeight, float newnessWeight)
virtual bool reconstruct(ProcessedUSInputDataPtr input, vtkImageDataPtr outputData, float radius, int nClosePlanes)
virtual bool initializeFrameBlocks(frameBlock_t *framePointers, int numBlocks, ProcessedUSInputDataPtr inputFrames)
void setProfiling(bool on)
void setDeepModified(vtkImageDataPtr image)
RealScalar length() const
boost::shared_ptr< class ProcessedUSInputData > ProcessedUSInputDataPtr
virtual void freeFrameBlocks(frameBlock_t *framePointers, int numBlocks)
double getKernelExecutionTime()
Namespace for all CustusX production code.