54 typedef std::vector<Device> Devices;
63 template<
typename T,
typename CloudType>
68 for (
int i = 0; i < v.size(); ++i)
88 template<
typename T,
typename CloudType>
89 struct EnableCLTypeSupport {};
92 template<
typename CloudType>
93 struct EnableCLTypeSupport<float, CloudType>
96 static string code(
const cl::Device& device)
98 return "typedef float T;\n";
103 template<
typename CloudType>
104 struct EnableCLTypeSupport<double, CloudType>
108 static string code(
const cl::Device& device)
111 const string& exts(device.getInfo<CL_DEVICE_EXTENSIONS>());
114 if (exts.find(
"cl_khr_fp64") != string::npos)
115 s +=
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
116 else if (exts.find(
"cl_amd_fp64") != string::npos)
117 s +=
"#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n";
119 throw runtime_error(
"The OpenCL platform does not support 64 bits double-precision floating-points scalars.");
120 s +=
"typedef double T;\n";
129 typedef std::vector<cl::Device> Devices;
131 typedef std::map<std::string, cl::Program> ProgramCache;
135 ProgramCache cachedPrograms;
138 SourceCacher(
const cl_device_type deviceType)
141 vector<cl::Platform> platforms;
142 cl::Platform::get(&platforms);
143 if (platforms.empty())
144 throw runtime_error(
"No OpenCL platform found");
147 cl::Platform platform = platforms[0];
148 const char *userDefinedPlatform(getenv(
"NABO_OPENCL_USE_PLATFORM"));
149 if (userDefinedPlatform)
151 size_t userDefinedPlatformId = atoi(userDefinedPlatform);
152 if (userDefinedPlatformId < platforms.size())
153 platform = platforms[userDefinedPlatformId];
157 cl_context_properties properties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform(), 0 };
158 bool deviceFound =
false;
160 context = cl::Context(deviceType, properties);
162 }
catch (
const cl::Error& e) {
163 cerr <<
"Cannot find device type " << deviceType <<
" for OpenCL, falling back to any device" << endl;
166 context = cl::Context(CL_DEVICE_TYPE_ALL, properties);
167 devices = context.getInfo<CL_CONTEXT_DEVICES>();
169 throw runtime_error(
"No devices on OpenCL platform");
175 cerr <<
"Destroying source cacher containing " << cachedPrograms.size() <<
" cached programs" << endl;
179 bool contains(
const std::string& source)
181 return cachedPrograms.find(source) != cachedPrograms.end();
190 typedef std::map<cl_device_type, SourceCacher*> Devices;
195 cerr <<
"Destroying CL context manager, used " << devices.size() <<
" contexts" << endl;
196 for (Devices::iterator it(devices.begin()); it != devices.end(); ++it)
200 cl::Context& createContext(
const cl_device_type deviceType)
202 std::lock_guard lock(mutex);
203 Devices::iterator it(devices.find(deviceType));
204 if (it == devices.end())
207 pair<cl_device_type, SourceCacher*>(deviceType,
new SourceCacher(deviceType))
210 return it->second->context;
213 SourceCacher* getSourceCacher(
const cl_device_type deviceType)
215 std::lock_guard lock(mutex);
216 Devices::iterator it(devices.find(deviceType));
217 if (it == devices.end())
218 throw runtime_error(
"Attempt to get source cacher before creating a context");
228 static ContextManager contextManager;
230 template<
typename T,
typename CloudType>
231 OpenCLSearch<T, CloudType>::OpenCLSearch(
const CloudType& cloud,
const Index dim,
const unsigned creationOptionFlags,
const cl_device_type deviceType):
233 deviceType(deviceType),
234 context(contextManager.createContext(deviceType))
238 template<
typename T,
typename CloudType>
239 void OpenCLSearch<T, CloudType>::initOpenCL(
const char* clFileName,
const char* kernelName,
const std::string& additionalDefines)
243 SourceCacher* sourceCacher(contextManager.getSourceCacher(deviceType));
244 SourceCacher::Devices& devices(sourceCacher->devices);
247 cl::Program::Sources sources;
250 oss << EnableCLTypeSupport<T, CloudType>::code(devices.back());
251 oss <<
"#define EPSILON " << numeric_limits<T>::epsilon() <<
"\n";
252 oss <<
"#define DIM_COUNT " << dim <<
"\n";
254 oss <<
"#define POINT_STRIDE " << cloud.stride() <<
"\n";
255 oss <<
"#define MAX_K " << MAX_K <<
"\n";
256 if (collectStatistics)
257 oss <<
"#define TOUCH_STATISTICS\n";
258 oss << additionalDefines;
261 const std::string& source(oss.str());
262 if (!sourceCacher->contains(source))
264 const size_t defLen(source.length());
265 char *defContent(
new char[defLen+1]);
266 strcpy(defContent, source.c_str());
267 sources.push_back(std::make_pair(defContent, defLen));
268 string sourceFileName(OPENCL_SOURCE_DIR);
269 sourceFileName += clFileName;
271 const char* files[] = {
272 OPENCL_SOURCE_DIR
"structure.cl",
273 OPENCL_SOURCE_DIR
"heap.cl",
274 sourceFileName.c_str(),
277 for (
const char** file = files; *file != NULL; ++file)
279 std::ifstream stream(*file);
281 throw runtime_error((
string(
"cannot open file: ") + *file));
283 stream.seekg(0, std::ios_base::end);
284 size_t size(stream.tellg());
285 stream.seekg(0, std::ios_base::beg);
287 char* content(
new char[size + 1]);
288 std::copy(std::istreambuf_iterator<char>(stream),
289 std::istreambuf_iterator<char>(), content);
290 content[size] =
'\0';
292 sources.push_back(std::make_pair(content, size));
294 sourceCacher->cachedPrograms[source] = cl::Program(context, sources);
295 cl::Program& program = sourceCacher->cachedPrograms[source];
298 cl::Error error(CL_SUCCESS);
300 program.build(devices);
301 }
catch (cl::Error e) {
306 for (cl::Devices::const_iterator it = devices.begin(); it != devices.end(); ++it)
308 cerr <<
"device : " << it->getInfo<CL_DEVICE_NAME>() <<
"\n";
309 cerr <<
"compilation log:\n" << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(*it) << endl;
312 for (cl::Program::Sources::iterator it = sources.begin(); it != sources.end(); ++it)
319 if (error.err() != CL_SUCCESS)
322 cl::Program& program = sourceCacher->cachedPrograms[source];
325 knnKernel = cl::Kernel(program, kernelName);
326 queue = cl::CommandQueue(context, devices.back());
329 if (!(cloud.Flags & Eigen::DirectAccessBit) || (cloud.Flags & Eigen::RowMajorBit))
330 throw runtime_error(
"wrong memory mapping in point cloud");
331 const size_t cloudCLSize(cloud.cols() * cloud.stride() *
sizeof(T));
332 cloudCL = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, cloudCLSize,
const_cast<T*
>(&cloud.coeff(0,0)));
333 knnKernel.setArg(0,
sizeof(cl_mem), &cloudCL);
336 template<
typename T,
typename CloudType>
337 unsigned long OpenCLSearch<T, CloudType>::knn(
const Matrix& query, IndexMatrix& indices, Matrix& dists2,
const Index k,
const T epsilon,
const unsigned optionFlags,
const T maxRadius)
const
339 checkSizesKnn(query, indices, dists2, k, optionFlags);
344 throw runtime_error(
"number of neighbors too large for OpenCL");
347 if (query.stride() != cloud.stride() ||
348 query.rows() != cloud.rows())
349 throw runtime_error(
"query is not of the same dimensionality as the point cloud");
352 if (!(query.Flags & Eigen::DirectAccessBit) || (query.Flags & Eigen::RowMajorBit))
353 throw runtime_error(
"wrong memory mapping in query data");
354 const size_t queryCLSize(query.cols() * query.stride() *
sizeof(T));
355 cl::Buffer queryCL(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, queryCLSize,
const_cast<T*
>(&query.coeff(0,0)));
356 knnKernel.setArg(1,
sizeof(cl_mem), &queryCL);
358 assert((indices.Flags & Eigen::DirectAccessBit) && (!(indices.Flags & Eigen::RowMajorBit)));
359 const int indexStride(indices.stride());
360 const size_t indicesCLSize(indices.cols() * indexStride *
sizeof(
int));
361 cl::Buffer indicesCL(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, indicesCLSize, &indices.coeffRef(0,0));
362 knnKernel.setArg(2,
sizeof(cl_mem), &indicesCL);
364 assert((dists2.Flags & Eigen::DirectAccessBit) && (!(dists2.Flags & Eigen::RowMajorBit)));
365 const int dists2Stride(dists2.stride());
366 const size_t dists2CLSize(dists2.cols() * dists2Stride *
sizeof(T));
367 cl::Buffer dists2CL(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, dists2CLSize, &dists2.coeffRef(0,0));
368 knnKernel.setArg(3,
sizeof(cl_mem), &dists2CL);
371 knnKernel.setArg(4, k);
372 knnKernel.setArg(5, (1 + epsilon)*(1 + epsilon));
373 knnKernel.setArg(6, maxRadius*maxRadius);
374 knnKernel.setArg(7, optionFlags);
375 knnKernel.setArg(8, indexStride);
376 knnKernel.setArg(9, dists2Stride);
377 knnKernel.setArg(10, cl_uint(cloud.cols()));
380 vector<cl_uint> visitCounts;
381 const size_t visitCountCLSize(query.cols() *
sizeof(cl_uint));
382 cl::Buffer visitCountCL;
383 if (collectStatistics)
385 visitCounts.resize(query.cols());
386 visitCountCL = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, visitCountCLSize, &visitCounts[0]);
387 knnKernel.setArg(11,
sizeof(cl_mem), &visitCountCL);
391 queue.enqueueNDRangeKernel(knnKernel, cl::NullRange, cl::NDRange(query.cols()), cl::NullRange);
392 queue.enqueueMapBuffer(indicesCL,
true, CL_MAP_READ, 0, indicesCLSize, 0, 0);
393 queue.enqueueMapBuffer(dists2CL,
true, CL_MAP_READ, 0, dists2CLSize, 0, 0);
394 if (collectStatistics)
395 queue.enqueueMapBuffer(visitCountCL,
true, CL_MAP_READ, 0, visitCountCLSize, 0, 0);
399 if (collectStatistics)
401 unsigned long totalVisitCounts(0);
402 for (
size_t i = 0; i < visitCounts.size(); ++i)
403 totalVisitCounts += (
unsigned long)visitCounts[i];
404 return totalVisitCounts;
410 template<
typename T,
typename CloudType>
411 BruteForceSearchOpenCL<T, CloudType>::BruteForceSearchOpenCL(
const CloudType& cloud,
const Index dim,
const unsigned creationOptionFlags,
const cl_device_type deviceType):
412 OpenCLSearch<T, CloudType>::OpenCLSearch(cloud, dim, creationOptionFlags, deviceType)
415 const_cast<Vector&
>(this->
minBound) = cloud.topRows(this->dim).rowwise().minCoeff();
416 const_cast<Vector&
>(this->
maxBound) = cloud.topRows(this->dim).rowwise().maxCoeff();
419 for (
int i = 0; i <
cloud.cols(); ++i)
427 initOpenCL(
"knn_bf.cl",
"knnBruteForce");
430 template struct BruteForceSearchOpenCL<float>;
431 template struct BruteForceSearchOpenCL<double>;
432 template struct BruteForceSearchOpenCL<float, Eigen::Matrix3Xf>;
433 template struct BruteForceSearchOpenCL<double, Eigen::Matrix3Xd>;
434 template struct BruteForceSearchOpenCL<float, Eigen::Map<const Eigen::Matrix3Xf, Eigen::Aligned> >;
435 template struct BruteForceSearchOpenCL<double, Eigen::Map<const Eigen::Matrix3Xd, Eigen::Aligned> >;
439 template<
typename T,
typename CloudType>
440 size_t KDTreeBalancedPtInLeavesStackOpenCL<T, CloudType>::getTreeSize(
size_t elCount)
const
449 if (elCount & (1 << i))
452 for (
int j = 0; j <= i; ++j)
459 template<
typename T,
typename CloudType>
460 size_t KDTreeBalancedPtInLeavesStackOpenCL<T, CloudType>::getTreeDepth(
size_t elCount)
const
468 if (elCount & (1 << i))
474 template<
typename T,
typename CloudType>
475 void KDTreeBalancedPtInLeavesStackOpenCL<T, CloudType>::buildNodes(
const BuildPointsIt first,
const BuildPointsIt last,
const size_t pos,
const Vector minValues,
const Vector maxValues)
477 const size_t count(last - first);
481 const int d = -2-(first->index);
482 assert(pos < nodes.size());
483 nodes[pos] = Node(d);
488 size_t cutDim = argMax<T, CloudType>(maxValues - minValues);
491 const size_t rightCount(count/2);
492 const size_t leftCount(count - rightCount);
493 assert(last - rightCount == first + leftCount);
496 nth_element(first, first + leftCount, last, CompareDim(cutDim));
499 const T cutVal((first+leftCount)->pos.coeff(cutDim));
500 nodes[pos] = Node(cutDim, cutVal);
505 Vector leftMaxValues(maxValues);
506 leftMaxValues[cutDim] = cutVal;
508 Vector rightMinValues(minValues);
509 rightMinValues[cutDim] = cutVal;
512 buildNodes(first, first + leftCount, childLeft(pos), minValues, leftMaxValues);
513 buildNodes(first + leftCount, last, childRight(pos), rightMinValues, maxValues);
516 template<
typename T,
typename CloudType>
517 KDTreeBalancedPtInLeavesStackOpenCL<T, CloudType>::KDTreeBalancedPtInLeavesStackOpenCL(
const CloudType& cloud,
const Index dim,
const unsigned creationOptionFlags,
const cl_device_type deviceType):
518 OpenCLSearch<T, CloudType>::OpenCLSearch(cloud, dim, creationOptionFlags, deviceType)
524 buildPoints.reserve(
cloud.cols());
525 for (
int i = 0; i <
cloud.cols(); ++i)
528 buildPoints.push_back(BuildPoint(v, i));
541 const unsigned maxStackDepth(getTreeDepth(
nodes.size()) + 1);
544 initOpenCL(
"knn_kdtree_pt_in_leaves.cl",
"knnKDTree",
"#define MAX_STACK_DEPTH " + std::to_string(maxStackDepth) +
"\n");
547 const size_t nodesCLSize(
nodes.size() *
sizeof(Node));
548 nodesCL = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, nodesCLSize, &
nodes[0]);
549 if (collectStatistics)
550 knnKernel.setArg(12,
sizeof(cl_mem), &nodesCL);
552 knnKernel.setArg(11,
sizeof(cl_mem), &nodesCL);
555 template struct KDTreeBalancedPtInLeavesStackOpenCL<float>;
556 template struct KDTreeBalancedPtInLeavesStackOpenCL<double>;
557 template struct KDTreeBalancedPtInLeavesStackOpenCL<float, Eigen::Matrix3Xf>;
558 template struct KDTreeBalancedPtInLeavesStackOpenCL<double, Eigen::Matrix3Xd>;
559 template struct KDTreeBalancedPtInLeavesStackOpenCL<float, Eigen::Map<const Eigen::Matrix3Xf, Eigen::Aligned> >;
560 template struct KDTreeBalancedPtInLeavesStackOpenCL<double, Eigen::Map<const Eigen::Matrix3Xd, Eigen::Aligned> >;
563 template<
typename T,
typename CloudType>
564 size_t KDTreeBalancedPtInNodesStackOpenCL<T, CloudType>::getTreeSize(
size_t elCount)
const
571 if (elCount & (1 << i))
574 for (
int j = 0; j <= i; ++j)
580 template<
typename T,
typename CloudType>
581 size_t KDTreeBalancedPtInNodesStackOpenCL<T, CloudType>::getTreeDepth(
size_t elCount)
const
587 if (elCount & (1 << i))
593 template<
typename T,
typename CloudType>
594 void KDTreeBalancedPtInNodesStackOpenCL<T, CloudType>::buildNodes(
const BuildPointsIt first,
const BuildPointsIt last,
const size_t pos,
const Vector minValues,
const Vector maxValues)
596 const size_t count(last - first);
600 nodes[pos] = Node(-1, *first);
605 const size_t cutDim = argMax<T, CloudType>(maxValues - minValues);
608 const size_t recurseCount(count-1);
609 const size_t rightCount(recurseCount/2);
610 const size_t leftCount(recurseCount-rightCount);
611 assert(last - rightCount == first + leftCount + 1);
614 nth_element(first, first + leftCount, last, CompareDim(cloud, cutDim));
617 const Index index(*(first+leftCount));
618 const T cutVal(cloud.coeff(cutDim, index));
619 nodes[pos] = Node(cutDim, index);
624 Vector leftMaxValues(maxValues);
625 leftMaxValues[cutDim] = cutVal;
627 Vector rightMinValues(minValues);
628 rightMinValues[cutDim] = cutVal;
633 buildNodes(first, first + leftCount, childLeft(pos), minValues, leftMaxValues);
634 buildNodes(first + leftCount + 1, last, childRight(pos), rightMinValues, maxValues);
638 nodes[childLeft(pos)] = Node(-1, *first);
639 nodes[childRight(pos)] = Node(-2, 0);
643 template<
typename T,
typename CloudType>
644 KDTreeBalancedPtInNodesStackOpenCL<T, CloudType>::KDTreeBalancedPtInNodesStackOpenCL(
const CloudType& cloud,
const Index dim,
const unsigned creationOptionFlags,
const cl_device_type deviceType):
645 OpenCLSearch<T, CloudType>::OpenCLSearch(cloud, dim, creationOptionFlags, deviceType)
651 buildPoints.reserve(
cloud.cols());
652 for (
int i = 0; i <
cloud.cols(); ++i)
654 buildPoints.push_back(i);
668 const unsigned maxStackDepth(getTreeDepth(
nodes.size()) + 1);
671 initOpenCL(
"knn_kdtree_pt_in_nodes.cl",
"knnKDTree",
"#define MAX_STACK_DEPTH " + std::to_string(maxStackDepth) +
"\n");
674 const size_t nodesCLSize(
nodes.size() *
sizeof(Node));
675 nodesCL = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, nodesCLSize, &
nodes[0]);
676 if (collectStatistics)
677 knnKernel.setArg(12,
sizeof(cl_mem), &nodesCL);
679 knnKernel.setArg(11,
sizeof(cl_mem), &nodesCL);
682 template struct KDTreeBalancedPtInNodesStackOpenCL<float>;
683 template struct KDTreeBalancedPtInNodesStackOpenCL<double>;
684 template struct KDTreeBalancedPtInNodesStackOpenCL<float, Eigen::Matrix3Xf>;
685 template struct KDTreeBalancedPtInNodesStackOpenCL<double, Eigen::Matrix3Xd>;
686 template struct KDTreeBalancedPtInNodesStackOpenCL<float, Eigen::Map<const Eigen::Matrix3Xf, Eigen::Aligned> >;
687 template struct KDTreeBalancedPtInNodesStackOpenCL<double, Eigen::Map<const Eigen::Matrix3Xd, Eigen::Aligned> >;
692 #endif // HAVE_OPENCL