54         typedef std::vector<Device> Devices;
 
   63         template<
typename T, 
typename CloudType>
 
   68                 for (
int i = 0; i < v.size(); ++i)
 
   88         template<
typename T, 
typename CloudType>
 
   89         struct EnableCLTypeSupport {};
 
   92         template<
typename CloudType>
 
   93         struct EnableCLTypeSupport<float, CloudType>
 
   96                 static string code(
const cl::Device& device)
 
   98                         return "typedef float T;\n";
 
  103         template<
typename CloudType>
 
  104         struct EnableCLTypeSupport<double, CloudType>
 
  108                 static string code(
const cl::Device& device)
 
  111                         const string& exts(device.getInfo<CL_DEVICE_EXTENSIONS>());
 
  114                         if (exts.find(
"cl_khr_fp64") != string::npos)
 
  115                                 s += 
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
 
  116                         else if (exts.find(
"cl_amd_fp64") != string::npos)
 
  117                                 s += 
"#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n";
 
  119                                 throw runtime_error(
"The OpenCL platform does not support 64 bits double-precision floating-points scalars.");
 
  120                         s += 
"typedef double T;\n";
 
  129                 typedef std::vector<cl::Device> Devices;
 
  131                 typedef std::map<std::string, cl::Program> ProgramCache;
 
  135                 ProgramCache cachedPrograms; 
 
  138                 SourceCacher(
const cl_device_type deviceType)
 
  141                         vector<cl::Platform> platforms;
 
  143                         if (platforms.empty())
 
  144                                 throw runtime_error(
"No OpenCL platform found");
 
  147                         cl::Platform platform = platforms[0];
 
  148                         const char *userDefinedPlatform(getenv(
"NABO_OPENCL_USE_PLATFORM"));
 
  149                         if (userDefinedPlatform)
 
  151                                 size_t userDefinedPlatformId = atoi(userDefinedPlatform);
 
  152                                 if (userDefinedPlatformId < platforms.size())
 
  153                                         platform = platforms[userDefinedPlatformId];
 
  157                         cl_context_properties properties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform(), 0 };
 
  158                         bool deviceFound = 
false;
 
  160                                 context = cl::Context(deviceType, properties);
 
  162                         } 
catch (
const cl::Error& e) {
 
  163                                 cerr << 
"Cannot find device type " << deviceType << 
" for OpenCL, falling back to any device" << endl;
 
  166                                 context = cl::Context(CL_DEVICE_TYPE_ALL, properties);
 
  167                         devices = context.getInfo<CL_CONTEXT_DEVICES>();
 
  169                                 throw runtime_error(
"No devices on OpenCL platform");
 
  175                         cerr << 
"Destroying source cacher containing " << cachedPrograms.size() << 
" cached programs" << endl;
 
  181                         return cachedPrograms.find(source) != cachedPrograms.end();
 
  190                 typedef std::map<cl_device_type, SourceCacher*> Devices;
 
  195                         cerr << 
"Destroying CL context manager, used " << devices.size() << 
" contexts" << endl;
 
  196                         for (Devices::iterator it(devices.begin()); it != devices.end(); ++it)
 
  200                 cl::Context& createContext(
const cl_device_type deviceType)
 
  202                         std::lock_guard lock(mutex);
 
  203                         Devices::iterator it(devices.find(deviceType));
 
  204                         if (it == devices.end())
 
  207                                         pair<cl_device_type, SourceCacher*>(deviceType, 
new SourceCacher(deviceType))
 
  210                         return it->second->context;
 
  213                 SourceCacher* getSourceCacher(
const cl_device_type deviceType)
 
  215                         std::lock_guard lock(mutex);
 
  216                         Devices::iterator it(devices.find(deviceType));
 
  217                         if (it == devices.end())
 
  218                                 throw runtime_error(
"Attempt to get source cacher before creating a context");
 
  228         static ContextManager contextManager;
 
  230         template<
typename T, 
typename CloudType>
 
  231         OpenCLSearch<T, CloudType>::OpenCLSearch(
const CloudType& cloud, 
const Index dim, 
const unsigned creationOptionFlags, 
const cl_device_type deviceType):
 
  233                 deviceType(deviceType),
 
  234                 context(contextManager.createContext(deviceType))
 
  238         template<
typename T, 
typename CloudType>
 
  239         void OpenCLSearch<T, CloudType>::initOpenCL(
const char* clFileName, 
const char* kernelName, 
const std::string& additionalDefines)
 
  243                 SourceCacher* sourceCacher(contextManager.getSourceCacher(deviceType));
 
  244                 SourceCacher::Devices& devices(sourceCacher->devices);
 
  247                 cl::Program::Sources sources;
 
  250                 oss << EnableCLTypeSupport<T, CloudType>::code(devices.back());
 
  251                 oss << 
"#define EPSILON " << numeric_limits<T>::epsilon() << 
"\n";
 
  252                 oss << 
"#define DIM_COUNT " << dim << 
"\n";
 
  254                 oss << 
"#define POINT_STRIDE " << cloud.stride() << 
"\n";
 
  255                 oss << 
"#define MAX_K " << MAX_K << 
"\n";
 
  256                 if (collectStatistics)
 
  257                         oss << 
"#define TOUCH_STATISTICS\n";
 
  258                 oss << additionalDefines;
 
  262                 if (!sourceCacher->contains(source))
 
  264                         const size_t defLen(source.length());
 
  265                         char *defContent(
new char[defLen+1]);
 
  266                         strcpy(defContent, source.c_str());
 
  267                         sources.push_back(std::make_pair(defContent, defLen));
 
  268                         string sourceFileName(OPENCL_SOURCE_DIR);
 
  269                         sourceFileName += clFileName;
 
  271                         const char* files[] = {
 
  272                                 OPENCL_SOURCE_DIR 
"structure.cl",
 
  273                                 OPENCL_SOURCE_DIR 
"heap.cl",
 
  274                                 sourceFileName.c_str(),
 
  277                         for (
const char** file = files; *file != NULL; ++file)
 
  279                                 std::ifstream stream(*file);
 
  281                                         throw runtime_error((
string(
"cannot open file: ") + *file));
 
  283                                 stream.seekg(0, std::ios_base::end);
 
  284                                 size_t size(stream.tellg());
 
  285                                 stream.seekg(0, std::ios_base::beg);
 
  287                                 char* content(
new char[size + 1]);
 
  288                                 std::copy(std::istreambuf_iterator<char>(stream),
 
  289                                                         std::istreambuf_iterator<char>(), content);
 
  290                                 content[size] = 
'\0';
 
  292                                 sources.push_back(std::make_pair(content, size));
 
  294                         sourceCacher->cachedPrograms[source] = cl::Program(context, sources);
 
  295                         cl::Program& program = sourceCacher->cachedPrograms[source];
 
  298                         cl::Error error(CL_SUCCESS);
 
  300                                 program.build(devices);
 
  301                         } 
catch (cl::Error e) {
 
  306                         for (cl::Devices::const_iterator it = devices.begin(); it != devices.end(); ++it)
 
  308                                 cerr << 
"device : " << it->getInfo<CL_DEVICE_NAME>() << 
"\n";
 
  309                                 cerr << 
"compilation log:\n" << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(*it) << endl;
 
  312                         for (cl::Program::Sources::iterator it = sources.begin(); it != sources.end(); ++it)
 
  319                         if (error.err() != CL_SUCCESS)
 
  322                 cl::Program& program = sourceCacher->cachedPrograms[source];
 
  325                 knnKernel = cl::Kernel(program, kernelName); 
 
  326                 queue = cl::CommandQueue(context, devices.back());
 
  329                 if (!(cloud.Flags & Eigen::DirectAccessBit) || (cloud.Flags & Eigen::RowMajorBit))
 
  330                         throw runtime_error(
"wrong memory mapping in point cloud");
 
  331                 const size_t cloudCLSize(cloud.cols() * cloud.stride() * 
sizeof(T));
 
  332                 cloudCL = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, cloudCLSize, 
const_cast<T*
>(&cloud.coeff(0,0)));
 
  333                 knnKernel.setArg(0, 
sizeof(cl_mem), &cloudCL);
 
  336         template<
typename T, 
typename CloudType>
 
  337         unsigned long OpenCLSearch<T, CloudType>::knn(
const Matrix& query, IndexMatrix& indices, Matrix& dists2, 
const Index k, 
const T epsilon, 
const unsigned optionFlags, 
const T maxRadius)
 const 
  339                 checkSizesKnn(query, indices, dists2, k, optionFlags);
 
  344                         throw runtime_error(
"number of neighbors too large for OpenCL");
 
  347                 if (query.stride() != cloud.stride() ||
 
  348                         query.rows() != cloud.rows())
 
  349                         throw runtime_error(
"query is not of the same dimensionality as the point cloud");
 
  352                 if (!(query.Flags & Eigen::DirectAccessBit) || (query.Flags & Eigen::RowMajorBit))
 
  353                         throw runtime_error(
"wrong memory mapping in query data");
 
  354                 const size_t queryCLSize(query.cols() * query.stride() * 
sizeof(T));
 
  355                 cl::Buffer queryCL(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, queryCLSize, 
const_cast<T*
>(&query.coeff(0,0)));
 
  356                 knnKernel.setArg(1, 
sizeof(cl_mem), &queryCL);
 
  358                 assert((indices.Flags & Eigen::DirectAccessBit) && (!(indices.Flags & Eigen::RowMajorBit)));
 
  359                 const int indexStride(indices.stride());
 
  360                 const size_t indicesCLSize(indices.cols() * indexStride * 
sizeof(
int));
 
  361                 cl::Buffer indicesCL(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, indicesCLSize, &indices.coeffRef(0,0));
 
  362                 knnKernel.setArg(2, 
sizeof(cl_mem), &indicesCL);
 
  364                 assert((dists2.Flags & Eigen::DirectAccessBit) && (!(dists2.Flags & Eigen::RowMajorBit)));
 
  365                 const int dists2Stride(dists2.stride());
 
  366                 const size_t dists2CLSize(dists2.cols() * dists2Stride * 
sizeof(T));
 
  367                 cl::Buffer dists2CL(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, dists2CLSize, &dists2.coeffRef(0,0));
 
  368                 knnKernel.setArg(3, 
sizeof(cl_mem), &dists2CL);
 
  371                 knnKernel.setArg(4, k);
 
  372                 knnKernel.setArg(5, (1 + epsilon)*(1 + epsilon));
 
  373                 knnKernel.setArg(6, maxRadius*maxRadius);
 
  374                 knnKernel.setArg(7, optionFlags);
 
  375                 knnKernel.setArg(8, indexStride);
 
  376                 knnKernel.setArg(9, dists2Stride);
 
  377                 knnKernel.setArg(10, cl_uint(cloud.cols()));
 
  380                 vector<cl_uint> visitCounts;
 
  381                 const size_t visitCountCLSize(query.cols() * 
sizeof(cl_uint));
 
  382                 cl::Buffer visitCountCL;
 
  383                 if (collectStatistics)
 
  385                         visitCounts.resize(query.cols());
 
  386                         visitCountCL = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, visitCountCLSize, &visitCounts[0]);
 
  387                         knnKernel.setArg(11, 
sizeof(cl_mem), &visitCountCL);
 
  391                 queue.enqueueNDRangeKernel(knnKernel, cl::NullRange, cl::NDRange(query.cols()), cl::NullRange);
 
  392                 queue.enqueueMapBuffer(indicesCL, 
true, CL_MAP_READ, 0, indicesCLSize, 0, 0);
 
  393                 queue.enqueueMapBuffer(dists2CL, 
true, CL_MAP_READ, 0, dists2CLSize, 0, 0);
 
  394                 if (collectStatistics)
 
  395                         queue.enqueueMapBuffer(visitCountCL, 
true, CL_MAP_READ, 0, visitCountCLSize, 0, 0);
 
  399                 if (collectStatistics)
 
  401                         unsigned long totalVisitCounts(0);
 
  402                         for (
size_t i = 0; i < visitCounts.size(); ++i)
 
  403                                 totalVisitCounts += (
unsigned long)visitCounts[i];
 
  404                         return totalVisitCounts;
 
  410         template<
typename T, 
typename CloudType>
 
  411         BruteForceSearchOpenCL<T, CloudType>::BruteForceSearchOpenCL(
const CloudType& cloud, 
const Index dim, 
const unsigned creationOptionFlags, 
const cl_device_type deviceType):
 
  412         OpenCLSearch<T, CloudType>::OpenCLSearch(cloud, dim, creationOptionFlags, deviceType)
 
  415                 const_cast<Vector&
>(this->
minBound) = cloud.topRows(this->dim).rowwise().minCoeff();
 
  416                 const_cast<Vector&
>(this->
maxBound) = cloud.topRows(this->dim).rowwise().maxCoeff();
 
  419                 for (
int i = 0; i < 
cloud.cols(); ++i)
 
  427                 initOpenCL(
"knn_bf.cl", 
"knnBruteForce");
 
  430         template struct BruteForceSearchOpenCL<float>;
 
  431         template struct BruteForceSearchOpenCL<double>;
 
  432         template struct BruteForceSearchOpenCL<float, Eigen::Matrix3Xf>;
 
  433         template struct BruteForceSearchOpenCL<double, Eigen::Matrix3Xd>;
 
  434         template struct BruteForceSearchOpenCL<float, Eigen::Map<const Eigen::Matrix3Xf, Eigen::Aligned> >;
 
  435         template struct BruteForceSearchOpenCL<double, Eigen::Map<const Eigen::Matrix3Xd, Eigen::Aligned> >;
 
  439         template<
typename T, 
typename CloudType>
 
  440         size_t KDTreeBalancedPtInLeavesStackOpenCL<T, CloudType>::getTreeSize(
size_t elCount)
 const 
  449                         if (elCount & (1 << i))
 
  452                 for (
int j = 0; j <= i; ++j)
 
  459         template<
typename T, 
typename CloudType>
 
  460         size_t KDTreeBalancedPtInLeavesStackOpenCL<T, CloudType>::getTreeDepth(
size_t elCount)
 const 
  468                         if (elCount & (1 << i))
 
  474         template<
typename T, 
typename CloudType>
 
  475         void KDTreeBalancedPtInLeavesStackOpenCL<T, CloudType>::buildNodes(
const BuildPointsIt first, 
const BuildPointsIt last, 
const size_t pos, 
const Vector minValues, 
const Vector maxValues)
 
  477                 const size_t count(last - first);
 
  481                         const int d = -2-(first->index);
 
  482                         assert(pos < nodes.size());
 
  483                         nodes[pos] = Node(d);
 
  488                 size_t cutDim = argMax<T, CloudType>(maxValues - minValues);
 
  491                 const size_t rightCount(count/2);
 
  492                 const size_t leftCount(count - rightCount);
 
  493                 assert(last - rightCount == first + leftCount);
 
  496                 nth_element(first, first + leftCount, last, CompareDim(cutDim));
 
  499                 const T cutVal((first+leftCount)->pos.coeff(cutDim));
 
  500                 nodes[pos] = Node(cutDim, cutVal);
 
  505                 Vector leftMaxValues(maxValues);
 
  506                 leftMaxValues[cutDim] = cutVal;
 
  508                 Vector rightMinValues(minValues);
 
  509                 rightMinValues[cutDim] = cutVal;
 
  512                 buildNodes(first, first + leftCount, childLeft(pos), minValues, leftMaxValues);
 
  513                 buildNodes(first + leftCount, last, childRight(pos), rightMinValues, maxValues);
 
  516         template<
typename T, 
typename CloudType>
 
  517         KDTreeBalancedPtInLeavesStackOpenCL<T, CloudType>::KDTreeBalancedPtInLeavesStackOpenCL(
const CloudType& cloud, 
const Index dim, 
const unsigned creationOptionFlags, 
const cl_device_type deviceType):
 
  518                 OpenCLSearch<T, CloudType>::OpenCLSearch(cloud, dim, creationOptionFlags, deviceType)
 
  524                 buildPoints.reserve(
cloud.cols());
 
  525                 for (
int i = 0; i < 
cloud.cols(); ++i)
 
  528                         buildPoints.push_back(BuildPoint(v, i));
 
  541                 const unsigned maxStackDepth(getTreeDepth(
nodes.size()) + 1);
 
  544                 initOpenCL(
"knn_kdtree_pt_in_leaves.cl", 
"knnKDTree", 
"#define MAX_STACK_DEPTH " + std::to_string(maxStackDepth) + 
"\n");
 
  547                 const size_t nodesCLSize(
nodes.size() * 
sizeof(Node));
 
  548                 nodesCL = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, nodesCLSize, &
nodes[0]);
 
  549                 if (collectStatistics)
 
  550                         knnKernel.setArg(12, 
sizeof(cl_mem), &nodesCL);
 
  552                         knnKernel.setArg(11, 
sizeof(cl_mem), &nodesCL);
 
  555         template struct KDTreeBalancedPtInLeavesStackOpenCL<float>;
 
  556         template struct KDTreeBalancedPtInLeavesStackOpenCL<double>;
 
  557         template struct KDTreeBalancedPtInLeavesStackOpenCL<float, Eigen::Matrix3Xf>;
 
  558         template struct KDTreeBalancedPtInLeavesStackOpenCL<double, Eigen::Matrix3Xd>;
 
  559         template struct KDTreeBalancedPtInLeavesStackOpenCL<float, Eigen::Map<const Eigen::Matrix3Xf, Eigen::Aligned> >;
 
  560         template struct KDTreeBalancedPtInLeavesStackOpenCL<double, Eigen::Map<const Eigen::Matrix3Xd, Eigen::Aligned> >;
 
  563         template<
typename T, 
typename CloudType>
 
  564         size_t KDTreeBalancedPtInNodesStackOpenCL<T, CloudType>::getTreeSize(
size_t elCount)
 const 
  571                         if (elCount & (1 << i))
 
  574                 for (
int j = 0; j <= i; ++j)
 
  580         template<
typename T, 
typename CloudType>
 
  581         size_t KDTreeBalancedPtInNodesStackOpenCL<T, CloudType>::getTreeDepth(
size_t elCount)
 const 
  587                         if (elCount & (1 << i))
 
  593         template<
typename T, 
typename CloudType>
 
  594         void KDTreeBalancedPtInNodesStackOpenCL<T, CloudType>::buildNodes(
const BuildPointsIt first, 
const BuildPointsIt last, 
const size_t pos, 
const Vector minValues, 
const Vector maxValues)
 
  596                 const size_t count(last - first);
 
  600                         nodes[pos] = Node(-1, *first);
 
  605                 const size_t cutDim = argMax<T, CloudType>(maxValues - minValues);
 
  608                 const size_t recurseCount(count-1);
 
  609                 const size_t rightCount(recurseCount/2);
 
  610                 const size_t leftCount(recurseCount-rightCount);
 
  611                 assert(last - rightCount == first + leftCount + 1);
 
  614                 nth_element(first, first + leftCount, last, CompareDim(cloud, cutDim));
 
  617                 const Index index(*(first+leftCount));
 
  618                 const T cutVal(cloud.coeff(cutDim, index));
 
  619                 nodes[pos] = Node(cutDim, index);
 
  624                 Vector leftMaxValues(maxValues);
 
  625                 leftMaxValues[cutDim] = cutVal;
 
  627                 Vector rightMinValues(minValues);
 
  628                 rightMinValues[cutDim] = cutVal;
 
  633                         buildNodes(first, first + leftCount, childLeft(pos), minValues, leftMaxValues);
 
  634                         buildNodes(first + leftCount + 1, last, childRight(pos), rightMinValues, maxValues);
 
  638                         nodes[childLeft(pos)] = Node(-1, *first);
 
  639                         nodes[childRight(pos)] = Node(-2, 0);
 
  643         template<
typename T, 
typename CloudType>
 
  644         KDTreeBalancedPtInNodesStackOpenCL<T, CloudType>::KDTreeBalancedPtInNodesStackOpenCL(
const CloudType& cloud, 
const Index dim, 
const unsigned creationOptionFlags, 
const cl_device_type deviceType):
 
  645         OpenCLSearch<T, CloudType>::OpenCLSearch(cloud, dim, creationOptionFlags, deviceType)
 
  651                 buildPoints.reserve(
cloud.cols());
 
  652                 for (
int i = 0; i < 
cloud.cols(); ++i)
 
  654                         buildPoints.push_back(i);
 
  668                 const unsigned maxStackDepth(getTreeDepth(
nodes.size()) + 1);
 
  671                 initOpenCL(
"knn_kdtree_pt_in_nodes.cl", 
"knnKDTree", 
"#define MAX_STACK_DEPTH " + std::to_string(maxStackDepth) + 
"\n");
 
  674                 const size_t nodesCLSize(
nodes.size() * 
sizeof(Node));
 
  675                 nodesCL = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, nodesCLSize, &
nodes[0]);
 
  676                 if (collectStatistics)
 
  677                         knnKernel.setArg(12, 
sizeof(cl_mem), &nodesCL);
 
  679                         knnKernel.setArg(11, 
sizeof(cl_mem), &nodesCL);
 
  682         template struct KDTreeBalancedPtInNodesStackOpenCL<float>;
 
  683         template struct KDTreeBalancedPtInNodesStackOpenCL<double>;
 
  684         template struct KDTreeBalancedPtInNodesStackOpenCL<float, Eigen::Matrix3Xf>;
 
  685         template struct KDTreeBalancedPtInNodesStackOpenCL<double, Eigen::Matrix3Xd>;
 
  686         template struct KDTreeBalancedPtInNodesStackOpenCL<float, Eigen::Map<const Eigen::Matrix3Xf, Eigen::Aligned> >;
 
  687         template struct KDTreeBalancedPtInNodesStackOpenCL<double, Eigen::Map<const Eigen::Matrix3Xd, Eigen::Aligned> >;
 
  692 #endif // HAVE_OPENCL