This is an example of how to use Kdtree queries within SYCL kernels to speed-up the Ponca Fitting library.
#include <Ponca/Ponca>
#include <sycl/sycl.hpp>
#include "sycl_kdtree.h"
template <typename Fit, typename DataPoint, typename GPUBuffers>
typename DataPoint::Scalar test_fit(GPUBuffers buffers, int i, typename DataPoint::Scalar scale)
{
KdTreeGPU<DataPoint> kdtree(buffers);
typename DataPoint::VectorType pos = kdtree.points()[i].pos();
Fit fit;
fit.setNeighborFilter({pos, scale});
fit.computeWithIds(kdtree.rangeNeighbors(i, scale), kdtree.points());
if (!fit.isStable())
return NAN;
return fit.potential(pos);
}
template <typename Scalar, unsigned int Dim>
void testPlaneSycl(sycl::device device, const bool _bUnoriented = false, const bool _bAddPositionNoise = false,
const bool _bAddNormalNoise = false)
{
std::cout << "Running on: " << device.get_info<sycl::info::device::name>() << std::endl;
using VectorType = typename DataPoint::VectorType;
const unsigned int nbPoints = Eigen::internal::random<int>(100, 1000);
const Scalar width = Eigen::internal::random<Scalar>(1., 10.);
const Scalar height = width;
const Scalar analysisScale = Scalar(15.) * std::sqrt(width * height / nbPoints);
const Scalar centerScale = Eigen::internal::random<Scalar>(1, 10000);
const VectorType center = VectorType::Random() * centerScale;
const VectorType direction = VectorType::Random().normalized();
std::vector<DataPoint> points(nbPoints);
std::vector<Scalar> potentials(nbPoints);
for (unsigned int i = 0; i < nbPoints; ++i)
{
potentials[i] = NAN;
_bUnoriented);
}
auto queue = sycl::queue(device);
using BuffersGPU = typename KdTreeGPU<DataPoint>::Buffers;
BuffersGPU deviceKDTree;
deepCopyBuffersToDevice<Ponca::KdTreePointerTraits<DataPoint>>(queue, kdtree.buffers(), deviceKDTree);
DataPoint* devicePoints = sycl::malloc_device<DataPoint>(nbPoints, queue);
Scalar* devicePotentials = sycl::malloc_device<Scalar>(nbPoints, queue);
queue.memcpy(devicePoints, points.data(), nbPoints * sizeof(DataPoint));
queue.memcpy(devicePotentials, potentials.data(), nbPoints * sizeof(Scalar));
queue.wait();
auto env = queue.submit([&](sycl::handler& h) {
h.parallel_for(sycl::range<1>(nbPoints), [=](sycl::id<1> i) {
size_t idx = i[0];
devicePotentials[idx] = test_fit<MeanFitSmooth, DataPoint>(deviceKDTree, idx, analysisScale);
});
});
queue.memcpy(potentials.data(), devicePotentials, nbPoints * sizeof(Scalar), env).wait();
sycl::free(devicePotentials, queue);
sycl::free(devicePoints, queue);
}
int main(const int , char** )
{
testPlaneSycl<double, 3>(sycl::device(sycl::default_selector_v));
return 0;
}
Aggregator class used to declare specialized structures using CRTP.