2 #ifndef EIGEN_TEST_CUDA_COMMON_H 3 #define EIGEN_TEST_CUDA_COMMON_H 6 #include <cuda_runtime.h> 7 #include <cuda_runtime_api.h> 14 template<
typename Kernel,
typename Input,
typename Output>
15 void run_on_cpu(
const Kernel& ker,
int n,
const Input& in, Output& out)
17 for(
int i=0;
i<
n;
i++)
18 ker(
i, in.data(), out.data());
22 template<
typename Kernel,
typename Input,
typename Output>
33 template<
typename Kernel,
typename Input,
typename Output>
34 void run_on_cuda(
const Kernel& ker,
int n,
const Input& in, Output& out)
38 std::ptrdiff_t in_bytes = in.size() *
sizeof(
typename Input::Scalar);
39 std::ptrdiff_t out_bytes = out.size() *
sizeof(
typename Output::Scalar);
41 cudaMalloc((
void**)(&d_in), in_bytes);
42 cudaMalloc((
void**)(&d_out), out_bytes);
44 cudaMemcpy(d_in, in.data(), in_bytes, cudaMemcpyHostToDevice);
45 cudaMemcpy(d_out, out.data(), out_bytes, cudaMemcpyHostToDevice);
50 dim3 Grids( (n+
int(Blocks.x)-1)/
int(Blocks.x) );
52 cudaThreadSynchronize();
53 run_on_cuda_meta_kernel<<<Grids,Blocks>>>(ker,
n, d_in, d_out);
54 cudaThreadSynchronize();
57 cudaMemcpy(const_cast<typename Input::Scalar*>(in.data()), d_in, in_bytes, cudaMemcpyDeviceToHost);
58 cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost);
65 template<
typename Kernel,
typename Input,
typename Output>
68 Input in_ref, in_cuda;
69 Output out_ref, out_cuda;
71 in_ref = in_cuda = in;
72 out_ref = out_cuda = out;
86 cudaDeviceProp deviceProp;
87 cudaGetDeviceProperties(&deviceProp, device);
88 std::cout <<
"CUDA device info:\n";
89 std::cout <<
" name: " << deviceProp.name <<
"\n";
90 std::cout <<
" capability: " << deviceProp.major <<
"." << deviceProp.minor <<
"\n";
91 std::cout <<
" multiProcessorCount: " << deviceProp.multiProcessorCount <<
"\n";
92 std::cout <<
" maxThreadsPerMultiProcessor: " << deviceProp.maxThreadsPerMultiProcessor <<
"\n";
93 std::cout <<
" warpSize: " << deviceProp.warpSize <<
"\n";
94 std::cout <<
" regsPerBlock: " << deviceProp.regsPerBlock <<
"\n";
95 std::cout <<
" concurrentKernels: " << deviceProp.concurrentKernels <<
"\n";
96 std::cout <<
" clockRate: " << deviceProp.clockRate <<
"\n";
97 std::cout <<
" canMapHostMemory: " << deviceProp.canMapHostMemory <<
"\n";
98 std::cout <<
" computeMode: " << deviceProp.computeMode <<
"\n";
101 #endif // EIGEN_TEST_CUDA_COMMON_H
__global__ void run_on_cuda_meta_kernel(const Kernel ker, int n, const Input *in, Output *out)
void run_on_cuda(const Kernel &ker, int n, const Input &in, Output &out)
#define VERIFY_IS_APPROX(a, b)
void run_on_cpu(const Kernel &ker, int n, const Input &in, Output &out)
void run_and_compare_to_cuda(const Kernel &ker, int n, const Input &in, Output &out)