00001 #include <parallel_array.h> 00002 #include <parallel_common.h> 00003 #include <parallel_math.h> 00004 #include <parallel_reduce.h> 00005 00006 #include <iostream> 00007 #include <stdio.h> 00008 00009 #ifdef USE_CUDA 00010 #include "cuda_kernels.cuh" 00011 #elif USE_OPENCL 00012 #include "opencl_kernels.h" 00013 #endif 00014 00015 using namespace parallel_utils; 00016 using namespace parallel_ode; 00017 00018 template <typename T, ParallelType PType> 00019 int parallelReduceStrategyTest( ReduceType reduceType ) 00020 { 00021 typedef typename vec4<T>::Type Vec4T; 00022 typedef ParallelHDArray<Vec4T,PType> ParallelHDArray; 00023 00024 dxInitDevice(); 00025 00026 const ArraySize reduceStride = 32; 00027 const ArraySize reduceSize = 2; 00028 const ArraySize reduceBufferSize = reduceSize * reduceStride; 00029 00030 ParallelHDArray testArrayHD0( reduceBufferSize ); 00031 ParallelHDArray testArrayHD1( reduceBufferSize ); 00032 00033 Vec4T* testArrayHP0 = testArrayHD0.getHostBuffer( ); 00034 Vec4T* testArrayHP1 = testArrayHD1.getHostBuffer( ); 00035 00036 for( size_t i = 0; i < reduceBufferSize; i++) { 00037 testArrayHP0[ i ] = make_vec4( (T)i ); 00038 testArrayHP1[ i ] = make_vec4( (T)i ); 00039 } 00040 00041 testArrayHD0.syncToDevice( CopyTypes::COPY_SYNC ); 00042 testArrayHD1.syncToDevice( CopyTypes::COPY_SYNC ); 00043 00044 parallel_ode::ReduceStrategy* reduceStrategy = parallel_ode::ReduceStrategyFactory::create( reduceType, false ); 00045 std::vector<int> reduceRepetitionCount; 00046 switch( reduceType ) { 00047 case ReduceTypes::REDUCE_STRIDED: 00048 reduceStrategy->initialize( reduceStride, reduceSize, reduceRepetitionCount ); 00049 break; 00050 case ReduceTypes::REDUCE_SEQUENTIAL: 00051 reduceStrategy->initialize( reduceSize, reduceStride, reduceRepetitionCount ); 00052 break; 00053 default: 00054 reduceStrategy->initialize( reduceStride, reduceSize, reduceRepetitionCount ); 00055 break; 00056 } 00057 00058 #ifdef USE_CUDA 00059 cudaPGSReduce<T>( testArrayHD0.getDeviceBuffer( ), testArrayHD1.getDeviceBuffer( ), 00060 testArrayHD0.getDeviceBuffer( ), testArrayHD1.getDeviceBuffer( ), reduceStrategy ); 00061 #elif USE_OPENCL 00062 oclPGSReduce( testArrayHD0.getDeviceBuffer( ), testArrayHD1.getDeviceBuffer( ), 00063 testArrayHD0.getDeviceBuffer( ), testArrayHD1.getDeviceBuffer( ), reduceStrategy ); 00064 #endif 00065 testArrayHD0.syncToHost( CopyTypes::COPY_SYNC ); 00066 testArrayHD1.syncToHost( CopyTypes::COPY_SYNC ); 00067 00068 testArrayHD0.print( "TestReduceArray" ); 00069 testArrayHD1.print( "TestReduceArray2"); 00070 00071 for( size_t i = 0; i < reduceBufferSize; i++) { 00072 testArrayHP1[ i ] = make_vec4( (T)0.0 ); 00073 } 00074 00075 #ifdef USE_CUDA 00076 cudaZeroVector<Vec4T>( testArrayHD0.getDeviceBuffer( ), reduceBufferSize ); 00077 #elif USE_OPENCL 00078 oclZeroVector( testArrayHD0.getDeviceBuffer( ), reduceBufferSize ); 00079 #endif 00080 00081 testArrayHD1.syncToDevice( CopyTypes::COPY_SYNC ); 00082 testArrayHD1.syncToHost( CopyTypes::COPY_SYNC ); 00083 testArrayHD0.syncToHost( CopyTypes::COPY_SYNC ); 00084 00085 testArrayHD0.print( "TestZeroArray" ); 00086 testArrayHD1.print( "TestZeroArray2" ); 00087 00088 delete reduceStrategy; 00089 dxShutdownDevice(); 00090 return 0; 00091 } 00092 00093 template <typename T, ParallelType PType> 00094 int parallelReduceTest() 00095 { 00096 parallelReduceStrategyTest<T,PType>( ReduceTypes::REDUCE_STRIDED ); 00097 return 0; 00098 } 00099 00100 #ifdef CUDA_DOUBLESUPPORT 00101 template int parallelReduceTest<dReal,ParallelTypes::PARALLEL_TYPE>(); 00102 #else 00103 template int parallelReduceTest<float,ParallelTypes::PARALLEL_TYPE>(); 00104 #endif 00105