Go to the documentation of this file.00001 #include <parallel_array.h>
00002 #include <parallel_common.h>
00003 #include <parallel_math.h>
00004 #include <parallel_reduce.h>
00005
00006 #include <iostream>
00007 #include <stdio.h>
00008
00009 #ifdef USE_CUDA
00010 #include "cuda_kernels.cuh"
00011 #elif USE_OPENCL
00012 #include "opencl_kernels.h"
00013 #endif
00014
00015 using namespace parallel_utils;
00016 using namespace parallel_ode;
00017
00018 template <typename T, ParallelType PType>
00019 int parallelReduceStrategyTest( ReduceType reduceType )
00020 {
00021 typedef typename vec4<T>::Type Vec4T;
00022 typedef ParallelHDArray<Vec4T,PType> ParallelHDArray;
00023
00024 dxInitDevice();
00025
00026 const ArraySize reduceStride = 32;
00027 const ArraySize reduceSize = 2;
00028 const ArraySize reduceBufferSize = reduceSize * reduceStride;
00029
00030 ParallelHDArray testArrayHD0( reduceBufferSize );
00031 ParallelHDArray testArrayHD1( reduceBufferSize );
00032
00033 Vec4T* testArrayHP0 = testArrayHD0.getHostBuffer( );
00034 Vec4T* testArrayHP1 = testArrayHD1.getHostBuffer( );
00035
00036 for( size_t i = 0; i < reduceBufferSize; i++) {
00037 testArrayHP0[ i ] = make_vec4( (T)i );
00038 testArrayHP1[ i ] = make_vec4( (T)i );
00039 }
00040
00041 testArrayHD0.syncToDevice( CopyTypes::COPY_SYNC );
00042 testArrayHD1.syncToDevice( CopyTypes::COPY_SYNC );
00043
00044 parallel_ode::ReduceStrategy* reduceStrategy = parallel_ode::ReduceStrategyFactory::create( reduceType, false );
00045 std::vector<int> reduceRepetitionCount;
00046 switch( reduceType ) {
00047 case ReduceTypes::REDUCE_STRIDED:
00048 reduceStrategy->initialize( reduceStride, reduceSize, reduceRepetitionCount );
00049 break;
00050 case ReduceTypes::REDUCE_SEQUENTIAL:
00051 reduceStrategy->initialize( reduceSize, reduceStride, reduceRepetitionCount );
00052 break;
00053 default:
00054 reduceStrategy->initialize( reduceStride, reduceSize, reduceRepetitionCount );
00055 break;
00056 }
00057
00058 #ifdef USE_CUDA
00059 cudaPGSReduce<T>( testArrayHD0.getDeviceBuffer( ), testArrayHD1.getDeviceBuffer( ),
00060 testArrayHD0.getDeviceBuffer( ), testArrayHD1.getDeviceBuffer( ), reduceStrategy );
00061 #elif USE_OPENCL
00062 oclPGSReduce( testArrayHD0.getDeviceBuffer( ), testArrayHD1.getDeviceBuffer( ),
00063 testArrayHD0.getDeviceBuffer( ), testArrayHD1.getDeviceBuffer( ), reduceStrategy );
00064 #endif
00065 testArrayHD0.syncToHost( CopyTypes::COPY_SYNC );
00066 testArrayHD1.syncToHost( CopyTypes::COPY_SYNC );
00067
00068 testArrayHD0.print( "TestReduceArray" );
00069 testArrayHD1.print( "TestReduceArray2");
00070
00071 for( size_t i = 0; i < reduceBufferSize; i++) {
00072 testArrayHP1[ i ] = make_vec4( (T)0.0 );
00073 }
00074
00075 #ifdef USE_CUDA
00076 cudaZeroVector<Vec4T>( testArrayHD0.getDeviceBuffer( ), reduceBufferSize );
00077 #elif USE_OPENCL
00078 oclZeroVector( testArrayHD0.getDeviceBuffer( ), reduceBufferSize );
00079 #endif
00080
00081 testArrayHD1.syncToDevice( CopyTypes::COPY_SYNC );
00082 testArrayHD1.syncToHost( CopyTypes::COPY_SYNC );
00083 testArrayHD0.syncToHost( CopyTypes::COPY_SYNC );
00084
00085 testArrayHD0.print( "TestZeroArray" );
00086 testArrayHD1.print( "TestZeroArray2" );
00087
00088 delete reduceStrategy;
00089 dxShutdownDevice();
00090 return 0;
00091 }
00092
00093 template <typename T, ParallelType PType>
00094 int parallelReduceTest()
00095 {
00096 parallelReduceStrategyTest<T,PType>( ReduceTypes::REDUCE_STRIDED );
00097 return 0;
00098 }
00099
00100 #ifdef CUDA_DOUBLESUPPORT
00101 template int parallelReduceTest<dReal,ParallelTypes::PARALLEL_TYPE>();
00102 #else
00103 template int parallelReduceTest<float,ParallelTypes::PARALLEL_TYPE>();
00104 #endif
00105