00001 #include "gemm.h"
00002 #include "utils.h"
00003 #include "cuda.h"
00004 #include <stdlib.h>
00005 #include <stdio.h>
00006 #include <math.h>
00007
00008 void gemm_bin(int M, int N, int K, float ALPHA,
00009 char *A, int lda,
00010 float *B, int ldb,
00011 float *C, int ldc)
00012 {
00013 int i,j,k;
00014 for(i = 0; i < M; ++i){
00015 for(k = 0; k < K; ++k){
00016 char A_PART = A[i*lda+k];
00017 if(A_PART){
00018 for(j = 0; j < N; ++j){
00019 C[i*ldc+j] += B[k*ldb+j];
00020 }
00021 } else {
00022 for(j = 0; j < N; ++j){
00023 C[i*ldc+j] -= B[k*ldb+j];
00024 }
00025 }
00026 }
00027 }
00028 }
00029
00030 float *random_matrix(int rows, int cols)
00031 {
00032 int i;
00033 float *m = calloc(rows*cols, sizeof(float));
00034 for(i = 0; i < rows*cols; ++i){
00035 m[i] = (float)rand()/RAND_MAX;
00036 }
00037 return m;
00038 }
00039
00040 void time_random_matrix(int TA, int TB, int m, int k, int n)
00041 {
00042 float *a;
00043 if(!TA) a = random_matrix(m,k);
00044 else a = random_matrix(k,m);
00045 int lda = (!TA)?k:m;
00046 float *b;
00047 if(!TB) b = random_matrix(k,n);
00048 else b = random_matrix(n,k);
00049 int ldb = (!TB)?n:k;
00050
00051 float *c = random_matrix(m,n);
00052 int i;
00053 clock_t start = clock(), end;
00054 for(i = 0; i<10; ++i){
00055 gemm_cpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c,n);
00056 }
00057 end = clock();
00058 printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf ms\n",m,k,k,n, TA, TB, (float)(end-start)/CLOCKS_PER_SEC);
00059 free(a);
00060 free(b);
00061 free(c);
00062 }
00063
00064
00065 void gemm(int TA, int TB, int M, int N, int K, float ALPHA,
00066 float *A, int lda,
00067 float *B, int ldb,
00068 float BETA,
00069 float *C, int ldc)
00070 {
00071 gemm_cpu( TA, TB, M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc);
00072 }
00073
00074 void gemm_nn(int M, int N, int K, float ALPHA,
00075 float *A, int lda,
00076 float *B, int ldb,
00077 float *C, int ldc)
00078 {
00079 int i,j,k;
00080 for(i = 0; i < M; ++i){
00081 for(k = 0; k < K; ++k){
00082 register float A_PART = ALPHA*A[i*lda+k];
00083 for(j = 0; j < N; ++j){
00084 C[i*ldc+j] += A_PART*B[k*ldb+j];
00085 }
00086 }
00087 }
00088 }
00089
00090 void gemm_nt(int M, int N, int K, float ALPHA,
00091 float *A, int lda,
00092 float *B, int ldb,
00093 float *C, int ldc)
00094 {
00095 int i,j,k;
00096 for(i = 0; i < M; ++i){
00097 for(j = 0; j < N; ++j){
00098 register float sum = 0;
00099 for(k = 0; k < K; ++k){
00100 sum += ALPHA*A[i*lda+k]*B[j*ldb + k];
00101 }
00102 C[i*ldc+j] += sum;
00103 }
00104 }
00105 }
00106
00107 void gemm_tn(int M, int N, int K, float ALPHA,
00108 float *A, int lda,
00109 float *B, int ldb,
00110 float *C, int ldc)
00111 {
00112 int i,j,k;
00113 for(i = 0; i < M; ++i){
00114 for(k = 0; k < K; ++k){
00115 register float A_PART = ALPHA*A[k*lda+i];
00116 for(j = 0; j < N; ++j){
00117 C[i*ldc+j] += A_PART*B[k*ldb+j];
00118 }
00119 }
00120 }
00121 }
00122
00123 void gemm_tt(int M, int N, int K, float ALPHA,
00124 float *A, int lda,
00125 float *B, int ldb,
00126 float *C, int ldc)
00127 {
00128 int i,j,k;
00129 for(i = 0; i < M; ++i){
00130 for(j = 0; j < N; ++j){
00131 register float sum = 0;
00132 for(k = 0; k < K; ++k){
00133 sum += ALPHA*A[i+k*lda]*B[k+j*ldb];
00134 }
00135 C[i*ldc+j] += sum;
00136 }
00137 }
00138 }
00139
00140
00141 void gemm_cpu(int TA, int TB, int M, int N, int K, float ALPHA,
00142 float *A, int lda,
00143 float *B, int ldb,
00144 float BETA,
00145 float *C, int ldc)
00146 {
00147
00148 int i, j;
00149 for(i = 0; i < M; ++i){
00150 for(j = 0; j < N; ++j){
00151 C[i*ldc + j] *= BETA;
00152 }
00153 }
00154 if(!TA && !TB)
00155 gemm_nn(M, N, K, ALPHA,A,lda, B, ldb,C,ldc);
00156 else if(TA && !TB)
00157 gemm_tn(M, N, K, ALPHA,A,lda, B, ldb,C,ldc);
00158 else if(!TA && TB)
00159 gemm_nt(M, N, K, ALPHA,A,lda, B, ldb,C,ldc);
00160 else
00161 gemm_tt(M, N, K, ALPHA,A,lda, B, ldb,C,ldc);
00162 }
00163
00164 #ifdef GPU
00165
00166 #include <math.h>
00167
00168 void gemm_ongpu(int TA, int TB, int M, int N, int K, float ALPHA,
00169 float *A_gpu, int lda,
00170 float *B_gpu, int ldb,
00171 float BETA,
00172 float *C_gpu, int ldc)
00173 {
00174 cublasHandle_t handle = blas_handle();
00175 cudaError_t status = cublasSgemm(handle, (TB ? CUBLAS_OP_T : CUBLAS_OP_N),
00176 (TA ? CUBLAS_OP_T : CUBLAS_OP_N), N, M, K, &ALPHA, B_gpu, ldb, A_gpu, lda, &BETA, C_gpu, ldc);
00177 check_error(status);
00178 }
00179
00180 void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA,
00181 float *A, int lda,
00182 float *B, int ldb,
00183 float BETA,
00184 float *C, int ldc)
00185 {
00186 float *A_gpu = cuda_make_array(A, (TA ? lda*K:lda*M));
00187 float *B_gpu = cuda_make_array(B, (TB ? ldb*N : ldb*K));
00188 float *C_gpu = cuda_make_array(C, ldc*M);
00189
00190 gemm_ongpu(TA, TB, M, N, K, ALPHA, A_gpu, lda, B_gpu, ldb, BETA, C_gpu, ldc);
00191
00192 cuda_pull_array(C_gpu, C, ldc*M);
00193 cuda_free(A_gpu);
00194 cuda_free(B_gpu);
00195 cuda_free(C_gpu);
00196 }
00197
00198 #include <stdio.h>
00199 #include <stdlib.h>
00200 #include <string.h>
00201 #include <time.h>
00202
00203 void time_gpu_random_matrix(int TA, int TB, int m, int k, int n)
00204 {
00205 float *a;
00206 if(!TA) a = random_matrix(m,k);
00207 else a = random_matrix(k,m);
00208 int lda = (!TA)?k:m;
00209 float *b;
00210 if(!TB) b = random_matrix(k,n);
00211 else b = random_matrix(n,k);
00212 int ldb = (!TB)?n:k;
00213
00214 float *c = random_matrix(m,n);
00215 int i;
00216 clock_t start = clock(), end;
00217 for(i = 0; i<32; ++i){
00218 gemm_gpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c,n);
00219 }
00220 end = clock();
00221 printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf s\n",m,k,k,n, TA, TB, (float)(end-start)/CLOCKS_PER_SEC);
00222 free(a);
00223 free(b);
00224 free(c);
00225 }
00226
00227 void time_ongpu(int TA, int TB, int m, int k, int n)
00228 {
00229 int iter = 10;
00230 float *a = random_matrix(m,k);
00231 float *b = random_matrix(k,n);
00232
00233 int lda = (!TA)?k:m;
00234 int ldb = (!TB)?n:k;
00235
00236 float *c = random_matrix(m,n);
00237
00238 float *a_cl = cuda_make_array(a, m*k);
00239 float *b_cl = cuda_make_array(b, k*n);
00240 float *c_cl = cuda_make_array(c, m*n);
00241
00242 int i;
00243 clock_t start = clock(), end;
00244 for(i = 0; i<iter; ++i){
00245 gemm_ongpu(TA,TB,m,n,k,1,a_cl,lda,b_cl,ldb,1,c_cl,n);
00246 cudaThreadSynchronize();
00247 }
00248 double flop = ((double)m)*n*(2.*k + 2.)*iter;
00249 double gflop = flop/pow(10., 9);
00250 end = clock();
00251 double seconds = sec(end-start);
00252 printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf s, %lf GFLOPS\n",m,k,k,n, TA, TB, seconds, gflop/seconds);
00253 cuda_free(a_cl);
00254 cuda_free(b_cl);
00255 cuda_free(c_cl);
00256 free(a);
00257 free(b);
00258 free(c);
00259 }
00260
00261
00262 void test_gpu_accuracy(int TA, int TB, int m, int k, int n)
00263 {
00264 srand(0);
00265 float *a;
00266 if(!TA) a = random_matrix(m,k);
00267 else a = random_matrix(k,m);
00268 int lda = (!TA)?k:m;
00269 float *b;
00270 if(!TB) b = random_matrix(k,n);
00271 else b = random_matrix(n,k);
00272 int ldb = (!TB)?n:k;
00273
00274 float *c = random_matrix(m,n);
00275 float *c_gpu = random_matrix(m,n);
00276 memset(c, 0, m*n*sizeof(float));
00277 memset(c_gpu, 0, m*n*sizeof(float));
00278 int i;
00279
00280 gemm_gpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c_gpu,n);
00281
00282
00283
00284 gemm_cpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c,n);
00285
00286
00287 double sse = 0;
00288 for(i = 0; i < m*n; ++i) {
00289
00290 sse += pow(c[i]-c_gpu[i], 2);
00291 }
00292 printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %g SSE\n",m,k,k,n, TA, TB, sse/(m*n));
00293 free(a);
00294 free(b);
00295 free(c);
00296 free(c_gpu);
00297 }
00298
00299 int test_gpu_blas()
00300 {
00301
00302
00303
00304
00305
00306
00307
00308
00309
00310
00311
00312
00313
00314
00315
00316
00317
00318
00319
00320
00321
00322
00323
00324
00325
00326 time_ongpu(0,0,64,75,12544);
00327 time_ongpu(0,0,64,75,12544);
00328 time_ongpu(0,0,64,75,12544);
00329 time_ongpu(0,0,64,576,12544);
00330 time_ongpu(0,0,256,2304,784);
00331 time_ongpu(1,1,2304,256,784);
00332 time_ongpu(0,0,512,4608,196);
00333 time_ongpu(1,1,4608,512,196);
00334
00335 return 0;
00336 }
00337 #endif
00338