00001 #include "connected_layer.h"
00002 #include "batchnorm_layer.h"
00003 #include "utils.h"
00004 #include "cuda.h"
00005 #include "blas.h"
00006 #include "gemm.h"
00007
00008 #include <math.h>
00009 #include <stdio.h>
00010 #include <stdlib.h>
00011 #include <string.h>
00012
00013 connected_layer make_connected_layer(int batch, int inputs, int outputs, ACTIVATION activation, int batch_normalize)
00014 {
00015 int i;
00016 connected_layer l = {0};
00017 l.type = CONNECTED;
00018
00019 l.inputs = inputs;
00020 l.outputs = outputs;
00021 l.batch=batch;
00022 l.batch_normalize = batch_normalize;
00023 l.h = 1;
00024 l.w = 1;
00025 l.c = inputs;
00026 l.out_h = 1;
00027 l.out_w = 1;
00028 l.out_c = outputs;
00029
00030 l.output = calloc(batch*outputs, sizeof(float));
00031 l.delta = calloc(batch*outputs, sizeof(float));
00032
00033 l.weight_updates = calloc(inputs*outputs, sizeof(float));
00034 l.bias_updates = calloc(outputs, sizeof(float));
00035
00036 l.weights = calloc(outputs*inputs, sizeof(float));
00037 l.biases = calloc(outputs, sizeof(float));
00038
00039 l.forward = forward_connected_layer;
00040 l.backward = backward_connected_layer;
00041 l.update = update_connected_layer;
00042
00043
00044 float scale = sqrt(2./inputs);
00045 for(i = 0; i < outputs*inputs; ++i){
00046 l.weights[i] = scale*rand_uniform(-1, 1);
00047 }
00048
00049 for(i = 0; i < outputs; ++i){
00050 l.biases[i] = 0;
00051 }
00052
00053 if(batch_normalize){
00054 l.scales = calloc(outputs, sizeof(float));
00055 l.scale_updates = calloc(outputs, sizeof(float));
00056 for(i = 0; i < outputs; ++i){
00057 l.scales[i] = 1;
00058 }
00059
00060 l.mean = calloc(outputs, sizeof(float));
00061 l.mean_delta = calloc(outputs, sizeof(float));
00062 l.variance = calloc(outputs, sizeof(float));
00063 l.variance_delta = calloc(outputs, sizeof(float));
00064
00065 l.rolling_mean = calloc(outputs, sizeof(float));
00066 l.rolling_variance = calloc(outputs, sizeof(float));
00067
00068 l.x = calloc(batch*outputs, sizeof(float));
00069 l.x_norm = calloc(batch*outputs, sizeof(float));
00070 }
00071
00072 #ifdef GPU
00073 l.forward_gpu = forward_connected_layer_gpu;
00074 l.backward_gpu = backward_connected_layer_gpu;
00075 l.update_gpu = update_connected_layer_gpu;
00076
00077 l.weights_gpu = cuda_make_array(l.weights, outputs*inputs);
00078 l.biases_gpu = cuda_make_array(l.biases, outputs);
00079
00080 l.weight_updates_gpu = cuda_make_array(l.weight_updates, outputs*inputs);
00081 l.bias_updates_gpu = cuda_make_array(l.bias_updates, outputs);
00082
00083 l.output_gpu = cuda_make_array(l.output, outputs*batch);
00084 l.delta_gpu = cuda_make_array(l.delta, outputs*batch);
00085 if(batch_normalize){
00086 l.scales_gpu = cuda_make_array(l.scales, outputs);
00087 l.scale_updates_gpu = cuda_make_array(l.scale_updates, outputs);
00088
00089 l.mean_gpu = cuda_make_array(l.mean, outputs);
00090 l.variance_gpu = cuda_make_array(l.variance, outputs);
00091
00092 l.rolling_mean_gpu = cuda_make_array(l.mean, outputs);
00093 l.rolling_variance_gpu = cuda_make_array(l.variance, outputs);
00094
00095 l.mean_delta_gpu = cuda_make_array(l.mean, outputs);
00096 l.variance_delta_gpu = cuda_make_array(l.variance, outputs);
00097
00098 l.x_gpu = cuda_make_array(l.output, l.batch*outputs);
00099 l.x_norm_gpu = cuda_make_array(l.output, l.batch*outputs);
00100 }
00101 #endif
00102 l.activation = activation;
00103 fprintf(stderr, "connected %4d -> %4d\n", inputs, outputs);
00104 return l;
00105 }
00106
00107 void update_connected_layer(connected_layer l, int batch, float learning_rate, float momentum, float decay)
00108 {
00109 axpy_cpu(l.outputs, learning_rate/batch, l.bias_updates, 1, l.biases, 1);
00110 scal_cpu(l.outputs, momentum, l.bias_updates, 1);
00111
00112 if(l.batch_normalize){
00113 axpy_cpu(l.outputs, learning_rate/batch, l.scale_updates, 1, l.scales, 1);
00114 scal_cpu(l.outputs, momentum, l.scale_updates, 1);
00115 }
00116
00117 axpy_cpu(l.inputs*l.outputs, -decay*batch, l.weights, 1, l.weight_updates, 1);
00118 axpy_cpu(l.inputs*l.outputs, learning_rate/batch, l.weight_updates, 1, l.weights, 1);
00119 scal_cpu(l.inputs*l.outputs, momentum, l.weight_updates, 1);
00120 }
00121
00122 void forward_connected_layer(connected_layer l, network_state state)
00123 {
00124 int i;
00125 fill_cpu(l.outputs*l.batch, 0, l.output, 1);
00126 int m = l.batch;
00127 int k = l.inputs;
00128 int n = l.outputs;
00129 float *a = state.input;
00130 float *b = l.weights;
00131 float *c = l.output;
00132 gemm(0,1,m,n,k,1,a,k,b,k,1,c,n);
00133 if(l.batch_normalize){
00134 if(state.train){
00135 mean_cpu(l.output, l.batch, l.outputs, 1, l.mean);
00136 variance_cpu(l.output, l.mean, l.batch, l.outputs, 1, l.variance);
00137
00138 scal_cpu(l.outputs, .95, l.rolling_mean, 1);
00139 axpy_cpu(l.outputs, .05, l.mean, 1, l.rolling_mean, 1);
00140 scal_cpu(l.outputs, .95, l.rolling_variance, 1);
00141 axpy_cpu(l.outputs, .05, l.variance, 1, l.rolling_variance, 1);
00142
00143 copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1);
00144 normalize_cpu(l.output, l.mean, l.variance, l.batch, l.outputs, 1);
00145 copy_cpu(l.outputs*l.batch, l.output, 1, l.x_norm, 1);
00146 } else {
00147 normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.outputs, 1);
00148 }
00149 scale_bias(l.output, l.scales, l.batch, l.outputs, 1);
00150 }
00151 for(i = 0; i < l.batch; ++i){
00152 axpy_cpu(l.outputs, 1, l.biases, 1, l.output + i*l.outputs, 1);
00153 }
00154 activate_array(l.output, l.outputs*l.batch, l.activation);
00155 }
00156
00157 void backward_connected_layer(connected_layer l, network_state state)
00158 {
00159 int i;
00160 gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
00161 for(i = 0; i < l.batch; ++i){
00162 axpy_cpu(l.outputs, 1, l.delta + i*l.outputs, 1, l.bias_updates, 1);
00163 }
00164 if(l.batch_normalize){
00165 backward_scale_cpu(l.x_norm, l.delta, l.batch, l.outputs, 1, l.scale_updates);
00166
00167 scale_bias(l.delta, l.scales, l.batch, l.outputs, 1);
00168
00169 mean_delta_cpu(l.delta, l.variance, l.batch, l.outputs, 1, l.mean_delta);
00170 variance_delta_cpu(l.x, l.delta, l.mean, l.variance, l.batch, l.outputs, 1, l.variance_delta);
00171 normalize_delta_cpu(l.x, l.mean, l.variance, l.mean_delta, l.variance_delta, l.batch, l.outputs, 1, l.delta);
00172 }
00173
00174 int m = l.outputs;
00175 int k = l.batch;
00176 int n = l.inputs;
00177 float *a = l.delta;
00178 float *b = state.input;
00179 float *c = l.weight_updates;
00180 gemm(1,0,m,n,k,1,a,m,b,n,1,c,n);
00181
00182 m = l.batch;
00183 k = l.outputs;
00184 n = l.inputs;
00185
00186 a = l.delta;
00187 b = l.weights;
00188 c = state.delta;
00189
00190 if(c) gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
00191 }
00192
00193
00194 void denormalize_connected_layer(layer l)
00195 {
00196 int i, j;
00197 for(i = 0; i < l.outputs; ++i){
00198 float scale = l.scales[i]/sqrt(l.rolling_variance[i] + .000001);
00199 for(j = 0; j < l.inputs; ++j){
00200 l.weights[i*l.inputs + j] *= scale;
00201 }
00202 l.biases[i] -= l.rolling_mean[i] * scale;
00203 l.scales[i] = 1;
00204 l.rolling_mean[i] = 0;
00205 l.rolling_variance[i] = 1;
00206 }
00207 }
00208
00209
00210 void statistics_connected_layer(layer l)
00211 {
00212 if(l.batch_normalize){
00213 printf("Scales ");
00214 print_statistics(l.scales, l.outputs);
00215
00216
00217
00218
00219
00220
00221 }
00222 printf("Biases ");
00223 print_statistics(l.biases, l.outputs);
00224 printf("Weights ");
00225 print_statistics(l.weights, l.outputs);
00226 }
00227
00228 #ifdef GPU
00229
00230 void pull_connected_layer(connected_layer l)
00231 {
00232 cuda_pull_array(l.weights_gpu, l.weights, l.inputs*l.outputs);
00233 cuda_pull_array(l.biases_gpu, l.biases, l.outputs);
00234 cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.inputs*l.outputs);
00235 cuda_pull_array(l.bias_updates_gpu, l.bias_updates, l.outputs);
00236 if (l.batch_normalize){
00237 cuda_pull_array(l.scales_gpu, l.scales, l.outputs);
00238 cuda_pull_array(l.rolling_mean_gpu, l.rolling_mean, l.outputs);
00239 cuda_pull_array(l.rolling_variance_gpu, l.rolling_variance, l.outputs);
00240 }
00241 }
00242
00243 void push_connected_layer(connected_layer l)
00244 {
00245 cuda_push_array(l.weights_gpu, l.weights, l.inputs*l.outputs);
00246 cuda_push_array(l.biases_gpu, l.biases, l.outputs);
00247 cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.inputs*l.outputs);
00248 cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.outputs);
00249 if (l.batch_normalize){
00250 cuda_push_array(l.scales_gpu, l.scales, l.outputs);
00251 cuda_push_array(l.rolling_mean_gpu, l.rolling_mean, l.outputs);
00252 cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.outputs);
00253 }
00254 }
00255
00256 void update_connected_layer_gpu(connected_layer l, int batch, float learning_rate, float momentum, float decay)
00257 {
00258 axpy_ongpu(l.outputs, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
00259 scal_ongpu(l.outputs, momentum, l.bias_updates_gpu, 1);
00260
00261 if(l.batch_normalize){
00262 axpy_ongpu(l.outputs, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
00263 scal_ongpu(l.outputs, momentum, l.scale_updates_gpu, 1);
00264 }
00265
00266 axpy_ongpu(l.inputs*l.outputs, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
00267 axpy_ongpu(l.inputs*l.outputs, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
00268 scal_ongpu(l.inputs*l.outputs, momentum, l.weight_updates_gpu, 1);
00269 }
00270
00271 void forward_connected_layer_gpu(connected_layer l, network_state state)
00272 {
00273 int i;
00274 fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);
00275
00276 int m = l.batch;
00277 int k = l.inputs;
00278 int n = l.outputs;
00279 float * a = state.input;
00280 float * b = l.weights_gpu;
00281 float * c = l.output_gpu;
00282 gemm_ongpu(0,1,m,n,k,1,a,k,b,k,1,c,n);
00283 if(l.batch_normalize){
00284 forward_batchnorm_layer_gpu(l, state);
00285 }
00286 for(i = 0; i < l.batch; ++i){
00287 axpy_ongpu(l.outputs, 1, l.biases_gpu, 1, l.output_gpu + i*l.outputs, 1);
00288 }
00289 activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
00290 }
00291
00292 void backward_connected_layer_gpu(connected_layer l, network_state state)
00293 {
00294 int i;
00295 constrain_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
00296 gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
00297 for(i = 0; i < l.batch; ++i){
00298 axpy_ongpu(l.outputs, 1, l.delta_gpu + i*l.outputs, 1, l.bias_updates_gpu, 1);
00299 }
00300
00301 if(l.batch_normalize){
00302 backward_batchnorm_layer_gpu(l, state);
00303 }
00304
00305 int m = l.outputs;
00306 int k = l.batch;
00307 int n = l.inputs;
00308 float * a = l.delta_gpu;
00309 float * b = state.input;
00310 float * c = l.weight_updates_gpu;
00311 gemm_ongpu(1,0,m,n,k,1,a,m,b,n,1,c,n);
00312
00313 m = l.batch;
00314 k = l.outputs;
00315 n = l.inputs;
00316
00317 a = l.delta_gpu;
00318 b = l.weights_gpu;
00319 c = state.delta;
00320
00321 if(c) gemm_ongpu(0,0,m,n,k,1,a,k,b,n,1,c,n);
00322 }
00323 #endif