00001 #include "local_layer.h"
00002 #include "utils.h"
00003 #include "im2col.h"
00004 #include "col2im.h"
00005 #include "blas.h"
00006 #include "gemm.h"
00007 #include <stdio.h>
00008 #include <time.h>
00009
00010 int local_out_height(local_layer l)
00011 {
00012 int h = l.h;
00013 if (!l.pad) h -= l.size;
00014 else h -= 1;
00015 return h/l.stride + 1;
00016 }
00017
00018 int local_out_width(local_layer l)
00019 {
00020 int w = l.w;
00021 if (!l.pad) w -= l.size;
00022 else w -= 1;
00023 return w/l.stride + 1;
00024 }
00025
00026 local_layer make_local_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation)
00027 {
00028 int i;
00029 local_layer l = {0};
00030 l.type = LOCAL;
00031
00032 l.h = h;
00033 l.w = w;
00034 l.c = c;
00035 l.n = n;
00036 l.batch = batch;
00037 l.stride = stride;
00038 l.size = size;
00039 l.pad = pad;
00040
00041 int out_h = local_out_height(l);
00042 int out_w = local_out_width(l);
00043 int locations = out_h*out_w;
00044 l.out_h = out_h;
00045 l.out_w = out_w;
00046 l.out_c = n;
00047 l.outputs = l.out_h * l.out_w * l.out_c;
00048 l.inputs = l.w * l.h * l.c;
00049
00050 l.weights = calloc(c*n*size*size*locations, sizeof(float));
00051 l.weight_updates = calloc(c*n*size*size*locations, sizeof(float));
00052
00053 l.biases = calloc(l.outputs, sizeof(float));
00054 l.bias_updates = calloc(l.outputs, sizeof(float));
00055
00056
00057 float scale = sqrt(2./(size*size*c));
00058 for(i = 0; i < c*n*size*size; ++i) l.weights[i] = scale*rand_uniform(-1,1);
00059
00060 l.col_image = calloc(out_h*out_w*size*size*c, sizeof(float));
00061 l.output = calloc(l.batch*out_h * out_w * n, sizeof(float));
00062 l.delta = calloc(l.batch*out_h * out_w * n, sizeof(float));
00063
00064 l.forward = forward_local_layer;
00065 l.backward = backward_local_layer;
00066 l.update = update_local_layer;
00067
00068 #ifdef GPU
00069 l.forward_gpu = forward_local_layer_gpu;
00070 l.backward_gpu = backward_local_layer_gpu;
00071 l.update_gpu = update_local_layer_gpu;
00072
00073 l.weights_gpu = cuda_make_array(l.weights, c*n*size*size*locations);
00074 l.weight_updates_gpu = cuda_make_array(l.weight_updates, c*n*size*size*locations);
00075
00076 l.biases_gpu = cuda_make_array(l.biases, l.outputs);
00077 l.bias_updates_gpu = cuda_make_array(l.bias_updates, l.outputs);
00078
00079 l.col_image_gpu = cuda_make_array(l.col_image, out_h*out_w*size*size*c);
00080 l.delta_gpu = cuda_make_array(l.delta, l.batch*out_h*out_w*n);
00081 l.output_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
00082
00083 #endif
00084 l.activation = activation;
00085
00086 fprintf(stderr, "Local Layer: %d x %d x %d image, %d filters -> %d x %d x %d image\n", h,w,c,n, out_h, out_w, n);
00087
00088 return l;
00089 }
00090
00091 void forward_local_layer(const local_layer l, network_state state)
00092 {
00093 int out_h = local_out_height(l);
00094 int out_w = local_out_width(l);
00095 int i, j;
00096 int locations = out_h * out_w;
00097
00098 for(i = 0; i < l.batch; ++i){
00099 copy_cpu(l.outputs, l.biases, 1, l.output + i*l.outputs, 1);
00100 }
00101
00102 for(i = 0; i < l.batch; ++i){
00103 float *input = state.input + i*l.w*l.h*l.c;
00104 im2col_cpu(input, l.c, l.h, l.w,
00105 l.size, l.stride, l.pad, l.col_image);
00106 float *output = l.output + i*l.outputs;
00107 for(j = 0; j < locations; ++j){
00108 float *a = l.weights + j*l.size*l.size*l.c*l.n;
00109 float *b = l.col_image + j;
00110 float *c = output + j;
00111
00112 int m = l.n;
00113 int n = 1;
00114 int k = l.size*l.size*l.c;
00115
00116 gemm(0,0,m,n,k,1,a,k,b,locations,1,c,locations);
00117 }
00118 }
00119 activate_array(l.output, l.outputs*l.batch, l.activation);
00120 }
00121
00122 void backward_local_layer(local_layer l, network_state state)
00123 {
00124 int i, j;
00125 int locations = l.out_w*l.out_h;
00126
00127 gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
00128
00129 for(i = 0; i < l.batch; ++i){
00130 axpy_cpu(l.outputs, 1, l.delta + i*l.outputs, 1, l.bias_updates, 1);
00131 }
00132
00133 for(i = 0; i < l.batch; ++i){
00134 float *input = state.input + i*l.w*l.h*l.c;
00135 im2col_cpu(input, l.c, l.h, l.w,
00136 l.size, l.stride, l.pad, l.col_image);
00137
00138 for(j = 0; j < locations; ++j){
00139 float *a = l.delta + i*l.outputs + j;
00140 float *b = l.col_image + j;
00141 float *c = l.weight_updates + j*l.size*l.size*l.c*l.n;
00142 int m = l.n;
00143 int n = l.size*l.size*l.c;
00144 int k = 1;
00145
00146 gemm(0,1,m,n,k,1,a,locations,b,locations,1,c,n);
00147 }
00148
00149 if(state.delta){
00150 for(j = 0; j < locations; ++j){
00151 float *a = l.weights + j*l.size*l.size*l.c*l.n;
00152 float *b = l.delta + i*l.outputs + j;
00153 float *c = l.col_image + j;
00154
00155 int m = l.size*l.size*l.c;
00156 int n = 1;
00157 int k = l.n;
00158
00159 gemm(1,0,m,n,k,1,a,m,b,locations,0,c,locations);
00160 }
00161
00162 col2im_cpu(l.col_image, l.c, l.h, l.w, l.size, l.stride, l.pad, state.delta+i*l.c*l.h*l.w);
00163 }
00164 }
00165 }
00166
00167 void update_local_layer(local_layer l, int batch, float learning_rate, float momentum, float decay)
00168 {
00169 int locations = l.out_w*l.out_h;
00170 int size = l.size*l.size*l.c*l.n*locations;
00171 axpy_cpu(l.outputs, learning_rate/batch, l.bias_updates, 1, l.biases, 1);
00172 scal_cpu(l.outputs, momentum, l.bias_updates, 1);
00173
00174 axpy_cpu(size, -decay*batch, l.weights, 1, l.weight_updates, 1);
00175 axpy_cpu(size, learning_rate/batch, l.weight_updates, 1, l.weights, 1);
00176 scal_cpu(size, momentum, l.weight_updates, 1);
00177 }
00178
00179 #ifdef GPU
00180
00181 void forward_local_layer_gpu(const local_layer l, network_state state)
00182 {
00183 int out_h = local_out_height(l);
00184 int out_w = local_out_width(l);
00185 int i, j;
00186 int locations = out_h * out_w;
00187
00188 for(i = 0; i < l.batch; ++i){
00189 copy_ongpu(l.outputs, l.biases_gpu, 1, l.output_gpu + i*l.outputs, 1);
00190 }
00191
00192 for(i = 0; i < l.batch; ++i){
00193 float *input = state.input + i*l.w*l.h*l.c;
00194 im2col_ongpu(input, l.c, l.h, l.w,
00195 l.size, l.stride, l.pad, l.col_image_gpu);
00196 float *output = l.output_gpu + i*l.outputs;
00197 for(j = 0; j < locations; ++j){
00198 float *a = l.weights_gpu + j*l.size*l.size*l.c*l.n;
00199 float *b = l.col_image_gpu + j;
00200 float *c = output + j;
00201
00202 int m = l.n;
00203 int n = 1;
00204 int k = l.size*l.size*l.c;
00205
00206 gemm_ongpu(0,0,m,n,k,1,a,k,b,locations,1,c,locations);
00207 }
00208 }
00209 activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
00210 }
00211
00212 void backward_local_layer_gpu(local_layer l, network_state state)
00213 {
00214 int i, j;
00215 int locations = l.out_w*l.out_h;
00216
00217 gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
00218 for(i = 0; i < l.batch; ++i){
00219 axpy_ongpu(l.outputs, 1, l.delta_gpu + i*l.outputs, 1, l.bias_updates_gpu, 1);
00220 }
00221
00222 for(i = 0; i < l.batch; ++i){
00223 float *input = state.input + i*l.w*l.h*l.c;
00224 im2col_ongpu(input, l.c, l.h, l.w,
00225 l.size, l.stride, l.pad, l.col_image_gpu);
00226
00227 for(j = 0; j < locations; ++j){
00228 float *a = l.delta_gpu + i*l.outputs + j;
00229 float *b = l.col_image_gpu + j;
00230 float *c = l.weight_updates_gpu + j*l.size*l.size*l.c*l.n;
00231 int m = l.n;
00232 int n = l.size*l.size*l.c;
00233 int k = 1;
00234
00235 gemm_ongpu(0,1,m,n,k,1,a,locations,b,locations,1,c,n);
00236 }
00237
00238 if(state.delta){
00239 for(j = 0; j < locations; ++j){
00240 float *a = l.weights_gpu + j*l.size*l.size*l.c*l.n;
00241 float *b = l.delta_gpu + i*l.outputs + j;
00242 float *c = l.col_image_gpu + j;
00243
00244 int m = l.size*l.size*l.c;
00245 int n = 1;
00246 int k = l.n;
00247
00248 gemm_ongpu(1,0,m,n,k,1,a,m,b,locations,0,c,locations);
00249 }
00250
00251 col2im_ongpu(l.col_image_gpu, l.c, l.h, l.w, l.size, l.stride, l.pad, state.delta+i*l.c*l.h*l.w);
00252 }
00253 }
00254 }
00255
00256 void update_local_layer_gpu(local_layer l, int batch, float learning_rate, float momentum, float decay)
00257 {
00258 int locations = l.out_w*l.out_h;
00259 int size = l.size*l.size*l.c*l.n*locations;
00260 axpy_ongpu(l.outputs, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
00261 scal_ongpu(l.outputs, momentum, l.bias_updates_gpu, 1);
00262
00263 axpy_ongpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
00264 axpy_ongpu(size, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
00265 scal_ongpu(size, momentum, l.weight_updates_gpu, 1);
00266 }
00267
00268 void pull_local_layer(local_layer l)
00269 {
00270 int locations = l.out_w*l.out_h;
00271 int size = l.size*l.size*l.c*l.n*locations;
00272 cuda_pull_array(l.weights_gpu, l.weights, size);
00273 cuda_pull_array(l.biases_gpu, l.biases, l.outputs);
00274 }
00275
00276 void push_local_layer(local_layer l)
00277 {
00278 int locations = l.out_w*l.out_h;
00279 int size = l.size*l.size*l.c*l.n*locations;
00280 cuda_push_array(l.weights_gpu, l.weights, size);
00281 cuda_push_array(l.biases_gpu, l.biases, l.outputs);
00282 }
00283 #endif