00001 #include "gru_layer.h" 00002 #include "connected_layer.h" 00003 #include "utils.h" 00004 #include "cuda.h" 00005 #include "blas.h" 00006 #include "gemm.h" 00007 00008 #include <math.h> 00009 #include <stdio.h> 00010 #include <stdlib.h> 00011 #include <string.h> 00012 00013 static void increment_layer(layer *l, int steps) 00014 { 00015 int num = l->outputs*l->batch*steps; 00016 l->output += num; 00017 l->delta += num; 00018 l->x += num; 00019 l->x_norm += num; 00020 00021 #ifdef GPU 00022 l->output_gpu += num; 00023 l->delta_gpu += num; 00024 l->x_gpu += num; 00025 l->x_norm_gpu += num; 00026 #endif 00027 } 00028 00029 layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_normalize) 00030 { 00031 fprintf(stderr, "GRU Layer: %d inputs, %d outputs\n", inputs, outputs); 00032 batch = batch / steps; 00033 layer l = {0}; 00034 l.batch = batch; 00035 l.type = GRU; 00036 l.steps = steps; 00037 l.inputs = inputs; 00038 00039 l.input_z_layer = malloc(sizeof(layer)); 00040 fprintf(stderr, "\t\t"); 00041 *(l.input_z_layer) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize); 00042 l.input_z_layer->batch = batch; 00043 00044 l.state_z_layer = malloc(sizeof(layer)); 00045 fprintf(stderr, "\t\t"); 00046 *(l.state_z_layer) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize); 00047 l.state_z_layer->batch = batch; 00048 00049 00050 00051 l.input_r_layer = malloc(sizeof(layer)); 00052 fprintf(stderr, "\t\t"); 00053 *(l.input_r_layer) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize); 00054 l.input_r_layer->batch = batch; 00055 00056 l.state_r_layer = malloc(sizeof(layer)); 00057 fprintf(stderr, "\t\t"); 00058 *(l.state_r_layer) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize); 00059 l.state_r_layer->batch = batch; 00060 00061 00062 00063 l.input_h_layer = malloc(sizeof(layer)); 00064 fprintf(stderr, "\t\t"); 00065 *(l.input_h_layer) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize); 00066 l.input_h_layer->batch = batch; 00067 00068 l.state_h_layer = malloc(sizeof(layer)); 00069 fprintf(stderr, "\t\t"); 00070 *(l.state_h_layer) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize); 00071 l.state_h_layer->batch = batch; 00072 00073 l.batch_normalize = batch_normalize; 00074 00075 00076 l.outputs = outputs; 00077 l.output = calloc(outputs*batch*steps, sizeof(float)); 00078 l.delta = calloc(outputs*batch*steps, sizeof(float)); 00079 l.state = calloc(outputs*batch, sizeof(float)); 00080 l.prev_state = calloc(outputs*batch, sizeof(float)); 00081 l.forgot_state = calloc(outputs*batch, sizeof(float)); 00082 l.forgot_delta = calloc(outputs*batch, sizeof(float)); 00083 00084 l.r_cpu = calloc(outputs*batch, sizeof(float)); 00085 l.z_cpu = calloc(outputs*batch, sizeof(float)); 00086 l.h_cpu = calloc(outputs*batch, sizeof(float)); 00087 00088 l.forward = forward_gru_layer; 00089 l.backward = backward_gru_layer; 00090 l.update = update_gru_layer; 00091 00092 #ifdef GPU 00093 l.forward_gpu = forward_gru_layer_gpu; 00094 l.backward_gpu = backward_gru_layer_gpu; 00095 l.update_gpu = update_gru_layer_gpu; 00096 00097 l.forgot_state_gpu = cuda_make_array(l.output, batch*outputs); 00098 l.forgot_delta_gpu = cuda_make_array(l.output, batch*outputs); 00099 l.prev_state_gpu = cuda_make_array(l.output, batch*outputs); 00100 l.state_gpu = cuda_make_array(l.output, batch*outputs); 00101 l.output_gpu = cuda_make_array(l.output, batch*outputs*steps); 00102 l.delta_gpu = cuda_make_array(l.delta, batch*outputs*steps); 00103 l.r_gpu = cuda_make_array(l.output_gpu, batch*outputs); 00104 l.z_gpu = cuda_make_array(l.output_gpu, batch*outputs); 00105 l.h_gpu = cuda_make_array(l.output_gpu, batch*outputs); 00106 #endif 00107 00108 return l; 00109 } 00110 00111 void update_gru_layer(layer l, int batch, float learning_rate, float momentum, float decay) 00112 { 00113 update_connected_layer(*(l.input_layer), batch, learning_rate, momentum, decay); 00114 update_connected_layer(*(l.self_layer), batch, learning_rate, momentum, decay); 00115 update_connected_layer(*(l.output_layer), batch, learning_rate, momentum, decay); 00116 } 00117 00118 void forward_gru_layer(layer l, network_state state) 00119 { 00120 network_state s = {0}; 00121 s.train = state.train; 00122 int i; 00123 layer input_z_layer = *(l.input_z_layer); 00124 layer input_r_layer = *(l.input_r_layer); 00125 layer input_h_layer = *(l.input_h_layer); 00126 00127 layer state_z_layer = *(l.state_z_layer); 00128 layer state_r_layer = *(l.state_r_layer); 00129 layer state_h_layer = *(l.state_h_layer); 00130 00131 fill_cpu(l.outputs * l.batch * l.steps, 0, input_z_layer.delta, 1); 00132 fill_cpu(l.outputs * l.batch * l.steps, 0, input_r_layer.delta, 1); 00133 fill_cpu(l.outputs * l.batch * l.steps, 0, input_h_layer.delta, 1); 00134 00135 fill_cpu(l.outputs * l.batch * l.steps, 0, state_z_layer.delta, 1); 00136 fill_cpu(l.outputs * l.batch * l.steps, 0, state_r_layer.delta, 1); 00137 fill_cpu(l.outputs * l.batch * l.steps, 0, state_h_layer.delta, 1); 00138 if(state.train) { 00139 fill_cpu(l.outputs * l.batch * l.steps, 0, l.delta, 1); 00140 copy_cpu(l.outputs*l.batch, l.state, 1, l.prev_state, 1); 00141 } 00142 00143 for (i = 0; i < l.steps; ++i) { 00144 s.input = l.state; 00145 forward_connected_layer(state_z_layer, s); 00146 forward_connected_layer(state_r_layer, s); 00147 00148 s.input = state.input; 00149 forward_connected_layer(input_z_layer, s); 00150 forward_connected_layer(input_r_layer, s); 00151 forward_connected_layer(input_h_layer, s); 00152 00153 00154 copy_cpu(l.outputs*l.batch, input_z_layer.output, 1, l.z_cpu, 1); 00155 axpy_cpu(l.outputs*l.batch, 1, state_z_layer.output, 1, l.z_cpu, 1); 00156 00157 copy_cpu(l.outputs*l.batch, input_r_layer.output, 1, l.r_cpu, 1); 00158 axpy_cpu(l.outputs*l.batch, 1, state_r_layer.output, 1, l.r_cpu, 1); 00159 00160 activate_array(l.z_cpu, l.outputs*l.batch, LOGISTIC); 00161 activate_array(l.r_cpu, l.outputs*l.batch, LOGISTIC); 00162 00163 copy_cpu(l.outputs*l.batch, l.state, 1, l.forgot_state, 1); 00164 mul_cpu(l.outputs*l.batch, l.r_cpu, 1, l.forgot_state, 1); 00165 00166 s.input = l.forgot_state; 00167 forward_connected_layer(state_h_layer, s); 00168 00169 copy_cpu(l.outputs*l.batch, input_h_layer.output, 1, l.h_cpu, 1); 00170 axpy_cpu(l.outputs*l.batch, 1, state_h_layer.output, 1, l.h_cpu, 1); 00171 00172 #ifdef USET 00173 activate_array(l.h_cpu, l.outputs*l.batch, TANH); 00174 #else 00175 activate_array(l.h_cpu, l.outputs*l.batch, LOGISTIC); 00176 #endif 00177 00178 weighted_sum_cpu(l.state, l.h_cpu, l.z_cpu, l.outputs*l.batch, l.output); 00179 00180 copy_cpu(l.outputs*l.batch, l.output, 1, l.state, 1); 00181 00182 state.input += l.inputs*l.batch; 00183 l.output += l.outputs*l.batch; 00184 increment_layer(&input_z_layer, 1); 00185 increment_layer(&input_r_layer, 1); 00186 increment_layer(&input_h_layer, 1); 00187 00188 increment_layer(&state_z_layer, 1); 00189 increment_layer(&state_r_layer, 1); 00190 increment_layer(&state_h_layer, 1); 00191 } 00192 } 00193 00194 void backward_gru_layer(layer l, network_state state) 00195 { 00196 } 00197 00198 #ifdef GPU 00199 00200 void pull_gru_layer(layer l) 00201 { 00202 } 00203 00204 void push_gru_layer(layer l) 00205 { 00206 } 00207 00208 void update_gru_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay) 00209 { 00210 update_connected_layer_gpu(*(l.input_r_layer), batch, learning_rate, momentum, decay); 00211 update_connected_layer_gpu(*(l.input_z_layer), batch, learning_rate, momentum, decay); 00212 update_connected_layer_gpu(*(l.input_h_layer), batch, learning_rate, momentum, decay); 00213 update_connected_layer_gpu(*(l.state_r_layer), batch, learning_rate, momentum, decay); 00214 update_connected_layer_gpu(*(l.state_z_layer), batch, learning_rate, momentum, decay); 00215 update_connected_layer_gpu(*(l.state_h_layer), batch, learning_rate, momentum, decay); 00216 } 00217 00218 void forward_gru_layer_gpu(layer l, network_state state) 00219 { 00220 network_state s = {0}; 00221 s.train = state.train; 00222 int i; 00223 layer input_z_layer = *(l.input_z_layer); 00224 layer input_r_layer = *(l.input_r_layer); 00225 layer input_h_layer = *(l.input_h_layer); 00226 00227 layer state_z_layer = *(l.state_z_layer); 00228 layer state_r_layer = *(l.state_r_layer); 00229 layer state_h_layer = *(l.state_h_layer); 00230 00231 fill_ongpu(l.outputs * l.batch * l.steps, 0, input_z_layer.delta_gpu, 1); 00232 fill_ongpu(l.outputs * l.batch * l.steps, 0, input_r_layer.delta_gpu, 1); 00233 fill_ongpu(l.outputs * l.batch * l.steps, 0, input_h_layer.delta_gpu, 1); 00234 00235 fill_ongpu(l.outputs * l.batch * l.steps, 0, state_z_layer.delta_gpu, 1); 00236 fill_ongpu(l.outputs * l.batch * l.steps, 0, state_r_layer.delta_gpu, 1); 00237 fill_ongpu(l.outputs * l.batch * l.steps, 0, state_h_layer.delta_gpu, 1); 00238 if(state.train) { 00239 fill_ongpu(l.outputs * l.batch * l.steps, 0, l.delta_gpu, 1); 00240 copy_ongpu(l.outputs*l.batch, l.state_gpu, 1, l.prev_state_gpu, 1); 00241 } 00242 00243 for (i = 0; i < l.steps; ++i) { 00244 s.input = l.state_gpu; 00245 forward_connected_layer_gpu(state_z_layer, s); 00246 forward_connected_layer_gpu(state_r_layer, s); 00247 00248 s.input = state.input; 00249 forward_connected_layer_gpu(input_z_layer, s); 00250 forward_connected_layer_gpu(input_r_layer, s); 00251 forward_connected_layer_gpu(input_h_layer, s); 00252 00253 00254 copy_ongpu(l.outputs*l.batch, input_z_layer.output_gpu, 1, l.z_gpu, 1); 00255 axpy_ongpu(l.outputs*l.batch, 1, state_z_layer.output_gpu, 1, l.z_gpu, 1); 00256 00257 copy_ongpu(l.outputs*l.batch, input_r_layer.output_gpu, 1, l.r_gpu, 1); 00258 axpy_ongpu(l.outputs*l.batch, 1, state_r_layer.output_gpu, 1, l.r_gpu, 1); 00259 00260 activate_array_ongpu(l.z_gpu, l.outputs*l.batch, LOGISTIC); 00261 activate_array_ongpu(l.r_gpu, l.outputs*l.batch, LOGISTIC); 00262 00263 copy_ongpu(l.outputs*l.batch, l.state_gpu, 1, l.forgot_state_gpu, 1); 00264 mul_ongpu(l.outputs*l.batch, l.r_gpu, 1, l.forgot_state_gpu, 1); 00265 00266 s.input = l.forgot_state_gpu; 00267 forward_connected_layer_gpu(state_h_layer, s); 00268 00269 copy_ongpu(l.outputs*l.batch, input_h_layer.output_gpu, 1, l.h_gpu, 1); 00270 axpy_ongpu(l.outputs*l.batch, 1, state_h_layer.output_gpu, 1, l.h_gpu, 1); 00271 00272 #ifdef USET 00273 activate_array_ongpu(l.h_gpu, l.outputs*l.batch, TANH); 00274 #else 00275 activate_array_ongpu(l.h_gpu, l.outputs*l.batch, LOGISTIC); 00276 #endif 00277 00278 weighted_sum_gpu(l.state_gpu, l.h_gpu, l.z_gpu, l.outputs*l.batch, l.output_gpu); 00279 00280 copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.state_gpu, 1); 00281 00282 state.input += l.inputs*l.batch; 00283 l.output_gpu += l.outputs*l.batch; 00284 increment_layer(&input_z_layer, 1); 00285 increment_layer(&input_r_layer, 1); 00286 increment_layer(&input_h_layer, 1); 00287 00288 increment_layer(&state_z_layer, 1); 00289 increment_layer(&state_r_layer, 1); 00290 increment_layer(&state_h_layer, 1); 00291 } 00292 } 00293 00294 void backward_gru_layer_gpu(layer l, network_state state) 00295 { 00296 network_state s = {0}; 00297 s.train = state.train; 00298 int i; 00299 layer input_z_layer = *(l.input_z_layer); 00300 layer input_r_layer = *(l.input_r_layer); 00301 layer input_h_layer = *(l.input_h_layer); 00302 00303 layer state_z_layer = *(l.state_z_layer); 00304 layer state_r_layer = *(l.state_r_layer); 00305 layer state_h_layer = *(l.state_h_layer); 00306 00307 increment_layer(&input_z_layer, l.steps - 1); 00308 increment_layer(&input_r_layer, l.steps - 1); 00309 increment_layer(&input_h_layer, l.steps - 1); 00310 00311 increment_layer(&state_z_layer, l.steps - 1); 00312 increment_layer(&state_r_layer, l.steps - 1); 00313 increment_layer(&state_h_layer, l.steps - 1); 00314 00315 state.input += l.inputs*l.batch*(l.steps-1); 00316 if(state.delta) state.delta += l.inputs*l.batch*(l.steps-1); 00317 l.output_gpu += l.outputs*l.batch*(l.steps-1); 00318 l.delta_gpu += l.outputs*l.batch*(l.steps-1); 00319 for (i = l.steps-1; i >= 0; --i) { 00320 if(i != 0) copy_ongpu(l.outputs*l.batch, l.output_gpu - l.outputs*l.batch, 1, l.prev_state_gpu, 1); 00321 float *prev_delta_gpu = (i == 0) ? 0 : l.delta_gpu - l.outputs*l.batch; 00322 00323 copy_ongpu(l.outputs*l.batch, input_z_layer.output_gpu, 1, l.z_gpu, 1); 00324 axpy_ongpu(l.outputs*l.batch, 1, state_z_layer.output_gpu, 1, l.z_gpu, 1); 00325 00326 copy_ongpu(l.outputs*l.batch, input_r_layer.output_gpu, 1, l.r_gpu, 1); 00327 axpy_ongpu(l.outputs*l.batch, 1, state_r_layer.output_gpu, 1, l.r_gpu, 1); 00328 00329 activate_array_ongpu(l.z_gpu, l.outputs*l.batch, LOGISTIC); 00330 activate_array_ongpu(l.r_gpu, l.outputs*l.batch, LOGISTIC); 00331 00332 copy_ongpu(l.outputs*l.batch, input_h_layer.output_gpu, 1, l.h_gpu, 1); 00333 axpy_ongpu(l.outputs*l.batch, 1, state_h_layer.output_gpu, 1, l.h_gpu, 1); 00334 00335 #ifdef USET 00336 activate_array_ongpu(l.h_gpu, l.outputs*l.batch, TANH); 00337 #else 00338 activate_array_ongpu(l.h_gpu, l.outputs*l.batch, LOGISTIC); 00339 #endif 00340 00341 weighted_delta_gpu(l.prev_state_gpu, l.h_gpu, l.z_gpu, prev_delta_gpu, input_h_layer.delta_gpu, input_z_layer.delta_gpu, l.outputs*l.batch, l.delta_gpu); 00342 00343 #ifdef USET 00344 gradient_array_ongpu(l.h_gpu, l.outputs*l.batch, TANH, input_h_layer.delta_gpu); 00345 #else 00346 gradient_array_ongpu(l.h_gpu, l.outputs*l.batch, LOGISTIC, input_h_layer.delta_gpu); 00347 #endif 00348 00349 copy_ongpu(l.outputs*l.batch, input_h_layer.delta_gpu, 1, state_h_layer.delta_gpu, 1); 00350 00351 copy_ongpu(l.outputs*l.batch, l.prev_state_gpu, 1, l.forgot_state_gpu, 1); 00352 mul_ongpu(l.outputs*l.batch, l.r_gpu, 1, l.forgot_state_gpu, 1); 00353 fill_ongpu(l.outputs*l.batch, 0, l.forgot_delta_gpu, 1); 00354 00355 s.input = l.forgot_state_gpu; 00356 s.delta = l.forgot_delta_gpu; 00357 00358 backward_connected_layer_gpu(state_h_layer, s); 00359 if(prev_delta_gpu) mult_add_into_gpu(l.outputs*l.batch, l.forgot_delta_gpu, l.r_gpu, prev_delta_gpu); 00360 mult_add_into_gpu(l.outputs*l.batch, l.forgot_delta_gpu, l.prev_state_gpu, input_r_layer.delta_gpu); 00361 00362 gradient_array_ongpu(l.r_gpu, l.outputs*l.batch, LOGISTIC, input_r_layer.delta_gpu); 00363 copy_ongpu(l.outputs*l.batch, input_r_layer.delta_gpu, 1, state_r_layer.delta_gpu, 1); 00364 00365 gradient_array_ongpu(l.z_gpu, l.outputs*l.batch, LOGISTIC, input_z_layer.delta_gpu); 00366 copy_ongpu(l.outputs*l.batch, input_z_layer.delta_gpu, 1, state_z_layer.delta_gpu, 1); 00367 00368 s.input = l.prev_state_gpu; 00369 s.delta = prev_delta_gpu; 00370 00371 backward_connected_layer_gpu(state_r_layer, s); 00372 backward_connected_layer_gpu(state_z_layer, s); 00373 00374 s.input = state.input; 00375 s.delta = state.delta; 00376 00377 backward_connected_layer_gpu(input_h_layer, s); 00378 backward_connected_layer_gpu(input_r_layer, s); 00379 backward_connected_layer_gpu(input_z_layer, s); 00380 00381 00382 state.input -= l.inputs*l.batch; 00383 if(state.delta) state.delta -= l.inputs*l.batch; 00384 l.output_gpu -= l.outputs*l.batch; 00385 l.delta_gpu -= l.outputs*l.batch; 00386 increment_layer(&input_z_layer, -1); 00387 increment_layer(&input_r_layer, -1); 00388 increment_layer(&input_h_layer, -1); 00389 00390 increment_layer(&state_z_layer, -1); 00391 increment_layer(&state_r_layer, -1); 00392 increment_layer(&state_h_layer, -1); 00393 } 00394 } 00395 #endif