jsk_recognition_utils: roi_pooling

Go to the documentation of this file.
00001 # Modified work:
00002 # -----------------------------------------------------------------------------
00003 # Copyright (c) 2015 Preferred Infrastructure, Inc.
00004 # Copyright (c) 2015 Preferred Networks, Inc.
00005 # -----------------------------------------------------------------------------
00006 
00007 # Original work of _roi_pooling_slice, forward_cpu and backward_cpu:
00008 # -----------------------------------------------------------------------------
00009 # Copyright 2014 Nervana Systems Inc.
00010 # Licensed under the Apache License, Version 2.0 (the "License");
00011 # you may not use this file except in compliance with the License.
00012 # You may obtain a copy of the License at
00013 #
00014 #      http://www.apache.org/licenses/LICENSE-2.0
00015 #
00016 # Unless required by applicable law or agreed to in writing, software
00017 # distributed under the License is distributed on an "AS IS" BASIS,
00018 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00019 # See the License for the specific language governing permissions and
00020 # limitations under the License.
00021 # -----------------------------------------------------------------------------
00022 
00023 # Original work of forward_gpu and backward_gpu:
00024 # -----------------------------------------------------------------------------
00025 # Fast R-CNN
00026 # Copyright (c) 2015 Microsoft
00027 # Licensed under The MIT License [see fast-rcnn/LICENSE for details]
00028 # Written by Ross Girshick
00029 # -----------------------------------------------------------------------------
00030 
00031 import numpy
00032 import six
00033 
00034 from chainer import cuda
00035 from chainer import function
00036 from chainer.utils import type_check
00037 
00038 
00039 def _roi_pooling_slice(size, stride, max_size, roi_offset):
00040     start = int(numpy.floor(size * stride))
00041     end = int(numpy.ceil((size + 1) * stride))
00042 
00043     start = min(max(start + roi_offset, 0), max_size)
00044     end = min(max(end + roi_offset, 0), max_size)
00045 
00046     return slice(start, end), end - start
00047 
00048 
00049 class ROIPooling2D(function.Function):
00050 
00051     """RoI pooling over a set of 2d planes."""
00052 
00053     def __init__(self, outh, outw, spatial_scale):
00054         self.outh, self.outw = outh, outw
00055         self.spatial_scale = spatial_scale
00056 
00057     def check_type_forward(self, in_types):
00058         type_check.expect(in_types.size() == 2)
00059 
00060         x_type, roi_type = in_types
00061         type_check.expect(
00062             x_type.dtype == numpy.float32,
00063             x_type.ndim == 4,
00064             roi_type.dtype == numpy.float32,
00065             roi_type.ndim == 2,
00066             roi_type.shape[1] == 5,
00067         )
00068 
00069     def forward_cpu(self, inputs):
00070         bottom_data, bottom_rois = inputs
00071         n_rois, channels, height, width = bottom_data.shape
00072         top_data = numpy.empty((n_rois, channels, self.outh, self.outw),
00073                                dtype=numpy.float32)
00074         self.argmax_data = numpy.empty_like(top_data).astype(numpy.int32)
00075 
00076         for i_roi in six.moves.range(n_rois):
00077             idx, xmin, ymin, xmax, ymax = bottom_rois[i_roi]
00078             xmin = int(round(xmin * self.spatial_scale))
00079             xmax = int(round(xmax * self.spatial_scale))
00080             ymin = int(round(ymin * self.spatial_scale))
00081             ymax = int(round(ymax * self.spatial_scale))
00082             roi_width = max(xmax - xmin + 1, 1)
00083             roi_height = max(ymax - ymin + 1, 1)
00084             strideh = 1. * roi_height / self.outh
00085             stridew = 1. * roi_width / self.outw
00086 
00087             for outh in six.moves.range(self.outh):
00088                 sliceh, lenh = _roi_pooling_slice(
00089                     outh, strideh, height, ymin)
00090                 if sliceh.stop <= sliceh.start:
00091                     continue
00092                 for outw in six.moves.range(self.outw):
00093                     slicew, lenw = _roi_pooling_slice(
00094                         outw, stridew, width, xmin)
00095                     if slicew.stop <= slicew.start:
00096                         continue
00097                     roi_data = bottom_data[int(idx), :, sliceh, slicew]\
00098                         .reshape(channels, -1)
00099                     top_data[i_roi, :, outh, outw] =\
00100                         numpy.max(roi_data, axis=1)
00101 
00102                     # get the max idx respect to feature_maps coordinates
00103                     max_idx_slice = numpy.unravel_index(
00104                         numpy.argmax(roi_data, axis=1), (lenh, lenw))
00105                     max_idx_slice_h = max_idx_slice[0] + sliceh.start
00106                     max_idx_slice_w = max_idx_slice[1] + slicew.start
00107                     max_idx_slice = max_idx_slice_h * width + max_idx_slice_w
00108                     self.argmax_data[i_roi, :, outh, outw] = max_idx_slice
00109         return top_data,
00110 
00111     def forward_gpu(self, inputs):
00112         bottom_data, bottom_rois = inputs
00113         channels, height, width = bottom_data.shape[1:]
00114         n_rois = bottom_rois.shape[0]
00115         top_data = cuda.cupy.empty((n_rois, channels, self.outh,
00116                                     self.outw), dtype=numpy.float32)
00117         self.argmax_data = cuda.cupy.empty_like(top_data).astype(numpy.int32)
00118         cuda.cupy.ElementwiseKernel(
00119             '''
00120             raw float32 bottom_data, float32 spatial_scale, int32 channels,
00121             int32 height, int32 width, int32 pooled_height, int32 pooled_width,
00122             raw float32 bottom_rois
00123             ''',
00124             'float32 top_data, int32 argmax_data',
00125             '''
00126             // pos in output filter
00127             int pw = i % pooled_width;
00128             int ph = (i / pooled_width) % pooled_height;
00129             int c = (i / pooled_width / pooled_height) % channels;
00130             int num = i / pooled_width / pooled_height / channels;
00131 
00132             int roi_batch_ind = bottom_rois[num * 5 + 0];
00133             int roi_start_w = round(bottom_rois[num * 5 + 1] * spatial_scale);
00134             int roi_start_h = round(bottom_rois[num * 5 + 2] * spatial_scale);
00135             int roi_end_w = round(bottom_rois[num * 5 + 3] * spatial_scale);
00136             int roi_end_h = round(bottom_rois[num * 5 + 4] * spatial_scale);
00137 
00138             // Force malformed ROIs to be 1x1
00139             int roi_width = max(roi_end_w - roi_start_w + 1, 1);
00140             int roi_height = max(roi_end_h - roi_start_h + 1, 1);
00141             float bin_size_h = static_cast<float>(roi_height)
00142                            / static_cast<float>(pooled_height);
00143             float bin_size_w = static_cast<float>(roi_width)
00144                            / static_cast<float>(pooled_width);
00145 
00146             int hstart = static_cast<int>(floor(static_cast<float>(ph)
00147                                           * bin_size_h));
00148             int wstart = static_cast<int>(floor(static_cast<float>(pw)
00149                                           * bin_size_w));
00150             int hend = static_cast<int>(ceil(static_cast<float>(ph + 1)
00151                                         * bin_size_h));
00152             int wend = static_cast<int>(ceil(static_cast<float>(pw + 1)
00153                                         * bin_size_w));
00154 
00155             // Add roi offsets and clip to input boundaries
00156             hstart = min(max(hstart + roi_start_h, 0), height);
00157             hend = min(max(hend + roi_start_h, 0), height);
00158             wstart = min(max(wstart + roi_start_w, 0), width);
00159             wend = min(max(wend + roi_start_w, 0), width);
00160             bool is_empty = (hend <= hstart) || (wend <= wstart);
00161 
00162             // Define an empty pooling region to be zero
00163             float maxval = is_empty ? 0 : -1E+37;
00164             // If nothing is pooled, argmax=-1 causes nothing to be backprop'd
00165             int maxidx = -1;
00166             int data_offset = (roi_batch_ind * channels + c) * height * width;
00167             for (int h = hstart; h < hend; ++h) {
00168                 for (int w = wstart; w < wend; ++w) {
00169                     int bottom_index = h * width + w;
00170                     if (bottom_data[data_offset + bottom_index] > maxval) {
00171                         maxval = bottom_data[data_offset + bottom_index];
00172                         maxidx = bottom_index;
00173                     }
00174                 }
00175             }
00176             top_data = maxval;
00177             argmax_data = maxidx;
00178             ''', 'roi_poolig_2d_fwd'
00179         )(bottom_data, self.spatial_scale, channels, height, width,
00180           self.outh, self.outw, bottom_rois, top_data,
00181           self.argmax_data)
00182 
00183         return top_data,
00184 
00185     def backward_cpu(self, inputs, gy):
00186         bottom_data, bottom_rois = inputs
00187         n_rois, channels, height, width = bottom_data.shape
00188         bottom_delta = numpy.zeros_like(bottom_data, dtype=numpy.float32)
00189 
00190         for i_roi in six.moves.range(n_rois):
00191             idx, xmin, ymin, xmax, ymax = bottom_rois[i_roi]
00192             idx = int(idx)
00193             xmin = int(round(xmin * self.spatial_scale))
00194             xmax = int(round(xmax * self.spatial_scale))
00195             ymin = int(round(ymin * self.spatial_scale))
00196             ymax = int(round(ymax * self.spatial_scale))
00197             roi_width = max(xmax - xmin + 1, 1)
00198             roi_height = max(ymax - ymin + 1, 1)
00199 
00200             strideh = float(roi_height) / float(self.outh)
00201             stridew = float(roi_width) / float(self.outw)
00202 
00203             # iterate all the w, h (from feature map) that fall into this ROIs
00204             for w in six.moves.range(xmin, xmax + 1):
00205                 for h in six.moves.range(ymin, ymax + 1):
00206                     phstart = int(numpy.floor(float(h - ymin) / strideh))
00207                     phend = int(numpy.ceil(float(h - ymin + 1) / strideh))
00208                     pwstart = int(numpy.floor(float(w - xmin) / stridew))
00209                     pwend = int(numpy.ceil(float(w - xmin + 1) / stridew))
00210 
00211                     phstart = min(max(phstart, 0), self.outh)
00212                     phend = min(max(phend, 0), self.outh)
00213                     pwstart = min(max(pwstart, 0), self.outw)
00214                     pwend = min(max(pwend, 0), self.outw)
00215 
00216                     for ph in six.moves.range(phstart, phend):
00217                         for pw in six.moves.range(pwstart, pwend):
00218                             max_idx_tmp = self.argmax_data[i_roi, :, ph, pw]
00219                             for c in six.moves.range(channels):
00220                                 if max_idx_tmp[c] == (h * width + w):
00221                                     bottom_delta[idx, c, h, w] += \
00222                                         gy[0][i_roi, c, ph, pw]
00223         return bottom_delta, None
00224 
00225     def backward_gpu(self, inputs, gy):
00226         bottom_data, bottom_rois = inputs
00227         channels, height, width = bottom_data.shape[1:]
00228         bottom_diff = cuda.cupy.zeros_like(bottom_data, dtype=numpy.float32)
00229         cuda.cupy.ElementwiseKernel(
00230             '''
00231             raw float32 top_diff, raw int32 argmax_data, int32 num_rois,
00232             float32 spatial_scale, int32 channels, int32 height, int32 width,
00233             int32 pooled_height, int32 pooled_width, raw float32 bottom_rois
00234             ''',
00235             'float32 bottom_diff',
00236             '''
00237             int w = i % width;
00238             int h = (i / width) % height;
00239             int c = (i / (width * height)) % channels;
00240             int num = i / (width * height * channels);
00241 
00242             float gradient = 0;
00243             // Accumulate gradient over all ROIs that pooled this element
00244             for (int roi_n = 0; roi_n < num_rois; ++roi_n) {
00245                 // Skip if ROI's batch index doesn't match num
00246                 if (num != static_cast<int>(bottom_rois[roi_n * 5])) {
00247                     continue;
00248                 }
00249 
00250                 int roi_start_w = round(bottom_rois[roi_n * 5 + 1]
00251                                         * spatial_scale);
00252                 int roi_start_h = round(bottom_rois[roi_n * 5 + 2]
00253                                         * spatial_scale);
00254                 int roi_end_w = round(bottom_rois[roi_n * 5 + 3]
00255                                       * spatial_scale);
00256                 int roi_end_h = round(bottom_rois[roi_n * 5 + 4]
00257                                       * spatial_scale);
00258 
00259                 // Skip if ROI doesn't include (h, w)
00260                 const bool in_roi = (w >= roi_start_w && w <= roi_end_w &&
00261                                      h >= roi_start_h && h <= roi_end_h);
00262                 if (!in_roi) {
00263                     continue;
00264                 }
00265 
00266                 int offset = (roi_n * channels + c) * pooled_height
00267                              * pooled_width;
00268 
00269                 // Compute feasible set of pooled units that could have pooled
00270                 // this bottom unit
00271 
00272                 // Force malformed ROIs to be 1x1
00273                 int roi_width = max(roi_end_w - roi_start_w + 1, 1);
00274                 int roi_height = max(roi_end_h - roi_start_h + 1, 1);
00275 
00276                 float bin_size_h = static_cast<float>(roi_height)
00277                                / static_cast<float>(pooled_height);
00278                 float bin_size_w = static_cast<float>(roi_width)
00279                                / static_cast<float>(pooled_width);
00280 
00281                 int phstart = floor(static_cast<float>(h - roi_start_h)
00282                                     / bin_size_h);
00283                 int phend = ceil(static_cast<float>(h - roi_start_h + 1)
00284                                  / bin_size_h);
00285                 int pwstart = floor(static_cast<float>(w - roi_start_w)
00286                                     / bin_size_w);
00287                 int pwend = ceil(static_cast<float>(w - roi_start_w + 1)
00288                                  / bin_size_w);
00289 
00290                 phstart = min(max(phstart, 0), pooled_height);
00291                 phend = min(max(phend, 0), pooled_height);
00292                 pwstart = min(max(pwstart, 0), pooled_width);
00293                 pwend = min(max(pwend, 0), pooled_width);
00294 
00295                 for (int ph = phstart; ph < phend; ++ph) {
00296                     for (int pw = pwstart; pw < pwend; ++pw) {
00297                         int index_ = ph * pooled_width + pw + offset;
00298                         if (argmax_data[index_] == (h * width + w)) {
00299                             gradient += top_diff[index_];
00300                         }
00301                     }
00302                 }
00303             }
00304             bottom_diff = gradient;
00305             ''', 'roi_pooling_2d_bwd'
00306         )(gy[0], self.argmax_data, bottom_rois.shape[0], self.spatial_scale,
00307           channels, height, width, self.outh, self.outw,
00308           bottom_rois, bottom_diff)
00309 
00310         return bottom_diff, None
00311 
00312 
00313 def roi_pooling_2d(x, rois, outh, outw, spatial_scale):
00314     """Spatial Region of Interest (ROI) pooling function.
00315 
00316     This function acts similarly to :class:`~functions.MaxPooling2D`, but
00317     it computes the maximum of input spatial patch for each channel
00318     with the region of interest.
00319 
00320     Args:
00321         x (~chainer.Variable): Input variable.
00322         rois (~chainer.Variable): Input roi variable.
00323         outh (int): Height of output image after pooled.
00324         outw (int): Width of output image after pooled.
00325         spatial_scale (float): Scale of the roi is resized.
00326 
00327     Returns:
00328         ~chainer.Variable: Ouptut variable.
00329 
00330     See the original paper proposing ROIPooling:
00331     `Fast R-CNN <http://arxiv.org/abs/1504.08083>`_.
00332 
00333     """
00334     return ROIPooling2D(outh, outw, spatial_scale)(x, rois)