00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031 import numpy
00032 import six
00033
00034 from chainer import cuda
00035 from chainer import function
00036 from chainer.utils import type_check
00037
00038
00039 def _roi_pooling_slice(size, stride, max_size, roi_offset):
00040 start = int(numpy.floor(size * stride))
00041 end = int(numpy.ceil((size + 1) * stride))
00042
00043 start = min(max(start + roi_offset, 0), max_size)
00044 end = min(max(end + roi_offset, 0), max_size)
00045
00046 return slice(start, end), end - start
00047
00048
00049 class ROIPooling2D(function.Function):
00050
00051 """RoI pooling over a set of 2d planes."""
00052
00053 def __init__(self, outh, outw, spatial_scale):
00054 self.outh, self.outw = outh, outw
00055 self.spatial_scale = spatial_scale
00056
00057 def check_type_forward(self, in_types):
00058 type_check.expect(in_types.size() == 2)
00059
00060 x_type, roi_type = in_types
00061 type_check.expect(
00062 x_type.dtype == numpy.float32,
00063 x_type.ndim == 4,
00064 roi_type.dtype == numpy.float32,
00065 roi_type.ndim == 2,
00066 roi_type.shape[1] == 5,
00067 )
00068
00069 def forward_cpu(self, inputs):
00070 bottom_data, bottom_rois = inputs
00071 n_rois, channels, height, width = bottom_data.shape
00072 top_data = numpy.empty((n_rois, channels, self.outh, self.outw),
00073 dtype=numpy.float32)
00074 self.argmax_data = numpy.empty_like(top_data).astype(numpy.int32)
00075
00076 for i_roi in six.moves.range(n_rois):
00077 idx, xmin, ymin, xmax, ymax = bottom_rois[i_roi]
00078 xmin = int(round(xmin * self.spatial_scale))
00079 xmax = int(round(xmax * self.spatial_scale))
00080 ymin = int(round(ymin * self.spatial_scale))
00081 ymax = int(round(ymax * self.spatial_scale))
00082 roi_width = max(xmax - xmin + 1, 1)
00083 roi_height = max(ymax - ymin + 1, 1)
00084 strideh = 1. * roi_height / self.outh
00085 stridew = 1. * roi_width / self.outw
00086
00087 for outh in six.moves.range(self.outh):
00088 sliceh, lenh = _roi_pooling_slice(
00089 outh, strideh, height, ymin)
00090 if sliceh.stop <= sliceh.start:
00091 continue
00092 for outw in six.moves.range(self.outw):
00093 slicew, lenw = _roi_pooling_slice(
00094 outw, stridew, width, xmin)
00095 if slicew.stop <= slicew.start:
00096 continue
00097 roi_data = bottom_data[int(idx), :, sliceh, slicew]\
00098 .reshape(channels, -1)
00099 top_data[i_roi, :, outh, outw] =\
00100 numpy.max(roi_data, axis=1)
00101
00102
00103 max_idx_slice = numpy.unravel_index(
00104 numpy.argmax(roi_data, axis=1), (lenh, lenw))
00105 max_idx_slice_h = max_idx_slice[0] + sliceh.start
00106 max_idx_slice_w = max_idx_slice[1] + slicew.start
00107 max_idx_slice = max_idx_slice_h * width + max_idx_slice_w
00108 self.argmax_data[i_roi, :, outh, outw] = max_idx_slice
00109 return top_data,
00110
00111 def forward_gpu(self, inputs):
00112 bottom_data, bottom_rois = inputs
00113 channels, height, width = bottom_data.shape[1:]
00114 n_rois = bottom_rois.shape[0]
00115 top_data = cuda.cupy.empty((n_rois, channels, self.outh,
00116 self.outw), dtype=numpy.float32)
00117 self.argmax_data = cuda.cupy.empty_like(top_data).astype(numpy.int32)
00118 cuda.cupy.ElementwiseKernel(
00119 '''
00120 raw float32 bottom_data, float32 spatial_scale, int32 channels,
00121 int32 height, int32 width, int32 pooled_height, int32 pooled_width,
00122 raw float32 bottom_rois
00123 ''',
00124 'float32 top_data, int32 argmax_data',
00125 '''
00126 // pos in output filter
00127 int pw = i % pooled_width;
00128 int ph = (i / pooled_width) % pooled_height;
00129 int c = (i / pooled_width / pooled_height) % channels;
00130 int num = i / pooled_width / pooled_height / channels;
00131
00132 int roi_batch_ind = bottom_rois[num * 5 + 0];
00133 int roi_start_w = round(bottom_rois[num * 5 + 1] * spatial_scale);
00134 int roi_start_h = round(bottom_rois[num * 5 + 2] * spatial_scale);
00135 int roi_end_w = round(bottom_rois[num * 5 + 3] * spatial_scale);
00136 int roi_end_h = round(bottom_rois[num * 5 + 4] * spatial_scale);
00137
00138 // Force malformed ROIs to be 1x1
00139 int roi_width = max(roi_end_w - roi_start_w + 1, 1);
00140 int roi_height = max(roi_end_h - roi_start_h + 1, 1);
00141 float bin_size_h = static_cast<float>(roi_height)
00142 / static_cast<float>(pooled_height);
00143 float bin_size_w = static_cast<float>(roi_width)
00144 / static_cast<float>(pooled_width);
00145
00146 int hstart = static_cast<int>(floor(static_cast<float>(ph)
00147 * bin_size_h));
00148 int wstart = static_cast<int>(floor(static_cast<float>(pw)
00149 * bin_size_w));
00150 int hend = static_cast<int>(ceil(static_cast<float>(ph + 1)
00151 * bin_size_h));
00152 int wend = static_cast<int>(ceil(static_cast<float>(pw + 1)
00153 * bin_size_w));
00154
00155 // Add roi offsets and clip to input boundaries
00156 hstart = min(max(hstart + roi_start_h, 0), height);
00157 hend = min(max(hend + roi_start_h, 0), height);
00158 wstart = min(max(wstart + roi_start_w, 0), width);
00159 wend = min(max(wend + roi_start_w, 0), width);
00160 bool is_empty = (hend <= hstart) || (wend <= wstart);
00161
00162 // Define an empty pooling region to be zero
00163 float maxval = is_empty ? 0 : -1E+37;
00164 // If nothing is pooled, argmax=-1 causes nothing to be backprop'd
00165 int maxidx = -1;
00166 int data_offset = (roi_batch_ind * channels + c) * height * width;
00167 for (int h = hstart; h < hend; ++h) {
00168 for (int w = wstart; w < wend; ++w) {
00169 int bottom_index = h * width + w;
00170 if (bottom_data[data_offset + bottom_index] > maxval) {
00171 maxval = bottom_data[data_offset + bottom_index];
00172 maxidx = bottom_index;
00173 }
00174 }
00175 }
00176 top_data = maxval;
00177 argmax_data = maxidx;
00178 ''', 'roi_poolig_2d_fwd'
00179 )(bottom_data, self.spatial_scale, channels, height, width,
00180 self.outh, self.outw, bottom_rois, top_data,
00181 self.argmax_data)
00182
00183 return top_data,
00184
00185 def backward_cpu(self, inputs, gy):
00186 bottom_data, bottom_rois = inputs
00187 n_rois, channels, height, width = bottom_data.shape
00188 bottom_delta = numpy.zeros_like(bottom_data, dtype=numpy.float32)
00189
00190 for i_roi in six.moves.range(n_rois):
00191 idx, xmin, ymin, xmax, ymax = bottom_rois[i_roi]
00192 idx = int(idx)
00193 xmin = int(round(xmin * self.spatial_scale))
00194 xmax = int(round(xmax * self.spatial_scale))
00195 ymin = int(round(ymin * self.spatial_scale))
00196 ymax = int(round(ymax * self.spatial_scale))
00197 roi_width = max(xmax - xmin + 1, 1)
00198 roi_height = max(ymax - ymin + 1, 1)
00199
00200 strideh = float(roi_height) / float(self.outh)
00201 stridew = float(roi_width) / float(self.outw)
00202
00203
00204 for w in six.moves.range(xmin, xmax + 1):
00205 for h in six.moves.range(ymin, ymax + 1):
00206 phstart = int(numpy.floor(float(h - ymin) / strideh))
00207 phend = int(numpy.ceil(float(h - ymin + 1) / strideh))
00208 pwstart = int(numpy.floor(float(w - xmin) / stridew))
00209 pwend = int(numpy.ceil(float(w - xmin + 1) / stridew))
00210
00211 phstart = min(max(phstart, 0), self.outh)
00212 phend = min(max(phend, 0), self.outh)
00213 pwstart = min(max(pwstart, 0), self.outw)
00214 pwend = min(max(pwend, 0), self.outw)
00215
00216 for ph in six.moves.range(phstart, phend):
00217 for pw in six.moves.range(pwstart, pwend):
00218 max_idx_tmp = self.argmax_data[i_roi, :, ph, pw]
00219 for c in six.moves.range(channels):
00220 if max_idx_tmp[c] == (h * width + w):
00221 bottom_delta[idx, c, h, w] += \
00222 gy[0][i_roi, c, ph, pw]
00223 return bottom_delta, None
00224
00225 def backward_gpu(self, inputs, gy):
00226 bottom_data, bottom_rois = inputs
00227 channels, height, width = bottom_data.shape[1:]
00228 bottom_diff = cuda.cupy.zeros_like(bottom_data, dtype=numpy.float32)
00229 cuda.cupy.ElementwiseKernel(
00230 '''
00231 raw float32 top_diff, raw int32 argmax_data, int32 num_rois,
00232 float32 spatial_scale, int32 channels, int32 height, int32 width,
00233 int32 pooled_height, int32 pooled_width, raw float32 bottom_rois
00234 ''',
00235 'float32 bottom_diff',
00236 '''
00237 int w = i % width;
00238 int h = (i / width) % height;
00239 int c = (i / (width * height)) % channels;
00240 int num = i / (width * height * channels);
00241
00242 float gradient = 0;
00243 // Accumulate gradient over all ROIs that pooled this element
00244 for (int roi_n = 0; roi_n < num_rois; ++roi_n) {
00245 // Skip if ROI's batch index doesn't match num
00246 if (num != static_cast<int>(bottom_rois[roi_n * 5])) {
00247 continue;
00248 }
00249
00250 int roi_start_w = round(bottom_rois[roi_n * 5 + 1]
00251 * spatial_scale);
00252 int roi_start_h = round(bottom_rois[roi_n * 5 + 2]
00253 * spatial_scale);
00254 int roi_end_w = round(bottom_rois[roi_n * 5 + 3]
00255 * spatial_scale);
00256 int roi_end_h = round(bottom_rois[roi_n * 5 + 4]
00257 * spatial_scale);
00258
00259 // Skip if ROI doesn't include (h, w)
00260 const bool in_roi = (w >= roi_start_w && w <= roi_end_w &&
00261 h >= roi_start_h && h <= roi_end_h);
00262 if (!in_roi) {
00263 continue;
00264 }
00265
00266 int offset = (roi_n * channels + c) * pooled_height
00267 * pooled_width;
00268
00269 // Compute feasible set of pooled units that could have pooled
00270 // this bottom unit
00271
00272 // Force malformed ROIs to be 1x1
00273 int roi_width = max(roi_end_w - roi_start_w + 1, 1);
00274 int roi_height = max(roi_end_h - roi_start_h + 1, 1);
00275
00276 float bin_size_h = static_cast<float>(roi_height)
00277 / static_cast<float>(pooled_height);
00278 float bin_size_w = static_cast<float>(roi_width)
00279 / static_cast<float>(pooled_width);
00280
00281 int phstart = floor(static_cast<float>(h - roi_start_h)
00282 / bin_size_h);
00283 int phend = ceil(static_cast<float>(h - roi_start_h + 1)
00284 / bin_size_h);
00285 int pwstart = floor(static_cast<float>(w - roi_start_w)
00286 / bin_size_w);
00287 int pwend = ceil(static_cast<float>(w - roi_start_w + 1)
00288 / bin_size_w);
00289
00290 phstart = min(max(phstart, 0), pooled_height);
00291 phend = min(max(phend, 0), pooled_height);
00292 pwstart = min(max(pwstart, 0), pooled_width);
00293 pwend = min(max(pwend, 0), pooled_width);
00294
00295 for (int ph = phstart; ph < phend; ++ph) {
00296 for (int pw = pwstart; pw < pwend; ++pw) {
00297 int index_ = ph * pooled_width + pw + offset;
00298 if (argmax_data[index_] == (h * width + w)) {
00299 gradient += top_diff[index_];
00300 }
00301 }
00302 }
00303 }
00304 bottom_diff = gradient;
00305 ''', 'roi_pooling_2d_bwd'
00306 )(gy[0], self.argmax_data, bottom_rois.shape[0], self.spatial_scale,
00307 channels, height, width, self.outh, self.outw,
00308 bottom_rois, bottom_diff)
00309
00310 return bottom_diff, None
00311
00312
00313 def roi_pooling_2d(x, rois, outh, outw, spatial_scale):
00314 """Spatial Region of Interest (ROI) pooling function.
00315
00316 This function acts similarly to :class:`~functions.MaxPooling2D`, but
00317 it computes the maximum of input spatial patch for each channel
00318 with the region of interest.
00319
00320 Args:
00321 x (~chainer.Variable): Input variable.
00322 rois (~chainer.Variable): Input roi variable.
00323 outh (int): Height of output image after pooled.
00324 outw (int): Width of output image after pooled.
00325 spatial_scale (float): Scale of the roi is resized.
00326
00327 Returns:
00328 ~chainer.Variable: Ouptut variable.
00329
00330 See the original paper proposing ROIPooling:
00331 `Fast R-CNN <http://arxiv.org/abs/1504.08083>`_.
00332
00333 """
00334 return ROIPooling2D(outh, outw, spatial_scale)(x, rois)