Program Listing for File xf_custom_convolution.hpp

Return to documentation for file (/tmp/ws/src/vitis_common/include/imgproc/xf_custom_convolution.hpp)

/*
 * Copyright 2019 Xilinx, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef _XF_CUSTOM_CONVOLUTION_HPP_
#define _XF_CUSTOM_CONVOLUTION_HPP_

#include "../common/xf_common.hpp"
#include "../common/xf_utility.hpp"
#include "hls_stream.h"
typedef unsigned char uchar;

namespace xf {
namespace cv {

/****************************************************************************************
 * xFApplyCustomFilter: Applies the user defined kernel to the input image.
 *
 * _lbuf       ->  Buffer containing the input image data
 * _kernel     ->  Kernel provided by the user of type 16S
 * shift       ->  Fixed point format of the filter co-efficients for unity
 *gain filter
 ****************************************************************************************/
template <int DEPTH_SRC,
          int DEPTH_DST,
          int filter_height,
          int filter_width,
          int NPC,
          int PLANES,
          int buf_width,
          typename buf_type>
XF_PTNAME(DEPTH_DST)
xFApplyCustomFilter(buf_type _lbuf[][buf_width], short int _kernel[][filter_width], int ind, unsigned char shift) {
// clang-format off
    #pragma HLS INLINE off
    // clang-format on
    XF_PTNAME(DEPTH_DST) res = 0;
    ap_int32_t tmp_res[PLANES];
    ap_int24_t conv_val[filter_height][filter_width][PLANES];
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=conv_val complete dim=0
    // clang-format on

    ap_int32_t row_sum[filter_height][PLANES], fix_res = 0, tmp_row_sum = 0;
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=row_sum complete dim=1
    // clang-format on

    XF_PTNAME(DEPTH_DST) arr_ind = ind;

// performing kernel operation and storing in the temporary buffer
filterLoopI:
    for (uchar i = 0; i < filter_height; i++) {
// clang-format off
#pragma HLS UNROLL
        // clang-format on
        arr_ind = ind;

    filterLoopJ:
        for (uchar j = 0; j < filter_width; j++) {
// clang-format off
#pragma HLS UNROLL
        // clang-format on
        planes_loop1:
            for (uchar k = 0; k < PLANES; k++) {
// clang-format off
#pragma HLS UNROLL
                // clang-format on
                conv_val[i][j][k] = ((_lbuf[i][arr_ind].range((k * 8) + 7, k * 8)) * _kernel[i][j]);
            }
            arr_ind++;
        }
    }

// accumulating the row sum values of the temporary buffer
planes_add_row:
    for (uchar p = 0; p < PLANES; p++) {
// clang-format off
#pragma HLS UNROLL
    // clang-format on
    addFilterLoopI:
        for (uchar i = 0; i < filter_height; i++) {
// clang-format off
#pragma HLS UNROLL
            // clang-format on
            tmp_row_sum = 0;

        addFilterLoopJ:
            for (uchar j = 0; j < filter_width; j++) {
// clang-format off
#pragma HLS UNROLL
                // clang-format on
                tmp_row_sum += conv_val[i][j][p];
            }
            row_sum[i][p] = tmp_row_sum;
        }
    }

// adding the row_sum buffer elements and storing in the result
add_row_col_plane_loop:
    for (uchar p = 0; p < PLANES; p++) {
// clang-format off
#pragma HLS UNROLL
        // clang-format on
        fix_res = 0;
    resultFilterLoopI:
        for (uchar i = 0; i < filter_height; i++) {
// clang-format off
#pragma HLS UNROLL
            // clang-format on
            fix_res += row_sum[i][p];
        }

        // converting the input type from Q1.shift
        tmp_res[p] = (fix_res >> shift);
    }

    // overflow handling depending upon the input type
    if ((DEPTH_DST == XF_8UP) || (DEPTH_DST == XF_24UP)) {
    planes_loop_out8:
        for (uchar p = 0; p < PLANES; p++) {
// clang-format off
#pragma HLS UNROLL
            // clang-format on
            if (tmp_res[p] > 255) {
                res.range((p * 8) + 7, p * 8) = 255;
            } else if (tmp_res[p] < 0) {
                res.range((p * 8) + 7, p * 8) = 0;
            } else {
                res.range((p * 8) + 7, p * 8) = tmp_res[p];
            }
        }
    } else if ((DEPTH_DST == XF_16SP) || (DEPTH_DST == XF_48SP)) {
    planes_loop_out16:
        for (uchar p = 0; p < PLANES; p++) {
// clang-format off
#pragma HLS UNROLL
            // clang-format on
            int tmp_val = (int)tmp_res[p];
            if (tmp_val > ((1 << (16 - 1)) - 1)) {
                res.range((p * 16) + 15, p * 16) = ((1 << (16 - 1)) - 1);
            } else if (tmp_val < -(1 << (16 - 1))) {
                res.range((p * 16) + 15, p * 16) = -(1 << (16 - 1));
            } else {
                res.range((p * 16) + 15, p * 16) = (short)tmp_val;
            }
        }
    }
    return res;
}

/****************************************************************************************
 * xFComputeCustomFilter : Applies the mask and Computes the filter value for
 *NPC
 *                  number of times.
 *
 * _lbuf       ->  Buffer containing the input image data
 * _kernel     ->  Kernel provided by the user of type 16S
 * _mask_value ->  The output buffer containing ouput image data
 * shift       ->  Fixed point format of the filter co-efficients for unity
 *gain filter
 ****************************************************************************************/
template <int filter_height, int filter_width, int buf_width, int NPC, int DEPTH_SRC, int DEPTH_DST, int PLANES>
void xFComputeCustomFilter(XF_PTNAME(DEPTH_SRC) _lbuf[][buf_width],
                           short int _kernel[][filter_width],
                           XF_PTNAME(DEPTH_DST) * _mask_value,
                           unsigned char shift) {
// clang-format off
    #pragma HLS inline
// clang-format on
// computes the filter operation depending upon the mode of parallelism
computeFilterLoop:
    for (ap_uint<5> j = 0; j < XF_NPIXPERCYCLE(NPC); j++) {
// clang-format off
        #pragma HLS UNROLL
        // clang-format on
        _mask_value[j] = xFApplyCustomFilter<DEPTH_SRC, DEPTH_DST, filter_height, filter_width, NPC, PLANES>(
            _lbuf, _kernel, j, shift);
    }
}

template <int SRC_T,
          int DST_T,
          int ROWS,
          int COLS,
          int DEPTH_SRC,
          int DEPTH_DST,
          int NPC,
          int WORDWIDTH_SRC,
          int WORDWIDTH_DST,
          int TC,
          int FW,
          int filter_height,
          int filter_width,
          int F_COUNT,
          int PLANES>
void Convolution_Process(xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _src,
                         xf::cv::Mat<DST_T, ROWS, COLS, NPC>& _dst,
                         XF_SNAME(WORDWIDTH_SRC) buf[filter_height][COLS >> XF_BITSHIFT(NPC)],
                         XF_PTNAME(DEPTH_SRC) lbuf[filter_height][XF_NPIXPERCYCLE(NPC) + filter_width - 1],
                         XF_SNAME(WORDWIDTH_SRC) tmp_buf[filter_height],
                         XF_PTNAME(DEPTH_DST) mask_value[XF_NPIXPERCYCLE(NPC)],
                         short int _filter[][filter_width],
                         uint16_t image_width,
                         uchar row_ind,
                         unsigned char shift,
                         XF_SNAME(WORDWIDTH_DST) & P0,
                         unsigned char index[filter_height],
                         ap_uint<13> col_factor,
                         uchar filter_width_factor,
                         unsigned short image_height,
                         ap_uint<13> row,
                         int& rd_ind,
                         int& wr_ind) {
// clang-format off
    #pragma HLS INLINE
    // clang-format on
    uchar step = XF_PIXELDEPTH(DEPTH_DST);
    unsigned short max_loop = XF_WORDDEPTH(WORDWIDTH_DST);
mainColLoop:
    for (ap_uint<13> col = 0; col < (image_width); col++) // Width of the image
    {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=TC max=TC
        #pragma HLS PIPELINE II=1
        // clang-format on

        // reading the data from the stream to the input buffer

        if (row < image_height) {
            buf[row_ind][col] = _src.read(rd_ind);
            rd_ind++;
        } else {
            buf[row_ind][col] = 0;
        }

    // loading the data from the input buffer to the temporary buffer
    fillTempBuffer_1:
        for (uchar l = 0; l < filter_height; l++) {
// clang-format off
            #pragma HLS UNROLL
            // clang-format on
            tmp_buf[l] = buf[index[l]][col];
        }

    // extracting the pixels from the temporary buffer to the line buffer
    extractPixelsLoop_1:
        for (uchar l = 0; l < filter_height; l++) {
// clang-format off
            #pragma HLS UNROLL
            // clang-format on
            xfExtractPixels<NPC, WORDWIDTH_SRC, DEPTH_SRC>(&lbuf[l][(filter_width - 1)], tmp_buf[l], 0);
        }

        // computing the mask value
        xFComputeCustomFilter<filter_height, filter_width, (XF_NPIXPERCYCLE(NPC) + filter_width - 1), NPC, DEPTH_SRC,
                              DEPTH_DST, PLANES>(lbuf, _filter, mask_value, shift);

        // left column border condition
        if (col <= col_factor) {
            ap_uint<13> ind = filter_width_factor;
            ap_uint<13> range_step = 0;

            if ((XF_NPIXPERCYCLE(NPC) - filter_width_factor) >= 0) {
            packMaskToTempRes_1:
                for (uchar l = 0; l < (XF_NPIXPERCYCLE(NPC) - FW); l++) {
// clang-format off
                    #pragma HLS LOOP_TRIPCOUNT min=F_COUNT max=F_COUNT
                    #pragma HLS UNROLL
                    // clang-format on
                    P0.range((range_step + (step - 1)), range_step) = mask_value[ind++];
                    range_step += step;
                }
            } else {
                filter_width_factor -= XF_NPIXPERCYCLE(NPC);
            }
        }

        // packing the data from the mask value to the temporary result P0 and
        // pushing data into stream
        else {
            ap_uint<10> max_range_step = max_loop - (filter_width_factor * step);

        packMaskToTempRes_2:
            for (uchar l = 0; l < FW; l++) {
// clang-format off
                #pragma HLS LOOP_TRIPCOUNT min=FW max=FW
                #pragma HLS UNROLL
                // clang-format on
                P0.range((max_range_step + (step - 1)), (max_range_step)) = mask_value[l];
                max_range_step += step;
            }

            // writing the temporary result into the stream
            _dst.write(wr_ind, P0);
            wr_ind++;

            ap_uint<13> ind = filter_width_factor;
            ap_uint<13> range_step = 0;

        packMaskToTempRes_3:
            for (ap_uint<13> l = 0; l < (XF_NPIXPERCYCLE(NPC) - FW); l++) {
// clang-format off
                #pragma HLS LOOP_TRIPCOUNT min=F_COUNT max=F_COUNT
                #pragma HLS UNROLL
                // clang-format on
                P0.range((range_step + (step - 1)), range_step) = mask_value[ind++];
                range_step += step;
            }
        }

    // re-initializing the line buffers
    copyEndPixelsI_1:
        for (uchar i = 0; i < filter_height; i++) {
// clang-format off
            #pragma HLS UNROLL
        // clang-format on
        copyEndPixelsJ_1:
            for (uchar l = 0; l < (filter_width - 1); l++) {
// clang-format off
                #pragma HLS UNROLL
                // clang-format on
                lbuf[i][l] = lbuf[i][XF_NPIXPERCYCLE(NPC) + l];
            }
        }
    } //  end of main column loop*/
}

/************************************************************************************
 * xFCustomConvKernel : Convolutes the input filter over the input image and
 *writes
 *                  onto the output image.
 *
 * _src     ->  Input image of type 8U
 * _filter  ->  Kernel provided by the user of type 16S
 * _dst     ->  Output image after applying the filter operation, of type
 *8U or 16S
 * shift    ->  Fixed point format of the filter co-efficients for unity
 *gain
 *filter
 ************************************************************************************/
template <int SRC_T,
          int DST_T,
          int ROWS,
          int COLS,
          int DEPTH_SRC,
          int DEPTH_DST,
          int NPC,
          int WORDWIDTH_SRC,
          int WORDWIDTH_DST,
          int COLS_COUNT,
          int filter_height,
          int filter_width,
          int F_COUNT,
          int FW,
          int COL_FACTOR_COUNT,
          int PLANES>
void xFCustomConvolutionKernel(xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _src,
                               short int _filter[][filter_width],
                               xf::cv::Mat<DST_T, ROWS, COLS, NPC>& _dst,
                               unsigned char shift,
                               unsigned short img_width,
                               unsigned short img_height) {
    uchar step = XF_PIXELDEPTH(DEPTH_DST);
    unsigned short max_loop = XF_WORDDEPTH(WORDWIDTH_DST);
    uchar buf_size = (XF_NPIXPERCYCLE(NPC) + filter_width - 1);

    uchar row_ind = 0, row_ptr = 0;
    unsigned char index[filter_height];
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=index complete dim=1
    // clang-format on

    XF_SNAME(WORDWIDTH_DST) P0;
    XF_SNAME(WORDWIDTH_SRC) buf[filter_height][COLS >> XF_BITSHIFT(NPC)];
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=buf complete dim=1
    // clang-format on

    XF_PTNAME(DEPTH_SRC)
    lbuf[filter_height][XF_NPIXPERCYCLE(NPC) + filter_width - 1];
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=lbuf complete dim=0
    // clang-format on

    XF_SNAME(WORDWIDTH_SRC) tmp_buf[filter_height];
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=tmp_buf complete dim=1
    // clang-format on

    XF_PTNAME(DEPTH_DST) mask_value[XF_NPIXPERCYCLE(NPC)];
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=mask_value complete dim=1
    // clang-format on

    XF_PTNAME(DEPTH_DST) col_border_mask[(filter_width >> 1)];
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=col_border_mask complete dim=1
    // clang-format on

    ap_uint<13> col_factor = 0;
    uchar filter_width_factor = (filter_width >> 1);
    int rd_ind = 0, wr_ind = 0;

// setting the column factor depending upon the filter dimensions
colFactorLoop:
    for (uchar f = (filter_width >> 1); f > (XF_NPIXPERCYCLE(NPC)); f = (f - XF_NPIXPERCYCLE(NPC))) {
        col_factor++;
    }

// initializing the first two rows to zeros
fillBufZerosI:
    for (uchar i = 0; i < (filter_height >> 1); i++) {
// clang-format off
        #pragma HLS UNROLL
    // clang-format on
    fillBufZerosJ:
        for (ap_uint<13> j = 0; j < (img_width); j++) {
// clang-format off
            #pragma HLS LOOP_TRIPCOUNT min=COLS_COUNT max=COLS_COUNT
            #pragma HLS UNROLL
            // clang-format on
            buf[row_ind][j] = 0;
        }
        row_ind++;
    }

// reading the first two rows from the input stream
readTopBorderI:
    for (uchar i = 0; i < (filter_height >> 1); i++) {
// clang-format off
        #pragma HLS UNROLL
    // clang-format on
    readTopBorderJ:
        for (ap_uint<13> j = 0; j < (img_width); j++) {
// clang-format off
            #pragma HLS LOOP_TRIPCOUNT min=COLS_COUNT max=COLS_COUNT
            #pragma HLS PIPELINE
            // clang-format on
            buf[row_ind][j] = _src.read(rd_ind);
            rd_ind++;
        }
        row_ind++;
    }

// row loop from 1 to the end of the image
mainRowLoop:
    for (ap_uint<13> row = (filter_height >> 1); row < (img_height + ((filter_height >> 1))); row++) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=ROWS max=ROWS
        // clang-format on

        row_ptr = row_ind + 1;

    // index calculation
    settingIndex_1:
        for (int l = 0; l < filter_height; l++) {
// clang-format off
            #pragma HLS UNROLL
            // clang-format on
            if (row_ptr >= filter_height) row_ptr = 0;

            index[l] = row_ptr++;
        }

    // initializing the line buffer to zero
    fillingLineBufferZerosI_1:
        for (uchar i = 0; i < filter_height; i++) {
// clang-format off
            #pragma HLS UNROLL
        // clang-format on
        fillingLineBufferZerosJ_1:
            for (uchar j = 0; j < (filter_width - 1); j++) {
// clang-format off
                #pragma HLS UNROLL
                // clang-format on
                lbuf[i][j] = 0;
            }
        }
        // initializing the temporary result value to zero
        P0 = 0;

        Convolution_Process<SRC_T, DST_T, ROWS, COLS, DEPTH_SRC, DEPTH_DST, NPC, WORDWIDTH_SRC, WORDWIDTH_DST,
                            COLS_COUNT, FW, filter_height, filter_width, F_COUNT, PLANES>(
            _src, _dst, buf, lbuf, tmp_buf, mask_value, _filter, img_width, row_ind, shift, P0, index, col_factor,
            filter_width_factor, img_height, row, rd_ind, wr_ind);


    // initializing the line buffers to zero
    fillingLineBufferZerosI_2:
        for (uchar i = 0; i < filter_height; i++) {
// clang-format off
            #pragma HLS UNROLL
        // clang-format on
        fillingLineBufferZerosJ_2:
            for (ap_uint<13> l = (filter_width - 1); l < buf_size; l++) {
// clang-format off
                #pragma HLS UNROLL
                // clang-format on
                lbuf[i][l] = 0;
            }
        }

        // applying the filter and computing the mask_value
        if ((filter_width >> 1) > 0) {
        getMaskValue_1:
            for (uchar i = 0; i < (filter_width >> 1); i++) {
// clang-format off
                #pragma HLS UNROLL
                // clang-format on
                col_border_mask[i] =
                    xFApplyCustomFilter<DEPTH_SRC, DEPTH_DST, filter_height, filter_width, NPC, PLANES>(lbuf, _filter,
                                                                                                        i, shift);
            }
        }

        int max_range_step = max_loop - (FW * step);

    packMaskToTempRes_4:
        for (uchar l = 0; l < FW; l++) {
// clang-format off
            #pragma HLS LOOP_TRIPCOUNT min=FW max=FW
            #pragma HLS UNROLL
            // clang-format on
            P0.range((max_range_step + step - 1), (max_range_step)) = col_border_mask[l];
            max_range_step += step;
        }

        // writing the temporary result into the stream
        _dst.write(wr_ind, P0);
        wr_ind++;

    colFactorLoopBorder:
        for (ap_uint<13> c = 0; c < col_factor; c++) {
// clang-format off
            #pragma HLS LOOP_TRIPCOUNT min=COL_FACTOR_COUNT max=COL_FACTOR_COUNT
            // clang-format on

            max_range_step = 0;
        widthFactorLoopBorder:
            for (int l = FW; l < (XF_NPIXPERCYCLE(NPC) + FW); l++) {
                P0.range((max_range_step + (step - 1)), (max_range_step)) = col_border_mask[l];
                max_range_step += step;
            }
            _dst.write(wr_ind, P0);
            wr_ind++;
        }

        // incrementing the row_ind for each iteration of row
        row_ind++;
        if (row_ind == filter_height) {
            row_ind = 0;
        }
    } // end of main row loop
} // end of xFCustomConvKernel

template <int DEPTH_SRC, int DEPTH_DST, int F_HEIGHT, int F_WIDTH, int PLANES>
void xFApplyFilter2D(XF_PTNAME(DEPTH_SRC) _kernel_pixel[F_HEIGHT][F_WIDTH],
                     short int _kernel_filter[F_HEIGHT][F_WIDTH],
                     XF_PTNAME(DEPTH_DST) & out,
                     unsigned char shift) {
// clang-format off
    #pragma HLS INLINE off
    // clang-format on

    ap_int<32> sum = 0, in_step = 0, out_step = 0, p = 0;
    ap_int<32> temp = 0;
    ap_int<32> tmp_sum = 0;
FILTER_LOOP_HEIGHT:
    ap_uint<24> bgr_val;
    if ((DEPTH_DST == XF_8UP) || (DEPTH_DST == XF_24UP)) {
        in_step = 8;
        out_step = 8;
    } else {
        in_step = 8;
        out_step = 16;
    }
    for (ap_uint<8> c = 0, k = 0; c < PLANES; c++, k += out_step) {
        sum = 0;
        temp = 0;
        tmp_sum = 0;
        for (ap_int<8> m = 0; m < F_HEIGHT; m++) {
        FILTER_LOOP_WIDTH:
            for (ap_int<8> n = 0; n < F_WIDTH; n++) {
                XF_PTNAME(DEPTH_SRC)
                src_v = _kernel_pixel[F_HEIGHT - m - 1][F_WIDTH - 1 - n];

                short int filter_v = _kernel_filter[m][n];
                temp = src_v.range(p + (in_step - 1), p) * filter_v;
                sum = sum + temp;
            }
        }
        p = p + 8;
        tmp_sum = sum >> shift;

        if ((DEPTH_DST == XF_8UP) || (DEPTH_DST == XF_24UP)) {
            if (tmp_sum > ((1 << (8)) - 1)) {
                out.range(k + 7, k) = ((1 << (8)) - 1);
            } else if (tmp_sum < 0) {
                out.range(k + 7, k) = 0;
            } else {
                out.range(k + 7, k) = tmp_sum;
            }
        } else if ((DEPTH_DST == XF_16SP) || (DEPTH_DST == XF_48SP)) {
            if (tmp_sum > ((1 << (16 - 1)) - 1)) {
                out.range(k + 15, k) = ((1 << (16 - 1)) - 1);
            } else if (tmp_sum < -(1 << (16 - 1))) {
                out.range(k + 15, k) = -(1 << (16 - 1));
            } else {
                out.range(k + 15, k) = tmp_sum;
            }
        }
    }
}
static int borderInterpolate(int p, int len, int borderType) {
// clang-format off
    #pragma HLS INLINE
    // clang-format on

    if (p >= 0 && p < len)
        return p;
    else
        p = -1;
    return p;
}

template <int SRC_T,
          int DST_T,
          int ROWS,
          int COLS,
          int DEPTH_SRC,
          int DEPTH_DST,
          int NPC,
          int WORDWIDTH_SRC,
          int WORDWIDTH_DST,
          int TC,
          int K_HEIGHT,
          int K_WIDTH,
          int PLANES>
static void xFFilter2Dkernel(xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _src_mat,
                             xf::cv::Mat<DST_T, ROWS, COLS, NPC>& _dst_mat,
                             short int _filter_kernel[K_HEIGHT][K_WIDTH],
                             unsigned char shift,
                             uint16_t rows,
                             uint16_t cols)

{
    XF_SNAME(WORDWIDTH_SRC) fillvalue = 0;
// clang-format off
    #pragma HLS INLINE off
    // clang-format on

    // The main processing window
    XF_PTNAME(DEPTH_SRC) src_kernel_win[K_HEIGHT][K_WIDTH];
    // The main line buffer
    XF_SNAME(WORDWIDTH_SRC) k_buf[K_HEIGHT][COLS >> XF_BITSHIFT(NPC)];
    // A small buffer keeping a few pixels from the line
    // buffer, so that we can complete right borders correctly.
    XF_SNAME(WORDWIDTH_SRC) right_border_buf[K_HEIGHT][K_WIDTH];
    // Temporary storage for reading from the line buffers.
    XF_SNAME(WORDWIDTH_SRC) col_buf[K_HEIGHT];
#ifndef __SYNTHESIS__
    assert(rows >= 8);
    assert(cols >= 8);
    assert(rows <= ROWS);
    assert(cols <= COLS);
#endif
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=col_buf complete dim=0
    #pragma HLS ARRAY_PARTITION variable=_filter_kernel complete dim=0
    #pragma HLS ARRAY_PARTITION variable=src_kernel_win complete dim=0
    #pragma HLS ARRAY_PARTITION variable=k_buf complete dim=1
    #pragma HLS ARRAY_PARTITION variable=right_border_buf complete dim=0
    // clang-format on

    int heightloop = rows + K_HEIGHT - 1 + K_HEIGHT;
    int widthloop = cols + K_WIDTH - 1; // one pixel overlap, so it should minus one
    /*ap_uint<13> i,j;
    ap_uint<13> anchorx=K_WIDTH/2,anchory=K_HEIGHT/2;
    ap_uint<13> ImagLocx=0,ImagLocy =0;*/

    uint16_t i, j;
    int rd_ind = 0, wr_ind = 0;
    uint16_t anchorx = K_WIDTH >> 1, anchory = K_HEIGHT >> 1;
    int16_t ImagLocx = 0, ImagLocy = 0;

ROW_LOOP:
    for (i = 0; i < heightloop; i++) {
    COL_LOOP:
        for (j = 0; j < widthloop; j++) {
// This DEPENDENCE pragma is necessary because the border mode handling is not
// affine.
// clang-format off
            #pragma HLS DEPENDENCE array inter false
            #pragma HLS LOOP_FLATTEN OFF
            #pragma HLS PIPELINE
            // clang-format on

            // fill data x,y are the coordinate in the image, it could be negative.
            // For example (-1,-1) represents the
            // interpolation pixel.
            ImagLocx = j - anchorx;
            ImagLocy = i - K_HEIGHT - anchory;
            int16_t x = borderInterpolate(ImagLocx, cols, 0);

            // column left shift
            for (ap_int<8> row = 0; row < K_HEIGHT; row++)
                for (ap_int<8> col = K_WIDTH - 1; col >= 1; col--)
                    src_kernel_win[row][col] = src_kernel_win[row][col - 1];

            for (ap_int<8> buf_row = 0; buf_row < K_HEIGHT; buf_row++) {
// Fetch the column from the line buffer to shift into the window.
#ifndef __SYNTHESIS__
                assert((x < COLS));
#endif
                col_buf[buf_row] = ((x < 0)) ? fillvalue : k_buf[buf_row][x];
            }

            if ((ImagLocy < (-anchory)) || (ImagLocy >= K_HEIGHT - 1 && ImagLocy < rows - 1)) {
                // Advance load and body process
                if (ImagLocx >= 0 && ImagLocx < cols) {
                    XF_SNAME(WORDWIDTH_SRC)
                    Toppixel = col_buf[K_HEIGHT - 1]; // k_buf[k](K_HEIGHT-1,ImagLocx);
                    src_kernel_win[K_HEIGHT - 1][0] = Toppixel;
                    if (ImagLocx >= cols - K_WIDTH) {
                        right_border_buf[0][ImagLocx - (cols - K_WIDTH)] = Toppixel;
                    }
                    for (ap_int<8> buf_row = K_HEIGHT - 1; buf_row >= 1; buf_row--) {
                        XF_SNAME(WORDWIDTH_SRC)
                        temp = col_buf[buf_row - 1]; // k_buf[k](buf_row-1,ImagLocx);
                        src_kernel_win[buf_row - 1][0] = temp;
                        k_buf[buf_row][x] = temp;
                        if (ImagLocx >= cols - K_WIDTH) {
                            right_border_buf[K_HEIGHT - buf_row][ImagLocx - (cols - K_WIDTH)] = temp;
                        }
                    }
                    XF_SNAME(WORDWIDTH_SRC) temp = 0;
                    temp = (_src_mat.read(rd_ind));
                    rd_ind++;

                    k_buf[0][x] = temp;
                } else if (ImagLocx < 0) {
                    for (int buf_row = 0; buf_row < K_HEIGHT; buf_row++) {
                        src_kernel_win[buf_row][0] = fillvalue;
                    }
                } else if (ImagLocx >= cols) {
                    for (int buf_row = 0; buf_row < K_HEIGHT; buf_row++) {
                        src_kernel_win[buf_row][0] = fillvalue;
                    }
                }
            } else if (ImagLocy >= 0) { //   && ImagLocy < K_HEIGHT-1) ||
                // (ImagLocy >= rows-1      && ImagLocy < heightloop)) {
                // top extend pixel bottom keep the buffer 0 with the data rows-1
                // content.
                int ref = K_HEIGHT - 1;
                if (ImagLocy >= rows - 1) ref = rows - 1;
                int y = ImagLocy;
                for (int buf_row = 0; buf_row < K_HEIGHT; buf_row++) {
                    int t = borderInterpolate(y, rows, 0);
                    int locy = ref - t;
#ifndef __SYNTHESIS__
                    assert(t < 0 || (locy >= 0 && locy < K_HEIGHT));
#endif
                    if (y >= rows)
                        src_kernel_win[buf_row][0] = fillvalue;
                    else if (y < 0)
                        src_kernel_win[buf_row][0] = fillvalue;
                    else
                        src_kernel_win[buf_row][0] = col_buf[locy];
                    y--;
                }
            }

            // figure out the output image pixel value
            if (i >= (K_HEIGHT + K_HEIGHT - 1) && j >= (K_WIDTH - 1)) {
                XF_PTNAME(DEPTH_DST) temp;
                xFApplyFilter2D<DEPTH_SRC, DEPTH_DST, K_HEIGHT, K_WIDTH, PLANES>(src_kernel_win, _filter_kernel, temp,
                                                                                 shift);
                XF_SNAME(WORDWIDTH_DST) temp1 = temp;
                _dst_mat.write(wr_ind, temp1);
                wr_ind++;
            }
        }
    }
}

template <int BORDER_TYPE, int FILTER_WIDTH, int FILTER_HEIGHT, int SRC_T, int DST_T, int ROWS, int COLS, int NPC>
void filter2D(xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _src_mat,
              xf::cv::Mat<DST_T, ROWS, COLS, NPC>& _dst_mat,
              short int filter[FILTER_HEIGHT * FILTER_WIDTH],
              unsigned char _shift) {
// clang-format off
    #pragma HLS INLINE OFF
// clang-format on
#ifndef __SYNTHESIS__
    assert(((_src_mat.rows <= ROWS) && (_src_mat.cols <= COLS)) && "ROWS and COLS should be greater than input image");
#endif
    unsigned short img_width = _src_mat.cols >> XF_BITSHIFT(NPC);
    unsigned short img_height = _src_mat.rows;

    short int lfilter[FILTER_HEIGHT][FILTER_WIDTH];
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=lfilter complete dim=0
    // clang-format on
    for (unsigned char i = 0; i < FILTER_HEIGHT; i++) {
        for (unsigned char j = 0; j < FILTER_WIDTH; j++) {
            lfilter[i][j] = filter[i * FILTER_WIDTH + j];
        }
    }

    if (NPC == XF_NPPC8) {
        xFCustomConvolutionKernel<SRC_T, DST_T, ROWS, COLS, XF_DEPTH(SRC_T, NPC), XF_DEPTH(DST_T, NPC), NPC,
                                  XF_WORDWIDTH(SRC_T, NPC), XF_WORDWIDTH(DST_T, NPC), (COLS >> XF_BITSHIFT(NPC)),
                                  FILTER_HEIGHT, FILTER_WIDTH,
                                  (XF_NPIXPERCYCLE(NPC) - ((FILTER_WIDTH >> 1) % XF_NPIXPERCYCLE(NPC))),
                                  ((FILTER_WIDTH >> 1) % XF_NPIXPERCYCLE(NPC)),
                                  (((FILTER_WIDTH >> 1) - 1) >> XF_BITSHIFT(NPC)), XF_CHANNELS(SRC_T, NPC)>(
            _src_mat, lfilter, _dst_mat, _shift, img_width, img_height);

    }

    else if (NPC == XF_NPPC1) {
        xFFilter2Dkernel<SRC_T, DST_T, ROWS, COLS, XF_DEPTH(SRC_T, NPC), XF_DEPTH(DST_T, NPC), NPC,
                         XF_WORDWIDTH(SRC_T, NPC), XF_WORDWIDTH(DST_T, NPC), COLS, FILTER_HEIGHT, FILTER_WIDTH,
                         XF_CHANNELS(SRC_T, NPC)>(_src_mat, _dst_mat, lfilter, _shift, img_height, img_width);
    }
}
} // namespace cv
} // namespace xf
#endif // _XF_CUSTOM_CONVOLUTION_HPP_