Program Listing for File xf_gaussian_filter.hpp

Return to documentation for file (/tmp/ws/src/vitis_common/include/imgproc/xf_gaussian_filter.hpp)

/*
 * Copyright 2019 Xilinx, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef _XF_GAUSSIAN_HPP_
#define _XF_GAUSSIAN_HPP_

#ifndef __cplusplus
#error C++ is needed to include this header
#endif

#include "hls_stream.h"
#include "../common/xf_common.hpp"
#include "../common/xf_utility.hpp"

namespace xf {
namespace cv {

static void weightsghcalculation3x3(float sigma, unsigned char* weights) {
// clang-format off
    #pragma HLS INLINE
    // clang-format on

    float cf[3];

    float sum = 0;

    if (sigma <= 0) {
        sigma = 0.8;
    }

    int n = 3;

    float scale2X = -(1 / ((sigma * sigma) * 2));

    for (int i = 0; i < n; i++) {
        float x = i - ((n - 1) >> 1);
        float t = expf(scale2X * x * x);

        cf[i] = (float)t;
        sum += cf[i];
    }

    float err = 0.0f;
    sum = 1. / sum;
    for (int i = 0; i < n; i++) {
        cf[i] = (float)(cf[i] * sum);
        weights[i] = ((cf[i] * 256) + err + 0.5);

        err = ((cf[i] * 256)) - weights[i];
    }

}

static void weightsghcalculation5x5(float sigma, unsigned char weights[5]) {
// clang-format off
    #pragma HLS INLINE
    // clang-format on

    float cf[5];

    float sum = 0;

    if (sigma <= 0) {
        sigma = 1.1f;
    }

    int n = 5;

    float scale2X = -(1 / ((sigma * sigma) * 2));

    for (int i = 0; i < n; i++) {
        float x = i - ((n - 1) >> 1);
        float t = expf(scale2X * x * x);

        cf[i] = (float)t;
        sum += cf[i];
    }
    float err = 0.0f;
    sum = 1. / sum;
    for (int i = 0; i < n; i++) {
        cf[i] = (float)(cf[i] * sum);
        weights[i] = ((float)(cf[i] * 256) + err + 0.5);

        err = ((cf[i] * 256)) - weights[i];
    }
}

static void weightsghcalculation7x7(float sigma, unsigned char weights[7]) {
// clang-format off
    #pragma HLS INLINE
    // clang-format on

    float kval[7][7];

    float cf[7];

    float sum = 0;

    if (sigma <= 0) {
        sigma = 1.4f;
    }

    int n = 7;

    float scale2X = -(1 / ((sigma * sigma) * 2));

    for (int i = 0; i < n; i++) {
        float x = i - ((n - 1) >> 1);
        float t = expf(scale2X * x * x);

        cf[i] = (float)t;
        sum += cf[i];
    }

    sum = 1. / sum;
    for (int i = 0; i < n; i++) {
        cf[i] = (float)(cf[i] * sum);
        weights[i] = (unsigned char)(((float)cf[i] * 256) + 0.5);
    }
}

template <int DEPTH>
XF_PTNAME(DEPTH)
xFapplygaussian3x3(XF_PTNAME(DEPTH) D1,
                   XF_PTNAME(DEPTH) D2,
                   XF_PTNAME(DEPTH) D3,
                   XF_PTNAME(DEPTH) D4,
                   XF_PTNAME(DEPTH) D5,
                   XF_PTNAME(DEPTH) D6,
                   XF_PTNAME(DEPTH) D7,
                   XF_PTNAME(DEPTH) D8,
                   XF_PTNAME(DEPTH) D9,
                   unsigned char* weights) {
// clang-format off
    #pragma HLS INLINE OFF
    // clang-format on
    XF_PTNAME(DEPTH) out_pix = 0;
    unsigned int sum = 0;

    ap_uint<18> sum1, sum2, sum3;

    sum2 = (D4 + D6) * weights[0] + D5 * weights[1];

    ap_uint<15> sumvalue0 = D1 + D3 + D7 + D9;

    ap_uint<15> sumvalue1 = D2 + D8;

    unsigned int value1 = sumvalue0 * weights[0] + sumvalue1 * weights[1];

    sum = (value1)*weights[0] + sum2 * weights[1];

    unsigned char val = (sum + 32768) >> 16;

    out_pix = (XF_PTNAME(DEPTH))val;

    return out_pix;
}

template <int PLANES, int DEPTH, bool FOR_IMAGE_PYRAMID>
XF_PTNAME(DEPTH)
xfapplygaussian5x5(XF_PTNAME(DEPTH) * src_buf1,
                   XF_PTNAME(DEPTH) * src_buf2,
                   XF_PTNAME(DEPTH) * src_buf3,
                   XF_PTNAME(DEPTH) * src_buf4,
                   XF_PTNAME(DEPTH) * src_buf5,
                   unsigned char weights[5]) {
// clang-format off
    #pragma HLS INLINE OFF
    // clang-format on
    unsigned int sum = 0.0, sumval = 0;
    unsigned char value = 0;

    XF_PTNAME(DEPTH) out_pix = 0;

    ap_uint<10> tmp[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

    for (int i = 0, k = 0; i < PLANES; i++, k += 8) {
        tmp[0] = src_buf1[0].range(k + 7, k) + src_buf1[4].range(k + 7, k);
        tmp[1] = src_buf2[0].range(k + 7, k) + src_buf2[4].range(k + 7, k);
        tmp[2] = src_buf3[0].range(k + 7, k) + src_buf3[4].range(k + 7, k);
        tmp[3] = src_buf4[0].range(k + 7, k) + src_buf4[4].range(k + 7, k);
        tmp[4] = src_buf5[0].range(k + 7, k) + src_buf5[4].range(k + 7, k);

        tmp[5] = src_buf1[1].range(k + 7, k) + src_buf1[3].range(k + 7, k);
        tmp[6] = src_buf2[1].range(k + 7, k) + src_buf2[3].range(k + 7, k);
        tmp[7] = src_buf3[1].range(k + 7, k) + src_buf3[3].range(k + 7, k);
        tmp[8] = src_buf4[1].range(k + 7, k) + src_buf4[3].range(k + 7, k);
        tmp[9] = src_buf5[1].range(k + 7, k) + src_buf5[3].range(k + 7, k);

        ap_uint<24> tmp_sum[5] = {0, 0, 0, 0, 0};

        tmp_sum[0] = (ap_uint<24>)(tmp[0] + tmp[4]) * weights[0] + (ap_uint<24>)(tmp[5] + tmp[9]) * weights[1] +
                     (ap_uint<24>)(src_buf1[2].range(k + 7, k) + src_buf5[2].range(k + 7, k)) * weights[2];
        tmp_sum[1] = (ap_uint<24>)(tmp[1] + tmp[3]) * weights[0] + (ap_uint<24>)(tmp[6] + tmp[8]) * weights[1] +
                     (ap_uint<24>)(src_buf2[2].range(k + 7, k) + src_buf4[2].range(k + 7, k)) * weights[2];
        tmp_sum[2] = (ap_uint<24>)tmp[2] * weights[0] + (ap_uint<24>)tmp[7] * weights[1] +
                     (ap_uint<24>)src_buf3[2].range(k + 7, k) * weights[2];

        sumval = (unsigned int)tmp_sum[0] * weights[0] + tmp_sum[1] * weights[1] + tmp_sum[2] * weights[2];

        unsigned short val = ((sumval + 32768) >> 16);

        if (val >= 255) {
            value = 255;
        } else {
            value = val;
        }

        out_pix.range(k + 7, k) = (XF_PTNAME(DEPTH))value;
    }
    return out_pix;
}

template <int PLANES, int DEPTH>
XF_PTNAME(DEPTH)
xfapplygaussian7x7(XF_PTNAME(DEPTH) * src_buf1,
                   XF_PTNAME(DEPTH) * src_buf2,
                   XF_PTNAME(DEPTH) * src_buf3,
                   XF_PTNAME(DEPTH) * src_buf4,
                   XF_PTNAME(DEPTH) * src_buf5,
                   XF_PTNAME(DEPTH) * src_buf6,
                   XF_PTNAME(DEPTH) * src_buf7,
                   unsigned char weights[7]) {
// clang-format off
    #pragma HLS INLINE OFF
    // clang-format on

    XF_PTNAME(DEPTH) out_pix = 0;
    unsigned long long int sum_val = 0;
    unsigned short val = 0;
    for (int c = 0, k = 0; c < PLANES; c++, k += 8) {
        unsigned long long int sum_value = 0;
        unsigned int sum = 0.0, sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, sum6 = 0.0;
        sum3 = (unsigned int)(src_buf4[0].range(k + 7, k) + src_buf4[6].range(k + 7, k)) * weights[0] +
               (src_buf4[1].range(k + 7, k) + src_buf4[5].range(k + 7, k)) * weights[1] +
               (src_buf4[2].range(k + 7, k) + src_buf4[4].range(k + 7, k)) * weights[2] +
               src_buf4[3].range(k + 7, k) * weights[3];
        sum = (unsigned int)(src_buf1[0].range(k + 7, k) + src_buf1[6].range(k + 7, k) + src_buf7[0].range(k + 7, k) +
                             src_buf7[6].range(k + 7, k)) *
                  weights[0] +
              (src_buf1[1].range(k + 7, k) + src_buf1[5].range(k + 7, k) + src_buf7[1].range(k + 7, k) +
               src_buf7[5].range(k + 7, k)) *
                  weights[1] +
              (src_buf1[2].range(k + 7, k) + src_buf1[4].range(k + 7, k) + src_buf7[2].range(k + 7, k) +
               src_buf7[4].range(k + 7, k)) *
                  weights[2] +
              (src_buf1[3].range(k + 7, k) + src_buf7[3].range(k + 7, k)) * weights[3];
        sum1 = (unsigned int)(src_buf2[0].range(k + 7, k) + src_buf2[6].range(k + 7, k) + src_buf6[0].range(k + 7, k) +
                              src_buf6[6].range(k + 7, k)) *
                   weights[0] +
               (src_buf2[1].range(k + 7, k) + src_buf2[5].range(k + 7, k) + src_buf6[1].range(k + 7, k) +
                src_buf6[5].range(k + 7, k)) *
                   weights[1] +
               (src_buf2[2].range(k + 7, k) + src_buf2[4].range(k + 7, k) + src_buf6[2].range(k + 7, k) +
                src_buf6[4].range(k + 7, k)) *
                   weights[2] +
               (src_buf2[3].range(k + 7, k) + src_buf6[3].range(k + 7, k)) * weights[3];
        sum2 = (unsigned int)(src_buf3[0].range(k + 7, k) + src_buf3[6].range(k + 7, k) + src_buf5[0].range(k + 7, k) +
                              src_buf5[6].range(k + 7, k)) *
                   weights[0] +
               (src_buf3[1].range(k + 7, k) + src_buf3[5].range(k + 7, k) + src_buf5[1].range(k + 7, k) +
                src_buf5[5].range(k + 7, k)) *
                   weights[1] +
               (src_buf3[2].range(k + 7, k) + src_buf3[4].range(k + 7, k) + src_buf5[2].range(k + 7, k) +
                src_buf5[4].range(k + 7, k)) *
                   weights[2] +
               (src_buf3[3].range(k + 7, k) + src_buf5[3].range(k + 7, k)) * weights[3];

        sum_value = (sum)*weights[0] + (sum1)*weights[1] + (sum2)*weights[2] + (sum3)*weights[3];

        unsigned short val = ((sum_value + 32768) >> 16);

        unsigned char value;

        if (val >= 255) {
            value = 255;
        } else {
            value = val;
        }

        out_pix.range(k + 7, k) = (uchar_t)value;
    }
    return out_pix;
}

template <int NPC, int DEPTH, int PLANES>
void auGaussian3x3(XF_PTNAME(DEPTH) * OutputValues,
                   XF_PTNAME(DEPTH) * src_buf1,
                   XF_PTNAME(DEPTH) * src_buf2,
                   XF_PTNAME(DEPTH) * src_buf3,
                   unsigned char weights[3]) {
// clang-format off
    #pragma HLS INLINE
    // clang-format on
    int p = 0;
    ap_uint<24> val;
Compute_Grad_Loop:
    for (ap_uint<5> j = 0; j < (XF_NPIXPERCYCLE(NPC)); j++) {
// clang-format off
        #pragma HLS UNROLL
        // clang-format on
        for (ap_uint<5> c = 0, k = 0; c < PLANES; c++, k += 8) {
// clang-format off
            #pragma HLS UNROLL
            // clang-format on

            val.range(k + 7, k) = xFapplygaussian3x3<DEPTH>(
                src_buf1[j].range(k + 7, k), src_buf1[j + 1].range(k + 7, k), src_buf1[j + 2].range(k + 7, k),
                src_buf2[j].range(k + 7, k), src_buf2[j + 1].range(k + 7, k), src_buf2[j + 2].range(k + 7, k),
                src_buf3[j].range(k + 7, k), src_buf3[j + 1].range(k + 7, k), src_buf3[j + 2].range(k + 7, k), weights);
        }
        OutputValues[p] = val;
        p++;
    }
}

template <int SRC_T, int ROWS, int COLS, int PLANES, int DEPTH, int NPC, int WORDWIDTH, int TC>
void ProcessGaussian3x3(xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _src_mat,
                        xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _out_mat,
                        XF_SNAME(WORDWIDTH) buf[3][(COLS >> XF_BITSHIFT(NPC))],
                        XF_PTNAME(DEPTH) src_buf1[XF_NPIXPERCYCLE(NPC) + 2],
                        XF_PTNAME(DEPTH) src_buf2[XF_NPIXPERCYCLE(NPC) + 2],
                        XF_PTNAME(DEPTH) src_buf3[XF_NPIXPERCYCLE(NPC) + 2],
                        XF_PTNAME(DEPTH) OutputValues[XF_NPIXPERCYCLE(NPC)],
                        XF_SNAME(WORDWIDTH) & P0,
                        uint16_t img_width,
                        uint16_t img_height,
                        uint16_t& shift_x,
                        ap_uint<2> tp,
                        ap_uint<2> mid,
                        ap_uint<2> bottom,
                        ap_uint<13> row,
                        unsigned char* weights,
                        int& read_index,
                        int& write_index) {
// clang-format off
    #pragma HLS INLINE
    // clang-format on

    XF_SNAME(WORDWIDTH) buf0, buf1, buf2;
    uint16_t npc = XF_NPIXPERCYCLE(NPC);
    ap_uint<5> buf_size = XF_NPIXPERCYCLE(NPC) + 2;

Col_Loop:
    for (ap_uint<13> col = 0; col < img_width; col++) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=TC max=TC
        #pragma HLS pipeline
        // clang-format on
        if (row < img_height)
            buf[bottom][col] = _src_mat.read(read_index++); // Read data
        else
            buf[bottom][col] = 0;

        buf0 = buf[tp][col];
        buf1 = buf[mid][col];
        buf2 = buf[bottom][col];

        if (NPC == XF_NPPC8) {
            xfExtractPixels<NPC, WORDWIDTH, DEPTH>(&src_buf1[2], buf0, 0);
            xfExtractPixels<NPC, WORDWIDTH, DEPTH>(&src_buf2[2], buf1, 0);
            xfExtractPixels<NPC, WORDWIDTH, DEPTH>(&src_buf3[2], buf2, 0);

        }

        else {
            src_buf1[2] = buf0;
            src_buf2[2] = buf1;
            src_buf3[2] = buf2;
        }

        auGaussian3x3<NPC, DEPTH, PLANES>(OutputValues, src_buf1, src_buf2, src_buf3, weights);

        if (col == 0) {
            shift_x = 0;
            P0 = 0;
            if (NPC == XF_NPPC8) {
                xfPackPixels<NPC, WORDWIDTH, DEPTH>(&OutputValues[0], P0, 1, (npc - 1), shift_x);
            } else {
                P0 = OutputValues[0];
            }

        } else {
            if (NPC == XF_NPPC8) {
                xfPackPixels<NPC, WORDWIDTH, DEPTH>(&OutputValues[0], P0, 0, 1, shift_x);
            } else {
                P0 = OutputValues[0];
            }

            _out_mat.write(write_index++, P0);

            shift_x = 0;
            P0 = 0;
            if (NPC == XF_NPPC8) {
                xfPackPixels<NPC, WORDWIDTH, DEPTH>(&OutputValues[0], P0, 1, (npc - 1), shift_x);

            } else {
                P0 = OutputValues[0];
            }
        }

        src_buf1[0] = src_buf1[buf_size - 2];
        src_buf1[1] = src_buf1[buf_size - 1];
        src_buf2[0] = src_buf2[buf_size - 2];
        src_buf2[1] = src_buf2[buf_size - 1];
        src_buf3[0] = src_buf3[buf_size - 2];
        src_buf3[1] = src_buf3[buf_size - 1];
    } // Col_Loop
}

template <int SRC_T, int ROWS, int COLS, int PLANES, int DEPTH, int NPC, int WORDWIDTH, int TC>
void xfGaussianFilter3x3(xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _src_mat,
                         xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _out_mat,
                         uint16_t img_height,
                         uint16_t img_width,
                         unsigned char* weights) {
    ap_uint<13> row_ind;
    ap_uint<2> tp, mid, bottom;
    ap_uint<8> buf_size = XF_NPIXPERCYCLE(NPC) + 2;
    uint16_t shift_x = 0;
    ap_uint<13> row, col;
    int read_index = 0, write_index = 0;

    XF_PTNAME(DEPTH) OutputValues[XF_NPIXPERCYCLE(NPC)];

// clang-format off
    #pragma HLS ARRAY_PARTITION variable=OutputValues complete dim=1
    // clang-format on

    XF_PTNAME(DEPTH)
    src_buf1[XF_NPIXPERCYCLE(NPC) + 2] = {0}, src_buf2[XF_NPIXPERCYCLE(NPC) + 2] = {0},
                                    src_buf3[XF_NPIXPERCYCLE(NPC) + 2] = {0};
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=src_buf1 complete dim=1
    #pragma HLS ARRAY_PARTITION variable=src_buf2 complete dim=1
    #pragma HLS ARRAY_PARTITION variable=src_buf3 complete dim=1
    // clang-format on

    XF_SNAME(WORDWIDTH) P0;

    XF_SNAME(WORDWIDTH) buf[3][(COLS >> XF_BITSHIFT(NPC))];
// clang-format off
    #pragma HLS RESOURCE variable=buf core=RAM_S2P_BRAM
    #pragma HLS ARRAY_PARTITION variable=buf complete dim=1
    // clang-format on
    row_ind = 1;

Clear_Row_Loop:
    for (col = 0; col < img_width; col++) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=TC max=TC
        #pragma HLS pipeline
        // clang-format on
        //#pragma HLS LOOP_FLATTEN off
        buf[0][col] = 0;
        buf[row_ind][col] = _src_mat.read(read_index++); // data[read_index++];
    }
    row_ind++;

Row_Loop:
    for (row = 1; row < img_height + 1; row++) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=ROWS max=ROWS
        // clang-format on
        if (row_ind == 2) {
            tp = 0;
            mid = 1;
            bottom = 2;
        } else if (row_ind == 0) {
            tp = 1;
            mid = 2;
            bottom = 0;
        } else if (row_ind == 1) {
            tp = 2;
            mid = 0;
            bottom = 1;
        }

        src_buf1[0] = src_buf1[1] = 0;
        src_buf2[0] = src_buf2[1] = 0;
        src_buf3[0] = src_buf3[1] = 0;

        P0 = 0;

        ProcessGaussian3x3<SRC_T, ROWS, COLS, PLANES, DEPTH, NPC, WORDWIDTH, TC>(
            _src_mat, _out_mat, buf, src_buf1, src_buf2, src_buf3, OutputValues, P0, img_width, img_height, shift_x, tp,
            mid, bottom, row, weights, read_index, write_index);

        if ((NPC == XF_NPPC8) || (NPC == XF_NPPC16)) {
            OutputValues[0] = xFapplygaussian3x3<DEPTH>(src_buf1[buf_size - 2], src_buf1[buf_size - 1], 0,
                                                        src_buf2[buf_size - 2], src_buf2[buf_size - 1], 0,
                                                        src_buf3[buf_size - 2], src_buf3[buf_size - 1], 0, weights);

        } else {
            ap_uint<24> out_val1;
            for (int i = 0, k = 0; i < PLANES; i++, k += 8) {
                ap_uint<8> srcbuf10 = src_buf1[buf_size - 3].range(k + 7, k);
                ap_uint<8> srcbuf11 = src_buf1[buf_size - 2].range(k + 7, k);
                ap_uint<8> srcbuf20 = src_buf2[buf_size - 3].range(k + 7, k);
                ap_uint<8> srcbuf21 = src_buf2[buf_size - 2].range(k + 7, k);
                ap_uint<8> srcbuf30 = src_buf3[buf_size - 3].range(k + 7, k);
                ap_uint<8> srcbuf31 = src_buf3[buf_size - 2].range(k + 7, k);
                out_val1.range(k + 7, k) = xFapplygaussian3x3<DEPTH>(srcbuf10, srcbuf11, 0, srcbuf20, srcbuf21, 0,
                                                                     srcbuf30, srcbuf31, 0, weights);
            }
            OutputValues[0] = out_val1;
        }

        if (NPC == XF_NPPC8) {
            xfPackPixels<NPC, WORDWIDTH, DEPTH>(&OutputValues[0], P0, 0, 1, shift_x);

        } else {
            P0 = OutputValues[0];
        }

        _out_mat.write(write_index++, P0); // data[write_index++] = (P0);

        shift_x = 0;
        P0 = 0;

        row_ind++;
        if (row_ind == 3) {
            row_ind = 0;
        }
    } // Row_Loop
}

template <int NPC, int DEPTH, int PLANES, bool FOR_IMAGE_PYRAMID>
void xFGaussian5x5(XF_PTNAME(DEPTH) * OutputValues,
                   XF_PTNAME(DEPTH) * src_buf1,
                   XF_PTNAME(DEPTH) * src_buf2,
                   XF_PTNAME(DEPTH) * src_buf3,
                   XF_PTNAME(DEPTH) * src_buf4,
                   XF_PTNAME(DEPTH) * src_buf5,
                   unsigned char weights[5]) {
// clang-format off
    #pragma HLS INLINE
    // clang-format on
    XF_PTNAME(DEPTH) val = 0, p = 0;
Compute_Grad_Loop:
    for (ap_uint<5> j = 0; j < XF_NPIXPERCYCLE(NPC); j++) {
        OutputValues[j] = xfapplygaussian5x5<PLANES, DEPTH, FOR_IMAGE_PYRAMID>(&src_buf1[j], &src_buf2[j], &src_buf3[j],
                                                                               &src_buf4[j], &src_buf5[j], weights);
    }
}

template <int SRC_T, int ROWS, int COLS, int PLANES, int DEPTH, int NPC, int WORDWIDTH, int TC, bool FOR_IMAGE_PYRAMID>
void ProcessGaussian5x5(xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _src_mat,
                        xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _out_mat,
                        XF_SNAME(WORDWIDTH) buf[5][(COLS >> XF_BITSHIFT(NPC))],
                        XF_PTNAME(DEPTH) src_buf1[XF_NPIXPERCYCLE(NPC) + 4],
                        XF_PTNAME(DEPTH) src_buf2[XF_NPIXPERCYCLE(NPC) + 4],
                        XF_PTNAME(DEPTH) src_buf3[XF_NPIXPERCYCLE(NPC) + 4],
                        XF_PTNAME(DEPTH) src_buf4[XF_NPIXPERCYCLE(NPC) + 4],
                        XF_PTNAME(DEPTH) src_buf5[XF_NPIXPERCYCLE(NPC) + 4],
                        XF_PTNAME(DEPTH) OutputValues[XF_NPIXPERCYCLE(NPC)],
                        XF_SNAME(WORDWIDTH) & inter_valx,
                        uint16_t img_width,
                        uint16_t img_height,
                        ap_uint<13> row_ind,
                        uint16_t& shift_x,
                        ap_uint<4> tp1,
                        ap_uint<4> tp2,
                        ap_uint<4> mid,
                        ap_uint<4> bottom1,
                        ap_uint<4> bottom2,
                        ap_uint<13> row,
                        unsigned char weights[5],
                        int& read_index,
                        int& write_index) {
// clang-format off
    #pragma HLS INLINE
    // clang-format on
    XF_SNAME(WORDWIDTH) buf0, buf1, buf2, buf3, buf4;
    ap_uint<8> buf_size = XF_NPIXPERCYCLE(NPC) + 4;
    uint16_t npc = XF_NPIXPERCYCLE(NPC);
    ap_uint<8> max_loop = XF_WORDDEPTH(WORDWIDTH);
    ap_uint<8> step = XF_PIXELDEPTH(DEPTH);

Col_Loop:
    for (ap_uint<13> col = 0; col < img_width; col++) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=TC max=TC
        #pragma HLS pipeline
        // clang-format on
        if (row < img_height)
            buf[row_ind][col] = _src_mat.read(read_index++); //.data[read_index++];
        else
            buf[bottom2][col] = 0;

        buf0 = buf[tp1][col];
        buf1 = buf[tp2][col];
        buf2 = buf[mid][col];
        buf3 = buf[bottom1][col];
        buf4 = buf[bottom2][col];

        if (NPC == XF_NPPC8) {
            xfExtractPixels<NPC, WORDWIDTH, DEPTH>(&src_buf1[4], buf0, 0);
            xfExtractPixels<NPC, WORDWIDTH, DEPTH>(&src_buf2[4], buf1, 0);
            xfExtractPixels<NPC, WORDWIDTH, DEPTH>(&src_buf3[4], buf2, 0);
            xfExtractPixels<NPC, WORDWIDTH, DEPTH>(&src_buf4[4], buf3, 0);
            xfExtractPixels<NPC, WORDWIDTH, DEPTH>(&src_buf5[4], buf4, 0);
        } else {
            src_buf1[4] = buf0;
            src_buf2[4] = buf1;
            src_buf3[4] = buf2;
            src_buf4[4] = buf3;
            src_buf5[4] = buf4;
        }

        xFGaussian5x5<NPC, DEPTH, PLANES, FOR_IMAGE_PYRAMID>(OutputValues, src_buf1, src_buf2, src_buf3, src_buf4,
                                                             src_buf5, weights);

        for (ap_uint<4> i = 0; i < 4; i++) {
// clang-format off
            #pragma HLS unroll
            // clang-format on
            src_buf1[i] = src_buf1[buf_size - (4 - i)];
            src_buf2[i] = src_buf2[buf_size - (4 - i)];
            src_buf3[i] = src_buf3[buf_size - (4 - i)];
            src_buf4[i] = src_buf4[buf_size - (4 - i)];
            src_buf5[i] = src_buf5[buf_size - (4 - i)];
        }

        if (col == 0) {
            shift_x = 0;
            inter_valx = 0;

            xfPackPixels<NPC, WORDWIDTH, DEPTH>(&OutputValues[0], inter_valx, 2, (npc - 2), shift_x);

        } else {
            if (NPC == XF_NPPC8) {
                xfPackPixels<NPC, WORDWIDTH, DEPTH>(&OutputValues[0], inter_valx, 0, 2, shift_x);

                _out_mat.write(write_index++, inter_valx);

                shift_x = 0;
                inter_valx = 0;

                xfPackPixels<NPC, WORDWIDTH, DEPTH>(&OutputValues[0], inter_valx, 2, (npc - 2), shift_x);

            } else {
                if (col >= 2) {
                    if (PLANES == 1) {
                        inter_valx((max_loop - 1), (max_loop - 8)) = OutputValues[0];
                        _out_mat.write(write_index++, inter_valx);
                    } else {
                        //                      _out_mat.data[write_index++] =
                        //(OutputValues[0]);
                        _out_mat.write(write_index++, OutputValues[0]);
                    }
                }
            }
        }
    } // Col_Loop
}

template <int SRC_T, int ROWS, int COLS, int PLANES, int DEPTH, int NPC, int WORDWIDTH, int TC, bool FOR_IMAGE_PYRAMID>
void xFGaussianFilter5x5(xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _src_mat,
                         xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _out_mat,
                         uint16_t img_height,
                         uint16_t img_width,
                         unsigned char weights[5]) {
    ap_uint<13> row_ind;
    ap_uint<13> row, col;
    ap_uint<4> tp1, tp2, mid, bottom1, bottom2;
    ap_uint<5> i;

    ap_uint<5> buf_size = XF_NPIXPERCYCLE(NPC) + 4;
    ap_uint<9> step = XF_PIXELDEPTH(DEPTH);
    ap_uint<9> max_loop = XF_WORDDEPTH(WORDWIDTH);
    uint16_t shift_x = 0;
    ap_uint<8> npc = XF_NPIXPERCYCLE(NPC);
    int read_index = 0, write_index = 0;

    XF_PTNAME(DEPTH) OutputValues[XF_NPIXPERCYCLE(NPC)];

// clang-format off
    #pragma HLS ARRAY_PARTITION variable=OutputValues complete dim=1
    // clang-format on

    XF_SNAME(WORDWIDTH) buf0, buf1, buf2, buf3, buf4;

    XF_PTNAME(DEPTH)
    src_buf1[XF_NPIXPERCYCLE(NPC) + 4], src_buf2[XF_NPIXPERCYCLE(NPC) + 4], src_buf3[XF_NPIXPERCYCLE(NPC) + 4],
        src_buf4[XF_NPIXPERCYCLE(NPC) + 4], src_buf5[XF_NPIXPERCYCLE(NPC) + 4];
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=src_buf1 complete dim=1
    #pragma HLS ARRAY_PARTITION variable=src_buf2 complete dim=1
    #pragma HLS ARRAY_PARTITION variable=src_buf3 complete dim=1
    #pragma HLS ARRAY_PARTITION variable=src_buf4 complete dim=1
    #pragma HLS ARRAY_PARTITION variable=src_buf5 complete dim=1
    // clang-format on

    XF_SNAME(WORDWIDTH) tmp_in;
    XF_SNAME(WORDWIDTH) inter_valx = 0;

    XF_SNAME(WORDWIDTH) buf[5][(COLS >> XF_BITSHIFT(NPC))];
// clang-format off
    #pragma HLS RESOURCE variable=buf core=RAM_S2P_BRAM
    #pragma HLS ARRAY_PARTITION variable=buf complete dim=1
    // clang-format on

    row_ind = 2;

Clear_Row_Loop:
    for (col = 0; col < img_width; col++) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=TC max=TC
        #pragma HLS pipeline
        // clang-format on
        buf[0][col] = 0;
        buf[1][col] = 0;
        buf[row_ind][col] = _src_mat.read(read_index++); //.data[read_index++];
    }

    row_ind++;

Read_Row2_Loop:
    for (col = 0; col < img_width; col++) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=TC max=TC
        #pragma HLS pipeline
        // clang-format on

        buf[row_ind][col] = _src_mat.read(read_index++); //_src_mat.data[read_index++];
    }
    row_ind++;

Row_Loop:
    for (row = 2; row < img_height + 2; row++) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=ROWS max=ROWS
        // clang-format on

        // modify the buffer indices to re use
        if (row_ind == 4) {
            tp1 = 0;
            tp2 = 1;
            mid = 2;
            bottom1 = 3;
            bottom2 = 4;
        } else if (row_ind == 0) {
            tp1 = 1;
            tp2 = 2;
            mid = 3;
            bottom1 = 4;
            bottom2 = 0;
        } else if (row_ind == 1) {
            tp1 = 2;
            tp2 = 3;
            mid = 4;
            bottom1 = 0;
            bottom2 = 1;
        } else if (row_ind == 2) {
            tp1 = 3;
            tp2 = 4;
            mid = 0;
            bottom1 = 1;
            bottom2 = 2;
        } else if (row_ind == 3) {
            tp1 = 4;
            tp2 = 0;
            mid = 1;
            bottom1 = 2;
            bottom2 = 3;
        }

        src_buf1[0] = src_buf1[1] = src_buf1[2] = src_buf1[3] = 0;
        src_buf2[0] = src_buf2[1] = src_buf2[2] = src_buf2[3] = 0;
        src_buf3[0] = src_buf3[1] = src_buf3[2] = src_buf3[3] = 0;
        src_buf4[0] = src_buf4[1] = src_buf4[2] = src_buf4[3] = 0;
        src_buf5[0] = src_buf5[1] = src_buf5[2] = src_buf5[3] = 0;

        inter_valx = 0;

        ProcessGaussian5x5<SRC_T, ROWS, COLS, PLANES, DEPTH, NPC, WORDWIDTH, TC, FOR_IMAGE_PYRAMID>(
            _src_mat, _out_mat, buf, src_buf1, src_buf2, src_buf3, src_buf4, src_buf5, OutputValues, inter_valx,
            img_width, img_height, row_ind, shift_x, tp1, tp2, mid, bottom1, bottom2, row, weights, read_index,
            write_index);

        if ((NPC == XF_NPPC8) || (NPC == XF_NPPC16)) {
            for (ap_uint<6> i = 4; i < (XF_NPIXPERCYCLE(NPC) + 4); i++) {
                src_buf1[i] = 0;
                src_buf2[i] = 0;
                src_buf3[i] = 0;
                src_buf4[i] = 0;
                src_buf5[i] = 0;
            }
            OutputValues[0] = xfapplygaussian5x5<PLANES, DEPTH, FOR_IMAGE_PYRAMID>(
                &src_buf1[0], &src_buf2[0], &src_buf3[0], &src_buf4[0], &src_buf5[0], weights);

            OutputValues[1] = xfapplygaussian5x5<PLANES, DEPTH, FOR_IMAGE_PYRAMID>(
                &src_buf1[1], &src_buf2[1], &src_buf3[1], &src_buf4[1], &src_buf5[1], weights);

            xfPackPixels<NPC, WORDWIDTH, DEPTH>(&OutputValues[0], inter_valx, 0, 2, shift_x);

            //_out_mat.data[write_index++] = (inter_valx);
            _out_mat.write(write_index++, inter_valx);

        } else {
// clang-format off
            #pragma HLS ALLOCATION function instances=xfapplygaussian5x5<PLANES, DEPTH, FOR_IMAGE_PYRAMID> limit=1
            // clang-format on
            src_buf1[buf_size - 1] = 0;
            src_buf2[buf_size - 1] = 0;
            src_buf3[buf_size - 1] = 0;
            src_buf4[buf_size - 1] = 0;
            src_buf5[buf_size - 1] = 0;
            OutputValues[0] = xfapplygaussian5x5<PLANES, DEPTH, FOR_IMAGE_PYRAMID>(
                &src_buf1[0], &src_buf2[0], &src_buf3[0], &src_buf4[0], &src_buf5[0], weights);

            inter_valx((max_loop - 1), (max_loop - step)) = OutputValues[0];

            _out_mat.write(write_index++, inter_valx);
            for (ap_uint<4> i = 0; i < 4; i++) {
// clang-format off
                #pragma HLS unroll
                // clang-format on
                src_buf1[i] = src_buf1[buf_size - (4 - i)];
                src_buf2[i] = src_buf2[buf_size - (4 - i)];
                src_buf3[i] = src_buf3[buf_size - (4 - i)];
                src_buf4[i] = src_buf4[buf_size - (4 - i)];
                src_buf5[i] = src_buf5[buf_size - (4 - i)];
            }

            OutputValues[0] = xfapplygaussian5x5<PLANES, DEPTH, FOR_IMAGE_PYRAMID>(
                &src_buf1[0], &src_buf2[0], &src_buf3[0], &src_buf4[0], &src_buf5[0], weights);

            inter_valx((max_loop - 1), (max_loop - step)) = OutputValues[0];
            _out_mat.write(write_index++, inter_valx);
        }
        row_ind++;

        if (row_ind == 5) {
            row_ind = 0;
        }
    } // Row_Loop
}

template <int NPC, int DEPTH, int PLANES>
void xFGaussian7x7(XF_PTNAME(DEPTH) * OutputValues,
                   XF_PTNAME(DEPTH) * src_buf1,
                   XF_PTNAME(DEPTH) * src_buf2,
                   XF_PTNAME(DEPTH) * src_buf3,
                   XF_PTNAME(DEPTH) * src_buf4,
                   XF_PTNAME(DEPTH) * src_buf5,
                   XF_PTNAME(DEPTH) * src_buf6,
                   XF_PTNAME(DEPTH) * src_buf7,
                   unsigned char weights[7]) {
// clang-format off
    #pragma HLS INLINE
    // clang-format on
    for (ap_uint<9> j = 0; j < XF_NPIXPERCYCLE(NPC); j++) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=8 max=8
        #pragma HLS UNROLL
        // clang-format on
        //      for(ap_uint<8> c=0,k=0;c<PLANES;c++,k+=8)
        //      {
        OutputValues[j] = xfapplygaussian7x7<PLANES, DEPTH>(&src_buf1[j], &src_buf2[j], &src_buf3[j], &src_buf4[j],
                                                            &src_buf5[j], &src_buf6[j], &src_buf7[j], weights);
        //      }
    }
}

template <int SRC_T, int ROWS, int COLS, int PLANES, int DEPTH, int NPC, int WORDWIDTH, int TC>
void ProcessGaussian7x7(xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _src_mat,
                        xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _out_mat,
                        XF_SNAME(WORDWIDTH) buf[7][(COLS >> XF_BITSHIFT(NPC))],
                        XF_PTNAME(DEPTH) src_buf1[XF_NPIXPERCYCLE(NPC) + 6],
                        XF_PTNAME(DEPTH) src_buf2[XF_NPIXPERCYCLE(NPC) + 6],
                        XF_PTNAME(DEPTH) src_buf3[XF_NPIXPERCYCLE(NPC) + 6],
                        XF_PTNAME(DEPTH) src_buf4[XF_NPIXPERCYCLE(NPC) + 6],
                        XF_PTNAME(DEPTH) src_buf5[XF_NPIXPERCYCLE(NPC) + 6],
                        XF_PTNAME(DEPTH) src_buf6[XF_NPIXPERCYCLE(NPC) + 6],
                        XF_PTNAME(DEPTH) src_buf7[XF_NPIXPERCYCLE(NPC) + 6],
                        XF_PTNAME(DEPTH) OutputValues[XF_NPIXPERCYCLE(NPC)],
                        XF_SNAME(WORDWIDTH) & inter_valx,
                        uint16_t img_width,
                        uint16_t img_height,
                        uint16_t& shiftx,
                        ap_uint<4> tp1,
                        ap_uint<4> tp2,
                        ap_uint<4> tp3,
                        ap_uint<4> mid,
                        ap_uint<4> bottom1,
                        ap_uint<4> bottom2,
                        ap_uint<4> bottom3,
                        ap_uint<13> row_index,
                        unsigned char weights[7],
                        int& read_index,
                        int& write_index) {
// clang-format off
    #pragma HLS INLINE
    // clang-format on
    XF_SNAME(WORDWIDTH) buf0, buf1, buf2, buf3, buf4, buf5, buf6;
    uint16_t npc = XF_NPIXPERCYCLE(NPC);
    ap_uint<10> max_loop = XF_WORDDEPTH(WORDWIDTH);

Col_Loop:
    for (ap_uint<13> col = 0; col < img_width; col++) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=TC max=TC
        #pragma HLS pipeline
        // clang-format on
        if (row_index < img_height)
            buf[bottom3][col] = _src_mat.read(read_index++); //_src_mat.data[read_index++];
        else
            buf[bottom3][col] = 0;

        buf0 = buf[tp1][col];
        buf1 = buf[tp2][col];
        buf2 = buf[tp3][col];
        buf3 = buf[mid][col];
        buf4 = buf[bottom1][col];
        buf5 = buf[bottom2][col];
        buf6 = buf[bottom3][col];

        if (NPC == XF_NPPC8) {
            xfExtractData<NPC, WORDWIDTH, DEPTH>(src_buf1, src_buf2, src_buf3, src_buf4, src_buf5, src_buf6, src_buf7,
                                                 buf0, buf1, buf2, buf3, buf4, buf5, buf6);
        } else {
            src_buf1[6] = buf0;
            src_buf2[6] = buf1;
            src_buf3[6] = buf2;
            src_buf4[6] = buf3;
            src_buf5[6] = buf4;
            src_buf6[6] = buf5;
            src_buf7[6] = buf6;
        }

        xFGaussian7x7<NPC, DEPTH, PLANES>(OutputValues, src_buf1, src_buf2, src_buf3, src_buf4, src_buf5, src_buf6,
                                          src_buf7, weights);

        xfCopyData<NPC, DEPTH>(src_buf1, src_buf2, src_buf3, src_buf4, src_buf5, src_buf6, src_buf7);

        if (col == 0) {
            shiftx = 0;

            inter_valx = 0;

            xfPackPixels<NPC, WORDWIDTH, DEPTH>(&OutputValues[0], inter_valx, 3, (npc - 3), shiftx);

        } else {
            if ((NPC == XF_NPPC8) || (NPC == XF_NPPC16)) {
                xfPackPixels<NPC, WORDWIDTH, DEPTH>(&OutputValues[0], inter_valx, 0, 3, shiftx);

                //              _out_mat.data[write_index++] = (inter_valx);
                _out_mat.write(write_index++, inter_valx);

                shiftx = 0;

                inter_valx = 0;

                xfPackPixels<NPC, WORDWIDTH, DEPTH>(&OutputValues[0], inter_valx, 3, (npc - 3), shiftx);

            } else {
                if (col >= 3) {
                    if (PLANES == 1) {
                        inter_valx((max_loop - 1), (max_loop - 8)) = OutputValues[0];
                        //_out_mat.data[write_index++] = (inter_valx);
                        _out_mat.write(write_index++, inter_valx);
                    } else {
                        //                      _out_mat.data[write_index++] =
                        //(OutputValues[0]);
                        _out_mat.write(write_index++, OutputValues[0]);
                    }
                }
            }
        }
    } // Col_Loop
}

template <int SRC_T, int ROWS, int COLS, int PLANES, int DEPTH, int NPC, int WORDWIDTH, int TC>
void xFGaussianFilter7x7(xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _src_mat,
                         xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _out_mat,
                         uint16_t img_height,
                         uint16_t img_width,
                         unsigned char weights[7]) {
    ap_uint<13> row_ind, row, col;
    ap_uint<4> tp1, tp2, tp3, mid, bottom1, bottom2, bottom3;
    ap_uint<5> i;
    ap_uint<8> buf_size = (XF_NPIXPERCYCLE(NPC) + 6);
    ap_uint<10> max_loop = XF_WORDDEPTH(WORDWIDTH);
    int read_index = 0, write_index = 0;

    XF_PTNAME(DEPTH) OutputValues[XF_NPIXPERCYCLE(NPC)];

// clang-format off
    #pragma HLS ARRAY_PARTITION variable=OutputValues complete dim=1
    // clang-format on

    // Temporary buffers to hold image data from three rows.
    XF_PTNAME(DEPTH)
    src_buf1[XF_NPIXPERCYCLE(NPC) + 6], src_buf2[XF_NPIXPERCYCLE(NPC) + 6], src_buf3[XF_NPIXPERCYCLE(NPC) + 6],
        src_buf4[XF_NPIXPERCYCLE(NPC) + 6], src_buf5[XF_NPIXPERCYCLE(NPC) + 6];
    XF_PTNAME(DEPTH) src_buf6[XF_NPIXPERCYCLE(NPC) + 6], src_buf7[XF_NPIXPERCYCLE(NPC) + 6];
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=src_buf1 complete dim=1
    #pragma HLS ARRAY_PARTITION variable=src_buf2 complete dim=1
    #pragma HLS ARRAY_PARTITION variable=src_buf3 complete dim=1
    #pragma HLS ARRAY_PARTITION variable=src_buf4 complete dim=1
    #pragma HLS ARRAY_PARTITION variable=src_buf5 complete dim=1
    #pragma HLS ARRAY_PARTITION variable=src_buf6 complete dim=1
    #pragma HLS ARRAY_PARTITION variable=src_buf7 complete dim=1
    // clang-format on

    XF_SNAME(WORDWIDTH) inter_valx = 0;
    uint16_t shiftx = 0;

    XF_SNAME(WORDWIDTH) buf[7][(COLS >> XF_BITSHIFT(NPC))];
// clang-format off
    #pragma HLS RESOURCE variable=buf core=RAM_S2P_BRAM
    #pragma HLS ARRAY_PARTITION variable=buf complete dim=1
    // clang-format on

    row_ind = 3;
Clear_Row_Loop:
    for (col = 0; col < img_width; col++) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=TC max=TC
        #pragma HLS pipeline
        // clang-format on
        buf[0][col] = 0;
        buf[1][col] = 0;
        buf[2][col] = 0;
        buf[row_ind][col] = _src_mat.read(read_index++); // data[read_index++];
    }
    row_ind++;

Read_Row1_Loop:
    for (col = 0; col < img_width; col++) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=TC max=TC
        #pragma HLS pipeline
        // clang-format on

        buf[row_ind][col] = _src_mat.read(read_index++); //_src_mat.data[read_index++];
    }
    row_ind++;

Read_Row2_Loop:
    for (col = 0; col < img_width; col++) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=TC max=TC
        #pragma HLS pipeline
        // clang-format on

        buf[row_ind][col] = _src_mat.read(read_index++); //_src_mat.data[read_index++];
    }
    row_ind++;

Row_Loop:
    for (row = 3; row < img_height + 3; row++) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=ROWS max=ROWS
        // clang-format on
        // modify the buffer indices to re use
        if (row_ind == 0) {
            tp1 = 1;
            tp2 = 2;
            tp3 = 3;
            mid = 4;
            bottom1 = 5;
            bottom2 = 6;
            bottom3 = 0;
        } else if (row_ind == 1) {
            tp1 = 2;
            tp2 = 3;
            tp3 = 4;
            mid = 5;
            bottom1 = 6;
            bottom2 = 0;
            bottom3 = 1;
        } else if (row_ind == 2) {
            tp1 = 3;
            tp2 = 4;
            tp3 = 5;
            mid = 6;
            bottom1 = 0;
            bottom2 = 1;
            bottom3 = 2;
        } else if (row_ind == 3) {
            tp1 = 4;
            tp2 = 5;
            tp3 = 6;
            mid = 0;
            bottom1 = 1;
            bottom2 = 2;
            bottom3 = 3;
        } else if (row_ind == 4) {
            tp1 = 5;
            tp2 = 6;
            tp3 = 0;
            mid = 1;
            bottom1 = 2;
            bottom2 = 3;
            bottom3 = 4;
        } else if (row_ind == 5) {
            tp1 = 6;
            tp2 = 0;
            tp3 = 1;
            mid = 2;
            bottom1 = 3;
            bottom2 = 4;
            bottom3 = 5;
        } else if (row_ind == 6) {
            tp1 = 0;
            tp2 = 1;
            tp3 = 2;
            mid = 3;
            bottom1 = 4;
            bottom2 = 5;
            bottom3 = 6;
        }

        for (i = 0; i < 6; i++) {
// clang-format off
            #pragma HLS unroll
            // clang-format on
            src_buf1[i] = 0;
            src_buf2[i] = 0;
            src_buf3[i] = 0;
            src_buf4[i] = 0;
            src_buf5[i] = 0;
            src_buf6[i] = 0;
            src_buf7[i] = 0;
        }
        inter_valx = 0;
        ProcessGaussian7x7<SRC_T, ROWS, COLS, PLANES, DEPTH, NPC, WORDWIDTH, TC>(
            _src_mat, _out_mat, buf, src_buf1, src_buf2, src_buf3, src_buf4, src_buf5, src_buf6, src_buf7, OutputValues,
            inter_valx, img_width, img_height, shiftx, tp1, tp2, tp3, mid, bottom1, bottom2, bottom3, row, weights,
            read_index, write_index);

        if ((NPC == XF_NPPC8) || (NPC == XF_NPPC16)) {
            for (i = 0; i < 8; i++) {
// clang-format off
                #pragma HLS LOOP_TRIPCOUNT min=8 max=8
                #pragma HLS unroll
                // clang-format on
                src_buf1[buf_size + i - (XF_NPIXPERCYCLE(NPC))] = 0;
                src_buf2[buf_size + i - (XF_NPIXPERCYCLE(NPC))] = 0;
                src_buf3[buf_size + i - (XF_NPIXPERCYCLE(NPC))] = 0;
                src_buf4[buf_size + i - (XF_NPIXPERCYCLE(NPC))] = 0;
                src_buf5[buf_size + i - (XF_NPIXPERCYCLE(NPC))] = 0;
                src_buf6[buf_size + i - (XF_NPIXPERCYCLE(NPC))] = 0;
                src_buf7[buf_size + i - (XF_NPIXPERCYCLE(NPC))] = 0;
            }

            for (i = 0; i < 3; i++) {
// clang-format off
                #pragma HLS LOOP_TRIPCOUNT min=3 max=3
                #pragma HLS unroll
                // clang-format on

                OutputValues[i] =
                    xfapplygaussian7x7<PLANES, DEPTH>(&src_buf1[i], &src_buf2[i], &src_buf3[i], &src_buf4[i],
                                                      &src_buf5[i], &src_buf6[i], &src_buf7[i], weights);
            }
            xfPackPixels<NPC, WORDWIDTH, DEPTH>(&OutputValues[0], inter_valx, 0, 3, shiftx);

            //_out_mat.data[write_index++] = (inter_valx);
            _out_mat.write(write_index++, inter_valx);

            shiftx = 0;
            inter_valx = 0;
        } else {
            src_buf1[6] = 0;
            src_buf2[6] = 0;
            src_buf3[6] = 0;
            src_buf4[6] = 0;
            src_buf5[6] = 0;
            src_buf6[6] = 0;
            src_buf7[6] = 0;

            for (i = 0; i < 3; i++) {
// clang-format off
                #pragma HLS LOOP_TRIPCOUNT min=3 max=3
                #pragma HLS unroll
                #pragma HLS ALLOCATION function instances=xfapplygaussian7x7<PLANES, DEPTH> limit=1
                // clang-format on

                OutputValues[0] =
                    xfapplygaussian7x7<PLANES, DEPTH>(&src_buf1[0], &src_buf2[0], &src_buf3[0], &src_buf4[0],
                                                      &src_buf5[0], &src_buf6[0], &src_buf7[0], weights);

                xfCopyData<NPC, DEPTH>(src_buf1, src_buf2, src_buf3, src_buf4, src_buf5, src_buf6, src_buf7);
                if (PLANES == 1) {
                    inter_valx((max_loop - 1), (max_loop - 8)) = OutputValues[0];

                    //              _out_mat.data[write_index++] = (inter_valx);
                    _out_mat.write(write_index++, inter_valx);
                } else {
                    //                  _out_mat.data[write_index++] = ( OutputValues[0]);
                    _out_mat.write(write_index++, OutputValues[0]);
                }
            }
        }
        row_ind++;
        if (row_ind == 7) {
            row_ind = 0;
        }
    } // Row_Loop ends here

} /*
 template<int ROWS, int COLS,int PLANES, int DEPTH, int NPC, int WORDWIDTH>
 void xFGaussianFilter(hls::stream< XF_SNAME(WORDWIDTH)> &_src, hls::stream< XF_SNAME(WORDWIDTH) > &_dst, int
 _filter_width, int _border_type, uint16_t imgheight, uint16_t imgwidth, float sigma)
 {



 }*/

template <int FILTER_SIZE, int BORDER_TYPE, int SRC_T, int ROWS, int COLS, int NPC = 1>
void GaussianBlur(xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _src, xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _dst, float sigma) {
// clang-format off
    #pragma HLS inline off
    // clang-format on

    int imgwidth = _src.cols >> XF_BITSHIFT(NPC);

    if (FILTER_SIZE == XF_FILTER_3X3) {
        unsigned char weights[3];
// clang-format off
        #pragma HLS ARRAY_PARTITION variable=weights complete dim=1
        // clang-format on
        weightsghcalculation3x3(sigma, weights);
        xfGaussianFilter3x3<SRC_T, ROWS, COLS, XF_CHANNELS(SRC_T, NPC), XF_DEPTH(SRC_T, NPC), NPC,
                            XF_WORDWIDTH(SRC_T, NPC), (COLS >> XF_BITSHIFT(NPC))>(_src, _dst, _src.rows, imgwidth,
                                                                                  weights);
    } else if (FILTER_SIZE == XF_FILTER_5X5) {
        unsigned char weights[5];
// clang-format off
        #pragma HLS ARRAY_PARTITION variable=weights complete dim=1
        // clang-format on
        weightsghcalculation5x5(sigma, weights);
        xFGaussianFilter5x5<SRC_T, ROWS, COLS, XF_CHANNELS(SRC_T, NPC), XF_DEPTH(SRC_T, NPC), NPC,
                            XF_WORDWIDTH(SRC_T, NPC), (COLS >> XF_BITSHIFT(NPC)), false>(_src, _dst, _src.rows,
                                                                                         imgwidth, weights);
    } else if (FILTER_SIZE == XF_FILTER_7X7) {
        unsigned char weights[7];
// clang-format off
        #pragma HLS ARRAY_PARTITION variable=weights complete dim=1
        // clang-format on
        weightsghcalculation7x7(sigma, weights);
        xFGaussianFilter7x7<SRC_T, ROWS, COLS, XF_CHANNELS(SRC_T, NPC), XF_DEPTH(SRC_T, NPC), NPC,
                            XF_WORDWIDTH(SRC_T, NPC), (COLS >> XF_BITSHIFT(NPC))>(_src, _dst, _src.rows, imgwidth,
                                                                                  weights);
    }
}
} // namespace cv
} // namespace xf
#endif //_XF_GAUSSIAN_HPP_