Program Listing for File xf_utility.hpp

Return to documentation for file (/tmp/ws/src/vitis_common/include/common/xf_utility.hpp)

/*
 * Copyright 2020 Xilinx, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef _XF_UTILITY_H_
#define _XF_UTILITY_H_

#include "ap_axi_sdata.h"
#include "xf_common.hpp"
#include <assert.h>
#include <string.h>

namespace xf {
namespace cv {

template <int NPC, int WORDWIDTH, int PIXELDEPTH>
void xfPackPixels(
    XF_PTNAME(PIXELDEPTH) * tmp_buf, XF_SNAME(WORDWIDTH) & val, uint16_t pos, int16_t loopIter, uint16_t& shift) {
// clang-format off
    #pragma HLS INLINE
    // clang-format on
    ap_uint<8> STEP = XF_PIXELDEPTH(PIXELDEPTH);

    for (ap_int<9> i = 0; i < loopIter; i++) {
// clang-format off
        #pragma HLS unroll
        // clang-format on
        XF_PTUNAME(PIXELDEPTH) tmp = tmp_buf[pos];
        val = val | (((XF_SNAME(WORDWIDTH))tmp) << (shift * STEP));
        pos++;
        shift++;
    }
}

template <int NPC, int WORDWIDTH, int PIXELDEPTH>
void xfExtractPixels(XF_PTNAME(PIXELDEPTH) * tmp_buf, XF_SNAME(WORDWIDTH) & val1, int pos) {
// clang-format off
    #pragma HLS inline off
    // clang-format on
    XF_SNAME(WORDWIDTH) v = val1;

    int shift = 0;
    int STEP = XF_PIXELDEPTH(PIXELDEPTH);
Extract_pixels_loop:
    for (int i = 0; i < (1 << (XF_BITSHIFT(NPC))); i++) {
// clang-format off
        #pragma HLS UNROLL
        // clang-format on
        tmp_buf[pos + i] = v.range(shift + STEP - 1, shift);
        shift = shift + STEP;
    }
}

template <int NPC, int WORDWIDTH_SRC, int DEPTH_SRC>
void xfExtractData(XF_PTNAME(DEPTH_SRC) * src_buf1,
                   XF_PTNAME(DEPTH_SRC) * src_buf2,
                   XF_PTNAME(DEPTH_SRC) * src_buf3,
                   XF_PTNAME(DEPTH_SRC) * src_buf4,
                   XF_PTNAME(DEPTH_SRC) * src_buf5,
                   XF_PTNAME(DEPTH_SRC) * src_buf6,
                   XF_PTNAME(DEPTH_SRC) * src_buf7,
                   XF_SNAME(WORDWIDTH_SRC) buf0,
                   XF_SNAME(WORDWIDTH_SRC) buf1,
                   XF_SNAME(WORDWIDTH_SRC) buf2,
                   XF_SNAME(WORDWIDTH_SRC) buf3,
                   XF_SNAME(WORDWIDTH_SRC) buf4,
                   XF_SNAME(WORDWIDTH_SRC) buf5,
                   XF_SNAME(WORDWIDTH_SRC) buf6) {
// clang-format off
    #pragma HLS INLINE
    // clang-format on
    xfExtractPixels<NPC, WORDWIDTH_SRC, DEPTH_SRC>(&src_buf1[6], buf0, 0);
    xfExtractPixels<NPC, WORDWIDTH_SRC, DEPTH_SRC>(&src_buf2[6], buf1, 0);
    xfExtractPixels<NPC, WORDWIDTH_SRC, DEPTH_SRC>(&src_buf3[6], buf2, 0);
    xfExtractPixels<NPC, WORDWIDTH_SRC, DEPTH_SRC>(&src_buf4[6], buf3, 0);
    xfExtractPixels<NPC, WORDWIDTH_SRC, DEPTH_SRC>(&src_buf5[6], buf4, 0);
    xfExtractPixels<NPC, WORDWIDTH_SRC, DEPTH_SRC>(&src_buf6[6], buf5, 0);
    xfExtractPixels<NPC, WORDWIDTH_SRC, DEPTH_SRC>(&src_buf7[6], buf6, 0);
}

template <int NPC, int DEPTH_SRC>
void xfCopyData(XF_PTNAME(DEPTH_SRC) src_buf1[XF_NPIXPERCYCLE(NPC) + 6],
                XF_PTNAME(DEPTH_SRC) src_buf2[XF_NPIXPERCYCLE(NPC) + 6],
                XF_PTNAME(DEPTH_SRC) src_buf3[XF_NPIXPERCYCLE(NPC) + 6],
                XF_PTNAME(DEPTH_SRC) src_buf4[XF_NPIXPERCYCLE(NPC) + 6],
                XF_PTNAME(DEPTH_SRC) src_buf5[XF_NPIXPERCYCLE(NPC) + 6],
                XF_PTNAME(DEPTH_SRC) src_buf6[XF_NPIXPERCYCLE(NPC) + 6],
                XF_PTNAME(DEPTH_SRC) src_buf7[XF_NPIXPERCYCLE(NPC) + 6]) {
// clang-format off
    #pragma HLS INLINE
    // clang-format on
    ap_uint<5> buf_size = (XF_NPIXPERCYCLE(NPC) + 6);
    ap_uint<4> i = 0;
    ap_uint<4> ind = buf_size - 6;

    for (i = 0; i < 6; i++, ind++) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=6 max=6
        #pragma HLS unroll
        // clang-format on
        src_buf1[i] = src_buf1[ind];
        src_buf2[i] = src_buf2[ind];
        src_buf3[i] = src_buf3[ind];
        src_buf4[i] = src_buf4[ind];
        src_buf5[i] = src_buf5[ind];
        src_buf6[i] = src_buf6[ind];
        src_buf7[i] = src_buf7[ind];
    }
}

template <int SIZE, int WORDWIDTH>
void xFCopyBlockMemoryOut1(XF_SNAME(WORDWIDTH) * _src, unsigned long long int* _dst, int nbytes) {
#if _XF_SYNTHESIS_
    memcpy((unsigned long long int*)_dst, (unsigned long long int*)_src, SIZE);
#else
    if (nbytes) memcpy((unsigned long long int*)_dst, (unsigned long long int*)_src, nbytes);
#endif
}

template <int SIZE, int WORDWIDTH>
void xFCopyBlockMemoryIn1(unsigned long long int* _src, XF_SNAME(WORDWIDTH) * _dst, int nbytes) {
#if _XF_SYNTHESIS_
    memcpy((XF_SNAME(WORDWIDTH)*)_dst, (XF_SNAME(WORDWIDTH)*)_src, SIZE);
#else
    memcpy((XF_SNAME(WORDWIDTH)*)_dst, (XF_SNAME(WORDWIDTH)*)_src, nbytes);
#endif
}

template <int SIZE, int WORDWIDTH>
void xFCopyBlockMemoryIn(XF_SNAME(WORDWIDTH) * _src, XF_SNAME(WORDWIDTH) * _dst, int nbytes) {
#if _XF_SYNTHESIS_
    memcpy((AU_TNAME(WORDWIDTH)*)_dst, (AU_TNAME(WORDWIDTH)*)_src, SIZE);
#else
    memcpy((XF_SNAME(WORDWIDTH)*)_dst, (XF_SNAME(WORDWIDTH)*)_src, nbytes);
#endif
}

template <int SIZE, int WORDWIDTH>
void xFCopyBlockMemoryOut(XF_SNAME(WORDWIDTH) * _src, XF_SNAME(WORDWIDTH) * _dst, int nbytes) {
#if _XF_SYNTHESIS_
    memcpy((XF_SNAME(WORDWIDTH)*)_dst, (XF_SNAME(WORDWIDTH)*)_src, SIZE);
#else
    memcpy((XF_SNAME(WORDWIDTH)*)_dst, (XF_SNAME(WORDWIDTH)*)_src, nbytes);
#endif
}

template <int WORDWIDTH, int NPC, int IN_BH, int IN_BW>
void xFDuplicateStream(hls::stream<XF_SNAME(WORDWIDTH)>& in_strm,
                       hls::stream<XF_SNAME(WORDWIDTH)>& out_strm1,
                       hls::stream<XF_SNAME(WORDWIDTH)>& out_strm2,
                       int imwidth,
                       int imheight) {
    for (int i = 0; i < imheight; i++) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=IN_BH max=IN_BH
        #pragma HLS LOOP_FLATTEN off
        // clang-format on
        for (int j = 0; j < (imwidth >> NPC); j++) {
// clang-format off
            #pragma HLS pipeline
            #pragma HLS LOOP_TRIPCOUNT min=IN_BW max=IN_BW
            // clang-format on
            XF_SNAME(WORDWIDTH) tmp = in_strm.read();
            out_strm1.write(tmp);
            out_strm2.write(tmp);
        }
    }
}

// ==============================================================================
// Class contains funcitons requried for accel file (top wrapper file)
// ==============================================================================
class accel_utils {
   public:
    // ==============================================================================
    // Read module(s) to handle data transfer from AXI/HLS stream to xfMat
    // ------------------------------------------------------------------------------

    template <int PTR_WIDTH, int ROWS, int COLS, int NPC, int COLOR_T, int CH_WIDTH, int TRIPCOUNT>
    void Array2hlsStrm(ap_uint<PTR_WIDTH>* srcPtr, hls::stream<ap_uint<PTR_WIDTH> >& dstStrm, int rows, int cols) {
        int pixel_width = COLOR_T * CH_WIDTH;
        int loop_count = (((rows * cols * pixel_width) + PTR_WIDTH - 1) / PTR_WIDTH);

        for (int i = 0; i < loop_count; i++) {
// clang-format off
            #pragma HLS LOOP_TRIPCOUNT min=1 max=TRIPCOUNT
            #pragma HLS PIPELINE
            // clang-format on
            dstStrm.write(srcPtr[i]);
        }
    }

    template <int PTR_WIDTH, int MAT_T, int ROWS, int COLS, int NPC, int TRIPCOUNT>
    void hlsStrm2xfMat(hls::stream<ap_uint<PTR_WIDTH> >& srcStrm,
                       xf::cv::Mat<MAT_T, ROWS, COLS, NPC>& dstMat,
                       int dstMat_cols_align_npc) {
        int rows = dstMat.rows;
        int cols = dstMat.cols;
        int loop_count = (rows * dstMat_cols_align_npc) / XF_NPIXPERCYCLE(NPC);
        int pad = dstMat_cols_align_npc - cols;
        int in_size_bits = XF_PIXELWIDTH(MAT_T, NPC) * rows * dstMat_cols_align_npc; // channels
        int ddr_read_cycles = (((in_size_bits) + (PTR_WIDTH)-1) / (PTR_WIDTH));
        int ddr_read_cnt = 0;

        int valid_bits = 0;
        const int N_size = XF_PIXELWIDTH(MAT_T, NPC) * XF_NPIXPERCYCLE(NPC);
        const int last_N_size = XF_PIXELWIDTH(MAT_T, NPC) * (XF_NPIXPERCYCLE(NPC) - pad);
        const int PTR_WIDTH_min_N = PTR_WIDTH - N_size;
        const int PTR_WIDTH_min_last_N = PTR_WIDTH - last_N_size;
        const int PTR_WIDTH_plus_N = PTR_WIDTH + N_size;
        const int PTR_WIDTH_plus_last_N = PTR_WIDTH + last_N_size;

        int K_size;
        ap_uint<PTR_WIDTH> r;
        XF_TNAME(MAT_T, NPC) out;
        int ncpr = dstMat_cols_align_npc / XF_NPIXPERCYCLE(NPC); // number of clock per row
        int clk_cnt = 0;                                         // clock counter. reset at the start of every row
        int strm_cnt_disply = 0;
    L1:
        for (int i = 0; i < loop_count; i++) {
// clang-format off
            #pragma HLS LOOP_TRIPCOUNT min=1 max=TRIPCOUNT
            #pragma HLS PIPELINE
            // clang-format on

            int PTR_WIDTH_min_Ksize;
            int PTR_WIDTH_plus_Ksize;

            if (clk_cnt == ncpr - 1) {
                clk_cnt = 0;
                K_size = last_N_size;
                PTR_WIDTH_min_Ksize = PTR_WIDTH_min_last_N;
                PTR_WIDTH_plus_Ksize = PTR_WIDTH_plus_last_N;
            } else {
                clk_cnt++;
                K_size = N_size;
                PTR_WIDTH_min_Ksize = PTR_WIDTH_min_N;
                PTR_WIDTH_plus_Ksize = PTR_WIDTH_plus_N;
            }

            int valid_bits_update;
            int valid_bits_tmp = valid_bits - K_size;
            XF_TNAME(MAT_T, NPC) out = 0;

            if (valid_bits < K_size) {
                if (valid_bits != 0) {
                    out.range(valid_bits - 1, 0) = r.range(PTR_WIDTH - 1, PTR_WIDTH - valid_bits);
                }
                if (ddr_read_cnt < ddr_read_cycles) {
                    r = srcStrm.read();
                    ddr_read_cnt++;
                } else {
                    r = 0;
                }
                out.range(K_size - 1, valid_bits) = r.range(K_size - valid_bits - 1, 0);
                valid_bits = PTR_WIDTH_min_Ksize + valid_bits;
            } else {
                out = r.range(PTR_WIDTH_plus_Ksize - valid_bits - 1, PTR_WIDTH - valid_bits);
                valid_bits = valid_bits - K_size;
            }

            dstMat.write(i, out);
        }
        int stop = 0;
    }

    template <int PTR_WIDTH, int MAT_T, int ROWS, int COLS, int NPC>
    void Array2xfMat(ap_uint<PTR_WIDTH>* srcPtr, xf::cv::Mat<MAT_T, ROWS, COLS, NPC>& dstMat, int stride = -1) {
#if !defined(__XF_USE_OLD_IMPL__)
        MMIterIn<PTR_WIDTH, MAT_T, ROWS, COLS, NPC>::Array2xfMat(srcPtr, dstMat, stride);
#else
// clang-format off
        #pragma HLS DATAFLOW
        // clang-format on
        assert((PTR_WIDTH >= XF_WORDDEPTH(XF_WORDWIDTH(MAT_T, NPC))) &&
               "The PTR_WIDTH must be always greater than or equal to the minimum "
               "width for the corresponding "
               "configuration");
        const int ch_width = XF_DTPIXELDEPTH(MAT_T, NPC);

        hls::stream<ap_uint<PTR_WIDTH> > strm;
        int rows = dstMat.rows;
        int cols = dstMat.cols;
        int dstMat_cols_align_npc = ((dstMat.cols + (NPC - 1)) >> XF_BITSHIFT(NPC)) << XF_BITSHIFT(NPC);
        Array2hlsStrm<PTR_WIDTH, ROWS, COLS, NPC, XF_CHANNELS(MAT_T, NPC), ch_width,
                      ((ROWS * COLS * XF_CHANNELS(MAT_T, NPC) * ch_width) / PTR_WIDTH)>(srcPtr, strm, rows, cols);
        hlsStrm2xfMat<PTR_WIDTH, MAT_T, ROWS, COLS, NPC, (ROWS * COLS) / NPC>(strm, dstMat, dstMat_cols_align_npc);
#endif
    }

    template <int PTR_WIDTH, int ROWS, int COLS, int NPC, int COLOR_T, int CH_WIDTH, int TRIPCOUNT>
    void axiStrm2hlsStrm(hls::stream<ap_axiu<PTR_WIDTH, 0, 0, 0> >& srcPtr,
                         hls::stream<ap_uint<PTR_WIDTH> >& dstStrm,
                         int rows,
                         int cols) {
        int pixel_width = COLOR_T * CH_WIDTH;
        int loop_count = (((rows * cols * pixel_width) + PTR_WIDTH - 1) / PTR_WIDTH);

        for (int i = 0; i < loop_count; i++) {
// clang-format off
            #pragma HLS LOOP_TRIPCOUNT min=1 max=TRIPCOUNT
            #pragma HLS PIPELINE
            // clang-format on
            ap_axiu<PTR_WIDTH, 0, 0, 0> v = srcPtr.read();
            dstStrm.write(v.data);
        }
    }

    template <int PTR_WIDTH, int MAT_T, int ROWS, int COLS, int NPC>
    void axiStrm2xfMat(hls::stream<ap_axiu<PTR_WIDTH, 0, 0, 0> >& srcPtr, xf::cv::Mat<MAT_T, ROWS, COLS, NPC>& dstMat) {
// clang-format off
        #pragma HLS DATAFLOW
        // clang-format on
        assert((PTR_WIDTH >= XF_WORDDEPTH(XF_WORDWIDTH(MAT_T, NPC))) &&
               "The PTR_WIDTH must be always greater than or equal to the minimum "
               "width for the corresponding "
               "configuration");
        const int ch_width = XF_DTPIXELDEPTH(MAT_T, NPC);

        hls::stream<ap_uint<PTR_WIDTH> > strm;
        int rows = dstMat.rows;
        int cols = dstMat.cols;
        int dstMat_cols_align_npc = ((dstMat.cols + (NPC - 1)) >> XF_BITSHIFT(NPC)) << XF_BITSHIFT(NPC);
        axiStrm2hlsStrm<PTR_WIDTH, ROWS, COLS, NPC, XF_CHANNELS(MAT_T, NPC), ch_width,
                        ((ROWS * COLS * XF_CHANNELS(MAT_T, NPC) * ch_width) / PTR_WIDTH)>(srcPtr, strm, rows, cols);
        hlsStrm2xfMat<PTR_WIDTH, MAT_T, ROWS, COLS, NPC, (ROWS * COLS) / NPC>(strm, dstMat, dstMat_cols_align_npc);
    }

    // ==============================================================================
    // Write module(s) to handle data transfer from xfMat to AXI/HLS stream
    // ------------------------------------------------------------------------------

    template <int PTR_WIDTH, int MAT_T, int ROWS, int COLS, int NPC, int TRIPCOUNT>
    void xfMat2hlsStrm(xf::cv::Mat<MAT_T, ROWS, COLS, NPC>& srcMat,
                       hls::stream<ap_uint<PTR_WIDTH> >& dstStrm,
                       int srcMat_cols_align_npc) {
        int rows = srcMat.rows;
        int cols = srcMat.cols;
        int loop_count = (rows * srcMat_cols_align_npc) / XF_NPIXPERCYCLE(NPC);
        int pad = srcMat_cols_align_npc - cols;
        int out_size_bits = XF_PIXELWIDTH(MAT_T, NPC) * rows * srcMat_cols_align_npc; // channels
        int ddr_write_cycles = (((out_size_bits) + (PTR_WIDTH)-1) / (PTR_WIDTH));
        int ddr_write_cnt = 0;

        int bits_to_add = PTR_WIDTH;
        const int N_size = XF_PIXELWIDTH(MAT_T, NPC) * XF_NPIXPERCYCLE(NPC);
        const int last_N_size = XF_PIXELWIDTH(MAT_T, NPC) * (XF_NPIXPERCYCLE(NPC) - pad);
        const int PTR_WIDTH_min_N = PTR_WIDTH - N_size;
        const int PTR_WIDTH_min_last_N = PTR_WIDTH - last_N_size;
        const int PTR_WIDTH_plus_N = PTR_WIDTH + N_size;
        const int PTR_WIDTH_plus_last_N = PTR_WIDTH + last_N_size;

        ap_uint<PTR_WIDTH> r;
        XF_TNAME(MAT_T, NPC) in;
        int ncpr = srcMat_cols_align_npc / XF_NPIXPERCYCLE(NPC); // number of clock per row
        int clk_cnt = 0;                                         // clock counter. reset at the start of every row

    L1:
        for (int i = 0; i < loop_count; i++) {
// clang-format off
            #pragma HLS LOOP_TRIPCOUNT min=1 max=TRIPCOUNT
            #pragma HLS PIPELINE
            // clang-format on
            int K_size;
            int PTR_WIDTH_min_Ksize;
            int PTR_WIDTH_plus_Ksize;
            if (clk_cnt == ncpr - 1) {
                clk_cnt = 0;
                K_size = last_N_size;
                PTR_WIDTH_min_Ksize = PTR_WIDTH_min_last_N;
                PTR_WIDTH_plus_Ksize = PTR_WIDTH_plus_last_N;
            } else {
                clk_cnt++;
                K_size = N_size;
                PTR_WIDTH_min_Ksize = PTR_WIDTH_min_N;
                PTR_WIDTH_plus_Ksize = PTR_WIDTH_plus_N;
            }

            in = srcMat.read(i);

            if (bits_to_add <= K_size) {
                r.range(PTR_WIDTH - 1, PTR_WIDTH - bits_to_add) = in.range(bits_to_add - 1, 0);
                dstStrm.write(r);

                if (bits_to_add != K_size) {
                    r.range(K_size - bits_to_add - 1, 0) = in.range(K_size - 1, bits_to_add);
                }
                bits_to_add = PTR_WIDTH_min_Ksize + bits_to_add;
            } else {
                r.range(PTR_WIDTH_plus_Ksize - bits_to_add - 1, PTR_WIDTH - bits_to_add) = in;
                bits_to_add -= K_size;
            }
        }

        if (bits_to_add != PTR_WIDTH) {
            dstStrm.write(r);
        }
    }

    template <int PTR_WIDTH, int ROWS, int COLS, int NPC, int COLOR_T, int CH_WIDTH, int TRIPCOUNT>
    void hlsStrm2Array(hls::stream<ap_uint<PTR_WIDTH> >& srcStrm, ap_uint<PTR_WIDTH>* dstPtr, int rows, int cols) {
        int pixel_width = COLOR_T * CH_WIDTH;
        int loop_count = (((rows * cols * pixel_width) + PTR_WIDTH - 1) / PTR_WIDTH);

        for (int i = 0; i < loop_count; i++) {
// clang-format off
            #pragma HLS LOOP_TRIPCOUNT min=1 max=TRIPCOUNT
            #pragma HLS PIPELINE
            // clang-format on
            dstPtr[i] = srcStrm.read();
        }
    }

    template <int PTR_WIDTH, int MAT_T, int ROWS, int COLS, int NPC, int FILLZERO = 1>
    void xfMat2Array(xf::cv::Mat<MAT_T, ROWS, COLS, NPC>& srcMat, ap_uint<PTR_WIDTH>* dstPtr, int stride = -1) {
#if !defined(__XF_USE_OLD_IMPL__)
        MMIterOut<PTR_WIDTH, MAT_T, ROWS, COLS, NPC, FILLZERO>::xfMat2Array(srcMat, dstPtr, stride);
#else
// clang-format off
        #pragma HLS DATAFLOW
        // clang-format on
        assert((PTR_WIDTH >= XF_WORDDEPTH(XF_WORDWIDTH(MAT_T, NPC))) &&
               "The PTR_WIDTH must be always greater than or equal to the minimum "
               "width for the corresponding "
               "configuration");
        const int ch_width = XF_DTPIXELDEPTH(MAT_T, NPC);

        hls::stream<ap_uint<PTR_WIDTH> > strm;
        int rows = srcMat.rows;
        int cols = srcMat.cols;
        int srcMat_cols_align_npc = ((srcMat.cols + (NPC - 1)) >> XF_BITSHIFT(NPC)) << XF_BITSHIFT(NPC);

        xfMat2hlsStrm<PTR_WIDTH, MAT_T, ROWS, COLS, NPC, ROWS*((COLS + NPC - 1) / NPC)>(srcMat, strm,
                                                                                        srcMat_cols_align_npc);
        hlsStrm2Array<PTR_WIDTH, ROWS, COLS, NPC, XF_CHANNELS(MAT_T, NPC), ch_width,
                      ((ROWS * COLS * XF_CHANNELS(MAT_T, NPC) * ch_width) / PTR_WIDTH)>(strm, dstPtr, rows, cols);
#endif
    }

    template <int PTR_WIDTH, int ROWS, int COLS, int NPC, int COLOR_T, int CH_WIDTH, int TRIPCOUNT>
    void hlsStrm2axiStrm(hls::stream<ap_uint<PTR_WIDTH> >& srcStrm,
                         hls::stream<ap_axiu<PTR_WIDTH, 0, 0, 0> >& dstPtr,
                         int rows,
                         int cols) {
        int pixel_width = COLOR_T * CH_WIDTH;
        int loop_count = (((rows * cols * pixel_width) + PTR_WIDTH - 1) / PTR_WIDTH);

        for (int i = 0; i < loop_count; i++) {
// clang-format off
            #pragma HLS LOOP_TRIPCOUNT min=1 max=TRIPCOUNT
            #pragma HLS PIPELINE
            // clang-format on
            ap_axiu<PTR_WIDTH, 0, 0, 0> v;
            v.data = srcStrm.read();
            dstPtr.write(v);
        }
    }

    template <int PTR_WIDTH, int MAT_T, int ROWS, int COLS, int NPC>
    void xfMat2axiStrm(xf::cv::Mat<MAT_T, ROWS, COLS, NPC>& srcMat, hls::stream<ap_axiu<PTR_WIDTH, 0, 0, 0> >& dstPtr) {
// clang-format off
        #pragma HLS DATAFLOW
        // clang-format on
        assert((PTR_WIDTH >= XF_WORDDEPTH(XF_WORDWIDTH(MAT_T, NPC))) &&
               "The PTR_WIDTH must be always greater than or equal to the minimum "
               "width for the corresponding "
               "configuration");
        const int ch_width = XF_DTPIXELDEPTH(MAT_T, NPC);

        hls::stream<ap_uint<PTR_WIDTH> > strm;
        int rows = srcMat.rows;
        int cols = srcMat.cols;
        int srcMat_cols_align_npc = ((srcMat.cols + (NPC - 1)) >> XF_BITSHIFT(NPC)) << XF_BITSHIFT(NPC);

        xfMat2hlsStrm<PTR_WIDTH, MAT_T, ROWS, COLS, NPC, ROWS*((COLS + NPC - 1) / NPC)>(srcMat, strm,
                                                                                        srcMat_cols_align_npc);
        hlsStrm2axiStrm<PTR_WIDTH, ROWS, COLS, NPC, XF_CHANNELS(MAT_T, NPC), ch_width,
                        ((ROWS * COLS * XF_CHANNELS(MAT_T, NPC) * ch_width) / PTR_WIDTH)>(strm, dstPtr, rows, cols);
    }
};

template <int PTR_WIDTH, int MAT_T, int ROWS, int COLS, int NPC, int FILLZERO = 1>
void xfMat2Array(xf::cv::Mat<MAT_T, ROWS, COLS, NPC>& srcMat, ap_uint<PTR_WIDTH>* dstPtr, int stride = -1) {
#if !defined(__XF_USE_OLD_IMPL__)
    MMIterOut<PTR_WIDTH, MAT_T, ROWS, COLS, NPC, FILLZERO>::xfMat2Array(srcMat, dstPtr, stride);
#else
    accel_utils au;
    au.xfMat2Array<PTR_WIDTH, MAT_T, ROWS, COLS, NPC>(srcMat, dstPtr);
#endif
}

template <int PTR_WIDTH, int MAT_T, int ROWS, int COLS, int NPC>
void Array2xfMat(ap_uint<PTR_WIDTH>* srcPtr, xf::cv::Mat<MAT_T, ROWS, COLS, NPC>& dstMat, int stride = -1) {
#if !defined(__XF_USE_OLD_IMPL__)
    MMIterIn<PTR_WIDTH, MAT_T, ROWS, COLS, NPC>::Array2xfMat(srcPtr, dstMat, stride);
#else
    accel_utils au;
    au.Array2xfMat<PTR_WIDTH, MAT_T, ROWS, COLS, NPC>(srcPtr, dstMat);
#endif
}

template <int PTR_WIDTH, int MAT_T, int ROWS, int COLS, int NPC>
void xfMat2axiStrm(xf::cv::Mat<MAT_T, ROWS, COLS, NPC>& srcMat, hls::stream<ap_axiu<PTR_WIDTH, 0, 0, 0> >& dstPtr) {
    accel_utils au;
    au.xfMat2axiStrm<PTR_WIDTH, MAT_T, ROWS, COLS, NPC>(srcMat, dstPtr);
}

template <int PTR_WIDTH, int MAT_T, int ROWS, int COLS, int NPC>
void axiStrm2xfMat(hls::stream<ap_axiu<PTR_WIDTH, 0, 0, 0> >& srcPtr, xf::cv::Mat<MAT_T, ROWS, COLS, NPC>& dstMat) {
    accel_utils au;
    au.axiStrm2xfMat<PTR_WIDTH, MAT_T, ROWS, COLS, NPC>(srcPtr, dstMat);
}

} // namespace cv
} // namespace xf

#endif //_XF_UTILITY_H_