Program Listing for File xf_cvt_color_1.hpp

Return to documentation for file (/tmp/ws/src/vitis_common/include/imgproc/xf_cvt_color_1.hpp)

/*
 * Copyright 2019 Xilinx, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef _XF_CVT_COLOR_1_HPP_
#define _XF_CVT_COLOR_1_HPP_

#ifndef __cplusplus
#error C++ is needed to compile this header !
#endif

#ifndef _XF_CVT_COLOR_HPP_
#error This file can not be included independently !
#endif

#include "xf_cvt_color_utils.hpp"

template <int SRC_T, int DST_T, int ROWS, int COLS, int NPC, int WORDWIDTH_SRC, int TC, int TCC>
void write_y(xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& src_y,
             xf::cv::Mat<DST_T, ROWS, COLS, NPC>& out_y,
             uint16_t height,
             uint16_t width) {
    XF_SNAME(WORDWIDTH_SRC) tmp;
    unsigned long long int idx = 0;
    for (int i = 0; i < height; i++) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=ROWS max=ROWS
        #pragma HLS LOOP_FLATTEN off
        // clang-format on
        for (int j = 0; j < width; j++) {
// clang-format off
            #pragma HLS LOOP_TRIPCOUNT min=TC max=TC
            // clang-format on
            tmp = src_y.read(i * width + j);
            out_y.write(idx++, tmp);
        }
    }
}
template <int SRC_T, int UV_T, int ROWS, int COLS, int NPC, int NPC_UV, int WORDWIDTH_UV, int WORDWIDTH_DST, int TC>
void KernNv122Yuv4(xf::cv::Mat<UV_T, ROWS / 2, COLS / 2, NPC_UV>& _uv,
                   xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _u,
                   xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _v,
                   uint16_t height,
                   uint16_t width) {
    XF_PTNAME(XF_16UP) uv;
    XF_SNAME(WORDWIDTH_DST) u, v;
    XF_SNAME(WORDWIDTH_UV) uvPacked;
    XF_TNAME(SRC_T, NPC) arr_u[COLS];
    XF_TNAME(SRC_T, NPC) arr_v[COLS];

    unsigned long long int idx = 0, idx1 = 0;
    ap_uint<13> i, j;
    bool evenBlock = true;
RowLoop:
    for (i = 0; i < (height >> 1); i++) {
// clang-format off
        #pragma HLS LOOP_FLATTEN off
        #pragma HLS LOOP_TRIPCOUNT min=ROWS max=ROWS
    // clang-format on
    ColLoop:
        for (j = 0; j < width; j++) {
// clang-format off
            #pragma HLS pipeline
            #pragma HLS LOOP_TRIPCOUNT min=COLS max=COLS
            // clang-format on
            if (evenBlock) {
                uv = _uv.read(idx++);
                u.range(7, 0) = (uint8_t)uv.range(7, 0);
                v.range(7, 0) = (uint8_t)uv.range(15, 8);
            }
            arr_u[j] = u;
            arr_v[j] = v;

            _u.write(((i * 2) * (_u.cols >> XF_BITSHIFT(NPC))) + j, u);
            _v.write(((i * 2) * (_v.cols >> XF_BITSHIFT(NPC))) + j, v);
            evenBlock = evenBlock ? false : true;
        }
        for (int k = 0; k < width; k++) {
            _u.write((((i * 2) + 1) * (_u.cols >> XF_BITSHIFT(NPC))) + k, arr_u[k]);
            _v.write((((i * 2) + 1) * (_v.cols >> XF_BITSHIFT(NPC))) + k, arr_v[k]);
        }
    }
}

template <int SRC_T,
          int UV_T,
          int DST_T,
          int ROWS,
          int COLS,
          int NPC,
          int NPC_UV,
          int WORDWIDTH_Y,
          int WORDWIDTH_UV,
          int WORDWIDTH_DST>
void KernNv122Rgba(xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _y,
                   xf::cv::Mat<UV_T, ROWS / 2, COLS / 2, NPC_UV>& _uv,
                   xf::cv::Mat<DST_T, ROWS, COLS, NPC>& _rgba,
                   uint16_t height,
                   uint16_t width) {
    hls::stream<XF_SNAME(WORDWIDTH_UV)> uvStream;
// clang-format off
    #pragma HLS STREAM variable=&uvStream  depth=COLS
    // clang-format on
    XF_SNAME(WORDWIDTH_Y) yPacked;
    XF_SNAME(WORDWIDTH_UV) uvPacked;
    XF_SNAME(WORDWIDTH_DST) rgba;
    unsigned long long int idx = 0, idx1 = 0;
    uint8_t y1, y2;
    int32_t V2Rtemp, U2Gtemp, V2Gtemp, U2Btemp;
    int8_t u, v;
    bool evenRow = true, evenBlock = true;
RowLoop:
    for (int i = 0; i < height; i++) {
// clang-format off
        #pragma HLS LOOP_FLATTEN off
        #pragma HLS LOOP_TRIPCOUNT min=ROWS max=ROWS
    // clang-format on
    ColLoop:
        for (int j = 0; j < width; j++) {
// clang-format off
            #pragma HLS pipeline
            #pragma HLS LOOP_TRIPCOUNT min=COLS max=COLS
            // clang-format on

            yPacked = _y.read(i * width + j);
            if (evenRow) {
                if (evenBlock) {
                    uvPacked = _uv.read(idx++);
                    uvStream.write(uvPacked);
                }
            } else { // Keep a copy of UV row data in stream to use for oddrow
                if (evenBlock) {
                    uvPacked = uvStream.read();
                }
            }
            //          auExtractPixels<NPC, WORDWIDTH_SRC,
            // XF_8UP>(UVbuf, UVPacked, 0);
            uint8_t t = yPacked.range(7, 0);
            y1 = t > 16 ? t - 16 : 0;
            v = (uint8_t)uvPacked.range(15, 8) - 128;
            u = (uint8_t)uvPacked.range(7, 0) - 128;

            V2Rtemp = v * (short int)V2R;
            U2Gtemp = (short int)U2G * u;
            V2Gtemp = (short int)V2G * v;
            U2Btemp = u * (short int)U2B;

            // R = 1.164*Y + 1.596*V = Y + 0.164*Y + V + 0.596*V
            // G = 1.164*Y - 0.813*V - 0.391*U = Y + 0.164*Y - 0.813*V - 0.391*U
            // B = 1.164*Y + 2.018*U = Y + 0.164 + 2*U + 0.018*U
            rgba.range(7, 0) = CalculateR(y1, V2Rtemp, v);        // R
            rgba.range(15, 8) = CalculateG(y1, U2Gtemp, V2Gtemp); // G
            rgba.range(23, 16) = CalculateB(y1, U2Btemp, u);      // B
            rgba.range(31, 24) = 255;                             // A

            //          PackedPixels =
            // PackRGBAPixels<WORDWIDTH_DST>(RGB);
            _rgba.write(idx1++, rgba);
            evenBlock = evenBlock ? false : true;
        }
        evenRow = evenRow ? false : true;
    }
    if (height & 1) {
        for (int i = 0; i < width; i++) {
// clang-format off
            #pragma HLS LOOP_TRIPCOUNT min=COLS max=COLS
            // clang-format on
            uvStream.read();
        }
    }
}

template <int SRC_T, int UV_T, int ROWS, int COLS, int NPC, int NPC_UV, int WORDWIDTH_SRC, int WORDWIDTH_DST, int TC>
void KernNv122Iyuv(xf::cv::Mat<UV_T, ROWS / 2, COLS / 2, NPC_UV>& _uv,
                   xf::cv::Mat<SRC_T, ROWS / 4, COLS, NPC>& _u,
                   xf::cv::Mat<SRC_T, ROWS / 4, COLS, NPC>& _v,
                   uint16_t height,
                   uint16_t width) {
    XF_PTNAME(XF_8UP) u, v;
    XF_SNAME(WORDWIDTH_SRC) uv;
    unsigned long long int idx = 0;
    ap_uint<13> i, j;
RowLoop:
    for (i = 0; i<height>> 1; i++) {
// clang-format off
        #pragma HLS LOOP_FLATTEN off
        #pragma HLS LOOP_TRIPCOUNT min=ROWS max=ROWS
    // clang-format on
    ColLoop:
        for (j = 0; j < (width >> 1); j++) {
// clang-format off
            #pragma HLS pipeline
            #pragma HLS LOOP_TRIPCOUNT min=TC max=TC
            // clang-format on
            //          u = _uv.read();
            //          v = _uv.read();
            uv = _uv.read(i * (width >> 1) + j);

            _u.write(idx, uv.range(7, 0));
            _v.write(idx++, uv.range(15, 8));
        }
    }
}

template <int SRC_T, int UV_T, int ROWS, int COLS, int NPC, int NPC_UV, int WORDWIDTH_VU, int WORDWIDTH_DST, int TC>
void KernNv212Yuv4(xf::cv::Mat<UV_T, ROWS / 2, COLS / 2, NPC_UV>& _vu,
                   xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _u,
                   xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _v,
                   uint16_t height,
                   uint16_t width) {
    XF_PTNAME(XF_16UP) uv;
    XF_SNAME(WORDWIDTH_DST) u, v;
    XF_SNAME(WORDWIDTH_VU) uvPacked;
    XF_TNAME(SRC_T, NPC) arr_u[COLS];
    XF_TNAME(SRC_T, NPC) arr_v[COLS];

    unsigned long long int idx = 0, idx1 = 0;
    ap_uint<13> i, j;
    bool evenBlock = true;
RowLoop:
    for (i = 0; i < (height >> 1); i++) {
// clang-format off
        #pragma HLS LOOP_FLATTEN off
        #pragma HLS LOOP_TRIPCOUNT min=ROWS max=ROWS
    // clang-format on
    ColLoop:
        for (j = 0; j < width; j++) {
// clang-format off
            #pragma HLS pipeline
            #pragma HLS LOOP_TRIPCOUNT min=COLS max=COLS
            // clang-format on
            if (evenBlock) {
                uv = _vu.read(idx++);
                v.range(7, 0) = (uint8_t)uv.range(7, 0);
                u.range(7, 0) = (uint8_t)uv.range(15, 8);
            }
            arr_u[j] = u;
            arr_v[j] = v;

            _u.write(((i * 2) * (_u.cols >> XF_BITSHIFT(NPC))) + j, u);
            _v.write(((i * 2) * (_v.cols >> XF_BITSHIFT(NPC))) + j, v);
            evenBlock = evenBlock ? false : true;
        }
        for (int k = 0; k < width; k++) {
            _u.write((((i * 2) + 1) * (_u.cols >> XF_BITSHIFT(NPC))) + k, arr_u[k]);
            _v.write((((i * 2) + 1) * (_v.cols >> XF_BITSHIFT(NPC))) + k, arr_v[k]);
        }
    }
}

template <int SRC_T,
          int UV_T,
          int DST_T,
          int ROWS,
          int COLS,
          int NPC,
          int NPC_UV,
          int WORDWIDTH_Y,
          int WORDWIDTH_VU,
          int WORDWIDTH_DST>
void KernNv212Rgba(xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _y,
                   xf::cv::Mat<UV_T, ROWS / 2, COLS / 2, NPC_UV>& _vu,
                   xf::cv::Mat<DST_T, ROWS, COLS, NPC>& _rgba,
                   uint16_t height,
                   uint16_t width) {
    hls::stream<XF_SNAME(WORDWIDTH_VU)> vuStream;
// clang-format off
    #pragma HLS STREAM variable=&vuStream  depth=COLS
    // clang-format on
    XF_SNAME(WORDWIDTH_Y) yPacked;
    XF_SNAME(WORDWIDTH_VU) vuPacked;
    XF_SNAME(WORDWIDTH_DST) rgba;
    unsigned long long int idx = 0, idx1 = 0;
    ap_uint<13> i, j;
    uint8_t y1, y2;
    int32_t V2Rtemp, U2Gtemp, V2Gtemp, U2Btemp;
    int8_t u, v;
    bool evenRow = true, evenBlock = true;
RowLoop:
    for (i = 0; i < (height); i++) {
// clang-format off
        #pragma HLS LOOP_FLATTEN off
        #pragma HLS LOOP_TRIPCOUNT min=ROWS max=ROWS
    // clang-format on
    ColLoop:
        for (j = 0; j < width; j++) {
// clang-format off
            #pragma HLS pipeline
            #pragma HLS LOOP_TRIPCOUNT min=COLS max=COLS
            // clang-format on

            yPacked = _y.read(i * width + j);
            //          auExtractPixels<NPC, WORDWIDTH_SRC,
            // XF_8UP>(Ybuf, YPacked, 0);
            if (evenRow) {
                if (evenBlock) {
                    vuPacked = _vu.read(idx++);
                    vuStream.write(vuPacked);
                }
            } else { // Keep a copy of UV row data in stream to use for oddrow
                if (evenBlock) {
                    vuPacked = vuStream.read();
                }
            }
            //          auExtractPixels<NPC, WORDWIDTH_SRC,
            // XF_8UP>(UVbuf, UVPacked, 0);
            uint8_t t = yPacked.range(7, 0);
            y1 = t > 16 ? t - 16 : 0;
            u = (uint8_t)vuPacked.range(15, 8) - 128;
            v = (uint8_t)vuPacked.range(7, 0) - 128;

            V2Rtemp = v * (short int)V2R;
            U2Gtemp = (short int)U2G * u;
            V2Gtemp = (short int)V2G * v;
            U2Btemp = u * (short int)U2B;

            // R = 1.164*Y + 1.596*V = Y + 0.164*Y + V + 0.596*V
            // G = 1.164*Y - 0.813*V - 0.391*U = Y + 0.164*Y - 0.813*V - 0.391*U
            // B = 1.164*Y + 2.018*U = Y + 0.164 + 2*U + 0.018*U
            rgba.range(7, 0) = CalculateR(y1, V2Rtemp, v);        // R
            rgba.range(15, 8) = CalculateG(y1, U2Gtemp, V2Gtemp); // G
            rgba.range(23, 16) = CalculateB(y1, U2Btemp, u);      // B
            rgba.range(31, 24) = 255;                             // A

            //          PackedPixels =
            // PackRGBAPixels<WORDWIDTH_DST>(RGB);
            _rgba.write(idx1++, rgba);
            evenBlock = evenBlock ? false : true;
        }
        evenRow = evenRow ? false : true;
    }
    if (height & 1) {
        for (i = 0; i < width; i++) {
// clang-format off
            #pragma HLS LOOP_TRIPCOUNT min=COLS max=COLS
            // clang-format on
            vuStream.read();
        }
    }
}

template <int SRC_T, int UV_T, int ROWS, int COLS, int NPC, int NPC_UV, int WORDWIDTH_SRC, int WORDWIDTH_DST, int TC>
void KernNv212Iyuv(xf::cv::Mat<UV_T, ROWS / 2, COLS / 2, NPC_UV>& _vu,
                   xf::cv::Mat<SRC_T, ROWS / 4, COLS, NPC>& _u,
                   xf::cv::Mat<SRC_T, ROWS / 4, COLS, NPC>& _v,
                   uint16_t height,
                   uint16_t width) {
    ap_uint<13> i, j;
    XF_PTNAME(XF_8UP) u, v;
    XF_SNAME(WORDWIDTH_SRC) VUPacked, UVPacked0, UVPacked1;
    unsigned long long int idx = 0, idx1 = 0;
RowLoop:
    for (i = 0; i < (height >> 1); i++) {
// clang-format off
        #pragma HLS LOOP_FLATTEN off
        #pragma HLS LOOP_TRIPCOUNT min=ROWS max=ROWS
    // clang-format on
    ColLoop:
        for (j = 0; j < (width >> 1); j++) {
// clang-format off
            #pragma HLS pipeline
            #pragma HLS LOOP_TRIPCOUNT min=TC max=TC
            // clang-format on
            VUPacked = _vu.read(idx++);
            u = (uint8_t)VUPacked.range(15, 8);
            v = (uint8_t)VUPacked.range(7, 0);
            _u.write(idx1, u);
            _v.write(idx1++, v);
        }
    }
}

template <int SRC_T, int DST_T, int ROWS, int COLS, int NPC, int WORDWIDTH_SRC, int WORDWIDTH_DST, int TC>
void KernIyuv2Rgba(xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _y,
                   xf::cv::Mat<SRC_T, ROWS / 4, COLS, NPC>& _u,
                   xf::cv::Mat<SRC_T, ROWS / 4, COLS, NPC>& _v,
                   xf::cv::Mat<DST_T, ROWS, COLS, NPC>& _rgba,
                   uint16_t height,
                   uint16_t width) {
    unsigned long long int idx = 0, idx1 = 0;
    ap_uint<13> i, j;
    hls::stream<XF_SNAME(WORDWIDTH_SRC)> uStream, vStream;
// clang-format off
    #pragma HLS STREAM variable=&uStream  depth=COLS
    #pragma HLS STREAM variable=&vStream  depth=COLS
    // clang-format on

    XF_SNAME(WORDWIDTH_SRC) yPacked, uPacked, vPacked;
    XF_SNAME(WORDWIDTH_DST) rgba;

    uint8_t y1, y2;
    int32_t V2Rtemp, U2Gtemp, V2Gtemp, U2Btemp;
    int8_t u, v;
    bool evenRow = true, evenBlock = true;
RowLoop:
    for (i = 0; i < height; i++) {
// clang-format off
        #pragma HLS LOOP_FLATTEN off
        #pragma HLS LOOP_TRIPCOUNT min=ROWS max=ROWS
    // clang-format on
    ColLoop:
        for (j = 0; j < width; j++) {
// clang-format off
            #pragma HLS pipeline
            #pragma HLS LOOP_TRIPCOUNT min=COLS max=COLS
            // clang-format on
            yPacked = _y.read(i * width + j);

            if (evenBlock) {
                if (evenRow) {
                    uPacked = _u.read(idx);
                    uStream.write(uPacked);
                    vPacked = _v.read(idx++);
                    vStream.write(vPacked);
                } else {
                    /* Copy of the U and V values are pushed into stream to be used for
                     * next row */
                    uPacked = uStream.read();
                    vPacked = vStream.read();
                }
            }

            y1 = (uint8_t)yPacked.range(7, 0) > 16 ? (uint8_t)yPacked.range(7, 0) - 16 : 0;

            u = (uint8_t)uPacked.range(7, 0) - 128;
            v = (uint8_t)vPacked.range(7, 0) - 128;

            V2Rtemp = v * (short int)V2R;
            U2Gtemp = (short int)U2G * u;
            V2Gtemp = (short int)V2G * v;
            U2Btemp = u * (short int)U2B;

            // R = 1.164*Y + 1.596*V = Y + 0.164*Y + V + 0.596*V
            // G = 1.164*Y - 0.813*V - 0.391*U = Y + 0.164*Y - 0.813*V - 0.391*U
            // B = 1.164*Y + 2.018*U = Y + 0.164 + 2*U + 0.018*U
            rgba.range(7, 0) = CalculateR(y1, V2Rtemp, v);        // R
            rgba.range(15, 8) = CalculateG(y1, U2Gtemp, V2Gtemp); // G
            rgba.range(23, 16) = CalculateB(y1, U2Btemp, u);      // B
            rgba.range(31, 24) = 255;                             // A

            _rgba.write(idx1++, rgba);
            evenBlock = evenBlock ? false : true;
        }
        evenRow = evenRow ? false : true;
    }
}

template <int SRC_T, int ROWS, int COLS, int NPC, int WORDWIDTH, int rTC, int cTC>
void KernIyuv2Yuv4(xf::cv::Mat<SRC_T, ROWS / 4, COLS, NPC>& _in_u,
                   xf::cv::Mat<SRC_T, ROWS / 4, COLS, NPC>& _in_v,
                   xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _u_image,
                   xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _v_image,
                   uint16_t height,
                   uint16_t width) {
    hls::stream<XF_SNAME(WORDWIDTH)> inter_u;
// clang-format off
    #pragma HLS stream variable=inter_u depth=COLS
    // clang-format on

    hls::stream<XF_SNAME(WORDWIDTH)> inter_v;
// clang-format off
    #pragma HLS stream variable=inter_v depth=COLS
    // clang-format on

    XF_TNAME(SRC_T, NPC) arr_U[COLS];
    XF_TNAME(SRC_T, NPC) arr_V[COLS];

    XF_SNAME(WORDWIDTH) IUPacked, IVPacked;
    XF_PTNAME(XF_8UP) in_u, in_v;
    unsigned long long int idx = 0, idx1 = 0, in_idx1 = 0, in_idx2 = 0;
RowLoop:
    for (int i = 0; i < ((height >> 2) << 1); i++) {
// clang-format off
        #pragma HLS LOOP_FLATTEN
        #pragma HLS LOOP_TRIPCOUNT min=rTC max=rTC
    // clang-format on
    ColLoop:
        for (int j = 0, k = 0; j < (width >> 1); j++, k += 2) {
// clang-format off
            #pragma HLS pipeline
            #pragma HLS LOOP_TRIPCOUNT min=cTC max=cTC
            // clang-format on
            IUPacked = _in_u.read(in_idx1++);
            IVPacked = _in_v.read(in_idx2++);

            _u_image.write(((i * 2) * (width)) + k, IUPacked);
            _u_image.write(((i * 2) * (width)) + k + 1, IUPacked);
            _v_image.write(((i * 2) * (width)) + k, IVPacked);
            _v_image.write(((i * 2) * (width)) + k + 1, IVPacked);

            inter_u.write(IUPacked);
            inter_v.write(IVPacked);
            inter_u.write(IUPacked);
            inter_v.write(IVPacked);
        }
        for (int j = 0; j < width; j++) {
// clang-format off
            #pragma HLS pipeline
            // clang-format on
            _u_image.write((((i * 2) + 1) * (width) + j), inter_u.read());
            _v_image.write((((i * 2) + 1) * (width) + j), inter_v.read());
        }
    }
}

template <int SRC_T,
          int UV_T,
          int ROWS,
          int COLS,
          int NPC,
          int NPC_UV,
          int WORDWIDTH_SRC,
          int WORDWIDTH_UV,
          int rTC,
          int cTC>
void KernIyuv2Nv12(xf::cv::Mat<SRC_T, ROWS / 4, COLS, NPC>& _u,
                   xf::cv::Mat<SRC_T, ROWS / 4, COLS, NPC>& _v,
                   xf::cv::Mat<UV_T, ROWS / 2, COLS / 2, NPC_UV>& _uv,
                   uint16_t height,
                   uint16_t width) {
    ap_uint<13> i, j;
    XF_SNAME(WORDWIDTH_SRC) u, v;
    XF_SNAME(WORDWIDTH_UV) uv;
    unsigned long long int idx = 0;
RowLoop:
    for (i = 0; i<height>> 1; i++) {
// Reading the plane interleaved U and V data from streams,
// packing them in pixel interleaved and writing out to UV stream
// clang-format off
        #pragma HLS LOOP_FLATTEN off
        #pragma HLS LOOP_TRIPCOUNT min=rTC max=rTC
    // clang-format on
    ColLoop:
        for (j = 0; j < (width >> 1); j++) {
// clang-format off
            #pragma HLS pipeline
            #pragma HLS LOOP_TRIPCOUNT min=cTC max=cTC
            // clang-format on
            u = _u.read(i * (width >> 1) + j);
            v = _v.read(i * (width >> 1) + j);
            uv.range(7, 0) = u;
            uv.range(15, 8) = v;
            _uv.write(idx++, uv);
        }
    }
}

template <int SRC_T, int DST_T, int ROWS, int COLS, int NPC, int WORDWIDTH_SRC, int WORDWIDTH_DST>
void KernRgba2Yuv4(xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _rgba,
                   xf::cv::Mat<DST_T, ROWS, COLS, NPC>& _y,
                   xf::cv::Mat<DST_T, ROWS, COLS, NPC>& _u,
                   xf::cv::Mat<DST_T, ROWS, COLS, NPC>& _v,
                   uint16_t height,
                   uint16_t width) {
    XF_SNAME(XF_32UW) rgba;
    uint8_t y, u, v;
    unsigned long long int idx = 0;
RowLoop:
    for (int i = 0; i < height; ++i) {
// clang-format off
        #pragma HLS LOOP_FLATTEN OFF
        #pragma HLS LOOP_TRIPCOUNT min=ROWS max=ROWS
    // clang-format on
    ColLoop:
        for (int j = 0; j < width; ++j) {
// clang-format off
            #pragma HLS LOOP_TRIPCOUNT min=COLS max=COLS
            #pragma HLS PIPELINE
            // clang-format on
            rgba = _rgba.read(i * width + j);

            y = CalculateY(rgba.range(7, 0), rgba.range(15, 8), rgba.range(23, 16));
            u = CalculateU(rgba.range(7, 0), rgba.range(15, 8), rgba.range(23, 16));
            v = CalculateV(rgba.range(7, 0), rgba.range(15, 8), rgba.range(23, 16));

            _y.write(idx, y);
            _u.write(idx, u);
            _v.write(idx++, v);
        }
    }
}

template <int SRC_T,
          int DST_T,
          int ROWS,
          int COLS,
          int NPC,
          int WORDWIDTH_SRC,
          int WORDWIDTH_DST,
          int ROWS_U,
          int ROWS_V>
void KernRgba2Iyuv(xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _rgba,
                   xf::cv::Mat<DST_T, ROWS, COLS, NPC>& _y,
                   xf::cv::Mat<DST_T, ROWS / 4, COLS, NPC>& _u,
                   xf::cv::Mat<DST_T, ROWS / 4, COLS, NPC>& _v,
                   uint16_t height,
                   uint16_t width) {
    XF_SNAME(XF_32UW) rgba;
    uint8_t y, u, v;
    bool evenRow = true, evenBlock = true;
    unsigned long long int idx = 0, idx1 = 0;
RowLoop:
    for (int i = 0; i < height; i++) {
// clang-format off
        #pragma HLS LOOP_FLATTEN off
        #pragma HLS LOOP_TRIPCOUNT min=ROWS max=ROWS
    // clang-format on
    ColLoop:
        for (int j = 0; j < width; j++) {
// clang-format off
            #pragma HLS pipeline
            #pragma HLS LOOP_TRIPCOUNT min=COLS max=COLS
            // clang-format on
            rgba = _rgba.read(i * width + j);
            uint8_t r = rgba.range(7, 0);
            uint8_t g = rgba.range(15, 8);
            uint8_t b = rgba.range(23, 16);

            y = CalculateY(r, g, b);
            if (evenRow) {
                if (evenBlock) {
                    u = CalculateU(r, g, b);
                    v = CalculateV(r, g, b);
                }
            }
            _y.write(idx1++, y);
            if (evenRow & !evenBlock) {
                _u.write(idx, u);
                _v.write(idx++, v);
            }
            evenBlock = evenBlock ? false : true;
        }
        evenRow = evenRow ? false : true;
    }
}

template <int SRC_T,
          int Y_T,
          int UV_T,
          int ROWS,
          int COLS,
          int NPC,
          int NPC_UV,
          int WORDWIDTH_SRC,
          int WORDWIDTH_Y,
          int WORDWIDTH_UV>
void KernRgba2Nv12(xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _rgba,
                   xf::cv::Mat<Y_T, ROWS, COLS, NPC>& _y,
                   xf::cv::Mat<UV_T, ROWS / 2, COLS / 2, NPC_UV>& _uv,
                   uint16_t height,
                   uint16_t width) {
    //  XF_SNAME(XF_32UW) rgba;
    XF_TNAME(SRC_T, NPC) rgba;
    ap_uint<16> val1;
    uint8_t y, u, v;
    unsigned long long int idx = 0, idx1 = 0;
    bool evenRow = true, evenBlock = true;

RowLoop:
    for (int i = 0; i < height; i++) {
// clang-format off
        #pragma HLS LOOP_FLATTEN off
        #pragma HLS LOOP_TRIPCOUNT min=ROWS max=ROWS
    // clang-format on
    ColLoop:
        for (int j = 0; j < width; j++) {
// clang-format off
            #pragma HLS pipeline
            #pragma HLS LOOP_TRIPCOUNT min=COLS max=COLS
            // clang-format on
            rgba = _rgba.read(i * width + j);
            uint8_t r = rgba.range(7, 0);
            uint8_t g = rgba.range(15, 8);
            uint8_t b = rgba.range(23, 16);

            y = CalculateY(r, g, b);
            if (evenRow) {
                u = CalculateU(r, g, b);
                v = CalculateV(r, g, b);
            }
            _y.write(idx++, y);
            if (evenRow) {
                if ((j & 0x01) == 0)
                    //{
                    _uv.write(idx1++, u | (uint16_t)v << 8);
                //_uv.write(v);
                //}
                //  _uv.write(u | (uint16_t)v << 8);
            }
        }
        evenRow = evenRow ? false : true;
    }
}

template <int SRC_T,
          int Y_T,
          int UV_T,
          int ROWS,
          int COLS,
          int NPC,
          int NPC_UV,
          int WORDWIDTH_SRC,
          int WORDWIDTH_Y,
          int WORDWIDTH_VU>
void KernRgba2Nv21(xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _rgba,
                   xf::cv::Mat<Y_T, ROWS, COLS, NPC>& _y,
                   xf::cv::Mat<UV_T, ROWS / 2, COLS / 2, NPC_UV>& _vu,
                   uint16_t height,
                   uint16_t width) {
    width = width >> XF_BITSHIFT(NPC);
    XF_TNAME(SRC_T, NPC) rgba;
    uint8_t y, u, v;
    unsigned long long int idx = 0, idx1 = 0;
    bool evenRow = true, evenBlock = true;

RowLoop:
    for (int i = 0; i < height; i++) {
// clang-format off
        #pragma HLS LOOP_FLATTEN off
        #pragma HLS LOOP_TRIPCOUNT min=ROWS max=ROWS
    // clang-format on
    ColLoop:
        for (int j = 0; j < width; j++) {
// clang-format off
            #pragma HLS pipeline
            #pragma HLS LOOP_TRIPCOUNT min=COLS max=COLS
            // clang-format on
            rgba = _rgba.read(i * width + j);
            uint8_t r = rgba.range(7, 0);
            uint8_t g = rgba.range(15, 8);
            uint8_t b = rgba.range(23, 16);

            y = CalculateY(r, g, b);
            if (evenRow) {
                u = CalculateU(r, g, b);
                v = CalculateV(r, g, b);
            }
            _y.write(idx++, y);
            if (evenRow) {
                if ((j & 0x01) == 0) _vu.write(idx1++, v | ((uint16_t)u << 8));
            }
        }
        evenRow = evenRow ? false : true;
    }
}

// Yuyv2Rgba
template <int SRC_T, int DST_T, int ROWS, int COLS, int NPC, int WORDWIDTH_SRC, int WORDWIDTH_DST, int TC>
void KernYuyv2Rgba(xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _yuyv,
                   xf::cv::Mat<DST_T, ROWS, COLS, NPC>& _rgba,
                   uint16_t height,
                   uint16_t width) {
    XF_SNAME(WORDWIDTH_DST) rgba;
    XF_SNAME(WORDWIDTH_SRC) yu, yv;
    XF_PTNAME(XF_8UP) r, g, b;
    int8_t y1, y2, u, v;
    int32_t V2Rtemp, U2Gtemp, V2Gtemp, U2Btemp;
    unsigned long long int idx = 0;
RowLoop:
    for (int i = 0; i < height; i++) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=ROWS max=ROWS
        #pragma HLS LOOP_FLATTEN off
    // clang-format on
    ColLoop:
        for (int j = 0; j < width; j += 2) {
// clang-format off
            #pragma HLS LOOP_TRIPCOUNT min=TC max=TC
            #pragma HLS pipeline
            // clang-format on

            yu = _yuyv.read(i * width + j);
            yv = _yuyv.read(i * width + j + 1);
            u = (uint8_t)yu.range(15, 8) - 128;
            y1 = (yu.range(7, 0) > 16) ? ((uint8_t)yu.range(7, 0) - 16) : 0;

            v = (uint8_t)yv.range(15, 8) - 128;
            y2 = (yv.range(7, 0) > 16) ? ((uint8_t)yv.range(7, 0) - 16) : 0;

            V2Rtemp = v * (short int)V2R;
            U2Gtemp = (short int)U2G * u;
            V2Gtemp = (short int)V2G * v;
            U2Btemp = u * (short int)U2B;

            r = CalculateR(y1, V2Rtemp, v);
            g = CalculateG(y1, U2Gtemp, V2Gtemp);
            b = CalculateB(y1, U2Btemp, u);

            rgba = ((ap_uint32_t)r) | ((ap_uint32_t)g << 8) | ((ap_uint32_t)b << 16) | (0xFF000000);
            _rgba.write(idx++, rgba);

            r = CalculateR(y2, V2Rtemp, v);
            g = CalculateG(y2, U2Gtemp, V2Gtemp);
            b = CalculateB(y2, U2Btemp, u);

            rgba = ((ap_uint32_t)r) | ((ap_uint32_t)g << 8) | ((ap_uint32_t)b << 16) | (0xFF000000);
            _rgba.write(idx++, rgba);
        }
    }
}

// Yuyv2Nv12
template <int SRC_T,
          int Y_T,
          int UV_T,
          int ROWS,
          int COLS,
          int NPC,
          int NPC_UV,
          int WORDWIDTH_SRC,
          int WORDWIDTH_Y,
          int WORDWIDTH_UV,
          int TC>
void KernYuyv2Nv12(xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _yuyv,
                   xf::cv::Mat<Y_T, ROWS, COLS, NPC>& _y,
                   xf::cv::Mat<UV_T, ROWS / 2, COLS / 2, NPC_UV>& _uv,
                   uint16_t height,
                   uint16_t width) {
    XF_SNAME(WORDWIDTH_SRC) yu, yv;
    XF_PTNAME(XF_8UP) y1, y2;
    unsigned long long int idx = 0, idx1 = 0;
    XF_SNAME(WORDWIDTH_UV) uv;
    bool evenRow = true;
RowLoop:
    for (int i = 0; i < height; i++) {
// clang-format off
        #pragma HLS LOOP_FLATTEN off
        #pragma HLS LOOP_TRIPCOUNT min=ROWS max=ROWS
    // clang-format on
    ColLoop:
        for (int j = 0; j < width; j += 2) {
// clang-format off
            #pragma HLS pipeline
            #pragma HLS LOOP_TRIPCOUNT min=TC max=TC
            // clang-format on

            yu = _yuyv.read(i * width + j);
            yv = _yuyv.read(i * width + j + 1);
            y1 = yu.range(7, 0);
            if (evenRow) uv.range(7, 0) = yu.range(15, 8);

            y2 = yv.range(7, 0);
            if (evenRow) uv.range(15, 8) = yv.range(15, 8);

            _y.write(idx++, y1);
            _y.write(idx++, y2);
            if (evenRow) {
                _uv.write(idx1++, uv);
            }
        }
        evenRow = evenRow ? false : true;
    }
}
// Yuyv2Nv12
template <int SRC_T,
          int Y_T,
          int UV_T,
          int ROWS,
          int COLS,
          int NPC,
          int NPC_UV,
          int WORDWIDTH_SRC,
          int WORDWIDTH_Y,
          int WORDWIDTH_UV,
          int TC>
void KernYuyv2Nv21(xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _yuyv,
                   xf::cv::Mat<Y_T, ROWS, COLS, NPC>& _y,
                   xf::cv::Mat<UV_T, ROWS / 2, COLS / 2, NPC_UV>& _uv,
                   uint16_t height,
                   uint16_t width) {
    XF_SNAME(WORDWIDTH_SRC) yu, yv;
    XF_PTNAME(XF_8UP) y1, y2;
    unsigned long long int idx = 0, idx1 = 0;
    XF_SNAME(WORDWIDTH_UV) uv;
    bool evenRow = true;
RowLoop:
    for (int i = 0; i < height; i++) {
// clang-format off
        #pragma HLS LOOP_FLATTEN off
        #pragma HLS LOOP_TRIPCOUNT min=ROWS max=ROWS
    // clang-format on
    ColLoop:
        for (int j = 0; j < width; j += 2) {
// clang-format off
            #pragma HLS pipeline
            #pragma HLS LOOP_TRIPCOUNT min=TC max=TC
            // clang-format on

            yu = _yuyv.read(i * width + j);
            yv = _yuyv.read(i * width + j + 1);
            y1 = yu.range(7, 0);
            if (evenRow) uv.range(7, 0) = yv.range(15, 8);

            y2 = yv.range(7, 0);
            if (evenRow) uv.range(15, 8) = yu.range(15, 8);

            _y.write(idx++, y1);
            _y.write(idx++, y2);
            if (evenRow) {
                _uv.write(idx1++, uv);
            }
        }
        evenRow = evenRow ? false : true;
    }
}

// Yuyv2Iyuv
template <int SRC_T, int DST_T, int ROWS, int COLS, int NPC, int WORDWIDTH_SRC, int WORDWIDTH_DST, int TC>
void KernYuyv2Iyuv(xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _yuyv,
                   xf::cv::Mat<DST_T, ROWS, COLS, NPC>& _y,
                   xf::cv::Mat<DST_T, ROWS / 4, COLS, NPC>& _u,
                   xf::cv::Mat<DST_T, ROWS / 4, COLS, NPC>& _v,
                   uint16_t height,
                   uint16_t width) {
    XF_SNAME(WORDWIDTH_SRC) yu, yv;
    unsigned long long int idx = 0, idx1 = 0;
    bool evenRow = true, evenBlock = true;
    XF_PTNAME(XF_8UP) y1, y2, u, v;

RowLoop:
    for (int i = 0; i < height; i++) {
// clang-format off
        #pragma HLS LOOP_FLATTEN off
        #pragma HLS LOOP_TRIPCOUNT min=ROWS max=ROWS
    // clang-format on
    ColLoop:
        for (int j = 0; j < width; j += 2) {
// clang-format off
            #pragma HLS pipeline
            #pragma HLS LOOP_TRIPCOUNT min=TC max=TC
            // clang-format on

            yu = _yuyv.read(i * width + j);
            yv = _yuyv.read(i * width + j + 1);
            y1 = yu.range(7, 0);
            y2 = yv.range(7, 0);
            _y.write(idx, y1);
            idx++;
            _y.write(idx, y2);
            idx++;
            if (evenRow) u = yu.range(15, 8);

            if (evenRow) v = yv.range(15, 8);

            if (evenRow) {
                _u.write(idx1, u);
                _v.write(idx1, v);
                idx1++;
            }
        }
        evenRow = evenRow ? false : true;
    }
}

template <int SRC_T, int DST_T, int ROWS, int COLS, int NPC, int WORDWIDTH_SRC, int WORDWIDTH_DST, int TC>
void KernUyvy2Iyuv(xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _uyvy,
                   xf::cv::Mat<DST_T, ROWS, COLS, NPC>& y_plane,
                   xf::cv::Mat<DST_T, ROWS / 4, COLS, NPC>& u_plane,
                   xf::cv::Mat<DST_T, ROWS / 4, COLS, NPC>& v_plane,
                   uint16_t height,
                   uint16_t width) {
    XF_SNAME(WORDWIDTH_SRC) uy, vy;
    bool evenRow = true, evenBlock = true;
    XF_PTNAME(XF_8UP) y1, y2, u, v;
    unsigned long long int idx = 0, idx1 = 0;

RowLoop:
    for (int i = 0; i < height; i++) {
// clang-format off
        #pragma HLS LOOP_FLATTEN off
        #pragma HLS LOOP_TRIPCOUNT min=ROWS max=ROWS
    // clang-format on
    ColLoop:
        for (int j = 0; j < width; j += 2) {
// clang-format off
            #pragma HLS pipeline
            #pragma HLS LOOP_TRIPCOUNT min=TC max=TC
            // clang-format on

            uy = _uyvy.read(i * width + j);
            vy = _uyvy.read(i * width + j + 1);

            y1 = uy.range(15, 8);

            y_plane.write(idx1, y1);
            idx1++;
            if (evenRow) u = uy.range(7, 0);

            y2 = vy.range(15, 8);

            y_plane.write(idx1, y2);
            idx1++;
            if (evenRow) v = vy.range(7, 0);

            if (evenRow) {
                u_plane.write(idx, u);
                v_plane.write(idx, v);
                idx++;
            }
        }

        evenRow = evenRow ? false : true;
    }
}

// Uyvy2Nv12
template <int SRC_T,
          int Y_T,
          int UV_T,
          int ROWS,
          int COLS,
          int NPC,
          int NPC_UV,
          int WORDWIDTH_SRC,
          int WORDWIDTH_Y,
          int WORDWIDTH_UV,
          int TC>
void KernUyvy2Nv12(xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& uyvy,
                   xf::cv::Mat<Y_T, ROWS, COLS, NPC>& y_plane,
                   xf::cv::Mat<UV_T, ROWS / 2, COLS / 2, NPC_UV>& uv_plane,
                   uint16_t height,
                   uint16_t width) {
    XF_SNAME(WORDWIDTH_SRC) uy, vy;
    XF_PTNAME(XF_8UP) y1, y2;
    XF_SNAME(WORDWIDTH_UV) uv;
    bool evenRow = true;
    unsigned long long int idx = 0, idx1 = 0;
RowLoop:
    for (int i = 0; i < height; i++) {
// clang-format off
        #pragma HLS LOOP_FLATTEN off
        #pragma HLS LOOP_TRIPCOUNT min=ROWS max=ROWS
    // clang-format on
    ColLoop:
        for (int j = 0; j < width; j += 2) {
// clang-format off
            #pragma HLS pipeline
            #pragma HLS LOOP_TRIPCOUNT min=TC max=TC
            // clang-format on

            uy = uyvy.read(i * width + j);
            vy = uyvy.read(i * width + j + 1);

            y1 = uy.range(15, 8);
            if (evenRow) uv.range(7, 0) = uy.range(7, 0);

            y2 = vy.range(15, 8);
            if (evenRow) uv.range(15, 8) = vy.range(7, 0);

            y_plane.write(idx1, y1);
            idx1++;
            y_plane.write(idx1, y2);
            idx1++;
            if (evenRow) {
                uv_plane.write(idx, uv);
                idx++;
            }
        }
        evenRow = evenRow ? false : true;
    }
}
// Uyvy2Nv12
template <int SRC_T,
          int Y_T,
          int UV_T,
          int ROWS,
          int COLS,
          int NPC,
          int NPC_UV,
          int WORDWIDTH_SRC,
          int WORDWIDTH_Y,
          int WORDWIDTH_UV,
          int TC>
void KernUyvy2Nv21(xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& uyvy,
                   xf::cv::Mat<Y_T, ROWS, COLS, NPC>& y_plane,
                   xf::cv::Mat<UV_T, ROWS / 2, COLS / 2, NPC_UV>& uv_plane,
                   uint16_t height,
                   uint16_t width) {
    XF_SNAME(WORDWIDTH_SRC) uy, vy;
    XF_PTNAME(XF_8UP) y1, y2;
    XF_SNAME(WORDWIDTH_UV) uv;
    bool evenRow = true;
    unsigned long long int idx = 0, idx1 = 0;
RowLoop:
    for (int i = 0; i < height; i++) {
// clang-format off
        #pragma HLS LOOP_FLATTEN off
        #pragma HLS LOOP_TRIPCOUNT min=ROWS max=ROWS
    // clang-format on
    ColLoop:
        for (int j = 0; j < width; j += 2) {
// clang-format off
            #pragma HLS pipeline
            #pragma HLS LOOP_TRIPCOUNT min=TC max=TC
            // clang-format on

            uy = uyvy.read(i * width + j);
            vy = uyvy.read(i * width + j + 1);

            y1 = uy.range(15, 8);
            if (evenRow) uv.range(7, 0) = vy.range(7, 0);

            y2 = vy.range(15, 8);
            if (evenRow) uv.range(15, 8) = uy.range(7, 0);

            y_plane.write(idx1, y1);
            idx1++;
            y_plane.write(idx1, y2);
            idx1++;
            if (evenRow) {
                uv_plane.write(idx, uv);
                idx++;
            }
        }
        evenRow = evenRow ? false : true;
    }
}
// Uyvy2Rgba
template <int SRC_T, int DST_T, int ROWS, int COLS, int NPC, int WORDWIDTH_SRC, int WORDWIDTH_DST, int TC>
void KernUyvy2Rgba(xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _uyvy,
                   xf::cv::Mat<DST_T, ROWS, COLS, NPC>& _rgba,
                   uint16_t height,
                   uint16_t width) {
    XF_SNAME(WORDWIDTH_DST) rgba;

    XF_SNAME(WORDWIDTH_SRC) uyvy;

    XF_SNAME(WORDWIDTH_SRC) uy;
    XF_SNAME(WORDWIDTH_SRC) vy;
    unsigned long long int idx = 0;
    XF_PTNAME(XF_8UP) r, g, b;
    int8_t y1, y2, u, v;
    int32_t V2Rtemp, U2Gtemp, V2Gtemp, U2Btemp;

RowLoop:
    for (int i = 0; i < height; i++) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=ROWS max=ROWS
        #pragma HLS LOOP_FLATTEN off
    // clang-format on
    ColLoop:
        for (int j = 0; j < width; j += 2) {
// clang-format off
            #pragma HLS LOOP_TRIPCOUNT min=TC max=TC
            #pragma HLS pipeline
            // clang-format on

            uy = _uyvy.read(i * width + j);
            vy = _uyvy.read(i * width + j + 1);

            u = (uint8_t)uy.range(7, 0) - 128;

            /*          if(uyvy.range(15,8) > 16)
                                            y1 = (uint8_t)uyvy.range(15,8) - 16;
                                    else
                                            y1 = 0;*/

            y1 = (uy.range(15, 8) > 16) ? ((uint8_t)uy.range(15, 8) - 16) : 0;

            v = (uint8_t)vy.range(7, 0) - 128;

            /*          if(uyvy.range(31,24) > 16)
                                            y2 = ((uint8_t)uyvy.range(31,24) - 16);
                                    else
                                            y2 = 0;*/
            y2 = (vy.range(15, 8) > 16) ? ((uint8_t)vy.range(15, 8) - 16) : 0;

            V2Rtemp = v * (short int)V2R;
            U2Gtemp = (short int)U2G * u;
            V2Gtemp = (short int)V2G * v;
            U2Btemp = u * (short int)U2B;

            r = CalculateR(y1, V2Rtemp, v);
            g = CalculateG(y1, U2Gtemp, V2Gtemp);
            b = CalculateB(y1, U2Btemp, u);

            rgba = ((ap_uint32_t)r) | ((ap_uint32_t)g << 8) | ((ap_uint32_t)b << 16) | (0xFF000000);
            _rgba.write(idx, rgba);
            idx++;

            r = CalculateR(y2, V2Rtemp, v);
            g = CalculateG(y2, U2Gtemp, V2Gtemp);
            b = CalculateB(y2, U2Btemp, u);

            rgba = ((ap_uint32_t)r) | ((ap_uint32_t)g << 8) | ((ap_uint32_t)b << 16) | (0xFF000000);
            _rgba.write(idx, rgba);
            idx++;
        }
    }
}
#endif