Program Listing for File xf_resize_nn_bilinear.hpp

Return to documentation for file (/tmp/ws/src/vitis_common/include/imgproc/xf_resize_nn_bilinear.hpp)

/*
 * Copyright 2019 Xilinx, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef _XF_RESIZE_NN_BILINEAR_
#define _XF_RESIZE_NN_BILINEAR_

#include "hls_stream.h"
#include "ap_int.h"
#include "../common/xf_common.hpp"
#include "../common/xf_utility.hpp"
#ifndef __SYNTHESIS__
#include <iostream>
#endif

template <int DEPTH, int INTERPOLATION_TYPE, int NPPC>
void interpolatePixel(XF_CTUNAME(DEPTH, NPPC) A0,
                      XF_CTUNAME(DEPTH, NPPC) B0,
                      XF_CTUNAME(DEPTH, NPPC) A1,
                      XF_CTUNAME(DEPTH, NPPC) B1,
                      ap_ufixed<12, 2> Wx,
                      ap_ufixed<12, 2> Wy,
                      XF_CTUNAME(DEPTH, NPPC) & pixel) {
// clang-format off
    #pragma HLS inline
    // clang-format on
    if (INTERPOLATION_TYPE == XF_INTERPOLATION_NN) {
        pixel = A0;
    } else {
        ap_ufixed<12, 2> Wxy;
        ap_int<16> val0, val1, val2;
        ap_fixed<28, 18> P1, P2, P3, P4;
        ap_ufixed<28, 18> one_num = 1.0;

        Wxy = (Wx * Wy); // Wx - 0.32, Wy-0.32  (Wx*Wy-0.64)  Wxy - 0.32
        val0 = (A0 + B1 - (B0 + A1));
        val1 = (B0 - A0);
        val2 = (A1 - A0);

        P1 = (val0 * Wxy); // val0(16.0) * Wxy(0.32) = P1(16.32)
        P2 = (val1 * Wy);  // val1(16.0) * Wy(0.32) = P2(16.32)
        P3 = (val2 * Wx);  // val1(16.0) * Wx(0.32) = P3(16.32)
        P4 = (A0);         // A0(8.0) P4(8.32)

        pixel = (XF_CTUNAME(DEPTH, NPPC))((ap_fixed<32, 22>)(P1 + P2 + P3 + P4));
        // to get only integer part from sum of 8.32's , right shift by 32
    }
}
template <int DEPTH,
          int INTERPOLATION_TYPE,
          int NPPC,
          int T_INDEX_INT,
          int NUMBEROFINPUTWORDS,
          int WEIGHT_WIDTH,
          int WEIGHT_INT>
void computeOutputPixel(XF_TNAME(DEPTH, NPPC) A0[NUMBEROFINPUTWORDS],
                        XF_TNAME(DEPTH, NPPC) B0[NUMBEROFINPUTWORDS],
                        ap_uint<T_INDEX_INT> initIndex,
                        ap_uint<T_INDEX_INT> indexx[XF_NPIXPERCYCLE(NPPC)],
                        ap_ufixed<WEIGHT_WIDTH, WEIGHT_INT> Wx[XF_NPIXPERCYCLE(NPPC)],
                        ap_ufixed<WEIGHT_WIDTH, WEIGHT_INT> Wy,
                        XF_TNAME(DEPTH, NPPC) & pixel) {
// clang-format off
    #pragma HLS inline
    // clang-format on
    const int PIXELDEPTH = XF_DTPIXELDEPTH(DEPTH, NPPC);
    /*if(indexx[XF_NPIXPERCYCLE(NPPC)-1] > (initIndex+NUMBEROFINPUTWORDS*XF_NPIXPERCYCLE(NPPC)-1))
            {
                    fprintf(stderr, "Insufficient number of words to resize in X\n");
                    return;
            }*/
    assert((indexx[XF_NPIXPERCYCLE(NPPC) - 1] < (initIndex + NUMBEROFINPUTWORDS * XF_NPIXPERCYCLE(NPPC) - 1)) &&
           "Insufficient number of words to resize in X");

    XF_PTUNAME(DEPTH) unpackX1[XF_NPIXPERCYCLE(NPPC) * NUMBEROFINPUTWORDS];
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=unpackX1 complete dim=1
    // clang-format on
    XF_PTUNAME(DEPTH) unpackX2[XF_NPIXPERCYCLE(NPPC) * NUMBEROFINPUTWORDS];
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=unpackX2 complete dim=1
    // clang-format on
    XF_PTUNAME(DEPTH) outputPixel[XF_NPIXPERCYCLE(NPPC)];
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=outputPixel complete dim=1
    // clang-format on
    for (int k = 0; k < NUMBEROFINPUTWORDS; k++) {
// clang-format off
        #pragma HLS UNROLL
        // clang-format on
        for (int i = 0; i < XF_NPIXPERCYCLE(NPPC); i++) {
// clang-format off
            #pragma HLS UNROLL
            // clang-format on
            unpackX1[k * XF_NPIXPERCYCLE(NPPC) + i] =
                A0[k].range((i + 1) * XF_DTPIXELDEPTH(DEPTH, NPPC) * XF_CHANNELS(DEPTH, NPPC) - 1,
                            i * XF_DTPIXELDEPTH(DEPTH, NPPC) * XF_CHANNELS(DEPTH, NPPC));
            unpackX2[k * XF_NPIXPERCYCLE(NPPC) + i] =
                B0[k].range((i + 1) * XF_DTPIXELDEPTH(DEPTH, NPPC) * XF_CHANNELS(DEPTH, NPPC) - 1,
                            i * XF_DTPIXELDEPTH(DEPTH, NPPC) * XF_CHANNELS(DEPTH, NPPC));
        }
    }
    for (int i = 0; i < XF_NPIXPERCYCLE(NPPC); i++) {
// clang-format off
        #pragma HLS UNROLL
        // clang-format on

        for (int k = 0; k < XF_CHANNELS(DEPTH, NPPC); k++) {
// clang-format off
            #pragma HLS UNROLL
            // clang-format on
            XF_CTUNAME(DEPTH, NPPC) unpackX1temp[XF_NPIXPERCYCLE(NPPC) * NUMBEROFINPUTWORDS];
// clang-format off
            #pragma HLS ARRAY_PARTITION variable=unpackX1temp complete dim=1
            // clang-format on
            XF_CTUNAME(DEPTH, NPPC) unpackX2temp[XF_NPIXPERCYCLE(NPPC) * NUMBEROFINPUTWORDS];
// clang-format off
            #pragma HLS ARRAY_PARTITION variable=unpackX2temp complete dim=1
            // clang-format on
            for (int l = 0; l < XF_NPIXPERCYCLE(NPPC) * NUMBEROFINPUTWORDS; l++) {
// clang-format off
                #pragma HLS UNROLL
                // clang-format on
                unpackX1temp[l] = unpackX1[l].range((k + 1) * PIXELDEPTH - 1, k * PIXELDEPTH);
                unpackX2temp[l] = unpackX2[l].range((k + 1) * PIXELDEPTH - 1, k * PIXELDEPTH);
            }
            XF_CTUNAME(DEPTH, NPPC) currentoutput;
            interpolatePixel<DEPTH, INTERPOLATION_TYPE, NPPC>(
                unpackX1temp[indexx[i] - initIndex], unpackX2temp[indexx[i] - initIndex],
                unpackX1temp[indexx[i] - initIndex + 1], unpackX2temp[indexx[i] - initIndex + 1], Wx[i], Wy,
                currentoutput);
            outputPixel[i].range((k + 1) * PIXELDEPTH - 1, k * PIXELDEPTH) = currentoutput;
        }
    }

    for (int i = 0; i < XF_NPIXPERCYCLE(NPPC); i++) {
// clang-format off
        #pragma HLS UNROLL
        // clang-format on
        pixel.range((i + 1) * XF_DTPIXELDEPTH(DEPTH, NPPC) * XF_CHANNELS(DEPTH, NPPC) - 1,
                    i * XF_DTPIXELDEPTH(DEPTH, NPPC) * XF_CHANNELS(DEPTH, NPPC)) = outputPixel[i];
    }
}
static uint64_t xfUDivResize(uint64_t in_n, unsigned short in_d) {
// clang-format off
    #pragma HLS INLINE OFF
    // clang-format on
    uint64_t out_res = in_n / in_d;
    return out_res;
}

template <int NPPC, int T_SCALE_WIDTH, int T_SCALE_INT, int T_COMP_INDEX_WIDTH, int T_COMP_INDEX_INT>
void scaleMult(ap_ufixed<T_SCALE_WIDTH, T_SCALE_INT> scalex,
               ap_fixed<T_COMP_INDEX_WIDTH, T_COMP_INDEX_INT> scaleXParallel[XF_NPIXPERCYCLE(NPPC)]) {
// clang-format off
    #pragma HLS INLINE
    // clang-format on
    for (int i = 0; i < XF_NPIXPERCYCLE(NPPC); i++) {
// clang-format off
        #pragma HLS PIPELINE
        // clang-format on
        scaleXParallel[i] = (ap_fixed<T_COMP_INDEX_WIDTH, T_COMP_INDEX_INT>)scalex * (ap_uint<8>)i;
    }
    return;
}
template <int T_INDEX_INT,
          int T_COMP_INDEX_WIDTH,
          int T_COMP_INDEX_INT,
          int T_SCALE_WIDTH,
          int T_SCALE_INT,
          int INTERPOLATION_TYPE>
void scaleCompute(int currindex,
                  ap_ufixed<T_SCALE_WIDTH, T_SCALE_INT> inscale,
                  ap_fixed<T_COMP_INDEX_WIDTH, T_COMP_INDEX_INT>& ind_pre) {
    if (INTERPOLATION_TYPE == XF_INTERPOLATION_NN) {
        ind_pre = (ap_fixed<T_COMP_INDEX_WIDTH, T_COMP_INDEX_INT>)currindex * inscale +
                  (ap_fixed<T_COMP_INDEX_WIDTH, T_COMP_INDEX_INT>)0.001;

    } else {
        ind_pre = ((ap_fixed<T_COMP_INDEX_WIDTH, T_COMP_INDEX_INT>)currindex +
                   (ap_fixed<T_COMP_INDEX_WIDTH, T_COMP_INDEX_INT>)0.5) *
                      inscale -
                  (ap_fixed<T_COMP_INDEX_WIDTH, T_COMP_INDEX_INT>)0.5;
    }
}
template <int INTERPOLATION_TYPE,
          int T_COMP_INDEX_WIDTH,
          int T_COMP_INDEX_INT,
          int T_INDEX_INT,
          int T_SCALE_WIDTH,
          int T_SCALE_INT,
          int T_WEIGHT_WIDTH,
          int T_WEIGHT_INT,
          int NPPC>
void computeInterpolation(int inrows,
                          int incols,
                          int j,
                          int output_rows_count,
                          ap_ufixed<T_SCALE_WIDTH, T_SCALE_INT> scalex,
                          ap_fixed<T_COMP_INDEX_WIDTH, T_COMP_INDEX_INT> scaleXParallel[XF_NPIXPERCYCLE(NPPC)],
                          ap_ufixed<T_SCALE_WIDTH, T_SCALE_INT> scaley,
                          ap_uint<T_INDEX_INT> indexx[XF_NPIXPERCYCLE(NPPC)],
                          ap_uint<T_INDEX_INT>& indexy,
                          ap_uint<T_INDEX_INT>& nextYScale,
                          ap_ufixed<T_WEIGHT_WIDTH, T_WEIGHT_INT> WeightX[XF_NPIXPERCYCLE(NPPC)],
                          ap_ufixed<T_WEIGHT_WIDTH, T_WEIGHT_INT>& WeightY,
                          ap_fixed<T_COMP_INDEX_WIDTH, T_COMP_INDEX_INT> indexx_pre_comp,
                          ap_fixed<T_COMP_INDEX_WIDTH, T_COMP_INDEX_INT> indexy_pre_comp) {
    const int INDEX_INT = T_INDEX_INT;
    const int WEIGHT_WIDTH = T_WEIGHT_WIDTH;
    const int WEIGHT_INT = T_WEIGHT_INT;
    const int SCALE_WIDTH = T_SCALE_WIDTH;
    const int SCALE_INT = T_SCALE_INT;
    const int COMP_INDEX_WIDTH = T_COMP_INDEX_WIDTH;
    const int COMP_INDEX_INT = T_COMP_INDEX_INT;

    ap_fixed<COMP_INDEX_WIDTH, COMP_INDEX_INT> indexx_pre = 0;
    ap_fixed<COMP_INDEX_WIDTH, COMP_INDEX_INT> indexy_pre = 0;
    if (INTERPOLATION_TYPE == XF_INTERPOLATION_NN) {
        indexy_pre = indexy_pre_comp;
        nextYScale = indexy_pre + scaley;
        indexy = (ap_uint<INDEX_INT>)indexy_pre;
    } else {
        indexy_pre = indexy_pre_comp;
        nextYScale = indexy_pre + (ap_fixed<COMP_INDEX_WIDTH, COMP_INDEX_INT>)scaley;
        if (indexy_pre < 0) {
            indexy_pre = 0;
        } else if (indexy_pre > inrows - 1) {
            indexy_pre = inrows - 1;
        }
        indexy = (ap_uint<INDEX_INT>)indexy_pre;
        WeightY = ((ap_fixed<COMP_INDEX_WIDTH, COMP_INDEX_INT>)indexy_pre -
                   (ap_fixed<COMP_INDEX_WIDTH, COMP_INDEX_INT>)indexy);
    }
    //  fprintf(stderr,"\nIndexX:");
    for (int i = 0; i < XF_NPIXPERCYCLE(NPPC); i++) {
        ap_fixed<COMP_INDEX_WIDTH, COMP_INDEX_INT> indexy_pre = 0;
        if (INTERPOLATION_TYPE == XF_INTERPOLATION_NN) {
            indexx_pre = indexx_pre_comp + scaleXParallel[i];
            indexx[i] = (ap_uint<INDEX_INT>)indexx_pre;
        } else {
            indexx_pre = indexx_pre_comp + scaleXParallel[i];
            if (indexx_pre < 0) {
                indexx_pre = 0;
            } else if (indexx_pre > incols - 1) {
                indexx_pre = incols - 1;
            }
            indexx[i] = (ap_uint<INDEX_INT>)indexx_pre;
            WeightX[i] = ((ap_fixed<COMP_INDEX_WIDTH, COMP_INDEX_INT>)indexx_pre -
                          (ap_fixed<COMP_INDEX_WIDTH, COMP_INDEX_INT>)indexx[i]);
        }
        //  fprintf(stderr,"\t%d(%f)<%f>",(int)indexx[i],(float)indexx_pre,(float)WeightX[i]);
    }
}

template <int SRC_TYPE,
          int INHEIGHT,
          int INWIDTH,
          int NPPC,
          int OUTHEIGHT,
          int OUTWIDTH,
          int INTERPOLATION_TYPE,
          int MAX_DOWN_SCALE>
void resizeNNBilinear(xf::cv::Mat<SRC_TYPE, INHEIGHT, INWIDTH, NPPC>& imgInput,
                      xf::cv::Mat<SRC_TYPE, OUTHEIGHT, OUTWIDTH, NPPC>& imgOutput) {
    const int INDEX_INT = 17;
    const int WEIGHT_WIDTH = 48;
    const int WEIGHT_INT = 16;
    const int SCALE_WIDTH = 48;
    const int SCALE_INT = 16;
    const int PRE_INDEX_WIDTH = 10;
    const int PRE_INDEX_INT = 17;
    const int COMP_INDEX_WIDTH = 42; // SCALE_WIDTH+PRE_INDEX_WIDTH;
    const int COMP_INDEX_INT = 20;   // SCALE_INT+PRE_INDEX_INT;

    const int BUFFER_WORDS = MAX_DOWN_SCALE;
    const int BUFFER_DUP_FACTOR = (BUFFER_WORDS + 1) >> 1;

    uint64_t xnew, ynew;

    xnew = (imgInput.cols);
    ynew = (imgInput.rows); //(float)(out_height);

    xnew = xnew << 32;
    ynew = ynew << 32;
    ap_ufixed<SCALE_WIDTH, SCALE_INT> scalex, scaley;
    uint64_t Xscale64, Yscale64; // Q32.32
    Xscale64 = xfUDivResize(xnew, (imgOutput.cols));
    Yscale64 = xfUDivResize(ynew, (imgOutput.rows));
    ap_ufixed<64, 32> temp_scale_conv;

// clang-format off
    #pragma HLS ALLOCATION function instances=scaleCompute<INDEX_INT, COMP_INDEX_WIDTH, COMP_INDEX_INT, SCALE_WIDTH, SCALE_INT, INTERPOLATION_TYPE> limit=1
    #pragma HLS ALLOCATION function instances=xfUDivResize limit=1
    // clang-format on

    temp_scale_conv = *(ap_ufixed<64, 32>*)&Xscale64;
    scalex = temp_scale_conv;

    temp_scale_conv = *(ap_ufixed<64, 32>*)&Yscale64;
    scaley = temp_scale_conv;

    int imgInput_cols_align_npc = ((imgInput.cols + (NPPC - 1)) >> XF_BITSHIFT(NPPC)) << XF_BITSHIFT(NPPC);
    int imgOutput_cols_align_npc = ((imgOutput.cols + (NPPC - 1)) >> XF_BITSHIFT(NPPC)) << XF_BITSHIFT(NPPC);

    ap_fixed<COMP_INDEX_WIDTH, COMP_INDEX_INT> scaleXParallel[XF_NPIXPERCYCLE(NPPC)];
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=scaleXParallel complete dim=1
    // clang-format on
    scaleMult<NPPC, SCALE_WIDTH, SCALE_INT, COMP_INDEX_WIDTH, COMP_INDEX_INT>(scalex, scaleXParallel);

    XF_TNAME(SRC_TYPE, NPPC) line_buffer[3][BUFFER_DUP_FACTOR][(INWIDTH + NPPC - 1) >> (XF_BITSHIFT(NPPC))];
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=line_buffer complete dim=1
    #pragma HLS ARRAY_PARTITION variable=line_buffer complete dim=2
    // clang-format on
    int input_read_pointer = 0;
    int read_rows_count = 0;
    int output_write_pointer = 0;
    for (int i = 0; i < 2; i++) // read two rows
    {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=1 max=2
        // clang-format on
        for (int j = 0; j < (imgInput_cols_align_npc >> (XF_BITSHIFT(NPPC))); j++) {
// clang-format off
            #pragma HLS PIPELINE
            #pragma HLS LOOP_TRIPCOUNT min=1 max=INWIDTH/NPPC
            XF_TNAME(SRC_TYPE, NPPC) read_word = imgInput.read(input_read_pointer);
            // clang-format on
            for (int k = 0; k < BUFFER_DUP_FACTOR; k++) {
                line_buffer[i][k][j] = read_word;
            }
            input_read_pointer++;
        }
        read_rows_count++;
    }
    int output_rows_count = 0;
    int first_row_index = 0;
    int second_row_index = 1;
    int read_row_index = 2;
    int loop_row_count = (imgOutput.rows > imgInput.rows) ? imgOutput.rows : imgInput.rows;
    int loop_col_count =
        (imgOutput_cols_align_npc > imgInput_cols_align_npc) ? imgOutput_cols_align_npc : imgInput_cols_align_npc;
    const int LOOPCOUNTROW = (INHEIGHT > OUTHEIGHT) ? INHEIGHT : OUTHEIGHT;
    const int LOOPCOUNTCOL = (INWIDTH > OUTWIDTH) ? INWIDTH : OUTWIDTH;
    ap_uint<INDEX_INT> indexx[XF_NPIXPERCYCLE(NPPC)];
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=indexx complete dim=1
    // clang-format on
    ap_uint<INDEX_INT> indexy = 0;
    ap_uint<INDEX_INT> nextYScale = 0;
    ap_ufixed<WEIGHT_WIDTH, WEIGHT_INT> WeightX[XF_NPIXPERCYCLE(NPPC)];
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=WeightX complete dim=1
    // clang-format on
    ap_ufixed<WEIGHT_WIDTH, WEIGHT_INT> WeightY = 0;
    XF_TNAME(SRC_TYPE, NPPC) P0Buf[BUFFER_DUP_FACTOR << 1];
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=P0Buf complete dim=1
    // clang-format on
    XF_TNAME(SRC_TYPE, NPPC) P1Buf[BUFFER_DUP_FACTOR << 1];
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=P1Buf complete dim=1
    // clang-format on

    ap_fixed<COMP_INDEX_WIDTH, COMP_INDEX_INT> indexx_pre_comp = 0;
    ap_fixed<COMP_INDEX_WIDTH, COMP_INDEX_INT> indexy_pre_comp = 0;

    for (int i = 0; i < loop_row_count; i++) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=1 max=LOOPCOUNTROW
        // clang-format on

        scaleCompute<INDEX_INT, COMP_INDEX_WIDTH, COMP_INDEX_INT, SCALE_WIDTH, SCALE_INT, INTERPOLATION_TYPE>(
            output_rows_count, scaley, indexy_pre_comp);
        for (int j = 0; j < (loop_col_count >> (XF_BITSHIFT(NPPC))); j++) {
// clang-format off
            #pragma HLS PIPELINE
            #pragma HLS LOOP_TRIPCOUNT min=1 max=LOOPCOUNTCOL/NPPC
            // clang-format on

            scaleCompute<INDEX_INT, COMP_INDEX_WIDTH, COMP_INDEX_INT, SCALE_WIDTH, SCALE_INT, INTERPOLATION_TYPE>(
                j << (XF_BITSHIFT(NPPC)), scalex, indexx_pre_comp);

            computeInterpolation<INTERPOLATION_TYPE, COMP_INDEX_WIDTH, COMP_INDEX_INT, INDEX_INT, SCALE_WIDTH,
                                 SCALE_INT, WEIGHT_WIDTH, WEIGHT_INT, NPPC>(
                imgInput.rows, imgInput.cols, j << (XF_BITSHIFT(NPPC)), output_rows_count, scalex, scaleXParallel,
                scaley, indexx, indexy, nextYScale, WeightX, WeightY, indexx_pre_comp, indexy_pre_comp);
            int indexstores = first_row_index;
            XF_TNAME(SRC_TYPE, NPPC) read_pixel;
            bool flag_write = 0;
            if (read_rows_count != imgInput.rows) {
                if ((nextYScale >= read_rows_count - 1)) // check if the next index y needed needs to be read.
                {
                    if (j < (imgInput_cols_align_npc >> (XF_BITSHIFT(NPPC)))) {
                        read_pixel = imgInput.read(input_read_pointer);
                        flag_write = 1;
                        input_read_pointer++;
                    } else {
                        flag_write = 0;
                    }
                } else {
                    flag_write = 0;
                }
            } else {
                flag_write = 0;
            }

            if (indexstores == 0) {
                for (int k = 0; k < BUFFER_DUP_FACTOR; k++) {
// clang-format off
                    #pragma HLS UNROLL
                    // clang-format on
                    int idx = (indexx[0] >> XF_BITSHIFT(NPPC)) + (k << 1);
                    int idx_nxt = idx + (indexx[0] == (imgInput.cols - 1) ? 0 : 1);

                    P0Buf[(k << 1)] = line_buffer[0][k][idx];
                    P0Buf[(k << 1) + 1] = line_buffer[0][k][idx_nxt];
                    P1Buf[(k << 1)] = line_buffer[1][k][idx];
                    P1Buf[(k << 1) + 1] = line_buffer[1][k][idx_nxt];
                }
                if (flag_write) {
                    for (int k = 0; k < BUFFER_DUP_FACTOR; k++) {
// clang-format off
                        #pragma HLS UNROLL
                        // clang-format on
                        line_buffer[2][k][j] = read_pixel;
                    }
                }
            } else if (indexstores == 1) {
                for (int k = 0; k < BUFFER_DUP_FACTOR; k++) {
// clang-format off
                    #pragma HLS UNROLL
                    // clang-format on
                    int idx = (indexx[0] >> XF_BITSHIFT(NPPC)) + (k << 1);
                    int idx_nxt = idx + (indexx[0] == (imgInput.cols - 1) ? 0 : 1);

                    P0Buf[(k << 1)] = line_buffer[1][k][idx];
                    P0Buf[(k << 1) + 1] = line_buffer[1][k][idx_nxt];
                    P1Buf[(k << 1)] = line_buffer[2][k][idx];
                    P1Buf[(k << 1) + 1] = line_buffer[2][k][idx_nxt];
                }
                if (flag_write) {
                    for (int k = 0; k < BUFFER_DUP_FACTOR; k++) {
// clang-format off
                        #pragma HLS UNROLL
                        // clang-format on
                        line_buffer[0][k][j] = read_pixel;
                    }
                }
            } else {
                for (int k = 0; k < BUFFER_DUP_FACTOR; k++) {
// clang-format off
                    #pragma HLS UNROLL
                    // clang-format on
                    int idx = (indexx[0] >> XF_BITSHIFT(NPPC)) + (k << 1);
                    int idx_nxt = idx + (indexx[0] == (imgInput.cols - 1) ? 0 : 1);

                    P0Buf[(k << 1)] = line_buffer[2][k][idx];
                    P0Buf[(k << 1) + 1] = line_buffer[2][k][idx_nxt];
                    P1Buf[(k << 1)] = line_buffer[0][k][idx];
                    P1Buf[(k << 1) + 1] = line_buffer[0][k][idx_nxt];
                }
                if (flag_write) {
                    for (int k = 0; k < BUFFER_DUP_FACTOR; k++) {
// clang-format off
                        #pragma HLS UNROLL
                        // clang-format on
                        line_buffer[1][k][j] = read_pixel;
                    }
                }
            }
            if ((output_rows_count <= imgOutput.rows - 1) &&
                (((indexy == read_rows_count - 1) && (read_rows_count == imgInput.rows)) ||
                 (indexy == read_rows_count - 2))) {
                if (j < (imgOutput_cols_align_npc >> (XF_BITSHIFT(NPPC)))) {
                    if (indexy == read_rows_count - 1) {
                        for (int k = 0; k < BUFFER_WORDS; k++) {
// clang-format off
                            #pragma HLS UNROLL
                            // clang-format on
                            P0Buf[k] = P1Buf[k];
                        }
                    }
                    XF_TNAME(SRC_TYPE, NPPC) temp_store_output;
                    computeOutputPixel<SRC_TYPE, INTERPOLATION_TYPE, NPPC, INDEX_INT, BUFFER_WORDS, WEIGHT_WIDTH,
                                       WEIGHT_INT>(P0Buf, P1Buf,
                                                   ((indexx[0] >> XF_BITSHIFT(NPPC)) << XF_BITSHIFT(NPPC)), indexx,
                                                   WeightX, WeightY, temp_store_output);
                    imgOutput.write(output_write_pointer, temp_store_output);
                    output_write_pointer++;
                }
            }
        }
        if ((output_rows_count <= imgOutput.rows - 1) &&
            (((indexy == read_rows_count - 1) && (read_rows_count == imgInput.rows)) ||
             (indexy == read_rows_count - 2))) {
            output_rows_count++;
        }
        if (read_rows_count != imgInput.rows) {
            if ((nextYScale >= read_rows_count - 1)) // check if the next index y needed needs to be read.
            {
                first_row_index++;
                second_row_index++;
                read_row_index++;
                if (read_row_index == 3) {
                    read_row_index = 0;
                }
                if (first_row_index == 3) {
                    first_row_index = 0;
                }
                if (second_row_index == 3) {
                    second_row_index = 0;
                }
                read_rows_count++;
            }
        }
    }
}
#endif