Program Listing for File xf_resize_down_area.hpp

↰ Return to documentation for file (include/imgproc/xf_resize_down_area.hpp)
/*
 * Copyright 2019 Xilinx, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef _XF_RESIZE_DOWN_AREA_
#define _XF_RESIZE_DOWN_AREA_

#include "hls_stream.h"
#include "ap_int.h"
#include "../common/xf_common.hpp"
#include "../core/xf_math.h"
#include "../common/xf_utility.hpp"

#define AREADOWN_PARTIAL_RESULT_BITS 16

static uint32_t xFUdivResizeDownArea(unsigned short in_n, unsigned short in_d) {
    uint32_t out_div = uint32_t(in_n) * POW16 / in_d;
    return out_div;
}

template <int NUM_INPB, int LOG2_PB, int NUM_PB, int NPC>
static void flag_index_generator(ap_uint<32> Xscale,
                                 ap_uint<32> X_1PixelWeight,
                                 ap_uint<32> Y_1PixelWeight,
                                 ap_uint<16> row_index,
                                 int in_col_index,
                                 ap_uint<32> Xindex_output[NUM_PB],
                                 ap_uint<32>* Xindex_output_next,
                                 ap_uint<16> output_buffer_index[NUM_PB + 1],
                                 bool inflag_TA[NUM_PB][NUM_INPB],
                                 ap_uint<16>* skip_count,
                                 ap_uint<17> Wx[NUM_PB][NUM_INPB],
                                 bool* inflag_for_Nplus1_Procblock,
                                 ap_uint<17>* Wx_for_Nplus1_Procblock,
                                 bool* DDR_wr_en,
                                 bool* out_buffer_wr_en,
                                 bool Yaxis_overlap_en,
                                 ap_uint<32> Yindex_output,
                                 ap_uint<32> Yindex_output_prev,
                                 ap_uint<16> ouput_index_write_counter,
                                 unsigned short in_height,
                                 unsigned short in_width,
                                 unsigned short inImg_ncpr,
                                 ap_uint<16>* output_buffer_index_next_out) {
// clang-format off
    #pragma HLS inline
    // clang-format on

    ap_int<16> skip_count_tmp;
    ap_int<16> skip_count_tmp_opt;
    skip_count_tmp_opt = Xindex_output[0].range(31, 16); // - (Xindex_output[0].range(15,0)<X_1PixelWeight);

    ap_uint<16> output_buffer_index_start;
    if (Xscale == 0x10000)
        output_buffer_index_start = in_col_index * NPC;
    else {
        ap_uint<16> index_fract_value = Xindex_output[0].range(15, 0);
        ap_uint<16> weight_value = X_1PixelWeight.range(15, 0);
        ap_uint<16> sub_value = weight_value - index_fract_value;
        if (index_fract_value < weight_value && sub_value > 0x41)
            output_buffer_index_start = Xindex_output[0].range(31, 16) - 1;
        else
            output_buffer_index_start = Xindex_output[0].range(31, 16);
    }

    for (int pb_in = 0; pb_in < NUM_INPB + 1; pb_in++) {
        output_buffer_index[pb_in] = output_buffer_index_start + pb_in;
    }

    ap_uint<16> output_buffer_index_next;
    if (Xscale == 0x10000)
        output_buffer_index_next = (in_col_index + 1) * NPC;
    else {
        ap_uint<16> index_fract_value = Xindex_output_next[0].range(15, 0);
        ap_uint<16> weight_value = X_1PixelWeight.range(15, 0);
        ap_uint<16> sub_value = weight_value - index_fract_value;
        if (index_fract_value < weight_value && sub_value > 0x41)
            output_buffer_index_next = Xindex_output_next[0].range(31, 16) - 1;
        else
            output_buffer_index_next = Xindex_output_next[0].range(31, 16);
    }
    *output_buffer_index_next_out = output_buffer_index_next;

    ap_uint<16> int_bits_Xindex_out_previous;
    if (in_col_index == 0)
        int_bits_Xindex_out_previous = 0;
    else
        int_bits_Xindex_out_previous = (Xindex_output[0] - X_1PixelWeight - 0x41) >> 16;

    ap_uint<16> fract_bits_Xindex_out_previous;
    if (in_col_index == 0)
        fract_bits_Xindex_out_previous = 0;
    else
        fract_bits_Xindex_out_previous = (ap_uint<16>)(Xindex_output[0] - X_1PixelWeight);

    for (int ta_idx = 0; ta_idx < NUM_PB; ta_idx++) {
// clang-format off
        #pragma HLS unroll
        // clang-format on
        for (int pb_in = 0; pb_in < NUM_INPB; pb_in++) {
// clang-format off
            #pragma HLS unroll
            // clang-format on
            ap_uint<16> int_bits_Xindex_out = (Xindex_output[pb_in] - 0x41) >> 16;
            ap_uint<16> fract_bits_Xindex_out = Xindex_output[pb_in].range(15, 0);
            ap_uint<16> int_bits_Xindex_out_min1;   // = (Xindex_output[pb_in-1]-0x41)>>16;
            ap_uint<16> fract_bits_Xindex_out_min1; // = Xindex_output[pb_in-1].range(15,0);

            if (pb_in == 0) {
                int_bits_Xindex_out_min1 = 0;
                fract_bits_Xindex_out_min1 = 0;
            } else {
                int_bits_Xindex_out_min1 = (Xindex_output[pb_in - 1] - 0x41) >> 16;
                fract_bits_Xindex_out_min1 = Xindex_output[pb_in - 1].range(15, 0);
            }

            ap_uint<16> index_value = output_buffer_index[ta_idx];
            bool t1 = index_value == int_bits_Xindex_out;
            bool t2 = ((int_bits_Xindex_out - int_bits_Xindex_out_previous) == 1) && (pb_in == 0) &&
                      (index_value == int_bits_Xindex_out - 1) && fract_bits_Xindex_out < X_1PixelWeight &&
                      ((X_1PixelWeight - fract_bits_Xindex_out) > 0x41);
            bool t3 = ((int_bits_Xindex_out - int_bits_Xindex_out_min1) == 1) && (pb_in > 0) &&
                      (index_value == int_bits_Xindex_out - 1) && fract_bits_Xindex_out < X_1PixelWeight &&
                      ((X_1PixelWeight - fract_bits_Xindex_out) > 0x41);

            if (((t1 || t2 || t3) && (Xscale != 0x10000)) || ((Xscale == 0x10000) && (pb_in == ta_idx)))
                inflag_TA[ta_idx][pb_in] = 1;
            else
                inflag_TA[ta_idx][pb_in] = 0;
        }
    }

    ap_uint<32> input_index__for_Nplus1_Procblock = output_buffer_index_next * Xscale;
    ap_uint<16> intBits_input_index__for_Nplus1_Procblock = input_index__for_Nplus1_Procblock.range(31, 16);
    ap_uint<16> col_idx_x_NPC = (in_col_index + 1) * NPC;

    ap_uint<32> Xindex_for_Nplus1_Procblock = Xindex_output[NUM_PB - 1]; //+X_1PixelWeight;

    ap_uint<32> overlap_next_pixel = (((ap_uint<32>)output_buffer_index_next) << 16) - Xindex_output[NUM_PB - 1];

    if (NPC != 1) {
        // x scale is less than 1.5, then N+1 output pixel(inclusing partial output) can be generated using N input
        // pixel if( (output_buffer_index_start+4) == output_buffer_index_next && Xscale!=65536 && Xscale<98304 &&
        // overlap_next_pixel>0x41){
        if ((output_buffer_index_start + NPC) == output_buffer_index_next && Xscale != 65536 && Xscale < 98304 &&
            overlap_next_pixel > 0x41) {
            *inflag_for_Nplus1_Procblock = 1;
            *Wx_for_Nplus1_Procblock = Xindex_for_Nplus1_Procblock.range(15, 0);
        } else {
            *inflag_for_Nplus1_Procblock = 0;
            *Wx_for_Nplus1_Procblock = 0;
        }
    } else {
        if ((output_buffer_index_start + NPC) == output_buffer_index_next && overlap_next_pixel > 0x41) {
            *inflag_for_Nplus1_Procblock = 1;
            *Wx_for_Nplus1_Procblock = Xindex_for_Nplus1_Procblock.range(15, 0);
        } else {
            *inflag_for_Nplus1_Procblock = 0;
            *Wx_for_Nplus1_Procblock = 0;
        }
    }

    ap_uint<32> Yindex_output_tmp = Yindex_output; // - 0x41;
    ap_uint<32> overlap_with_next_row = 0x10000 - Yindex_output.range(15, 0);
    ap_uint<32> overlap_with_prev_row = 0x10000 - Yindex_output_prev.range(15, 0);
    ap_uint<32> Yindex_output_prev_tmp = Yindex_output_prev; // - 0x41;

    bool t1 = (ouput_index_write_counter <= output_buffer_index_next);
    bool t2 = Yaxis_overlap_en == 1;
    bool t3 = (Yindex_output_tmp.range(31, 16) != Yindex_output_prev_tmp.range(31, 16));
    bool if_test = t1 && (t2 || t3);
    int current_Yidx_int = Yindex_output_tmp.range(31, 16);
    int next_Yidx_int = Yindex_output_prev_tmp.range(31, 16);

    bool scale1_en = X_1PixelWeight[16] == 1;
    bool write_en_pixel_in_same_row = (ouput_index_write_counter <= output_buffer_index_next);
    bool overlap_en_next_row = (overlap_with_next_row > 0x41);
    bool overlap_en_prev_row = (overlap_with_prev_row > 0x41);
    bool output_row_en =
        (Yaxis_overlap_en == 1 || (Yindex_output_tmp.range(31, 16) != Yindex_output_prev_tmp.range(31, 16)));

    //## Yindex precision error when current index value is close to integeger number.. like 8.999928 is close
    // to 9.00000
    //## precision error upto 10^-2 is accepted.
    ap_uint<32> Yindex_output_precision_error = 0x10000 - Yindex_output.range(15, 0);
    ap_uint<32> Yindex_output_prev_precision_error = 0x10000 - Yindex_output_prev.range(15, 0);

    bool DDR_wr_en_tmp;
    if ((((Yaxis_overlap_en == 1 || (Yindex_output_tmp.range(31, 16) != Yindex_output_prev_tmp.range(31, 16)))) ||
         (Yindex_output_precision_error <= 65)) ||
        row_index == (in_height - 1)) {
        if ((ouput_index_write_counter <= output_buffer_index_next || (in_col_index == (inImg_ncpr)-1)) &&
            (Yindex_output_prev_precision_error > 65))
            DDR_wr_en_tmp = 1;
        else
            DDR_wr_en_tmp = 0;
    } else {
        DDR_wr_en_tmp = 0;
    }

    if (X_1PixelWeight[16] == 1 && Y_1PixelWeight[16] == 1)
        *DDR_wr_en = 1;
    else
        *DDR_wr_en = DDR_wr_en_tmp;

    if ((X_1PixelWeight[16] == 1) || (ouput_index_write_counter <= output_buffer_index_next) ||
        (in_col_index == (inImg_ncpr)-1)) {
        *out_buffer_wr_en = 1;
    } else {
        *out_buffer_wr_en = 0;
    }

    for (int ta_idx = 0; ta_idx < NUM_PB; ta_idx++) {
// clang-format off
        #pragma HLS unroll
        // clang-format on
        for (int pb_in = 0; pb_in < NUM_INPB; pb_in++) {
// clang-format off
            #pragma HLS unroll
            // clang-format on

            bool rangeA_0_to_scale = Xindex_output[pb_in].range(15, 0) <= X_1PixelWeight.range(15, 0); // Q0.16
            ap_uint<16> sub_result = X_1PixelWeight.range(15, 0) - Xindex_output[pb_in].range(15, 0);

            if (rangeA_0_to_scale == true && Xscale != 0x10000) {
                //              if(int_bits_wo_th_for_Wx[pb_in].range(LOG2_PB-1,0) == ta_idx)
                if (output_buffer_index[ta_idx] == Xindex_output[pb_in].range(31, 16))
                    Wx[ta_idx][pb_in] = Xindex_output[pb_in].range(15, 0);
                else
                    Wx[ta_idx][pb_in] = sub_result;
            } else
                Wx[ta_idx][pb_in] = X_1PixelWeight.range(16, 0);
        }
    }
}

template <int SIZE>
void treeAdder(ap_uint<32> in1[SIZE], ap_uint<32>* output) {
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=in1 complete dim=1
    #pragma HLS inline
    // clang-format on

    ap_uint<32> add1_out[SIZE / 2];
    ap_uint<32> add2_out[SIZE / 4];
    ap_uint<32> add3_out[SIZE / 8];
    ap_uint<32> add4_out[SIZE / 16];

    if ((SIZE / 2) != 0) {
        for (ap_uint<10> idx = 0; idx < (SIZE / 2); idx++) {
// clang-format off
            #pragma HLS unroll
            // clang-format on
            add1_out[idx] = in1[2 * idx] + in1[2 * idx + 1];
        }
    }

    if ((SIZE / 4) != 0) {
        for (ap_uint<10> idx = 0; idx < (SIZE / 4); idx++) {
// clang-format off
            #pragma HLS unroll
            // clang-format on
            add2_out[idx] = add1_out[2 * idx] + add1_out[2 * idx + 1];
        }
    }

    if ((SIZE / 8) != 0) {
        for (ap_uint<10> idx = 0; idx < (SIZE / 8); idx++) {
// clang-format off
            #pragma HLS unroll
            // clang-format on
            add3_out[idx] = add2_out[2 * idx] + add2_out[2 * idx + 1];
        }
    }

    if ((SIZE / 16) != 0) {
        for (ap_uint<10> idx = 0; idx < (SIZE / 16); idx++) {
// clang-format off
            #pragma HLS unroll
            // clang-format on
            add4_out[idx] = add3_out[2 * idx] + add3_out[2 * idx + 1];
        }
    }

    ap_uint<32> add_out;
    if ((SIZE / 2) == 1)
        add_out = add1_out[0];
    else if ((SIZE / 4) == 1)
        add_out = add2_out[0];
    else if ((SIZE / 8) == 1)
        add_out = add3_out[0];
    else if ((SIZE / 16) == 1)
        add_out = add4_out[0];
    else
        add_out = in1[0];

    *output = (add_out);
}

template <int NUM_INPB, int NUM_PB, int NPC>
static void processBlock(bool inflag_TA[NUM_INPB],
                         ap_uint<8> input_1plane[NUM_INPB],
                         ap_uint<17> Wx[NUM_INPB],
                         ap_uint<17> Wy,
                         ap_uint<AREADOWN_PARTIAL_RESULT_BITS>* procBlock_out) {
    ap_uint<32> mul_out[NPC];
    for (int pixelproc = 0; pixelproc < NPC; pixelproc++) {
// clang-format off
        #pragma HLS unroll
        // clang-format on
        ap_uint<8> in_data;
        if (inflag_TA[pixelproc] == 1)
            in_data = input_1plane[pixelproc];
        else
            in_data = 0;
        //##x_mul:Q8.16 = Q1.16 x Q8.0
        ap_uint<24> x_mul = Wx[pixelproc] * in_data;
        ap_uint<24> x_mul_round = x_mul + (1 << (8 - 1));
        //##mul_out:Q8.24 = Q1.16 x Q8.8
        mul_out[pixelproc] = Wy * (x_mul_round >> 8);
    }
    //##ta_out Q8.24
    ap_uint<32> ta_out;
    treeAdder<NPC>(mul_out, &ta_out);
    ap_uint<32> ta_out_round = ta_out + (1 << ((32 - AREADOWN_PARTIAL_RESULT_BITS) - 1));
    //##procBlock_out:Q8.8
    *procBlock_out = ta_out_round >> (32 - AREADOWN_PARTIAL_RESULT_BITS);
}

/*
 * Core Processing Block
 *
 *  PixelValue = Wx0*Wy0*data0[0] + Wx1*Wy0*data0[1] + Wx2*Wy0*data0[2] + Wx3*Wy0*data0[3] + Wx4*Wy0*data0[4] +
 *               Wx0*Wy1*data1[0] + Wx1*Wy1*data1[1] + Wx2*Wy1*data1[2] + Wx3*Wy1*data1[3] + Wx4*Wy1*data1[4] +
 *               Wx0*Wy2*data2[0] + Wx1*Wy2*data2[1] + Wx2*Wy2*data2[2] + Wx3*Wy2*data2[3] + Wx4*Wy2*data2[4] +
 *               Wx0*Wy3*data3[0] + Wx1*Wy3*data3[1] + Wx2*Wy3*data3[2] + Wx3*Wy3*data3[3] + Wx4*Wy3*data3[4] +
 *               Wx0*Wy4*data4[0] + Wx1*Wy4*data4[1] + Wx2*Wy4*data4[2] + Wx3*Wy4*data4[3] +; Wx4*Wy4*data4[4] +
 */
template <int NUM_INPB, int NUM_PB, int DEPTH, int WORDWIDTH, int PLANES, int NPC>
static void CoreProcessDownArea(ap_uint<17> Wx[NUM_PB][NUM_INPB],
                                ap_uint<17> Wy,
                                bool inflag_TA[NUM_PB][NUM_INPB],
                                XF_TNAME(DEPTH, NPC) read_word,
                                ap_uint<AREADOWN_PARTIAL_RESULT_BITS> output_PB[PLANES][NUM_PB + 1],
                                ap_uint<17> Wx_for_Nplus1_Procblock) {
    ap_uint<8> read_word_extract[PLANES][NUM_PB];
    for (int pixel = 0, bit1 = 0; pixel < NUM_PB; pixel++, bit1 += (PLANES * 8)) {
// clang-format off
        #pragma HLS unroll
        // clang-format on
        for (int channel = 0, bit2 = 0; channel < PLANES; channel++, bit2 += 8) {
// clang-format off
            #pragma HLS unroll
            // clang-format on
            if (pixel < NPC)
                read_word_extract[channel][pixel] = read_word.range(bit1 + (bit2 + 7), bit1 + bit2);
            else
                read_word_extract[channel][pixel] = 0;
            //      fprintf(stderr,"\n.range( %d,%d )",bit1+(bit2+7),bit1+bit2);
        }
    }

    for (int procblock_index = 0; procblock_index < NUM_PB + 1; procblock_index++) {
// clang-format off
        #pragma HLS unroll
        // clang-format on
        for (int plane_index = 0, bit = 0; plane_index < PLANES; plane_index++, bit += 8) {
// clang-format off
            #pragma HLS unroll
            // clang-format on

            ap_uint<8> input_1plane[NUM_INPB];
            for (int in_index = 0; in_index < NUM_INPB; in_index++) {
                input_1plane[in_index] = read_word_extract[plane_index][in_index];
            }

            if (procblock_index != NUM_PB) {
                ap_uint<AREADOWN_PARTIAL_RESULT_BITS> procBlock_out; // Q8.8
                processBlock<NUM_INPB, NUM_PB, NPC>(inflag_TA[procblock_index], input_1plane, Wx[procblock_index], Wy,
                                                    &procBlock_out);
                output_PB[plane_index][procblock_index] = procBlock_out;
            } else {
                // if(NPC!=1)
                {
                    //##x_mul:Q8.16 = Q1.16 x Q8.0
                    ap_uint<24> x_mul = Wx_for_Nplus1_Procblock * input_1plane[NUM_INPB - 1];
                    //##mul_out:Q8.24 = Q1.16 x Q8.8
                    ap_uint<32> mul_out = Wy * (x_mul >> 8);
                    output_PB[plane_index][procblock_index] = mul_out >> (32 - AREADOWN_PARTIAL_RESULT_BITS);
                    //              fprintf(stderr,"\n last PB: in x Wx x Wy = %d x %f x %f = %f",
                    //(int)input_1plane[NUM_INPB-1],(float)Wx_for_Nplus1_Procblock/(float)(1<<16),
                    //(float)Wy/(float)(1<<16), (float)mul_out/(float)(1<<24));
                }
            }
        }
    }
}

template <int PLANES, int NUM_PB, int LOG2_PB, int DST_COLS, int DEPTH_OUTBUFFER, int NPC>
static void update_output_buffer(bool DDR_write_en,
                                 bool out_buffer_wr_en,
                                 ap_uint<32> write_index,
                                 ap_uint<16> write_index_col,
                                 unsigned short out_width,
                                 ap_uint<AREADOWN_PARTIAL_RESULT_BITS> accum_reg[PLANES][2 * NPC],
                                 ap_uint<AREADOWN_PARTIAL_RESULT_BITS> accum_reg_overlap[PLANES][2 * NPC],
                                 ap_uint<AREADOWN_PARTIAL_RESULT_BITS> ouput_buffer[PLANES][NUM_PB][DEPTH_OUTBUFFER],
                                 ap_uint<16> output_buffer_Colindex[NUM_PB + 1],
                                 ap_uint<AREADOWN_PARTIAL_RESULT_BITS> PB_out[PLANES][NUM_PB + 1],
                                 int in_col_index,
                                 ap_uint<AREADOWN_PARTIAL_RESULT_BITS> PB_out_overlap[PLANES][NUM_PB + 1],
                                 bool Yaxis_overlap_en,
                                 ap_uint<8> DDR_write_data[PLANES][NPC]) {
// clang-format off
    #pragma HLS inline
    // clang-format on
    bool output_col_index_bit0 = write_index_col[0];

    ap_uint<AREADOWN_PARTIAL_RESULT_BITS> DDR_write0_temp[PLANES][NPC];
    ap_uint<AREADOWN_PARTIAL_RESULT_BITS> DDR_write1_temp[PLANES][NPC];

    ap_uint<AREADOWN_PARTIAL_RESULT_BITS> DDR_write0_temp_overlap[PLANES][NPC];
    ap_uint<AREADOWN_PARTIAL_RESULT_BITS> DDR_write1_temp_overlap[PLANES][NPC];

    for (int plane_id = 0; plane_id < PLANES; plane_id++) {
// clang-format off
        #pragma HLS unroll
        // clang-format on

        for (ap_uint<8> accum_idx = 0, index_pixel = 0; accum_idx < 2 * NPC; accum_idx++, index_pixel++) {
// clang-format off
            #pragma HLS unroll
            // clang-format on

            ap_uint<AREADOWN_PARTIAL_RESULT_BITS> data_mux_out = 0;         //
            ap_uint<AREADOWN_PARTIAL_RESULT_BITS> data_mux_out_overlap = 0; //
            ap_uint<NUM_PB + 1> data_mux_out_status = 0;
            for (ap_uint<16> out_idx = 0; out_idx < (NUM_PB + 1); out_idx++) {
// clang-format off
                #pragma HLS unroll
                // clang-format on
                ap_uint<LOG2_PB + 1> out_index_val = output_buffer_Colindex[out_idx].range(LOG2_PB, 0);
                if (out_index_val == accum_idx) {
                    data_mux_out = PB_out[plane_id][out_idx];
                    data_mux_out_overlap = PB_out_overlap[plane_id][out_idx];
                    data_mux_out_status[out_idx] = 1;
                } else
                    data_mux_out_status[out_idx] = 0;
            }

            ap_uint<AREADOWN_PARTIAL_RESULT_BITS> data_previous;
            ap_uint<AREADOWN_PARTIAL_RESULT_BITS> data_previous_overlap;
            if (in_col_index == 0) {
                data_previous = 0;
                data_previous_overlap = 0;
            } else // if(DDR_write_en==0)
            {
                data_previous = accum_reg[plane_id][index_pixel];
                data_previous_overlap = accum_reg_overlap[plane_id][index_pixel];
            }

            ap_uint<AREADOWN_PARTIAL_RESULT_BITS> update;         // = data_mux_out + data_previous;
            ap_uint<AREADOWN_PARTIAL_RESULT_BITS> update_overlap; // = data_mux_out_overlap + data_previous_overlap;

            if (data_mux_out_status != 0) {
                update = data_mux_out + data_previous;
                update_overlap = data_mux_out_overlap + data_previous_overlap;
            } else {
                update = data_previous;
                update_overlap = data_previous_overlap;
            }

            if (((output_col_index_bit0 == 0 && accum_idx < NPC) || (output_col_index_bit0 == 1 && accum_idx >= NPC)) &&
                (DDR_write_en == 1 || out_buffer_wr_en == 1)) {
                accum_reg[plane_id][accum_idx] = 0;
                accum_reg_overlap[plane_id][accum_idx] = 0;
            } else {
                accum_reg[plane_id][accum_idx] = update;
                accum_reg_overlap[plane_id][accum_idx] = update_overlap;
            }

            if (accum_idx < NPC) {
                DDR_write0_temp[plane_id][accum_idx] = update;
                DDR_write0_temp_overlap[plane_id][accum_idx] = update_overlap;
            } else {
                DDR_write1_temp[plane_id][accum_idx - NPC] = update;
                DDR_write1_temp_overlap[plane_id][accum_idx - NPC] = update_overlap;
            }
        }
    }

    for (int plane_id = 0; plane_id < PLANES; plane_id++) {
// clang-format off
        #pragma HLS unroll
        // clang-format on
        for (ap_uint<8> index_pixel = 0; index_pixel < NPC; index_pixel++) {
// clang-format off
            #pragma HLS unroll
            // clang-format on
            ap_uint<AREADOWN_PARTIAL_RESULT_BITS> temp_sum;
            ap_uint<AREADOWN_PARTIAL_RESULT_BITS> buffer_updated_data;
            ap_uint<AREADOWN_PARTIAL_RESULT_BITS> read_buffer_data =
                ouput_buffer[plane_id][index_pixel][write_index_col];
            if (output_col_index_bit0 == 0)
                temp_sum = (read_buffer_data + DDR_write0_temp[plane_id][index_pixel]);
            else
                temp_sum = (read_buffer_data + DDR_write1_temp[plane_id][index_pixel]);

            if (DDR_write_en == 1) {
                ap_uint<16> sum_rounding = temp_sum + (1 << (AREADOWN_PARTIAL_RESULT_BITS - 9));
                DDR_write_data[plane_id][index_pixel] = sum_rounding >> (AREADOWN_PARTIAL_RESULT_BITS - 8);
                ap_uint<AREADOWN_PARTIAL_RESULT_BITS> buffer_data_temp;
                if (Yaxis_overlap_en == 1)
                    if (output_col_index_bit0 == 0)
                        buffer_data_temp = DDR_write0_temp_overlap[plane_id][index_pixel];
                    else
                        buffer_data_temp = DDR_write1_temp_overlap[plane_id][index_pixel];
                else
                    buffer_data_temp = 0;

                ouput_buffer[plane_id][index_pixel][write_index_col] = buffer_data_temp;

            } else if (out_buffer_wr_en == 1) {
                ouput_buffer[plane_id][index_pixel][write_index_col] = temp_sum;
            }
        }
    }
}

template <int SRC_ROWS,
          int SRC_COLS,
          int PLANES,
          int DEPTH,
          int NPC,
          int WORDWIDTH,
          int DST_ROWS,
          int DST_COLS,
          int SRC_TC,
          int DST_TC>
void xFResizeAreaDownScale(xf::cv::Mat<DEPTH, SRC_ROWS, SRC_COLS, NPC>& stream_in,
                           xf::cv::Mat<DEPTH, DST_ROWS, DST_COLS, NPC>& resize_out) {
    unsigned short height = stream_in.rows;
    unsigned short width = stream_in.cols;
    unsigned short out_height = resize_out.rows;
    unsigned short out_width = resize_out.cols;

    unsigned short imgInput_ncpr = (width + (NPC - 1)) >> XF_BITSHIFT(NPC);
    unsigned short imgOutput_ncpr = (out_width + (NPC - 1)) >> XF_BITSHIFT(NPC);
    unsigned short imgOutput_width_align_npc = imgOutput_ncpr << XF_BITSHIFT(NPC);
    unsigned short in_col_loop_bound;
    if (imgOutput_width_align_npc != out_width)
        in_col_loop_bound = imgInput_ncpr + 1;
    else
        in_col_loop_bound = imgInput_ncpr;

    enum { NUM_PB = NPC, NUM_INPB = NPC, LOG2_PB = XF_BITSHIFT(NPC) };

    ap_uint<32> Xscale, Yscale; // Q16.16 format
    Xscale = xFUdivResizeDownArea((width), (out_width));
    Yscale = xFUdivResizeDownArea(height, out_height);
    ap_uint<32> X_1PixelWeight, Y_1PixelWeight; // Q16.16 format
    X_1PixelWeight = xFUdivResizeDownArea(out_width, width);
    Y_1PixelWeight = xFUdivResizeDownArea(out_height, height);

    //## X-direction output index(Q16.16), which is used for each Process block output.
    ap_uint<32> Xindex_output[NUM_PB];
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=Xindex_output complete dim=0
    // clang-format on
    ap_uint<32> Xindex_output_initial[NUM_PB];
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=Xindex_output_initial complete dim=0
    // clang-format on

    //## input flag for each input of last process block.
    //## TRUE - input data is mappped to multiplier
    //## FALSE- multiplier input is zero
    bool inflag_TA_prev[NUM_INPB];
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=inflag_TA_prev complete dim=0
    // clang-format on
    for (ap_uint<8> idx = 1; idx <= NUM_PB; idx++) {
// clang-format off
        #pragma HLS pipeline
        // clang-format on
        Xindex_output_initial[idx - 1] = X_1PixelWeight * idx;
        //      inflag_TA_prev[idx-1] = true;
    }

    ap_uint<32> Xindex_output_initial_next;
    Xindex_output_initial_next = X_1PixelWeight * (1 + NUM_PB);

    ap_uint<32> Xindex_output_next;

    //## Y-direction output index(Q16.16)
    ap_uint<32> Yindex_output = Y_1PixelWeight;
    ap_uint<32> Yindex_output_prev = 0;

    //## skip_count Q16.0, it is used for mapping input data to process block
    ap_uint<16> skip_count = 0;

    //##DDR index
    uint32_t read_index = 0;
    ap_uint<32> write_index = 0;
    ap_uint<16> write_col_index = 0;

    //## overlap flag in Y-direction
    bool Yaxis_overlap_en = 0;
    bool Yaxis_overlap_nextrow_en = 0;
    bool Yaxis_overlap_prevrow_en = 0;

    enum { DEPTH_OUTBUFFER = (DST_COLS + NPC - 1) / NPC };

    //## output buffer
    ap_uint<AREADOWN_PARTIAL_RESULT_BITS> ouput_buffer[PLANES][NUM_PB][DEPTH_OUTBUFFER];
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=ouput_buffer complete dim=1
    #pragma HLS ARRAY_PARTITION variable=ouput_buffer complete dim=2
    // clang-format on

    ap_uint<AREADOWN_PARTIAL_RESULT_BITS> accum_reg[PLANES][NPC * 2];
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=accum_reg complete dim=0
    // clang-format on
    ap_uint<AREADOWN_PARTIAL_RESULT_BITS> accum_reg_overlap[PLANES][NPC * 2];
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=accum_reg_overlap complete dim=0
    // clang-format on
    for (int dim2 = 0; dim2 < NPC * 2; dim2++) {
// clang-format off
        #pragma HLS unroll
        #pragma HLS unroll
        // clang-format on
        for (int dim1 = 0; dim1 < PLANES; dim1++) {
// clang-format off
            #pragma HLS unroll
            // clang-format on
            accum_reg[dim1][dim2] = 0;
            accum_reg_overlap[dim1][dim2] = 0;
        }
    }

    for (ap_uint<16> dim3 = 0; dim3 < DEPTH_OUTBUFFER; dim3++) {
// clang-format off
        #pragma HLS pipeline
        // clang-format on
        for (int dim2 = 0; dim2 < NUM_PB; dim2++) {
// clang-format off
            #pragma HLS unroll
            // clang-format on
            for (int dim1 = 0; dim1 < PLANES; dim1++) {
// clang-format off
                #pragma HLS unroll
                // clang-format on
                ouput_buffer[dim1][dim2][dim3] = 0;
            }
        }
    }

    int out_col_index = 0;
    ap_uint<16> output_row_index_for_pingpong = 0; // Q16.0
    bool prev_output_row_index_for_pingpong_bit0 = 0;
    ap_uint<16> ouput_index_write_counter = NPC;

    XF_TNAME(DEPTH, NPC) read_word;

    int display_write_per_row = 0;
    int display_write_rowID = 0;

LOOP_ROW:
    for (ap_uint<16> row_index = 0; row_index < height; row_index++) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=1 max=SRC_ROWS
    // clang-format on

    LOOP_COL:
        for (int col_index = 0, col_index_next = 1; col_index < in_col_loop_bound; col_index++, col_index_next++) {
// clang-format off
            #pragma HLS LOOP_TRIPCOUNT min=1 max=SRC_TC
            #pragma HLS pipeline
            #pragma HLS DEPENDENCE variable=ouput_buffer inter false
            // clang-format on

            for (int idx = 0; idx < NUM_PB; idx++) {
// clang-format off
                #pragma HLS unroll
                // clang-format on
                if (col_index == 0) {
                    Xindex_output[idx] = Xindex_output_initial[idx];

                    inflag_TA_prev[idx] = true;
                } else {
                    if (NPC == 1)
                        Xindex_output[idx] += X_1PixelWeight;
                    else
                        Xindex_output[idx] += (X_1PixelWeight * NUM_PB);
                }
            }
            if (col_index == 0) {
                Xindex_output_next = Xindex_output_initial_next;
            } else {
                if (NPC == 1)
                    Xindex_output_next += X_1PixelWeight;
                else
                    Xindex_output_next += (X_1PixelWeight * NUM_PB);
            }

            ap_uint<16> output_buffer_index[NUM_PB + 1];
// clang-format off
            #pragma HLS ARRAY_PARTITION variable=output_buffer_index complete dim=0
            // clang-format on
            //## input flag for each input of process block.
            bool inflag_TA[NUM_PB][NUM_INPB];
// clang-format off
            #pragma HLS ARRAY_PARTITION variable=inflag_TA complete dim=0
            // clang-format on
            bool inflag_for_Nplus1_Procblock;
            ap_uint<17> Wx_for_Nplus1_Procblock;
            ap_uint<17> Wx[NUM_PB][NUM_INPB]; // Q1.16
// clang-format off
            #pragma HLS ARRAY_PARTITION variable=Wx complete dim=0
            // clang-format on

            bool DDR_wr_en;
            bool out_buffer_wr_en;

            ap_uint<16> output_buffer_index_next_out;
            flag_index_generator<NUM_PB, LOG2_PB, NUM_PB, NPC>(
                Xscale, X_1PixelWeight, Y_1PixelWeight, row_index, col_index, Xindex_output, &Xindex_output_next,
                output_buffer_index, inflag_TA, &skip_count, Wx, &inflag_for_Nplus1_Procblock, &Wx_for_Nplus1_Procblock,
                &DDR_wr_en, &out_buffer_wr_en, Yaxis_overlap_en, Yindex_output, Yindex_output_prev,
                ouput_index_write_counter, height, width, in_col_loop_bound, &output_buffer_index_next_out);

            if (col_index == (in_col_loop_bound)-1)
                ouput_index_write_counter = NPC;
            else if (ouput_index_write_counter <= output_buffer_index_next_out)
                ouput_index_write_counter += NPC;

            if (col_index < imgInput_ncpr) read_word = stream_in.read(read_index++);

            // TODO: Wy weight generation
            ap_uint<17> Wy0, Wy1; // Q1.16
            if (Yaxis_overlap_en == 1) {
                Wy0 = Y_1PixelWeight.range(15, 0) - Yindex_output.range(15, 0);
                Wy1 = Yindex_output.range(15, 0);
            } else {
                Wy0 = Y_1PixelWeight;
                Wy1 = 0;
            }
            //## output data of each process block
            ap_uint<AREADOWN_PARTIAL_RESULT_BITS> PB_out[PLANES][NUM_PB + 1]; // Q8.8
// clang-format off
            #pragma HLS ARRAY_PARTITION variable=PB_out complete dim=0
            // clang-format on
            ap_uint<AREADOWN_PARTIAL_RESULT_BITS> PB_out_overlap[PLANES][NUM_PB + 1]; // Q8.8
// clang-format off
            #pragma HLS ARRAY_PARTITION variable=PB_out_overlap complete dim=0
            // clang-format on

            //## CoreProcess has "NUM_PB" process blocks. Each process block has "NUM_INPB" 3-input multiplier and Tree
            // adder to accumulate multiplier output.
            CoreProcessDownArea<NUM_INPB, NUM_PB, DEPTH, WORDWIDTH, PLANES, NPC>(Wx, Wy0, inflag_TA, read_word, PB_out,
                                                                                 Wx_for_Nplus1_Procblock);

            //## Extra CoreProcess to process next output in case of overlap.
            CoreProcessDownArea<NUM_INPB, NUM_PB, DEPTH, WORDWIDTH, PLANES, NPC>(
                Wx, Wy1, inflag_TA, read_word, PB_out_overlap, Wx_for_Nplus1_Procblock);

            ap_uint<8> DDR_write_data[PLANES][NPC];
// clang-format off
            #pragma HLS ARRAY_PARTITION variable=DDR_write_data complete dim=0
            // clang-format on
            update_output_buffer<PLANES, NUM_PB, LOG2_PB, DST_COLS, DEPTH_OUTBUFFER, NPC>(
                DDR_wr_en, out_buffer_wr_en, write_index, write_col_index, out_width, accum_reg, accum_reg_overlap,
                ouput_buffer, output_buffer_index, PB_out, col_index, PB_out_overlap, Yaxis_overlap_en, DDR_write_data);

            if (DDR_wr_en == 1) {
                display_write_per_row++;
                XF_TNAME(DEPTH, NPC) out_pix;
                ap_uint<PLANES * 8> plane_tmp;
                for (int pixel = 0, bit1 = 0; pixel < NPC; pixel++, bit1 += (PLANES * 8)) {
// clang-format off
                    #pragma HLS unroll
                    // clang-format on
                    for (int channel = 0, bit2 = 0; channel < PLANES; channel++, bit2 += 8) {
// clang-format off
                        #pragma HLS unroll
                        // clang-format on
                        plane_tmp.range(bit2 + 7, bit2) = DDR_write_data[channel][pixel];
                    }
                    out_pix.range(bit1 + (PLANES * 8) - 1, bit1) = plane_tmp;
                }
                if (out_col_index < imgOutput_ncpr) resize_out.write(write_index++, out_pix);

                if (col_index == ((in_col_loop_bound)-1))
                    out_col_index = 0;
                else
                    out_col_index++;
            }

            if (col_index == ((in_col_loop_bound)-1))
                write_col_index = 0;
            else if (out_buffer_wr_en)
                write_col_index++;

            // last iteration of col loop
            if (col_index == ((in_col_loop_bound)-1)) {
                Yindex_output += Y_1PixelWeight;
                Yindex_output_prev += Y_1PixelWeight;
            }

            int t1 = Yindex_output.range(15, 0);
            int t2 = Y_1PixelWeight;
            int t3 = Yindex_output;

            ap_uint<32> Yindex_threshold = Yindex_output - 0x41;

            if (col_index == ((in_col_loop_bound)-1)) Yaxis_overlap_prevrow_en = Yaxis_overlap_en;

            if (Yindex_output.range(15, 0) < Y_1PixelWeight && (Y_1PixelWeight - Yindex_output.range(15, 0)) > 0x41 &&
                Y_1PixelWeight[16] == 0)
                Yaxis_overlap_en = 1;
            else
                Yaxis_overlap_en = 0;

            if (col_index == ((in_col_loop_bound)-1))
                prev_output_row_index_for_pingpong_bit0 = output_row_index_for_pingpong[0];

            if (Yaxis_overlap_en == 0) {
                if ((Yindex_output.range(15, 0) < Y_1PixelWeight) && (Yindex_output.range(15, 0) > 0x41) &&
                    (col_index == ((in_col_loop_bound)-1)))
                    output_row_index_for_pingpong = Yindex_threshold.range(31, 16) - 1;
                else
                    output_row_index_for_pingpong = Yindex_threshold.range(31, 16);
            }
        } // col loop
    }     // row loop
}

#endif //_XF_RESIZE_DOWN_AREA_