.. _program_listing_file__tmp_ws_src_vitis_common_include_imgproc_xf_resize_down_area.hpp: Program Listing for File xf_resize_down_area.hpp ================================================ |exhale_lsh| :ref:`Return to documentation for file ` (``/tmp/ws/src/vitis_common/include/imgproc/xf_resize_down_area.hpp``) .. |exhale_lsh| unicode:: U+021B0 .. UPWARDS ARROW WITH TIP LEFTWARDS .. code-block:: cpp /* * Copyright 2019 Xilinx, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef _XF_RESIZE_DOWN_AREA_ #define _XF_RESIZE_DOWN_AREA_ #include "hls_stream.h" #include "ap_int.h" #include "../common/xf_common.hpp" #include "../core/xf_math.h" #include "../common/xf_utility.hpp" #define AREADOWN_PARTIAL_RESULT_BITS 16 static uint32_t xFUdivResizeDownArea(unsigned short in_n, unsigned short in_d) { uint32_t out_div = uint32_t(in_n) * POW16 / in_d; return out_div; } template static void flag_index_generator(ap_uint<32> Xscale, ap_uint<32> X_1PixelWeight, ap_uint<32> Y_1PixelWeight, ap_uint<16> row_index, int in_col_index, ap_uint<32> Xindex_output[NUM_PB], ap_uint<32>* Xindex_output_next, ap_uint<16> output_buffer_index[NUM_PB + 1], bool inflag_TA[NUM_PB][NUM_INPB], ap_uint<16>* skip_count, ap_uint<17> Wx[NUM_PB][NUM_INPB], bool* inflag_for_Nplus1_Procblock, ap_uint<17>* Wx_for_Nplus1_Procblock, bool* DDR_wr_en, bool* out_buffer_wr_en, bool Yaxis_overlap_en, ap_uint<32> Yindex_output, ap_uint<32> Yindex_output_prev, ap_uint<16> ouput_index_write_counter, unsigned short in_height, unsigned short in_width, unsigned short inImg_ncpr, ap_uint<16>* output_buffer_index_next_out) { // clang-format off #pragma HLS inline // clang-format on ap_int<16> skip_count_tmp; ap_int<16> skip_count_tmp_opt; skip_count_tmp_opt = Xindex_output[0].range(31, 16); // - (Xindex_output[0].range(15,0) output_buffer_index_start; if (Xscale == 0x10000) output_buffer_index_start = in_col_index * NPC; else { ap_uint<16> index_fract_value = Xindex_output[0].range(15, 0); ap_uint<16> weight_value = X_1PixelWeight.range(15, 0); ap_uint<16> sub_value = weight_value - index_fract_value; if (index_fract_value < weight_value && sub_value > 0x41) output_buffer_index_start = Xindex_output[0].range(31, 16) - 1; else output_buffer_index_start = Xindex_output[0].range(31, 16); } for (int pb_in = 0; pb_in < NUM_INPB + 1; pb_in++) { output_buffer_index[pb_in] = output_buffer_index_start + pb_in; } ap_uint<16> output_buffer_index_next; if (Xscale == 0x10000) output_buffer_index_next = (in_col_index + 1) * NPC; else { ap_uint<16> index_fract_value = Xindex_output_next[0].range(15, 0); ap_uint<16> weight_value = X_1PixelWeight.range(15, 0); ap_uint<16> sub_value = weight_value - index_fract_value; if (index_fract_value < weight_value && sub_value > 0x41) output_buffer_index_next = Xindex_output_next[0].range(31, 16) - 1; else output_buffer_index_next = Xindex_output_next[0].range(31, 16); } *output_buffer_index_next_out = output_buffer_index_next; ap_uint<16> int_bits_Xindex_out_previous; if (in_col_index == 0) int_bits_Xindex_out_previous = 0; else int_bits_Xindex_out_previous = (Xindex_output[0] - X_1PixelWeight - 0x41) >> 16; ap_uint<16> fract_bits_Xindex_out_previous; if (in_col_index == 0) fract_bits_Xindex_out_previous = 0; else fract_bits_Xindex_out_previous = (ap_uint<16>)(Xindex_output[0] - X_1PixelWeight); for (int ta_idx = 0; ta_idx < NUM_PB; ta_idx++) { // clang-format off #pragma HLS unroll // clang-format on for (int pb_in = 0; pb_in < NUM_INPB; pb_in++) { // clang-format off #pragma HLS unroll // clang-format on ap_uint<16> int_bits_Xindex_out = (Xindex_output[pb_in] - 0x41) >> 16; ap_uint<16> fract_bits_Xindex_out = Xindex_output[pb_in].range(15, 0); ap_uint<16> int_bits_Xindex_out_min1; // = (Xindex_output[pb_in-1]-0x41)>>16; ap_uint<16> fract_bits_Xindex_out_min1; // = Xindex_output[pb_in-1].range(15,0); if (pb_in == 0) { int_bits_Xindex_out_min1 = 0; fract_bits_Xindex_out_min1 = 0; } else { int_bits_Xindex_out_min1 = (Xindex_output[pb_in - 1] - 0x41) >> 16; fract_bits_Xindex_out_min1 = Xindex_output[pb_in - 1].range(15, 0); } ap_uint<16> index_value = output_buffer_index[ta_idx]; bool t1 = index_value == int_bits_Xindex_out; bool t2 = ((int_bits_Xindex_out - int_bits_Xindex_out_previous) == 1) && (pb_in == 0) && (index_value == int_bits_Xindex_out - 1) && fract_bits_Xindex_out < X_1PixelWeight && ((X_1PixelWeight - fract_bits_Xindex_out) > 0x41); bool t3 = ((int_bits_Xindex_out - int_bits_Xindex_out_min1) == 1) && (pb_in > 0) && (index_value == int_bits_Xindex_out - 1) && fract_bits_Xindex_out < X_1PixelWeight && ((X_1PixelWeight - fract_bits_Xindex_out) > 0x41); if (((t1 || t2 || t3) && (Xscale != 0x10000)) || ((Xscale == 0x10000) && (pb_in == ta_idx))) inflag_TA[ta_idx][pb_in] = 1; else inflag_TA[ta_idx][pb_in] = 0; } } ap_uint<32> input_index__for_Nplus1_Procblock = output_buffer_index_next * Xscale; ap_uint<16> intBits_input_index__for_Nplus1_Procblock = input_index__for_Nplus1_Procblock.range(31, 16); ap_uint<16> col_idx_x_NPC = (in_col_index + 1) * NPC; ap_uint<32> Xindex_for_Nplus1_Procblock = Xindex_output[NUM_PB - 1]; //+X_1PixelWeight; ap_uint<32> overlap_next_pixel = (((ap_uint<32>)output_buffer_index_next) << 16) - Xindex_output[NUM_PB - 1]; if (NPC != 1) { // x scale is less than 1.5, then N+1 output pixel(inclusing partial output) can be generated using N input // pixel if( (output_buffer_index_start+4) == output_buffer_index_next && Xscale!=65536 && Xscale<98304 && // overlap_next_pixel>0x41){ if ((output_buffer_index_start + NPC) == output_buffer_index_next && Xscale != 65536 && Xscale < 98304 && overlap_next_pixel > 0x41) { *inflag_for_Nplus1_Procblock = 1; *Wx_for_Nplus1_Procblock = Xindex_for_Nplus1_Procblock.range(15, 0); } else { *inflag_for_Nplus1_Procblock = 0; *Wx_for_Nplus1_Procblock = 0; } } else { if ((output_buffer_index_start + NPC) == output_buffer_index_next && overlap_next_pixel > 0x41) { *inflag_for_Nplus1_Procblock = 1; *Wx_for_Nplus1_Procblock = Xindex_for_Nplus1_Procblock.range(15, 0); } else { *inflag_for_Nplus1_Procblock = 0; *Wx_for_Nplus1_Procblock = 0; } } ap_uint<32> Yindex_output_tmp = Yindex_output; // - 0x41; ap_uint<32> overlap_with_next_row = 0x10000 - Yindex_output.range(15, 0); ap_uint<32> overlap_with_prev_row = 0x10000 - Yindex_output_prev.range(15, 0); ap_uint<32> Yindex_output_prev_tmp = Yindex_output_prev; // - 0x41; bool t1 = (ouput_index_write_counter <= output_buffer_index_next); bool t2 = Yaxis_overlap_en == 1; bool t3 = (Yindex_output_tmp.range(31, 16) != Yindex_output_prev_tmp.range(31, 16)); bool if_test = t1 && (t2 || t3); int current_Yidx_int = Yindex_output_tmp.range(31, 16); int next_Yidx_int = Yindex_output_prev_tmp.range(31, 16); bool scale1_en = X_1PixelWeight[16] == 1; bool write_en_pixel_in_same_row = (ouput_index_write_counter <= output_buffer_index_next); bool overlap_en_next_row = (overlap_with_next_row > 0x41); bool overlap_en_prev_row = (overlap_with_prev_row > 0x41); bool output_row_en = (Yaxis_overlap_en == 1 || (Yindex_output_tmp.range(31, 16) != Yindex_output_prev_tmp.range(31, 16))); //## Yindex precision error when current index value is close to integeger number.. like 8.999928 is close // to 9.00000 //## precision error upto 10^-2 is accepted. ap_uint<32> Yindex_output_precision_error = 0x10000 - Yindex_output.range(15, 0); ap_uint<32> Yindex_output_prev_precision_error = 0x10000 - Yindex_output_prev.range(15, 0); bool DDR_wr_en_tmp; if ((((Yaxis_overlap_en == 1 || (Yindex_output_tmp.range(31, 16) != Yindex_output_prev_tmp.range(31, 16)))) || (Yindex_output_precision_error <= 65)) || row_index == (in_height - 1)) { if ((ouput_index_write_counter <= output_buffer_index_next || (in_col_index == (inImg_ncpr)-1)) && (Yindex_output_prev_precision_error > 65)) DDR_wr_en_tmp = 1; else DDR_wr_en_tmp = 0; } else { DDR_wr_en_tmp = 0; } if (X_1PixelWeight[16] == 1 && Y_1PixelWeight[16] == 1) *DDR_wr_en = 1; else *DDR_wr_en = DDR_wr_en_tmp; if ((X_1PixelWeight[16] == 1) || (ouput_index_write_counter <= output_buffer_index_next) || (in_col_index == (inImg_ncpr)-1)) { *out_buffer_wr_en = 1; } else { *out_buffer_wr_en = 0; } for (int ta_idx = 0; ta_idx < NUM_PB; ta_idx++) { // clang-format off #pragma HLS unroll // clang-format on for (int pb_in = 0; pb_in < NUM_INPB; pb_in++) { // clang-format off #pragma HLS unroll // clang-format on bool rangeA_0_to_scale = Xindex_output[pb_in].range(15, 0) <= X_1PixelWeight.range(15, 0); // Q0.16 ap_uint<16> sub_result = X_1PixelWeight.range(15, 0) - Xindex_output[pb_in].range(15, 0); if (rangeA_0_to_scale == true && Xscale != 0x10000) { // if(int_bits_wo_th_for_Wx[pb_in].range(LOG2_PB-1,0) == ta_idx) if (output_buffer_index[ta_idx] == Xindex_output[pb_in].range(31, 16)) Wx[ta_idx][pb_in] = Xindex_output[pb_in].range(15, 0); else Wx[ta_idx][pb_in] = sub_result; } else Wx[ta_idx][pb_in] = X_1PixelWeight.range(16, 0); } } } template void treeAdder(ap_uint<32> in1[SIZE], ap_uint<32>* output) { // clang-format off #pragma HLS ARRAY_PARTITION variable=in1 complete dim=1 #pragma HLS inline // clang-format on ap_uint<32> add1_out[SIZE / 2]; ap_uint<32> add2_out[SIZE / 4]; ap_uint<32> add3_out[SIZE / 8]; ap_uint<32> add4_out[SIZE / 16]; if ((SIZE / 2) != 0) { for (ap_uint<10> idx = 0; idx < (SIZE / 2); idx++) { // clang-format off #pragma HLS unroll // clang-format on add1_out[idx] = in1[2 * idx] + in1[2 * idx + 1]; } } if ((SIZE / 4) != 0) { for (ap_uint<10> idx = 0; idx < (SIZE / 4); idx++) { // clang-format off #pragma HLS unroll // clang-format on add2_out[idx] = add1_out[2 * idx] + add1_out[2 * idx + 1]; } } if ((SIZE / 8) != 0) { for (ap_uint<10> idx = 0; idx < (SIZE / 8); idx++) { // clang-format off #pragma HLS unroll // clang-format on add3_out[idx] = add2_out[2 * idx] + add2_out[2 * idx + 1]; } } if ((SIZE / 16) != 0) { for (ap_uint<10> idx = 0; idx < (SIZE / 16); idx++) { // clang-format off #pragma HLS unroll // clang-format on add4_out[idx] = add3_out[2 * idx] + add3_out[2 * idx + 1]; } } ap_uint<32> add_out; if ((SIZE / 2) == 1) add_out = add1_out[0]; else if ((SIZE / 4) == 1) add_out = add2_out[0]; else if ((SIZE / 8) == 1) add_out = add3_out[0]; else if ((SIZE / 16) == 1) add_out = add4_out[0]; else add_out = in1[0]; *output = (add_out); } template static void processBlock(bool inflag_TA[NUM_INPB], ap_uint<8> input_1plane[NUM_INPB], ap_uint<17> Wx[NUM_INPB], ap_uint<17> Wy, ap_uint* procBlock_out) { ap_uint<32> mul_out[NPC]; for (int pixelproc = 0; pixelproc < NPC; pixelproc++) { // clang-format off #pragma HLS unroll // clang-format on ap_uint<8> in_data; if (inflag_TA[pixelproc] == 1) in_data = input_1plane[pixelproc]; else in_data = 0; //##x_mul:Q8.16 = Q1.16 x Q8.0 ap_uint<24> x_mul = Wx[pixelproc] * in_data; ap_uint<24> x_mul_round = x_mul + (1 << (8 - 1)); //##mul_out:Q8.24 = Q1.16 x Q8.8 mul_out[pixelproc] = Wy * (x_mul_round >> 8); } //##ta_out Q8.24 ap_uint<32> ta_out; treeAdder(mul_out, &ta_out); ap_uint<32> ta_out_round = ta_out + (1 << ((32 - AREADOWN_PARTIAL_RESULT_BITS) - 1)); //##procBlock_out:Q8.8 *procBlock_out = ta_out_round >> (32 - AREADOWN_PARTIAL_RESULT_BITS); } /* * Core Processing Block * * PixelValue = Wx0*Wy0*data0[0] + Wx1*Wy0*data0[1] + Wx2*Wy0*data0[2] + Wx3*Wy0*data0[3] + Wx4*Wy0*data0[4] + * Wx0*Wy1*data1[0] + Wx1*Wy1*data1[1] + Wx2*Wy1*data1[2] + Wx3*Wy1*data1[3] + Wx4*Wy1*data1[4] + * Wx0*Wy2*data2[0] + Wx1*Wy2*data2[1] + Wx2*Wy2*data2[2] + Wx3*Wy2*data2[3] + Wx4*Wy2*data2[4] + * Wx0*Wy3*data3[0] + Wx1*Wy3*data3[1] + Wx2*Wy3*data3[2] + Wx3*Wy3*data3[3] + Wx4*Wy3*data3[4] + * Wx0*Wy4*data4[0] + Wx1*Wy4*data4[1] + Wx2*Wy4*data4[2] + Wx3*Wy4*data4[3] +; Wx4*Wy4*data4[4] + */ template static void CoreProcessDownArea(ap_uint<17> Wx[NUM_PB][NUM_INPB], ap_uint<17> Wy, bool inflag_TA[NUM_PB][NUM_INPB], XF_TNAME(DEPTH, NPC) read_word, ap_uint output_PB[PLANES][NUM_PB + 1], ap_uint<17> Wx_for_Nplus1_Procblock) { ap_uint<8> read_word_extract[PLANES][NUM_PB]; for (int pixel = 0, bit1 = 0; pixel < NUM_PB; pixel++, bit1 += (PLANES * 8)) { // clang-format off #pragma HLS unroll // clang-format on for (int channel = 0, bit2 = 0; channel < PLANES; channel++, bit2 += 8) { // clang-format off #pragma HLS unroll // clang-format on if (pixel < NPC) read_word_extract[channel][pixel] = read_word.range(bit1 + (bit2 + 7), bit1 + bit2); else read_word_extract[channel][pixel] = 0; // fprintf(stderr,"\n.range( %d,%d )",bit1+(bit2+7),bit1+bit2); } } for (int procblock_index = 0; procblock_index < NUM_PB + 1; procblock_index++) { // clang-format off #pragma HLS unroll // clang-format on for (int plane_index = 0, bit = 0; plane_index < PLANES; plane_index++, bit += 8) { // clang-format off #pragma HLS unroll // clang-format on ap_uint<8> input_1plane[NUM_INPB]; for (int in_index = 0; in_index < NUM_INPB; in_index++) { input_1plane[in_index] = read_word_extract[plane_index][in_index]; } if (procblock_index != NUM_PB) { ap_uint procBlock_out; // Q8.8 processBlock(inflag_TA[procblock_index], input_1plane, Wx[procblock_index], Wy, &procBlock_out); output_PB[plane_index][procblock_index] = procBlock_out; } else { // if(NPC!=1) { //##x_mul:Q8.16 = Q1.16 x Q8.0 ap_uint<24> x_mul = Wx_for_Nplus1_Procblock * input_1plane[NUM_INPB - 1]; //##mul_out:Q8.24 = Q1.16 x Q8.8 ap_uint<32> mul_out = Wy * (x_mul >> 8); output_PB[plane_index][procblock_index] = mul_out >> (32 - AREADOWN_PARTIAL_RESULT_BITS); // fprintf(stderr,"\n last PB: in x Wx x Wy = %d x %f x %f = %f", //(int)input_1plane[NUM_INPB-1],(float)Wx_for_Nplus1_Procblock/(float)(1<<16), //(float)Wy/(float)(1<<16), (float)mul_out/(float)(1<<24)); } } } } } template static void update_output_buffer(bool DDR_write_en, bool out_buffer_wr_en, ap_uint<32> write_index, ap_uint<16> write_index_col, unsigned short out_width, ap_uint accum_reg[PLANES][2 * NPC], ap_uint accum_reg_overlap[PLANES][2 * NPC], ap_uint ouput_buffer[PLANES][NUM_PB][DEPTH_OUTBUFFER], ap_uint<16> output_buffer_Colindex[NUM_PB + 1], ap_uint PB_out[PLANES][NUM_PB + 1], int in_col_index, ap_uint PB_out_overlap[PLANES][NUM_PB + 1], bool Yaxis_overlap_en, ap_uint<8> DDR_write_data[PLANES][NPC]) { // clang-format off #pragma HLS inline // clang-format on bool output_col_index_bit0 = write_index_col[0]; ap_uint DDR_write0_temp[PLANES][NPC]; ap_uint DDR_write1_temp[PLANES][NPC]; ap_uint DDR_write0_temp_overlap[PLANES][NPC]; ap_uint DDR_write1_temp_overlap[PLANES][NPC]; for (int plane_id = 0; plane_id < PLANES; plane_id++) { // clang-format off #pragma HLS unroll // clang-format on for (ap_uint<8> accum_idx = 0, index_pixel = 0; accum_idx < 2 * NPC; accum_idx++, index_pixel++) { // clang-format off #pragma HLS unroll // clang-format on ap_uint data_mux_out = 0; // ap_uint data_mux_out_overlap = 0; // ap_uint data_mux_out_status = 0; for (ap_uint<16> out_idx = 0; out_idx < (NUM_PB + 1); out_idx++) { // clang-format off #pragma HLS unroll // clang-format on ap_uint out_index_val = output_buffer_Colindex[out_idx].range(LOG2_PB, 0); if (out_index_val == accum_idx) { data_mux_out = PB_out[plane_id][out_idx]; data_mux_out_overlap = PB_out_overlap[plane_id][out_idx]; data_mux_out_status[out_idx] = 1; } else data_mux_out_status[out_idx] = 0; } ap_uint data_previous; ap_uint data_previous_overlap; if (in_col_index == 0) { data_previous = 0; data_previous_overlap = 0; } else // if(DDR_write_en==0) { data_previous = accum_reg[plane_id][index_pixel]; data_previous_overlap = accum_reg_overlap[plane_id][index_pixel]; } ap_uint update; // = data_mux_out + data_previous; ap_uint update_overlap; // = data_mux_out_overlap + data_previous_overlap; if (data_mux_out_status != 0) { update = data_mux_out + data_previous; update_overlap = data_mux_out_overlap + data_previous_overlap; } else { update = data_previous; update_overlap = data_previous_overlap; } if (((output_col_index_bit0 == 0 && accum_idx < NPC) || (output_col_index_bit0 == 1 && accum_idx >= NPC)) && (DDR_write_en == 1 || out_buffer_wr_en == 1)) { accum_reg[plane_id][accum_idx] = 0; accum_reg_overlap[plane_id][accum_idx] = 0; } else { accum_reg[plane_id][accum_idx] = update; accum_reg_overlap[plane_id][accum_idx] = update_overlap; } if (accum_idx < NPC) { DDR_write0_temp[plane_id][accum_idx] = update; DDR_write0_temp_overlap[plane_id][accum_idx] = update_overlap; } else { DDR_write1_temp[plane_id][accum_idx - NPC] = update; DDR_write1_temp_overlap[plane_id][accum_idx - NPC] = update_overlap; } } } for (int plane_id = 0; plane_id < PLANES; plane_id++) { // clang-format off #pragma HLS unroll // clang-format on for (ap_uint<8> index_pixel = 0; index_pixel < NPC; index_pixel++) { // clang-format off #pragma HLS unroll // clang-format on ap_uint temp_sum; ap_uint buffer_updated_data; ap_uint read_buffer_data = ouput_buffer[plane_id][index_pixel][write_index_col]; if (output_col_index_bit0 == 0) temp_sum = (read_buffer_data + DDR_write0_temp[plane_id][index_pixel]); else temp_sum = (read_buffer_data + DDR_write1_temp[plane_id][index_pixel]); if (DDR_write_en == 1) { ap_uint<16> sum_rounding = temp_sum + (1 << (AREADOWN_PARTIAL_RESULT_BITS - 9)); DDR_write_data[plane_id][index_pixel] = sum_rounding >> (AREADOWN_PARTIAL_RESULT_BITS - 8); ap_uint buffer_data_temp; if (Yaxis_overlap_en == 1) if (output_col_index_bit0 == 0) buffer_data_temp = DDR_write0_temp_overlap[plane_id][index_pixel]; else buffer_data_temp = DDR_write1_temp_overlap[plane_id][index_pixel]; else buffer_data_temp = 0; ouput_buffer[plane_id][index_pixel][write_index_col] = buffer_data_temp; } else if (out_buffer_wr_en == 1) { ouput_buffer[plane_id][index_pixel][write_index_col] = temp_sum; } } } } template void xFResizeAreaDownScale(xf::cv::Mat& stream_in, xf::cv::Mat& resize_out) { unsigned short height = stream_in.rows; unsigned short width = stream_in.cols; unsigned short out_height = resize_out.rows; unsigned short out_width = resize_out.cols; unsigned short imgInput_ncpr = (width + (NPC - 1)) >> XF_BITSHIFT(NPC); unsigned short imgOutput_ncpr = (out_width + (NPC - 1)) >> XF_BITSHIFT(NPC); unsigned short imgOutput_width_align_npc = imgOutput_ncpr << XF_BITSHIFT(NPC); unsigned short in_col_loop_bound; if (imgOutput_width_align_npc != out_width) in_col_loop_bound = imgInput_ncpr + 1; else in_col_loop_bound = imgInput_ncpr; enum { NUM_PB = NPC, NUM_INPB = NPC, LOG2_PB = XF_BITSHIFT(NPC) }; ap_uint<32> Xscale, Yscale; // Q16.16 format Xscale = xFUdivResizeDownArea((width), (out_width)); Yscale = xFUdivResizeDownArea(height, out_height); ap_uint<32> X_1PixelWeight, Y_1PixelWeight; // Q16.16 format X_1PixelWeight = xFUdivResizeDownArea(out_width, width); Y_1PixelWeight = xFUdivResizeDownArea(out_height, height); //## X-direction output index(Q16.16), which is used for each Process block output. ap_uint<32> Xindex_output[NUM_PB]; // clang-format off #pragma HLS ARRAY_PARTITION variable=Xindex_output complete dim=0 // clang-format on ap_uint<32> Xindex_output_initial[NUM_PB]; // clang-format off #pragma HLS ARRAY_PARTITION variable=Xindex_output_initial complete dim=0 // clang-format on //## input flag for each input of last process block. //## TRUE - input data is mappped to multiplier //## FALSE- multiplier input is zero bool inflag_TA_prev[NUM_INPB]; // clang-format off #pragma HLS ARRAY_PARTITION variable=inflag_TA_prev complete dim=0 // clang-format on for (ap_uint<8> idx = 1; idx <= NUM_PB; idx++) { // clang-format off #pragma HLS pipeline // clang-format on Xindex_output_initial[idx - 1] = X_1PixelWeight * idx; // inflag_TA_prev[idx-1] = true; } ap_uint<32> Xindex_output_initial_next; Xindex_output_initial_next = X_1PixelWeight * (1 + NUM_PB); ap_uint<32> Xindex_output_next; //## Y-direction output index(Q16.16) ap_uint<32> Yindex_output = Y_1PixelWeight; ap_uint<32> Yindex_output_prev = 0; //## skip_count Q16.0, it is used for mapping input data to process block ap_uint<16> skip_count = 0; //##DDR index uint32_t read_index = 0; ap_uint<32> write_index = 0; ap_uint<16> write_col_index = 0; //## overlap flag in Y-direction bool Yaxis_overlap_en = 0; bool Yaxis_overlap_nextrow_en = 0; bool Yaxis_overlap_prevrow_en = 0; enum { DEPTH_OUTBUFFER = (DST_COLS + NPC - 1) / NPC }; //## output buffer ap_uint ouput_buffer[PLANES][NUM_PB][DEPTH_OUTBUFFER]; // clang-format off #pragma HLS ARRAY_PARTITION variable=ouput_buffer complete dim=1 #pragma HLS ARRAY_PARTITION variable=ouput_buffer complete dim=2 // clang-format on ap_uint accum_reg[PLANES][NPC * 2]; // clang-format off #pragma HLS ARRAY_PARTITION variable=accum_reg complete dim=0 // clang-format on ap_uint accum_reg_overlap[PLANES][NPC * 2]; // clang-format off #pragma HLS ARRAY_PARTITION variable=accum_reg_overlap complete dim=0 // clang-format on for (int dim2 = 0; dim2 < NPC * 2; dim2++) { // clang-format off #pragma HLS unroll #pragma HLS unroll // clang-format on for (int dim1 = 0; dim1 < PLANES; dim1++) { // clang-format off #pragma HLS unroll // clang-format on accum_reg[dim1][dim2] = 0; accum_reg_overlap[dim1][dim2] = 0; } } for (ap_uint<16> dim3 = 0; dim3 < DEPTH_OUTBUFFER; dim3++) { // clang-format off #pragma HLS pipeline // clang-format on for (int dim2 = 0; dim2 < NUM_PB; dim2++) { // clang-format off #pragma HLS unroll // clang-format on for (int dim1 = 0; dim1 < PLANES; dim1++) { // clang-format off #pragma HLS unroll // clang-format on ouput_buffer[dim1][dim2][dim3] = 0; } } } int out_col_index = 0; ap_uint<16> output_row_index_for_pingpong = 0; // Q16.0 bool prev_output_row_index_for_pingpong_bit0 = 0; ap_uint<16> ouput_index_write_counter = NPC; XF_TNAME(DEPTH, NPC) read_word; int display_write_per_row = 0; int display_write_rowID = 0; LOOP_ROW: for (ap_uint<16> row_index = 0; row_index < height; row_index++) { // clang-format off #pragma HLS LOOP_TRIPCOUNT min=1 max=SRC_ROWS // clang-format on LOOP_COL: for (int col_index = 0, col_index_next = 1; col_index < in_col_loop_bound; col_index++, col_index_next++) { // clang-format off #pragma HLS LOOP_TRIPCOUNT min=1 max=SRC_TC #pragma HLS pipeline #pragma HLS DEPENDENCE variable=ouput_buffer inter false // clang-format on for (int idx = 0; idx < NUM_PB; idx++) { // clang-format off #pragma HLS unroll // clang-format on if (col_index == 0) { Xindex_output[idx] = Xindex_output_initial[idx]; inflag_TA_prev[idx] = true; } else { if (NPC == 1) Xindex_output[idx] += X_1PixelWeight; else Xindex_output[idx] += (X_1PixelWeight * NUM_PB); } } if (col_index == 0) { Xindex_output_next = Xindex_output_initial_next; } else { if (NPC == 1) Xindex_output_next += X_1PixelWeight; else Xindex_output_next += (X_1PixelWeight * NUM_PB); } ap_uint<16> output_buffer_index[NUM_PB + 1]; // clang-format off #pragma HLS ARRAY_PARTITION variable=output_buffer_index complete dim=0 // clang-format on //## input flag for each input of process block. bool inflag_TA[NUM_PB][NUM_INPB]; // clang-format off #pragma HLS ARRAY_PARTITION variable=inflag_TA complete dim=0 // clang-format on bool inflag_for_Nplus1_Procblock; ap_uint<17> Wx_for_Nplus1_Procblock; ap_uint<17> Wx[NUM_PB][NUM_INPB]; // Q1.16 // clang-format off #pragma HLS ARRAY_PARTITION variable=Wx complete dim=0 // clang-format on bool DDR_wr_en; bool out_buffer_wr_en; ap_uint<16> output_buffer_index_next_out; flag_index_generator( Xscale, X_1PixelWeight, Y_1PixelWeight, row_index, col_index, Xindex_output, &Xindex_output_next, output_buffer_index, inflag_TA, &skip_count, Wx, &inflag_for_Nplus1_Procblock, &Wx_for_Nplus1_Procblock, &DDR_wr_en, &out_buffer_wr_en, Yaxis_overlap_en, Yindex_output, Yindex_output_prev, ouput_index_write_counter, height, width, in_col_loop_bound, &output_buffer_index_next_out); if (col_index == (in_col_loop_bound)-1) ouput_index_write_counter = NPC; else if (ouput_index_write_counter <= output_buffer_index_next_out) ouput_index_write_counter += NPC; if (col_index < imgInput_ncpr) read_word = stream_in.read(read_index++); // TODO: Wy weight generation ap_uint<17> Wy0, Wy1; // Q1.16 if (Yaxis_overlap_en == 1) { Wy0 = Y_1PixelWeight.range(15, 0) - Yindex_output.range(15, 0); Wy1 = Yindex_output.range(15, 0); } else { Wy0 = Y_1PixelWeight; Wy1 = 0; } //## output data of each process block ap_uint PB_out[PLANES][NUM_PB + 1]; // Q8.8 // clang-format off #pragma HLS ARRAY_PARTITION variable=PB_out complete dim=0 // clang-format on ap_uint PB_out_overlap[PLANES][NUM_PB + 1]; // Q8.8 // clang-format off #pragma HLS ARRAY_PARTITION variable=PB_out_overlap complete dim=0 // clang-format on //## CoreProcess has "NUM_PB" process blocks. Each process block has "NUM_INPB" 3-input multiplier and Tree // adder to accumulate multiplier output. CoreProcessDownArea(Wx, Wy0, inflag_TA, read_word, PB_out, Wx_for_Nplus1_Procblock); //## Extra CoreProcess to process next output in case of overlap. CoreProcessDownArea( Wx, Wy1, inflag_TA, read_word, PB_out_overlap, Wx_for_Nplus1_Procblock); ap_uint<8> DDR_write_data[PLANES][NPC]; // clang-format off #pragma HLS ARRAY_PARTITION variable=DDR_write_data complete dim=0 // clang-format on update_output_buffer( DDR_wr_en, out_buffer_wr_en, write_index, write_col_index, out_width, accum_reg, accum_reg_overlap, ouput_buffer, output_buffer_index, PB_out, col_index, PB_out_overlap, Yaxis_overlap_en, DDR_write_data); if (DDR_wr_en == 1) { display_write_per_row++; XF_TNAME(DEPTH, NPC) out_pix; ap_uint plane_tmp; for (int pixel = 0, bit1 = 0; pixel < NPC; pixel++, bit1 += (PLANES * 8)) { // clang-format off #pragma HLS unroll // clang-format on for (int channel = 0, bit2 = 0; channel < PLANES; channel++, bit2 += 8) { // clang-format off #pragma HLS unroll // clang-format on plane_tmp.range(bit2 + 7, bit2) = DDR_write_data[channel][pixel]; } out_pix.range(bit1 + (PLANES * 8) - 1, bit1) = plane_tmp; } if (out_col_index < imgOutput_ncpr) resize_out.write(write_index++, out_pix); if (col_index == ((in_col_loop_bound)-1)) out_col_index = 0; else out_col_index++; } if (col_index == ((in_col_loop_bound)-1)) write_col_index = 0; else if (out_buffer_wr_en) write_col_index++; // last iteration of col loop if (col_index == ((in_col_loop_bound)-1)) { Yindex_output += Y_1PixelWeight; Yindex_output_prev += Y_1PixelWeight; } int t1 = Yindex_output.range(15, 0); int t2 = Y_1PixelWeight; int t3 = Yindex_output; ap_uint<32> Yindex_threshold = Yindex_output - 0x41; if (col_index == ((in_col_loop_bound)-1)) Yaxis_overlap_prevrow_en = Yaxis_overlap_en; if (Yindex_output.range(15, 0) < Y_1PixelWeight && (Y_1PixelWeight - Yindex_output.range(15, 0)) > 0x41 && Y_1PixelWeight[16] == 0) Yaxis_overlap_en = 1; else Yaxis_overlap_en = 0; if (col_index == ((in_col_loop_bound)-1)) prev_output_row_index_for_pingpong_bit0 = output_row_index_for_pingpong[0]; if (Yaxis_overlap_en == 0) { if ((Yindex_output.range(15, 0) < Y_1PixelWeight) && (Yindex_output.range(15, 0) > 0x41) && (col_index == ((in_col_loop_bound)-1))) output_row_index_for_pingpong = Yindex_threshold.range(31, 16) - 1; else output_row_index_for_pingpong = Yindex_threshold.range(31, 16); } } // col loop } // row loop } #endif //_XF_RESIZE_DOWN_AREA_