Program Listing for File xf_resize_up_area.hpp

Return to documentation for file (/tmp/ws/src/vitis_common/include/imgproc/xf_resize_up_area.hpp)

/*
 * Copyright 2019 Xilinx, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef _XF_RESIZE_UP_AREA_
#define _XF_RESIZE_UP_AREA_

#include "hls_stream.h"
#include "ap_int.h"
#include "../common/xf_common.hpp"
#include "../core/xf_math.h"
#include "../common/xf_utility.hpp"
//#define POW32 4294967296   // 2^32

/*
 *  Coreprocessing Processing Block
 *
 *  PixelValue = A0*(1-Wx)*(1-Wy) + B0*(Wx)*(1-Wy) + A1*(1-Wx)*(Wy) + B1*(Wx)*(Wy)
 *             = Wx*Wy*(A0+B1-B0-A1) + Wx*(B0-A0) + Wy*(A1-A0) + A0
 */
static void CoreProcessUpArea(
    uchar_t A0, uchar_t B0, uchar_t A1, uchar_t B1, uint32_t Wx, uint32_t Wy, uchar_t* pixel) {
// clang-format off
    #pragma HLS PIPELINE
    // clang-format on
    uint32_t Wxy;
    int16_t val0, val1, val2;
    int64_t P1, P2, P3, P4;

    Wxy = ((uint64_t)Wx * Wy) >> 32; // Wx - 0.32, Wy-0.32  (Wx*Wy-0.64)  Wxy - 0.32
    val0 = (A0 + B1 - (B0 + A1));
    val1 = (B0 - A0);
    val2 = (A1 - A0);

    P1 = ((int64_t)val0 * Wxy); // val0(16.0) * Wxy(0.32) = P1(16.32)
    P2 = ((int64_t)val1 * Wx);  // val1(16.0) * Wx(0.32) = P2(16.32)
    P3 = ((int64_t)val2 * Wy);  // val1(16.0) * Wy(0.32) = P3(16.32)
    P4 = ((int64_t)A0 << 32);   // A0(8.0) P4(8.32)

    *pixel = (uchar_t)((P1 + P2 + P3 + P4) >> 32); // to get only integer part from sum of 8.32's , right shift by 32
}

/*
 * Processes the 8 pixel block
 * outputs 8 pixles packed into 64-bit
 */
template <int DEPTH, int NPC, int WORDWIDTH, int PLANES>
static XF_TNAME(DEPTH, NPC) ProcessBlockAreaUp(ap_uint<13> Offset[],
                                               uint32_t Weight[],
                                               uint32_t Yweight,
                                               XF_TNAME(DEPTH, NPC) D0[2],
                                               XF_TNAME(DEPTH, NPC) D1[2],
                                               ap_uint<13> blockstart,
                                               ap_uint<13> indoffset) {
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=D0 dim=1
    #pragma HLS ARRAY_PARTITION variable=D1 dim=1
    #pragma HLS INLINE
    // clang-format on
    XF_PTUNAME(DEPTH)
    line0[2 * (1 << XF_BITSHIFT(NPC))], line1[2 * (1 << XF_BITSHIFT(NPC))]; // holds the unpacked pixeldata
                                                                            // clang-format off
    #pragma HLS ARRAY_PARTITION variable=line0 complete dim=1
    #pragma HLS ARRAY_PARTITION variable=line1 complete dim=1
                                                                            // clang-format on

    uchar_t i, input_read, Pixels;
    uint16_t block_start_ind = (blockstart >> XF_BITSHIFT(NPC)) << XF_BITSHIFT(NPC);

    //  for(i=0;i<2;i++){
    //#pragma HLS unroll
    //      xfExtractPixels<NPC,WORDWIDTH,XF_DEPTH(DEPTH,NPC)>(line0,D0[i],i<<XF_BITSHIFT(NPC));
    //      xfExtractPixels<NPC,WORDWIDTH,XF_DEPTH(DEPTH,NPC)>(line1,D1[i],i<<XF_BITSHIFT(NPC));
    //  }
    XF_PTUNAME(DEPTH)
    line0_0[(1 << XF_BITSHIFT(NPC))], line0_1[(1 << XF_BITSHIFT(NPC))]; // holds the unpacked pixeldata
                                                                        // clang-format off
    #pragma HLS ARRAY_PARTITION variable=line0_0 complete dim=1
    #pragma HLS ARRAY_PARTITION variable=line0_1 complete dim=1
                                                                        // clang-format on
    XF_PTUNAME(DEPTH)
    line1_0[(1 << XF_BITSHIFT(NPC))], line1_1[(1 << XF_BITSHIFT(NPC))]; // holds the unpacked pixeldata
                                                                        // clang-format off
    #pragma HLS ARRAY_PARTITION variable=line1_0 complete dim=1
    #pragma HLS ARRAY_PARTITION variable=line1_1 complete dim=1
                                                                        // clang-format on

    xfExtractPixels<NPC, WORDWIDTH, XF_DEPTH(DEPTH, NPC)>(line0_0, D0[0], 0);
    xfExtractPixels<NPC, WORDWIDTH, XF_DEPTH(DEPTH, NPC)>(line0_1, D0[1], 0);
    xfExtractPixels<NPC, WORDWIDTH, XF_DEPTH(DEPTH, NPC)>(line1_0, D1[0], 0);
    xfExtractPixels<NPC, WORDWIDTH, XF_DEPTH(DEPTH, NPC)>(line1_1, D1[1], 0);

    for (int ii = 0; ii < 2 * (1 << XF_BITSHIFT(NPC)); ii++) {
// clang-format off
        #pragma HLS UNROLL
        // clang-format on
        if (ii < (1 << XF_BITSHIFT(NPC))) {
            line0[ii] = line0_0[ii];
            line1[ii] = line1_0[ii];
        } else {
            line0[ii] = line0_1[ii - (1 << XF_BITSHIFT(NPC))];
            line1[ii] = line1_1[ii - (1 << XF_BITSHIFT(NPC))];
        }
    }

    XF_TNAME(DEPTH, NPC) val = 0;
    int shift = 0;
process_block_loop:
    for (i = 0; i < (1 << XF_BITSHIFT(NPC)); i++) {
// clang-format off
        #pragma HLS UNROLL
        // clang-format on
        // input_read = (NPC == XF_NPPC1) ?0:Offset[indoffset+i] - block_start_ind;

        if (NPC == XF_NPPC1) {
            input_read = 0;
        } else {
            input_read = Offset[indoffset + i] - block_start_ind;
        }
        for (ap_uint<5> c = 0, k = 0; c < 3; c++, k += 8) {
            if (PLANES != 1) {
                CoreProcessUpArea(line0[input_read].range(k + 7, k), line0[input_read + 1].range(k + 7, k),
                                  line1[input_read].range(k + 7, k), line1[input_read + 1].range(k + 7, k),
                                  Weight[indoffset + i], Yweight, &Pixels);
                val.range((i * XF_PIXELWIDTH(DEPTH, NPC)) + k + 7, (i * XF_PIXELWIDTH(DEPTH, NPC)) + k) = Pixels;
            } else {
                if (c == 0) {
                    CoreProcessUpArea(line0[input_read], line0[input_read + 1], line1[input_read],
                                      line1[input_read + 1], Weight[indoffset + i], Yweight, &Pixels);
                    shift = i << XF_BITSHIFT(NPC);
                    val.range(shift + 7, shift) = Pixels;
                }
            }
        }
    }
    return val;
}
static uint64_t xFUDivAreaUp(uint64_t in_n, unsigned short in_d) {
// clang-format off
    #pragma HLS INLINE OFF
    // clang-format on
    uint64_t out_res = in_n / in_d;
    return out_res;
}

template <int SRC_ROWS,
          int SRC_COLS,
          int PLANES,
          int DEPTH,
          int NPC,
          int WORDWIDTH,
          int DST_ROWS,
          int DST_COLS,
          int SRC_TC,
          int DST_TC>
void xFResizeAreaUpScale(xf::cv::Mat<DEPTH, SRC_ROWS, SRC_COLS, NPC>& stream_in,
                         xf::cv::Mat<DEPTH, DST_ROWS, DST_COLS, NPC>& resize_out) {
    enum { DEPTH_LBUF = (SRC_COLS + NPC - 1) / NPC };
    XF_TNAME(DEPTH, NPC) lbuf_in0[DEPTH_LBUF];        // input buffers (ping pong)
    XF_TNAME(DEPTH, NPC) lbuf_in1[DEPTH_LBUF];        // input buffers (ping pong)
    XF_TNAME(DEPTH, NPC) lbuf_in2[DEPTH_LBUF];        // input buffers (ping pong)
    ap_uint<13> Hoffset[DST_COLS], Voffset[DST_ROWS]; // offset buffers which indicate from where the data is to be read
    uint32_t Hweight[DST_COLS],
        Vweight[DST_ROWS + 1]; // buffers which hold the weights for each corresponding input pixels
    if (NPC == XF_NPPC8) {
// clang-format off
        #pragma HLS ARRAY_PARTITION variable=Hoffset cyclic factor=8 dim=1
        #pragma HLS ARRAY_PARTITION variable=Hweight cyclic factor=8 dim=1
        // clang-format on
    } else if (NPC == XF_NPPC4) {
// clang-format off
        #pragma HLS ARRAY_PARTITION variable=Hoffset cyclic factor=4 dim=1
        #pragma HLS ARRAY_PARTITION variable=Hweight cyclic factor=4 dim=1
        // clang-format on
    } else if (NPC == XF_NPPC2) {
// clang-format off
        #pragma HLS ARRAY_PARTITION variable=Hoffset cyclic factor=2 dim=1
        #pragma HLS ARRAY_PARTITION variable=Hweight cyclic factor=2 dim=1
        // clang-format on
    }

    unsigned short height = stream_in.rows;
    unsigned short width = stream_in.cols;
    unsigned short out_height = resize_out.rows;
    unsigned short out_width = resize_out.cols;

    unsigned short imgInput_ncpr = (width + (NPC - 1)) >> XF_BITSHIFT(NPC);
    unsigned short imgOutput_ncpr = (out_width + (NPC - 1)) >> XF_BITSHIFT(NPC);

    uchar_t idx = 0, repcount = 0, datacount = 0;
    uint16_t Hstart[((DST_COLS + NPC - 1) >> XF_BITSHIFT(NPC)) +
                    1]; // Buffers holding the starting offset for each 8pixel block
    short x, read_line, block_ind, block_start, bufferIndex;
    short prev_y = -1, j = 0, i = 0, k, ii = 0, Yoffset, offset_temp, prev_offset_temp = 0;
    uint32_t Xscale, Yscale, Yweight;
    uint64_t inv_Xscale, inv_Yscale;
    int64_t Xtemp = 0, Ytemp = 0;
    int read_index = 0, write_index = 0;
    // float Xscale_float,Yscale_float,inv_Xscale_float,inv_Yscale_float;
    XF_TNAME(DEPTH, NPC) D0[2], D1[2]; // Holds the packed pixels required for processing
                                       // clang-format off
                                       #pragma HLS ARRAY_PARTITION variable=D0 dim=1
                                       #pragma HLS ARRAY_PARTITION variable=D1 dim=1
                                       // clang-format on

//  Xscale_float = (width<<XF_BITSHIFT(NPC))/(float)(out_width<<XF_BITSHIFT(NPC));
//  Yscale_float = height/(float)out_height;
//  inv_Xscale_float = (out_width<<XF_BITSHIFT(NPC))/(float)(width<<XF_BITSHIFT(NPC));
//  inv_Yscale_float = out_height/(float)height;

// clang-format off
    #pragma HLS ALLOCATION function instances=xFUDivAreaUp limit=1
    // clang-format on
    Xscale = xFUDivAreaUp((uint64_t)(width)*POW32, out_width);
    Yscale = xFUDivAreaUp((uint64_t)height * POW32, out_height);
    inv_Xscale = xFUDivAreaUp((uint64_t)(out_width)*POW32, width);
    inv_Yscale = xFUDivAreaUp((uint64_t)out_height * POW32, height);

    /* Calculating required Horizontal parameters*/
    Hstart[0] = 0;

    short npc_counter = 0;
    short Hstart_index = 0;
Hoffset_loop:
    for (x = 0, ii = 0; x < out_width; x++) {
// clang-format off
        #pragma HLS pipeline
        #pragma HLS LOOP_TRIPCOUNT min=1 max=DST_COLS
        // clang-format on

        offset_temp = ((uint64_t)x * Xscale + 429496) >> 32; // Extracting only the integer part,x(16.0) Xscale(0.32)
        Xtemp = ((uint64_t)(x + 1) << 32) - (offset_temp + 1) * inv_Xscale; // inv_Xscale 32.32
        if (Xtemp < 0)
            Hweight[x] = 0;
        else
            Hweight[x] = (uint32_t)(Xtemp & 0xFFFFFFFF); // Extracting fractional part

        if (npc_counter == 0) Hstart[Hstart_index++] = offset_temp;
        Hoffset[x] = offset_temp;
        prev_offset_temp = offset_temp;

        if (npc_counter == NPC - 1)
            npc_counter = 0;
        else
            npc_counter++;
    }

/* Calculating required Vertical parameters*/
Voffset_loop:
    for (x = 0; x < out_height; x++) {
// clang-format off
        #pragma HLS pipeline
        #pragma HLS LOOP_TRIPCOUNT min=1 max=DST_ROWS
        // clang-format on

        offset_temp = ((uint64_t)x * Yscale + 429496) >> 32; // Yscale(0.32)  Extracting only the integer part
        Ytemp = ((uint64_t)(x + 1) << 32) - (offset_temp + 1) * inv_Yscale;
        if (Ytemp < 0)
            Vweight[x] = 0;
        else
            Vweight[x] = offset_temp < (height - 1) ? (uint32_t)(Ytemp & 0xFFFFFFFF) : 0;

        // Voffset[x] = (offset_temp+1)<(height)?(offset_temp+1):height;

        if ((offset_temp) < height) {
            Voffset[x] = (offset_temp);
        } else {
            Voffset[x] = height - 1;
        }
    }

    idx = 0;

    bool read_flag = 0;
    ap_uint<2> l0 = 0, l1 = 1, l2 = 2, read_into = 2;
    ap_uint<16> lind0 = 0, lind1 = 1, lind2 = 65535, out_j = 0;
    for (x = 0; x < imgInput_ncpr; x++) {
// clang-format off
        #pragma HLS pipeline
        #pragma HLS LOOP_TRIPCOUNT min=1 max = SRC_TC
        // clang-format on
        XF_TNAME(DEPTH, NPC) tmp_in = stream_in.read(read_index++);

        lbuf_in0[x] = tmp_in;
    }
    out_j++;
    for (x = 0; x < imgInput_ncpr; x++) {
// clang-format off
        #pragma HLS pipeline
        #pragma HLS LOOP_TRIPCOUNT min=1 max = SRC_TC
        // clang-format on
        XF_TNAME(DEPTH, NPC) tmp_in = stream_in.read(read_index++);
        lbuf_in1[x] = tmp_in;
    }
    out_j++;
    int test = (int)lbuf_in0[0].range(7, 0);

outerloop:
    for (j = 0; j < out_height; j++) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=1 max = DST_ROWS
        // clang-format on
        if (read_flag) {
            if (read_into == 2) {
                lind2 = out_j;
            } else if (read_into == 1) {
                lind1 = out_j;
            } else {
                lind0 = out_j;
            }
            out_j++;
        }
        Yoffset = Voffset[j]; // row to be read
        Yweight = Vweight[j]; // weight of the next row

        if (Yoffset == lind0 && (Yoffset + 1) == lind1) {
            read_into = 2;
            l0 = 0;
            l1 = 1;
        } else if (Yoffset == lind1 && (Yoffset + 1) == lind2) {
            read_into = 0;
            l0 = 1;
            l1 = 2;
        } else if (Yoffset == lind2 && (Yoffset + 1) == lind0) {
            read_into = 1;
            l0 = 2;
            l1 = 0;
        }

        if (j < out_height - 1) {
            if (Voffset[j + 1] != Voffset[j]) {
                read_flag = 1;
            } else {
                read_flag = 0;
            }
        } else {
            read_flag = 0;
        }

    innerloop:
        for (i = 0; i < out_width;
             i = i + (1 << XF_BITSHIFT(NPC)))
        {
// clang-format off
            #pragma HLS PIPELINE
            #pragma HLS LOOP_FLATTEN OFF
            #pragma HLS LOOP_TRIPCOUNT min=1 max=DST_TC avg=DST_TC
            // clang-format on
            block_ind = i >> XF_BITSHIFT(NPC);
            block_start =
                (NPC == XF_NPPC1)
                    ? (ap_uint<13>)Hoffset[block_ind]
                    : (ap_uint<13>)
                          Hstart[block_ind]; // block_start is index of the input pixel to be read in image dimesions
            bufferIndex = block_start >> XF_BITSHIFT(NPC);
            if (read_flag && i < width && out_j < height) {
                if (read_into == 0) {
                    lbuf_in0[i >> XF_BITSHIFT(NPC)] = stream_in.read(read_index++);
                    for (k = 0; k < 2; k++) {
// clang-format off
                        #pragma HLS UNROLL
                        // clang-format on
                        if ((k + bufferIndex) < imgInput_ncpr) {
                            D0[k] = lbuf_in1[bufferIndex + k];
                            D1[k] = lbuf_in2[bufferIndex + k];
                        } else {
                            D0[k] = 0;
                            D1[k] = 0;
                        }
                    }
                } else if (read_into == 1) {
                    lbuf_in1[i >> XF_BITSHIFT(NPC)] = stream_in.read(read_index++);
                    for (k = 0; k < 2; k++) {
// clang-format off
                        #pragma HLS UNROLL
                        // clang-format on
                        if ((k + bufferIndex) < imgInput_ncpr) {
                            D0[k] = lbuf_in2[bufferIndex + k];
                            D1[k] = lbuf_in0[bufferIndex + k];
                        } else {
                            D0[k] = 0;
                            D1[k] = 0;
                        }
                    }
                } else {
                    lbuf_in2[i >> XF_BITSHIFT(NPC)] = stream_in.read(read_index++);
                    for (k = 0; k < 2; k++) {
// clang-format off
                        #pragma HLS UNROLL
                        // clang-format on
                        if ((k + bufferIndex) < imgInput_ncpr) {
                            D0[k] = lbuf_in0[bufferIndex + k];
                            D1[k] = lbuf_in1[bufferIndex + k];
                        } else {
                            D0[k] = 0;
                            D1[k] = 0;
                        }
                    }
                }
            } else {
                for (k = 0; k < 2; k++) {
// clang-format off
                    #pragma HLS UNROLL
                    // clang-format on
                    if ((k + bufferIndex) < imgInput_ncpr) {
                        if (l0 == 0) {
                            D0[k] = lbuf_in0[bufferIndex + k];
                            if (l1 == 1)
                                D1[k] = lbuf_in1[bufferIndex + k];
                            else
                                D1[k] = lbuf_in2[bufferIndex + k];
                        } else if (l0 == 1) {
                            D0[k] = lbuf_in1[bufferIndex + k];
                            if (l1 == 0)
                                D1[k] = lbuf_in0[bufferIndex + k];
                            else
                                D1[k] = lbuf_in2[bufferIndex + k];
                        } else {
                            D0[k] = lbuf_in2[bufferIndex + k];
                            if (l1 == 0)
                                D1[k] = lbuf_in0[bufferIndex + k];
                            else
                                D1[k] = lbuf_in1[bufferIndex + k];
                        }
                    } else {
                        D0[k] = 0;
                        D1[k] = 0;
                    }
                }
            }
            XF_TNAME(DEPTH, NPC)
            out_pix =
                ProcessBlockAreaUp<DEPTH, NPC, WORDWIDTH, PLANES>(Hoffset, Hweight, Yweight, D0, D1, block_start, i);
            resize_out.write(write_index++, out_pix);
        }
    }
}

#endif