Program Listing for File xf_bpc.hpp

Return to documentation for file (/tmp/ws/src/vitis_common/include/imgproc/xf_bpc.hpp)

/*
 * Copyright 2019 Xilinx, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef _XF_BPC_HPP_
#define _XF_BPC_HPP_

#include "ap_int.h"
#include "../common/xf_utility.hpp"
#include "../common/xf_video_mem.hpp"
//#include "xf_extra_utility.h"

// int glpbal=0;

namespace xf {

namespace cv {

// ======================================================================================
// A generic structure for BPC operation
// --------------------------------------------------------------------------------------
// Template Args:-
//        SRC_T : Data type of soruce image element
//         ROWS : Image height
//         COLS : Image width
//         NPPC : No.of pixels per clock
//     BORDER_T : Type of boder to be used for edge pixel(s) computation
// ......................................................................................

// Some macros related to template (for easiness of coding)
#define _GENERIC_BPC_TPLT_DEC                                                                  \
    template <typename F, int SRC_T, int ROWS, int COLS, int K_ROWS, int K_COLS, int NPPC = 1, \
              int BORDER_T = XF_BORDER_CONSTANT, int USE_URAM = 0>
#define _GENERIC_BPC_TPLT \
    template <typename F, int SRC_T, int ROWS, int COLS, int K_ROWS, int K_COLS, int NPPC, int BORDER_T, int USE_URAM>
#define _GENERIC_BPC GenericBPC<F, SRC_T, ROWS, COLS, K_ROWS, K_COLS, NPPC, BORDER_T, USE_URAM>

// Some global constants
#define CH_IDX_T uint8_t
#define K_ROW_IDX_T uint8_t
#define K_COL_IDX_T uint8_t
#define COL_IDX_T uint16_t // Support upto 65,535
#define ROW_IDX_T uint16_t // Support upto 65,535
#define SIZE_IDX_T uint32_t

// Some internal constants
#define _NPPC (XF_NPIXPERCYCLE(NPPC))                   // Number of pixel per clock to be processed
#define _NPPC_SHIFT_VAL (XF_BITSHIFT(NPPC))             // Gives log base 2 on NPPC; Used for shifting purpose in
                                                        // case of division
#define _ECPR ((((K_COLS >> 1) + (_NPPC - 1)) / _NPPC)) // Extra clocks required for processing a row
#define _NP_IN_PREV \
    (_NPPC - ((K_COLS >> 1) - (((K_COLS >> 1) / _NPPC) * _NPPC))) // No.of valid destination pixels in previous clock
#define _DST_PIX_WIDTH (XF_PIXELDEPTH(XF_DEPTH(SRC_T, NPPC)))     // destination pixel width

_GENERIC_BPC_TPLT_DEC class GenericBPC {
   public:
    // Internal regsiters/buffers
    xf::cv::Window<K_ROWS, XF_NPIXPERCYCLE(NPPC) + (K_COLS - 1), XF_DTUNAME(SRC_T, NPPC)>
        src_blk;                                 // Kernel sized image block with pixel parallelism
    xf::cv::Scalar<K_ROWS, K_ROW_IDX_T> row_idx; // To store row index for circular buffer access
    xf::cv::LineBuffer<K_ROWS,
                       (COLS >> _NPPC_SHIFT_VAL),
                       XF_TNAME(SRC_T, NPPC),
                       (USE_URAM ? RAM_S2P_URAM : RAM_S2P_BRAM),
                       (USE_URAM ? K_ROWS : 1)>
        buff; // Line Buffer for K_ROWS from the image

    // Internal Registers
    COL_IDX_T num_clks_per_row; // No.of clocks required for processing one row
    SIZE_IDX_T rd_ptr;          // Read pointer
    SIZE_IDX_T wr_ptr;          // Write pointer
    // uint8_t          threshold;        // Threshold value used to classify as
    // similar (0) / brighter(1) / darker(2)

    // Default Constructor
    GenericBPC() {
// clang-format off
#pragma HLS INLINE
        // clang-format on
        num_clks_per_row = 0;
        rd_ptr = 0;
        wr_ptr = 0;
    }

    // Internal functions
    void initialize(xf::cv::Mat<SRC_T, ROWS, COLS, NPPC>& _src);

    void update_row_idx();
    void process_row(ROW_IDX_T r,
                     xf::cv::Mat<SRC_T, ROWS, COLS, NPPC>& _src,
                     xf::cv::Mat<SRC_T, ROWS, COLS, NPPC>& _dst);
    void process_image(xf::cv::Mat<SRC_T, ROWS, COLS, NPPC>& _src, xf::cv::Mat<SRC_T, ROWS, COLS, NPPC>& _dst);
};

// -----------------------------------------------------------------------------------
// Function to initialize internal regsiters and buffers
// -----------------------------------------------------------------------------------
_GENERIC_BPC_TPLT void _GENERIC_BPC::initialize(xf::cv::Mat<SRC_T, ROWS, COLS, NPPC>& _src) {
#pragma HLS INLINE

    // Computing no.of clocks required for processing a row of given image
    // dimensions
    num_clks_per_row = (_src.cols + _NPPC - 1) >> _NPPC_SHIFT_VAL;

    // Read/Write pointer set to start location of input image
    rd_ptr = 0;
    wr_ptr = 0;

    // Initialize row-index values
    for (K_ROW_IDX_T kr = 0; kr < K_ROWS; kr++) {
#pragma HLS UNROLL
        row_idx.val[kr] = kr;
    }

    return;
} // End of initialize()

// -----------------------------------------------------------------------------------
// Function to process a row
// -----------------------------------------------------------------------------------
_GENERIC_BPC_TPLT void _GENERIC_BPC::process_row(ROW_IDX_T r,
                                                 xf::cv::Mat<SRC_T, ROWS, COLS, NPPC>& _src,
                                                 xf::cv::Mat<SRC_T, ROWS, COLS, NPPC>& _dst) {
#pragma HLS INLINE OFF

    // --------------------------------------
    // Constants
    // --------------------------------------
    const uint32_t _TC = (COLS >> _NPPC_SHIFT_VAL) + (K_COLS >> 1); // MAX Trip Count per row

    // --------------------------------------
    // Internal variables
    // --------------------------------------
    // Loop count variable
    COL_IDX_T col_loop_cnt = num_clks_per_row + _ECPR;
    ap_uint<32> pix_pos;
    short col = -(K_COLS >> 1);

    // To store out pixels in packed format
    XF_TNAME(SRC_T, NPPC) out_pixels, prev_out_pixels;

// --------------------------------------
// Initialize source block buffer to all zeros
// --------------------------------------
SRC_INIT_LOOP:
    for (K_ROW_IDX_T kr = 0; kr < K_ROWS; kr++) {
#pragma HLS UNROLL

        for (K_COL_IDX_T kc = 0; kc < (_NPPC + K_COLS - 1); kc++) {
#pragma HLS UNROLL
            src_blk.val[kr][kc] = 0;
        }
    }

// --------------------------------------
// Process columns of the row
// --------------------------------------
COL_LOOP:
    for (COL_IDX_T c = 0; c < col_loop_cnt; c++) {
#pragma HLS PIPELINE II = 1
#pragma HLS LOOP_TRIPCOUNT min = 1 max = _TC
        //#pragma HLS LOOP_FLATTEN OFF

        // Fetch next row of source image and store in internal RAMs
        // .........................................................
        if ((r < _src.rows) && (c < num_clks_per_row)) {
            buff.val[row_idx.val[K_ROWS - 1]][c] = _src.read(rd_ptr++);
        }

    // Fetch data from RAMs and store in 'src_blk' for processing
    // .........................................................
    BUFF_RD_LOOP:
        for (K_ROW_IDX_T kr = 0; kr < K_ROWS; kr++) {
#pragma HLS UNROLL
            XF_TNAME(SRC_T, NPPC) tmp_rd_buff;

            // Read packed data
            tmp_rd_buff = buff.val[row_idx.val[kr]][c]; // tmp_rd_buff = (c < num_clks_per_row)
                                                        // ? buff.val[row_idx.val[kr]][c] :
                                                        // (XF_TNAME(SRC_T, NPPC))0;

            // Extract pixels from packed data and store in 'src_blk'
            xfExtractPixels<NPPC, XF_WORDWIDTH(SRC_T, NPPC), XF_DEPTH(SRC_T, NPPC)>(src_blk.val[kr], tmp_rd_buff,
                                                                                    (K_COLS - 1));
        }
    // if (c >= _ECPR) {
    //   xFSetBorder<K_ROWS, (_NPPC + K_COLS-1), XF_DTUNAME(SRC_T, NPPC),
    //   BORDER_T>(src_blk, r, (c<<_NPPC_SHIFT_VAL),
    //   _src.rows, _src.cols);
    //}

    // Process the kernel block
    // ........................
    PROCESS_BLK_LOOP:
        for (int pix_idx = 0; pix_idx < _NPPC; pix_idx++) {
#pragma HLS UNROLL
            XF_DTUNAME(SRC_T, NPPC) NxM_src_blk[K_ROWS][K_COLS];
// clang-format off
#pragma HLS ARRAY_PARTITION variable = NxM_src_blk complete
            // clang-format on
            XF_DTUNAME(SRC_T, NPPC) out_pix;

        // Extract _NPPC, NxM-blocks from 'src_blk'
        REARRANGE_LOOP:
            for (K_ROW_IDX_T kr = 0; kr < K_ROWS; kr++) {
#pragma HLS UNROLL
                for (K_COL_IDX_T kc = 0; kc < K_COLS; kc++) {
#pragma HLS UNROLL
                    NxM_src_blk[kr][kc] = src_blk.val[kr][pix_idx + kc];
                }
            }

            // Apply the filter on the NxM_src_blk
            F oper;
            oper.apply(NxM_src_blk, &out_pix);

            // Start packing the out pixel value every clock of NPPC
            out_pixels.range(((pix_idx + 1) * _DST_PIX_WIDTH) - 1, (pix_idx * _DST_PIX_WIDTH)) = out_pix;
        }

        col = col + _NPPC;

        // Write the data out to DDR
        // .........................
        if (c >= _ECPR) {
            if (_NP_IN_PREV == _NPPC) { // Case of (K_COLS / 2) is divisible by NPPC
                _dst.write(wr_ptr++, out_pixels);
            } else {
                // Taking '_NP_IN_PREV' pixels from 'prev_out_pixels' (MSB side) and
                // (_NPPC - _NP_IN_PREV) from
                // 'out_pixels' (LSB)
                prev_out_pixels.range((_NP_IN_PREV * _DST_PIX_WIDTH) - 1, 0) =
                    prev_out_pixels.range((_NPPC * _DST_PIX_WIDTH) - 1, ((_NPPC - _NP_IN_PREV) * _DST_PIX_WIDTH));
                prev_out_pixels.range((_NPPC * _DST_PIX_WIDTH) - 1, (_NP_IN_PREV * _DST_PIX_WIDTH)) =
                    out_pixels.range(((_NPPC - _NP_IN_PREV) * _DST_PIX_WIDTH) - 1, 0);

                _dst.write(wr_ptr++, prev_out_pixels);
            }
        }
        prev_out_pixels = out_pixels;

    // Now get ready for next cycle of coputation. So copy the last K_COLS-1 data
    // to start location of 'src_blk'
    // ...........................................
    SHIFT_LOOP:
        for (K_ROW_IDX_T kr = 0; kr < K_ROWS; kr++) {
#pragma HLS UNROLL
            for (K_COL_IDX_T kc = 0; kc < K_COLS - 1; kc++) {
#pragma HLS UNROLL
                src_blk.val[kr][kc] = src_blk.val[kr][_NPPC + kc];
            }
        }
    }

    return;
} // End of process_row()

// -----------------------------------------------------------------------------------
// Function to update row index (Cyclic shift)
// -----------------------------------------------------------------------------------
_GENERIC_BPC_TPLT void _GENERIC_BPC::update_row_idx() {
#pragma HLS INLINE OFF

    K_ROW_IDX_T tmp_idx = row_idx.val[0];

    for (K_ROW_IDX_T kr = 0; kr < K_ROWS - 1; kr++) {
#pragma HLS UNROLL
        row_idx.val[kr] = row_idx.val[kr + 1];
    }
    row_idx.val[K_ROWS - 1] = tmp_idx;

    return;
} // End of update_row_idx

// -----------------------------------------------------------------------------------
// Main function that runs the filter over the image
// -----------------------------------------------------------------------------------
_GENERIC_BPC_TPLT void _GENERIC_BPC::process_image(xf::cv::Mat<SRC_T, ROWS, COLS, NPPC>& _src,
                                                   xf::cv::Mat<SRC_T, ROWS, COLS, NPPC>& _dst) {
#pragma HLS INLINE OFF
    // Constant declaration
    const uint32_t _TC =
        ((COLS >> _NPPC_SHIFT_VAL) + (K_COLS >> 1)) / NPPC; // MAX Trip Count per row considering N-Pixel parallelsim

    // ----------------------------------
    // Start process with initialization
    // ----------------------------------
    initialize(_src);

// ----------------------------------
// Initialize Line Buffer
// ----------------------------------
// Part1: Initialize the buffer with 1st (kernel height)/2 rows of image
//        Start filling rows from (kernel height)/2 and rest depending on border
//        type
READ_LINES_INIT:
    for (K_ROW_IDX_T r = (K_ROWS >> 1); r < (K_ROWS - 1); r++) { // Note: Ignoring last row
#pragma HLS UNROLL
        for (COL_IDX_T c = 0; c < num_clks_per_row; c++) {
#pragma HLS PIPELINE
#pragma HLS LOOP_TRIPCOUNT min = 1 max = _TC
            buff.val[r][c] = _src.read(rd_ptr++); // Reading the rows of image
        }
    }
// Part2: Take care of borders depending on border type.
//        In border replicate mode, fill with 1st row of the image.
BORDER_INIT:
    for (K_ROW_IDX_T r = 0; r < (K_ROWS >> 1); r++) {
#pragma HLS UNROLL
        for (COL_IDX_T c = 0; c < num_clks_per_row; c++) {
#pragma HLS PIPELINE
#pragma HLS LOOP_TRIPCOUNT min = 1 max = _TC
            buff.val[r][c] = (BORDER_T == XF_BORDER_REPLICATE) ? buff.val[K_ROWS >> 1][c] : (XF_TNAME(SRC_T, NPPC))0;
        }
    }

// ----------------------------------
// Processing each row of the image
// ----------------------------------
ROW_LOOP:
    for (ROW_IDX_T r = (K_ROWS >> 1); r < _src.rows + (K_ROWS >> 1); r++) {
//#pragma HLS PIPELINE
#pragma HLS LOOP_TRIPCOUNT min = 1 max = ROWS
        process_row(r, _src, _dst);
        update_row_idx();
    }

    return;
} // End of process_image()

// ======================================================================================

// ======================================================================================
// Class for BPC computation
// ======================================================================================
#define _BPC_P_SIZE 5

template <int SRC_T, int NPPC>
class BPC {
   public:
    // -------------------------------------------------------------------------
    // Creating apply function
    // Inputs: patch of NxN size
    // Ouputs: out_pix
    // -------------------------------------------------------------------------
    void apply(XF_DTUNAME(SRC_T, NPPC) patch[_BPC_P_SIZE][_BPC_P_SIZE], XF_DTUNAME(SRC_T, NPPC) * out_pix) {
#pragma HLS INLINE

        XF_DTUNAME(SRC_T, NPPC) out_val;
        XF_DTUNAME(SRC_T, NPPC) array[9];
        XF_DTUNAME(SRC_T, NPPC) array_channel[8];
#pragma HLS ARRAY_PARTITION variable = array complete dim = 1

        int array_ptr = 0;
    Compute_Grad_Loop:
        for (int copy_arr = 0; copy_arr < _BPC_P_SIZE; copy_arr = copy_arr + 2) {
#pragma HLS LOOP_TRIPCOUNT min = 1 max = 5
#pragma HLS UNROLL
            for (int copy_in = 0; copy_in < _BPC_P_SIZE; copy_in = copy_in + 2) {
#pragma HLS LOOP_TRIPCOUNT min = 1 max = 5
#pragma HLS UNROLL
                array[array_ptr] = patch[copy_arr][copy_in];
                array_ptr++;
            }
        }
        // for(int channel=0,k=0;channel<PLANES;channel++,k+=8)
        //  {
        //#pragma HLS LOOP_TRIPCOUNT min=1 max=PLANES
        //#pragma HLS UNROLL

        for (int p = 0; p < 4; p++) {
#pragma HLS LOOP_TRIPCOUNT min = 1 max = 9
#pragma HLS UNROLL
            array_channel[p] = array[p];
        }
        for (int l = 4; l < 8; l++) {
            array_channel[l] = array[l + 1];
        }
        XF_DTUNAME(SRC_T, NPPC) min = array_channel[0];
        XF_DTUNAME(SRC_T, NPPC) max = array_channel[0];
    xFApplyMaskLoop:
        for (int16_t j = 1; j < 8; j++) {
#pragma HLS LOOP_TRIPCOUNT min = 1 max = 9
            //#pragma HLS LOOP_FLATTEN off

            if (array_channel[j] > max) {
                max = array_channel[j];
            }
            if (array_channel[j] < min) {
                min = array_channel[j];
            }
        }
        XF_DTUNAME(SRC_T, NPPC) finalout = 0;

        if (array[4] < min)
            finalout = min;
        else if (array[4] > max)
            finalout = max;
        else
            finalout = array[4];

        out_val = finalout;
        //  }

        *out_pix = out_val;

        return;
    }
};

// ======================================================================================

// ======================================================================================
// Top BPC API
// --------------------------------------------------------------------------------------
// Template Args:-
//         TYPE : Data type of soruce image element
//         ROWS : Image height
//         COLS : Image width
//         NPPC : No.of pixels per clock
//     BORDER_T : Type of boder to be used for edge pixel(s) computation
//                (XF_BORDER_REPLICATE, XF_BORDER_CONSTANT,
//                XF_BORDER_REFLECT_101, XF_BORDER_REFLECT)
// ......................................................................................
#define _BPC_ BPC<TYPE, NPPC>

// --------------------------------------------------------------------------------------
// Below function will generate list of corners
// --------------------------------------------------------------------------------------
template <int TYPE, int ROWS, int COLS, int NPPC = 1, int BORDER_T = XF_BORDER_CONSTANT, int USE_URAM = 0>
void badpixelcorrection(xf::cv::Mat<TYPE, ROWS, COLS, NPPC>& _src, xf::cv::Mat<TYPE, ROWS, COLS, NPPC>& _dst) {
// clang-format off
#pragma HLS INLINE OFF
    // clang-format on
    GenericBPC<_BPC_, TYPE, ROWS, COLS, _BPC_P_SIZE, _BPC_P_SIZE, NPPC, BORDER_T, USE_URAM> bpc;

    bpc.process_image(_src, _dst);

    return;
}

#undef _BPC_

// ======================================================================================

// Some clean up for macros used
#undef _BPC_P_SIZE
#undef _BPC_NMS_P_SIZE
#undef _GENERIC_BPC_TPLT_DEC
#undef _GENERIC_BPC_TPLT
#undef _GENERIC_BPC

#undef CH_IDX_T
#undef K_ROW_IDX_T
#undef K_COL_IDX_T
#undef COL_IDX_T
#undef ROW_IDX_T
#undef SIZE_IDX_T

#undef _NPPC
#undef _NPPC_SHIFT_VAL
#undef _ECPR
#undef _NP_IN_PREV
#undef _DST_PIX_WIDTH

} // namespace xf
}

#endif //_XF_BPC_HPP_