Program Listing for File xf_edge_tracing.hpp

Return to documentation for file (/tmp/ws/src/vitis_common/include/imgproc/xf_edge_tracing.hpp)

/*
 * Copyright 2019 Xilinx, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef _XF_EDGE_TRACING_HPP_
#define _XF_EDGE_TRACING_HPP_

#ifndef __cplusplus
#error C++ is needed to use this file!
#endif

#include "hls_stream.h"
#include "../common/xf_common.hpp"
#include "../common/xf_utility.hpp"
#include <ap_int.h>
#include <string.h>

#define INTRA_ITERATIONS 8
#define INTER_ITERATIONS 2
#define SLICES 4
#define PIXELS 34
#define MIN_OVERLAP 10
#define PIXEL_PROCESS_BITS 68

#define MIN(x, y) (((x) < (y)) ? (x) : (y))
#define DIV_CEIL(x, y) (((x) + (y)-1) / (y))
#define ADJUST_MULTIPLE(x, y) (DIV_CEIL(x, y) * (y))

namespace xf {
namespace cv {

static void applyEqn(ap_uint<2>& x0,
                     ap_uint<2>& x1,
                     ap_uint<2>& x2,
                     ap_uint<2>& x3,
                     ap_uint<2>& a,
                     ap_uint<2>& x4,
                     ap_uint<2>& x5,
                     ap_uint<2>& x6,
                     ap_uint<2>& x7) {
// clang-format off
    #pragma HLS inline
    // clang-format on

    //# Apply equations
    bool a0 = a.range(1, 1);
    bool a1 = a.range(0, 0);

    a0 = (x0.range(1, 1) | x1.range(1, 1) | x2.range(1, 1) | x3.range(1, 1) | x4.range(1, 1) | x5.range(1, 1) |
          x6.range(1, 1) | x7.range(1, 1) | a.range(1, 1)) &
         (a.range(0, 0));

    x0.range(1, 1) = (a0 & x0.range(0, 0)) | x0.range(1, 1);
    x1.range(1, 1) = (a0 & x1.range(0, 0)) | x1.range(1, 1);
    x2.range(1, 1) = (a0 & x2.range(0, 0)) | x2.range(1, 1);
    x3.range(1, 1) = (a0 & x3.range(0, 0)) | x3.range(1, 1);
    x4.range(1, 1) = (a0 & x4.range(0, 0)) | x4.range(1, 1);
    x5.range(1, 1) = (a0 & x5.range(0, 0)) | x5.range(1, 1);
    x6.range(1, 1) = (a0 & x6.range(0, 0)) | x6.range(1, 1);
    x7.range(1, 1) = (a0 & x7.range(0, 0)) | x7.range(1, 1);

    //# Center pixel update
    a.range(1, 1) = a0;
    a.range(0, 0) = a1;
}

template <int n>
void PixelProcessNew(ap_uint<PIXEL_PROCESS_BITS> k1,
                     ap_uint<PIXEL_PROCESS_BITS> k2,
                     ap_uint<PIXEL_PROCESS_BITS> k3,
                     ap_uint<PIXEL_PROCESS_BITS>& l1,
                     ap_uint<PIXEL_PROCESS_BITS>& l2,
                     ap_uint<PIXEL_PROCESS_BITS>& l3) {
// clang-format off
    #pragma HLS inline off
    // clang-format on

    ap_uint<2> x1[PIXELS], x2[PIXELS], x3[PIXELS];
    ap_uint<2> y1[PIXELS], y2[PIXELS], y3[PIXELS];
    ap_uint<2> z1[PIXELS], z2[PIXELS], z3[PIXELS];

    for (int i = 0, j = 0; i < PIXEL_PROCESS_BITS; i += 2, j++) {
// clang-format off
        #pragma HLS unroll
        // clang-format on
        x1[j] = k1.range(i + 1, i);
        x2[j] = k2.range(i + 1, i);
        x3[j] = k3.range(i + 1, i);
    }

PL_1:
    for (int i = 1; i < PIXELS - 1; i += 3) {
// clang-format off
        #pragma HLS unroll
        // clang-format on
        applyEqn(x1[i - 1], x1[i], x1[i + 1], x2[i - 1], x2[i], x2[i + 1], x3[i - 1], x3[i], x3[i + 1]);
    }

    for (int i = 0; i < PIXELS; i++) {
// clang-format off
        #pragma HLS unroll
        // clang-format on
        y1[i] = x1[i];
        y2[i] = x2[i];
        y3[i] = x3[i];
    }

PL_2:
    for (int i = 2; i < PIXELS; i += 3) {
// clang-format off
        #pragma HLS unroll
        // clang-format on
        applyEqn(y1[i - 1], y1[i], y1[i + 1], y2[i - 1], y2[i], y2[i + 1], y3[i - 1], y3[i], y3[i + 1]);
    }

    for (int i = 0; i < PIXELS; i++) {
// clang-format off
        #pragma HLS unroll
        // clang-format on
        z1[i] = y1[i];
        z2[i] = y2[i];
        z3[i] = y3[i];
    }

PL_3:
    for (int i = 3; i < PIXELS - 1; i += 3) {
// clang-format off
        #pragma HLS unroll
        // clang-format on
        applyEqn(z1[i - 1], z1[i], z1[i + 1], z2[i - 1], z2[i], z2[i + 1], z3[i - 1], z3[i], z3[i + 1]);
    }

    for (int i = 0, j = 0; i < PIXEL_PROCESS_BITS; i += 2, j++) {
        l1.range(i + 1, i) = z1[j];
        l2.range(i + 1, i) = z2[j];
        l3.range(i + 1, i) = z3[j];
    }
}

template <int BRAMS, int BRAMS_SETS_BY_3, int DEPTH>
void TopDown(ap_uint<64> iBuff[BRAMS][DEPTH],
             uint16_t width,
             uint16_t height,
             int bramsetsval,
             int bramtotal,
             int bdrows,
             int ram_row_depth) {
    ap_uint<64> arr1[BRAMS], arr2[BRAMS], arr4[BRAMS];
// clang-format off
    #pragma HLS array_partition variable=arr1 complete
    #pragma HLS array_partition variable=arr2 complete
    #pragma HLS array_partition variable=arr4 complete
    // clang-format on

    ap_uint<4> arr3[BRAMS], arr5[BRAMS];
// clang-format off
    #pragma HLS array_partition variable=arr3 complete
    #pragma HLS array_partition variable=arr5 complete
    // clang-format on

    int countind = 0;

    for (int j = 0; j < 3; j++) {
    RD_INIT:
        for (int i = 0; i < BRAMS; i++) {
// clang-format off
            #pragma HLS unroll
            #pragma HLS loop_tripcount min=38 max=38
            // clang-format on

            arr1[i] = iBuff[i][0];
            arr3[i] = arr1[i].range(3, 0);
        }

    // Elements per RAM
    ELEMENTS_P_RAM:
        for (int el = 1; el < (ram_row_depth * bdrows);
             el++) { // (width/32)*number of rows possible in one bram(512 depth)//(ram_row_depth * bdrows)
                     // clang-format off
                     #pragma HLS loop_tripcount min=480 max=480
                     #pragma HLS pipeline II=1
                     #pragma HLS loop_flatten off
                     #pragma HLS DEPENDENCE variable=arr1 inter false
                     #pragma HLS DEPENDENCE variable=arr2 inter false
                     // clang-format on

        RD:
            for (int i = 0; i < BRAMS; i++) {
// clang-format off
                #pragma HLS unroll
                #pragma HLS loop_tripcount min=38 max=38
                // clang-format on

                arr2[i].range(3, 0) = arr3[i];
                arr2[i].range(63, 4) = arr1[i].range(63, 4);
                arr1[i] = iBuff[i][el];
                arr3[i] = arr1[i].range(3, 0);
                arr4[i] = arr2[i];
            }

        RD1:
            for (int i = 1, k = 0; i < BRAMS - 3; i += 3, k++) {
// clang-format off
                #pragma HLS unroll
                #pragma HLS loop_tripcount min=38 max=38
                // clang-format on

                ap_uint<PIXEL_PROCESS_BITS> k1, k2, k3;
                ap_uint<PIXEL_PROCESS_BITS> l1, l2, l3;

                k1.range(63, 0) = arr2[i + j - 1];
                k2.range(63, 0) = arr2[i + j + 0];
                k3.range(63, 0) = arr2[i + j + 1];

                k1.range(PIXEL_PROCESS_BITS - 1, 64) = arr1[i + j - 1].range(3, 0);
                k2.range(PIXEL_PROCESS_BITS - 1, 64) = arr1[i + j + 0].range(3, 0);
                k3.range(PIXEL_PROCESS_BITS - 1, 64) = arr1[i + j + 1].range(3, 0);

                PixelProcessNew<1>(k1, k2, k3, l1, l2, l3);

                arr4[i + j - 1] = l1.range(63, 0);
                arr4[i + j + 0] = l2.range(63, 0);
                arr4[i + j + 1] = l3.range(63, 0);

                arr3[i + j - 1] = l1.range(PIXEL_PROCESS_BITS - 1, 64);
                arr3[i + j + 0] = l2.range(PIXEL_PROCESS_BITS - 1, 64);
                arr3[i + j + 1] = l3.range(PIXEL_PROCESS_BITS - 1, 64);
            }

        RD2:
            for (int ii = 0; ii < BRAMS; ii++) {
// clang-format off
                #pragma HLS unroll
                #pragma HLS loop_tripcount min=38 max=38
                // clang-format on

                if ((ii == 0) && (el <= ram_row_depth * (bdrows - 1))) {
                    iBuff[0][ram_row_depth + el - 1] = arr4[bramtotal - 1];
                } else {
                    iBuff[ii][el - 1] = arr4[ii];
                }
            }
        }
    }
}

template <int SRC_T, int DST_T, int NPC_SRC, int NPC_DST, int HEIGHT, int WIDTH, bool USE_URAM, int depthm = -1>
static void xfEdgeTracing(xf::cv::Mat<DST_T, HEIGHT, WIDTH, NPC_DST, depthm>& _dst,
                          xf::cv::Mat<SRC_T, HEIGHT, WIDTH, NPC_SRC, depthm>& _src,
                          uint16_t height,
                          uint16_t width,
                          uint16_t width_8) {
// clang-format off
#pragma HLS INLINE
// clang-format on
#define BRAM_DEPTH (USE_URAM ? 4096 : 1024)

    enum {
        RAM_ROW_DEPTH = (WIDTH / 32),                // 64-bit width = 32 pixels; Gives depth of ram a row occupies
        NUM_ROWS_RAM = (BRAM_DEPTH / RAM_ROW_DEPTH), // Gives No.of rows per BRAM
        SLICE_H = (HEIGHT / SLICES),                 // Gives height of each Slice
        BRAM_SETS_TEMP = DIV_CEIL((SLICE_H + MIN_OVERLAP), NUM_ROWS_RAM),
        BRAMS_SETS = ADJUST_MULTIPLE(BRAM_SETS_TEMP, 3), // Making BRAM_CNT divisible by 3
        ACTUAL_ROWS = BRAMS_SETS * NUM_ROWS_RAM,
        OVERLAP = ACTUAL_ROWS - SLICE_H,
        BRAMS_TOTAL = BRAMS_SETS + 2
    };

    ap_uint<64> iBuff[BRAMS_TOTAL][BRAM_DEPTH];

    if (USE_URAM) {
// clang-format off
        #pragma HLS RESOURCE variable=iBuff core=RAM_T2P_URAM
        #pragma HLS array_partition variable=iBuff dim=1
        // clang-format on
    } else {
// clang-format off
        #pragma HLS RESOURCE variable=iBuff core=RAM_T2P_BRAM
        #pragma HLS array_partition variable=iBuff dim=1
        // clang-format on
    }

    //# I/P & O/P Registers
    ap_uint<64> iReg[1];
    ap_uint<64> oReg[1];

    int slice_h = (height >> 2);      // = height / SLICES
    int ram_row_depth = (width >> 5); // = width / 32
    int bdrows = BRAM_DEPTH / ram_row_depth;
    int bramsetsval = ADJUST_MULTIPLE(DIV_CEIL((slice_h + MIN_OVERLAP), bdrows), 3);
    int overlap = (bramsetsval * bdrows) - slice_h;
    int bramtotal = bramsetsval + 2;

//# Inter Iterations

INTER_ITERATION_LOOP:
    for (int inter_i = 0; inter_i < INTER_ITERATIONS; inter_i++) {
        //# Loop for Reading chunks of NMS output
        unsigned int offset = 0;
        unsigned int lBound = 0;

    SLICE_LOOP:
        for (int slice = 0; slice < SLICES; slice++) {
            lBound = ram_row_depth * (slice_h + ((slice == 3) ? 0 : overlap));

            if (inter_i == 0) {
                offset = slice * slice_h * ram_row_depth;
            } else {
                offset = (slice == 3) ? 0 : ((((3 - slice) * slice_h) - overlap) * ram_row_depth);
            }

            ap_uint<16> idx1 = 0, dep = 0;
            ap_uint<16> idx2 = 1;
            int cnt = 0;

        Read_N_Arrange:
            for (unsigned int i = 0; i < lBound; i++) {
// clang-format off
                #pragma HLS loop_tripcount min=16200 max=16800
                #pragma HLS pipeline II=1
                #pragma HLS DEPENDENCE variable=iBuff inter false
                #pragma HLS DEPENDENCE variable=iBuff intra false
                // clang-format on
                int ind_1 = 0, ind_2 = 0, val_ind = 0;
                iReg[0] = _src.read(offset + i); // Reading Input

                if (idx1 == ram_row_depth) {
                    idx1 = 0;
                    idx2++;
                }

                if (idx2 == bramsetsval + 1) {
                    idx2 = 1;
                    dep += ram_row_depth;
                }

                ap_uint<16> index = idx1 + dep;
                iBuff[idx2][index] = iReg[0];

                // Filling edge row buffers (i.e., iBuff[0] and iBuff[bramsetsval+1])
                // This is done by replicating the rows
                if (idx2 == 1) {
                    if (dep == 0) {
                        iBuff[0][index] = 0;
                    } else {
                        iBuff[bramsetsval + 1][index - ram_row_depth] = iReg[0];
                    }
                } else if (idx2 == bramsetsval) {
                    if (dep == ((bdrows - 1) * ram_row_depth)) {
                        iBuff[bramsetsval + 1][index] = 0;

                    } else {
                        iBuff[0][index + ram_row_depth] = iReg[0];
                    }
                }

                idx1++;
            }

        //# Intra Iterations
        INTRA_ITERATION_LOOP:
            for (int intra_i = 0; intra_i < INTRA_ITERATIONS; intra_i++) {
                TopDown<BRAMS_TOTAL, BRAMS_SETS / 3, BRAM_DEPTH>(iBuff, width, height, bramsetsval, bramtotal, bdrows,
                                                                 ram_row_depth);
            }

            idx1 = 0;
            idx2 = 1;
            dep = 0;

        Write:
            for (unsigned int i = 0; i < lBound; i++) {
// clang-format off
                #pragma HLS loop_tripcount min=16200 max=16800
                #pragma HLS pipeline
                // clang-format on

                if (idx1 == ram_row_depth) {
                    idx1 = 0;
                    idx2++;
                }
                if (idx2 == bramsetsval + 1) {
                    idx2 = 1;
                    dep += ram_row_depth;
                }

                oReg[0] = iBuff[idx2][idx1 + dep];
                _src.write((offset + i), oReg[0]);

                idx1++;
            }
        }
    }

    // printf("widthby8:%d\n",width / 8);

    // printf("widthby8:%d %d\n",width_8,(width_8 / 8));

    ap_uint<64> oBuff[RAM_ROW_DEPTH], oRegF[1];
//# Write the final output as 8-bit / pixel
FIN_WR_LOOP:
    for (int ii = 0; ii < height; ii++) {
// memcpy(oBuff, nms_in + ii * (width >> 2), width << 1);
// clang-format off
        #pragma HLS loop_tripcount min=HEIGHT max=HEIGHT
        // clang-format on
        for (int k = 0; k < ram_row_depth; k++) {
// clang-format off
            #pragma HLS pipeline
            #pragma HLS loop_tripcount min=RAM_ROW_DEPTH max=RAM_ROW_DEPTH
            // clang-format on
            oBuff[k] = _src.read((ii * ram_row_depth) + k);
        }

        ap_uint<3> id = 0;
        ap_uint<9> pixel = 0;
    WR_FIN_PIPE:
        for (int j = 0, bit = 0; j < width / 8; j++, bit += 16) {
// clang-format off
            #pragma HLS loop_tripcount min=WIDTH/8 max=WIDTH/8
            #pragma HLS pipeline
            // clang-format on
            if (id == 4) {
                id = 0;
                pixel++;
                bit = 0;
            }
            for (int k = 0, l = 0; k < 16; k += 2, l += 8) {
                ap_uint<2> pix = oBuff[pixel].range(bit + k + 1, bit + k);
                if (pix == 3)
                    oRegF[0].range(l + 7, l) = 255;
                else
                    oRegF[0].range(l + 7, l) = 0;
            }
            id++;

            if (j < (width_8 / 8)) _dst.write((ii * (width_8 / 8) + j), oRegF[0]);
        }
    }
}

template <int SRC_T, int DST_T, int ROWS, int COLS, int NPC_SRC, int NPC_DST, bool USE_URAM = false, int depthm = -1>
void EdgeTracing(xf::cv::Mat<SRC_T, ROWS, COLS, NPC_SRC, depthm>& _src,
                 xf::cv::Mat<DST_T, ROWS, COLS, NPC_DST, depthm>& _dst) {
// clang-format off
    #pragma HLS INLINE
    // clang-format on
    xfEdgeTracing<SRC_T, DST_T, NPC_SRC, NPC_DST, ROWS, COLS, USE_URAM, depthm>(_dst, _src, _src.rows, _src.cols,
                                                                                _dst.cols);
}

} // namespace cv
} // namespace xf
#endif