Program Listing for File xf_sgbm.hpp
↰ Return to documentation for file (/tmp/ws/src/vitis_common/include/imgproc/xf_sgbm.hpp
)
/*
* Copyright 2019 Xilinx, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef _XF_SGBM_HPP_
#define _XF_SGBM_HPP_
#ifndef __cplusplus
#error C++ is needed to include this header
#endif
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
#include "hls_stream.h"
#include "../common/xf_common.hpp"
#include "../common/xf_utility.hpp"
#define MAX_UCHAR 255
namespace xf {
namespace cv {
template <int DEPTH_SRC, int DEPTH_DST>
XF_PTNAME(DEPTH_DST)
xFComputeTransform5x5(XF_PTNAME(DEPTH_SRC) src_buf[5][5]) {
// clang-format off
#pragma HLS INLINE off
// clang-format on
XF_PTNAME(DEPTH_SRC) target = src_buf[2][2];
XF_PTNAME(DEPTH_DST) val = 0;
int idx = 0;
for (int i = 0; i < 5; i++) {
// clang-format off
#pragma HLS UNROLL
// clang-format on
for (int j = 0; j < 5; j++) {
// clang-format off
#pragma HLS UNROLL
// clang-format on
XF_PTNAME(DEPTH_SRC) ref = src_buf[i][j];
if ((i != 2) || (j != 2)) {
val.range(23 - idx, 23 - idx) = (ref < target) ? 1 : 0;
idx++;
}
}
}
return val;
}
template <int ROWS, int COLS, int DEPTH_SRC, int DEPTH_DST, int NPC, int WORDWIDTH_SRC, int WORDWIDTH_DST>
void xFProcessCensusTransform5x5(hls::stream<XF_SNAME(WORDWIDTH_SRC)>& _src_mat,
hls::stream<XF_SNAME(WORDWIDTH_DST)>& _dst_mat,
XF_SNAME(WORDWIDTH_SRC) buf[5][COLS],
XF_PTNAME(DEPTH_SRC) src_buf[5][5],
XF_PTNAME(DEPTH_DST) & CensusVal,
uint16_t img_width,
uint16_t img_height,
ap_uint<13> row_ind,
ap_uint<4> tp1,
ap_uint<4> tp2,
ap_uint<4> mid,
ap_uint<4> bottom1,
ap_uint<4> bottom2,
ap_uint<13> row) {
// clang-format off
#pragma HLS INLINE
// clang-format on
XF_SNAME(WORDWIDTH_SRC) buf0, buf1, buf2, buf3, buf4;
Col_Loop:
for (ap_uint<13> col = 0; col < img_width; col++) {
// clang-format off
#pragma HLS LOOP_TRIPCOUNT min=COLS max=COLS
#pragma HLS pipeline
// clang-format on
if (row < img_height)
buf[row_ind][col] = _src_mat.read();
else
buf[bottom2][col] = 0;
src_buf[0][4] = buf[tp1][col];
src_buf[1][4] = buf[tp2][col];
src_buf[2][4] = buf[mid][col];
src_buf[3][4] = buf[bottom1][col];
src_buf[4][4] = buf[bottom2][col];
CensusVal = xFComputeTransform5x5<DEPTH_SRC, DEPTH_DST>(src_buf);
for (ap_uint<4> i = 0; i < 5; i++) {
for (ap_uint<4> j = 0; j < 4; j++) {
// clang-format off
#pragma HLS unroll
// clang-format on
src_buf[i][j] = src_buf[i][j + 1];
}
}
if (col >= 2) {
_dst_mat.write(CensusVal);
}
} // Col_Loop
}
template <int ROWS, int COLS, int DEPTH_SRC, int DEPTH_DST, int NPC, int WORDWIDTH_SRC, int WORDWIDTH_DST>
void xFCensus5x5(hls::stream<XF_SNAME(WORDWIDTH_SRC)>& _src_mat,
hls::stream<XF_SNAME(WORDWIDTH_DST)>& _dst_mat,
uint16_t img_height,
uint16_t img_width) {
ap_uint<13> row_ind, row, col;
ap_uint<4> tp1, tp2, mid, bottom1, bottom2;
XF_PTNAME(DEPTH_DST) censusVal;
// Temporary buffers to hold image data from five rows
XF_PTNAME(DEPTH_SRC) src_buf[5][5];
// clang-format off
#pragma HLS ARRAY_PARTITION variable=src_buf complete dim=0
// clang-format on
// Temporary buffer to hold image data from five rows
XF_SNAME(WORDWIDTH_SRC) buf[5][COLS];
// clang-format off
#pragma HLS RESOURCE variable=buf core=RAM_S2P_BRAM
#pragma HLS ARRAY_PARTITION variable=buf complete dim=1
// clang-format on
row_ind = 2;
Clear_Row_Loop:
for (col = 0; col < img_width; col++) {
// clang-format off
#pragma HLS LOOP_TRIPCOUNT min=COLS max=COLS
#pragma HLS pipeline
// clang-format on
buf[0][col] = 0;
buf[1][col] = 0;
buf[row_ind][col] = _src_mat.read();
}
row_ind++;
Read_Row2_Loop:
for (col = 0; col < img_width; col++) {
// clang-format off
#pragma HLS LOOP_TRIPCOUNT min=COLS max=COLS
#pragma HLS pipeline
// clang-format on
buf[row_ind][col] = _src_mat.read();
}
row_ind++;
Row_Loop:
for (row = 2; row < img_height + 2; row++) {
// clang-format off
#pragma HLS LOOP_TRIPCOUNT min=ROWS max=ROWS
// clang-format on
// modify the buffer indices to re use
if (row_ind == 4) {
tp1 = 0;
tp2 = 1;
mid = 2;
bottom1 = 3;
bottom2 = 4;
} else if (row_ind == 0) {
tp1 = 1;
tp2 = 2;
mid = 3;
bottom1 = 4;
bottom2 = 0;
} else if (row_ind == 1) {
tp1 = 2;
tp2 = 3;
mid = 4;
bottom1 = 0;
bottom2 = 1;
} else if (row_ind == 2) {
tp1 = 3;
tp2 = 4;
mid = 0;
bottom1 = 1;
bottom2 = 2;
} else if (row_ind == 3) {
tp1 = 4;
tp2 = 0;
mid = 1;
bottom1 = 2;
bottom2 = 3;
}
for (int i = 0; i < 5; i++) {
// clang-format off
#pragma HLS UNROLL
// clang-format on
for (int j = 0; j < 4; j++) {
// clang-format off
#pragma HLS UNROLL
// clang-format on
src_buf[i][j] = 0;
}
}
xFProcessCensusTransform5x5<ROWS, COLS, DEPTH_SRC, DEPTH_DST, NPC, WORDWIDTH_SRC, WORDWIDTH_DST>(
_src_mat, _dst_mat, buf, src_buf, censusVal, img_width, img_height, row_ind, tp1, tp2, mid, bottom1,
bottom2, row);
for (int i = 0; i < 5; i++) {
// clang-format off
#pragma HLS UNROLL
// clang-format on
src_buf[i][4] = 0;
}
// clang-format off
#pragma HLS ALLOCATION function instances=xFComputeTransform5x5<DEPTH_SRC, DEPTH_DST> limit=1
// clang-format on
censusVal = xFComputeTransform5x5<DEPTH_SRC, DEPTH_DST>(src_buf);
_dst_mat.write(censusVal);
for (ap_uint<4> i = 0; i < 5; i++) {
for (ap_uint<4> j = 0; j < 4; j++) {
// clang-format off
#pragma HLS unroll
// clang-format on
src_buf[i][j] = src_buf[i][j + 1];
}
}
for (int i = 0; i < 5; i++) {
// clang-format off
#pragma HLS UNROLL
// clang-format on
src_buf[i][4] = 0;
}
censusVal = xFComputeTransform5x5<DEPTH_SRC, DEPTH_DST>(src_buf);
_dst_mat.write(censusVal);
row_ind++;
if (row_ind == 5) {
row_ind = 0;
}
} // Row_Loop
} // end of xFCensus5x5
template <int SIZE>
class xFMinSAD {
public:
template <typename T, typename T_idx>
static void find(T a[SIZE], T_idx& loc, T& val) {
// clang-format off
#pragma HLS INLINE
#pragma HLS array_partition variable=a complete dim=0
// clang-format on
T a1[SIZE / 2];
T a2[SIZE - SIZE / 2];
for (int i = 0; i < SIZE / 2; i++) {
// clang-format off
#pragma HLS UNROLL
// clang-format on
a1[i] = a[i];
}
for (int i = 0; i < SIZE - SIZE / 2; i++) {
// clang-format off
#pragma HLS UNROLL
// clang-format on
a2[i] = a[i + SIZE / 2];
}
T_idx l1, l2;
T v1, v2;
xFMinSAD<SIZE / 2>::find(a1, l1, v1);
xFMinSAD<SIZE - SIZE / 2>::find(a2, l2, v2);
if (v2 < v1) {
val = v2;
loc = l2 + SIZE / 2;
} else {
val = v1;
loc = l1;
}
}
};
template <>
class xFMinSAD<1> {
public:
template <typename T, typename T_idx>
static void find(T a[1], T_idx& loc, T& val) {
// clang-format off
#pragma HLS INLINE
// clang-format on
loc = 0;
val = a[0];
}
};
template <>
class xFMinSAD<2> {
public:
template <typename T, typename T_idx>
static void find(T a[2], T_idx& loc, T& val) {
// clang-format off
#pragma HLS INLINE
#pragma HLS array_partition variable=a complete dim=0
// clang-format on
T_idx l1 = 0, l2 = 1;
T v1 = a[0], v2 = a[1];
if (v2 < v1) {
val = v2;
loc = l2;
} else {
val = v1;
loc = l1;
}
}
};
template <int ROWS, int COLS, int DEPTH_SRC, int DEPTH_DST, int NPC, int WORDWIDTH_SRC, int WORDWIDTH_DST>
void xFCensusTransformKernel(hls::stream<XF_SNAME(WORDWIDTH_SRC)>& _src,
hls::stream<XF_SNAME(WORDWIDTH_DST)>& _dst,
uint8_t _window_size,
uint8_t _border_type,
uint16_t img_height,
uint16_t img_width) {
#ifndef _SYNTHESIS_
assert(((_window_size == XF_FILTER_3X3) || (_window_size == XF_FILTER_5X5)) &&
("Filter width must be either 3 or 5"));
assert(_border_type == XF_BORDER_CONSTANT && "Only XF_BORDER_CONSTANT is supported");
assert(((img_height <= ROWS) && (img_width <= COLS)) && "ROWS and COLS should be greater than input image");
assert((NPC == XF_NPPC1) && ("NPC must be XF_NPPC1"));
#endif
xFCensus5x5<ROWS, COLS, DEPTH_SRC, DEPTH_DST, NPC, WORDWIDTH_SRC, WORDWIDTH_DST>(_src, _dst, img_height, img_width);
} // end of wrapper function
template <int NDISP, int PU, int ROWS, int COLS>
void xFSGBMcomputecost(hls::stream<ap_uint<24> >& _src_census24_l,
hls::stream<ap_uint<24> >& _src_census24_r,
hls::stream<ap_uint<8> > _cost[PU],
int height,
int width) {
// clang-format off
#pragma HLS INLINE OFF
#pragma HLS ARRAY_PARTITION variable=_cost complete dim=1 // TODO
// clang-format on
ap_uint<24> l_val;
ap_uint<24> r_val;
ap_uint<24> r_buff[NDISP];
// clang-format off
#pragma HLS ARRAY_PARTITION variable=r_buff complete dim=1
// clang-format on
loop_height:
for (int r = 0; r < height; r++) {
// clang-format off
#pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS
// clang-format on
loop_sweep:
for (int i = 0; i < NDISP; i++) {
// clang-format off
#pragma HLS UNROLL
// clang-format on
r_buff[i] = 0;
}
loop_width:
for (int c = 0; c < width; c++) {
// clang-format off
#pragma HLS LOOP_TRIPCOUNT min=1 max=COLS
// clang-format on
if (PU == NDISP) {
// clang-format off
#pragma HLS PIPELINE II=1
// clang-format on
}
loop_sweep_inside:
for (int i = 0; i < NDISP / PU; i++) {
// clang-format off
#pragma HLS PIPELINE II=1
#pragma HLS loop_flatten
// clang-format on
if (i == 0) {
l_val = _src_census24_l.read();
r_val = _src_census24_r.read();
// shift the buffer left
loop_shift:
for (int i = NDISP - 1; i > 0; i--) r_buff[i] = r_buff[i - 1];
// insert the new value at the end
r_buff[0] = r_val;
}
loop_parallel_unit:
for (int j = 0; j < PU; j++) {
// clang-format off
#pragma HLS UNROLL
// clang-format on
ap_uint<24> xor_val = l_val ^ r_buff[i * PU + j];
uint8_t sum = 0;
loop_hamming_sum:
for (int k = 0; k < 24; k++) {
// clang-format off
#pragma HLS LOOP_TRIPCOUNT min=1 max=24
// clang-format on
uint8_t c = (uint8_t)(xor_val & 0x1);
sum += xor_val.range(k, k);
}
_cost[j].write((ap_uint8_t)sum);
}
}
}
}
}
static uint8_t min_of_4(uint8_t a, uint8_t b, uint8_t c, uint8_t d) {
// clang-format off
#pragma HLS INLINE
// clang-format on
uint8_t res, res1, res2;
res1 = a < b ? a : b;
res2 = c < d ? c : d;
res = res1 < res2 ? res1 : res2;
return res;
}
static uint8_t fn_reg(uint8_t value) {
//#pragma HLS inline off
//#pragma HLS interface register port=return
return value;
}
template <typename T>
static T fn_reg_scalar(T scalar) {
// clang-format off
#pragma HLS inline //off
// clang-format on
//#pragma HLS interface register port=return
return scalar;
}
template <int NDISP, int PU, int R, int ROWS, int COLS>
void xFSGBMoptimization(hls::stream<ap_uint<8> > _cost[PU],
hls::stream<ap_uint<16> > _agg_cost[PU],
int height,
int width,
uint8_t p1,
uint8_t p2) {
// array for the Lr data
// int CYC_PART = PU+1; // PU+1 or PU+2, TODO need to check and fix this
// array to store the Lr computed values for the Lr computation of neighboring pixels, we don't need for 180 degree
// so R-1. This array consumes BRAMs first dimension completely partitioned, while the second dimension of the Array
// was partitioned in a cyclic manner, which will allow us to access
// PU consecutive array indexes to be accessed in parallel
uint8_t Lr[R - 1][NDISP][COLS];
// clang-format off
#pragma HLS RESOURCE variable=Lr core=RAM_T2P_BRAM
#pragma HLS ARRAY_PARTITION variable=Lr complete dim=1
#pragma HLS ARRAY_PARTITION variable=Lr complete dim=2
// clang-format on
// #pragma HLS ARRAY_PARTITION variable=Lr cyclic factor=CYC_PART dim=2
uint8_t Lr_r1[NDISP];
// clang-format off
#pragma HLS ARRAY_PARTITION variable=Lr_r1 complete dim=1
// clang-format on
uint8_t Lr_r1_tmp[PU];
// clang-format off
#pragma HLS ARRAY_PARTITION variable=Lr_r1_tmp complete dim=1
// clang-format on
// array to store r0 data for the computation of next pixel in the raster scan manner, so one pixel's Lr data is
// sufficient
uint8_t Lr_r0[NDISP];
// clang-format off
#pragma HLS ARRAY_PARTITION variable=Lr_r0 complete dim=0
// clang-format on
// temporary array which reads in the data from Lr array. This array will be processed
// this array works as a queue, after initialization, reads the data from BRAM to the last index while the first
// index is off loaded.
uint8_t tmp_store_Lr[R][PU + 2];
// clang-format off
#pragma HLS ARRAY_PARTITION variable=tmp_store_Lr complete dim=0
// clang-format on
// array to hold the first four min values, first dimension R is for directions, second one to hold first four min
// values. This array consumes BRAMs holds for all direction except 180 degree
uint8_t Lr_min[R - 1][COLS];
// clang-format off
#pragma HLS ARRAY_PARTITION variable=Lr_min complete dim=1
// clang-format on
// array to temporarily hold the data for r1 direction
uint8_t r1_min;
// array to hold the first four min values for the first direction, 180 degree
uint8_t r0_min;
// temporary array which reads in the data from Lr_min array. This array will be using in the processing block. The
// min arrays will always be in a sorted
uint8_t tmp_Lr_min[R];
// clang-format off
#pragma HLS ARRAY_PARTITION variable=tmp_Lr_min complete dim=1
// clang-format on
// suffix post to temporarily store the min values before writing to the BRAM.
// Post buffers are used for computation of minimum on the run, and the data are finally dumped into Lr_min array
uint8_t tmp_Lr_min_post[R];
// clang-format off
#pragma HLS ARRAY_PARTITION variable=tmp_Lr_min_post complete dim=1
// clang-format on
// dim 1 is for directions, and dim 2 for parallel units. This is an intermediary array which hold the computed Lr
// for min computation
uint8_t store_lr_for_min[R][PU];
// clang-format off
#pragma HLS ARRAY_PARTITION variable=store_lr_for_min complete dim=0
// clang-format on
for (int i = 0; i < R - 1; i++) {
for (int j = 0; j < NDISP; j++) {
for (int k = 0; k < COLS; k++) {
Lr[i][j][k] = 0;
}
}
}
for (int j = 0; j < NDISP; j++) {
Lr_r0[j] = 0;
}
for (int i = 0; i < R - 1; i++) {
for (int k = 0; k < COLS; k++) {
Lr_min[i][k] = 0;
}
}
tmp_Lr_min_post[0] = 0;
loop_row:
for (int ro = 0; ro < height; ro++) {
// clang-format off
#pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS
// clang-format on
loop_col:
for (int co = 0; co < width; co++) {
// clang-format off
#pragma HLS LOOP_TRIPCOUNT min=1 max=COLS
// clang-format on
if (PU == NDISP) {
// clang-format off
#pragma HLS PIPELINE II=2
// clang-format on
}
// process loop
uint8_t min_d0, min_cost0; // vs
disp_loop:
for (int d = 0; d < NDISP / PU; d++) {
// clang-format off
#pragma HLS PIPELINE II=2
#pragma HLS DEPENDENCE variable=Lr array intra false
//#pragma HLS DEPENDENCE variable=Lr array inter false
#pragma HLS DEPENDENCE variable=Lr_min array inter false
#pragma HLS LOOP_FLATTEN
// clang-format on
if (d == 0) {
for (int r = 0; r < R; r++) // previous disparity for d=0 is initialized with zero
{
// clang-format off
#pragma HLS UNROLL
// clang-format on
tmp_store_Lr[r][0] = 0;
}
for (int pu = 0; pu < PU; pu++) {
// clang-format off
#pragma HLS UNROLL
// clang-format on
tmp_store_Lr[0][pu + 1] = Lr_r0[pu];
for (int r = 1; r < R; r++) {
// clang-format off
#pragma HLS UNROLL
// clang-format on
tmp_store_Lr[r][pu + 1] = Lr[r - 1][pu][co + r - 2];
}
}
// border disparity case, when PU is same as number of disparity
// if parallel units are same as the maximum disparity then no next disparity for the final
// disparity computation
if (PU < NDISP)
tmp_store_Lr[0][PU + 1] = Lr_r0[PU];
else
tmp_store_Lr[0][PU + 1] = 0;
for (int r = 1; r < R; r++) {
// clang-format off
#pragma HLS UNROLL
// clang-format on
if (PU < NDISP)
tmp_store_Lr[r][PU + 1] = Lr[r - 1][PU][co + r - 2];
else
tmp_store_Lr[r][PU + 1] = 0;
}
// Copy Lr min values from the BRAM to temporary array which is used for processing
// vs tmp_Lr_min[0] = r0_min;
tmp_Lr_min[0] = tmp_Lr_min_post[0];
for (int r = 1; r < R; r++) {
// clang-format off
#pragma HLS UNROLL
// clang-format on
tmp_Lr_min[r] = Lr_min[r - 1][co + r - 2];
}
// initialize the post buffer with max values, helps in comparisons while sorting
for (int r = 0; r < R; r++) {
// clang-format off
#pragma HLS UNROLL
// clang-format on
tmp_Lr_min_post[r] = MAX_UCHAR;
}
} else {
for (int r = 0; r < R; r++) {
// clang-format off
#pragma HLS UNROLL
// clang-format on
tmp_store_Lr[r][0] = tmp_store_Lr[r][PU];
tmp_store_Lr[r][1] = tmp_store_Lr[r][PU + 1];
}
for (int pu = 1; pu < PU; pu++) {
// clang-format off
#pragma HLS UNROLL
// clang-format on
uint16_t disp_idx = (d * PU + pu);
tmp_store_Lr[0][pu + 1] = Lr_r0[disp_idx];
for (int r = 1; r < R; r++) {
// clang-format off
#pragma HLS UNROLL
// clang-format on
tmp_store_Lr[r][pu + 1] = Lr[r - 1][disp_idx][co + r - 2];
}
}
uint16_t disp_idx = d * PU + PU;
if (disp_idx < NDISP)
tmp_store_Lr[0][PU + 1] = Lr_r0[disp_idx];
else
tmp_store_Lr[0][PU + 1] = 0;
for (int r = 1; r < R; r++) {
// clang-format off
#pragma HLS UNROLL
// clang-format on
if (disp_idx < NDISP)
tmp_store_Lr[r][PU + 1] = Lr[r - 1][disp_idx][co + r - 2];
else
tmp_store_Lr[r][PU + 1] = 0;
}
}
loop_pu:
for (int pu = 0; pu < PU; pu++) {
// clang-format off
#pragma HLS UNROLL
// clang-format on
uint8_t cpd = (uint8_t)_cost[pu].read();
uint16_t agg_val = 0;
loop_directions:
for (int r = 0; r < R; r++) {
// clang-format off
#pragma HLS UNROLL
// clang-format on
// mink minimum of all disparity, mini minimum of all disparity except d-1, d, d+1
uint8_t lr_dp, lr_d, lr_dn, lr_mink = MAX_UCHAR;
lr_dp = tmp_store_Lr[r][pu];
lr_d = tmp_store_Lr[r][pu + 1];
lr_dn = tmp_store_Lr[r][pu + 2];
lr_mink = tmp_Lr_min[r];
// vs
uint8_t p1reg = fn_reg_scalar<uint8_t>(p1);
// border disparity cases for storing the lr values
int disp_idx = d * PU + pu;
// border case with respect to disparity
if (disp_idx == 0) lr_dp = MAX_UCHAR - p1reg;
if (disp_idx >= (NDISP - 1)) lr_dn = MAX_UCHAR - p1reg;
uint8_t tmini, tminv;
uint8_t tmp_arr[4];
// clang-format off
#pragma HLS ARRAY_PARTITION variable=tmp_arr complete dim=1
// clang-format on
tmp_arr[0] = lr_d;
tmp_arr[1] = lr_dp + p1;
tmp_arr[2] = lr_dn + p1;
uint8_t p2reg;
if ((r == 0) && (co == 0)) {
p2reg = 0;
} else {
p2reg = p2;
}
tmp_arr[3] = lr_mink + p2reg;
xFMinSAD<4>::find(tmp_arr, tmini, tminv);
// process block
uint8_t lr_tmp;
// clang-format off
#pragma HLS RESOURCE variable=lr_tmp core=AddSub_DSP
// clang-format on
lr_tmp = cpd - (uint8_t)lr_mink;
uint8_t lr;
// clang-format off
#pragma HLS RESOURCE variable=lr_tmp core=AddSub_DSP
// clang-format on
lr = lr_tmp + tminv;
// row or col border case
if (((r == 1) && (co == 0)) || (((r == 1) || (r == 2) || (r == 3)) && (ro == 0)) ||
((r == 3) && (co == width - 1)))
lr = cpd;
// assignment
if (r == 0)
Lr_r0[disp_idx] = lr;
else if (r == 1)
Lr_r1_tmp[pu] = lr;
else
Lr[r - 1][disp_idx][co] = lr;
store_lr_for_min[r][pu] = lr;
agg_val += lr;
}
_agg_cost[pu].write((ap_uint16_t)agg_val);
}
// uint8_t store_lr_for_min_reg[R][PU];
//#pragma HLS array_partition variable=store_lr_for_min complete dim=0
// for (int p=0; p<PU; p++) {
// for (int r=0; r<R; r++) {
// store_lr_for_min_reg[r][p] =
// fn_reg(store_lr_for_min[r][p]);
// }
// }
// compute min value for all sets of disparities
xFMinSAD<PU>::find(store_lr_for_min[0], min_d0, min_cost0);
if (min_cost0 < tmp_Lr_min_post[0]) tmp_Lr_min_post[0] = min_cost0;
for (int r = 1; r < R; r++) {
// clang-format off
#pragma HLS UNROLL
// clang-format on
uint8_t min_d, min_cost;
xFMinSAD<PU>::find(store_lr_for_min[r], min_d, min_cost);
if (min_cost < tmp_Lr_min_post[r]) tmp_Lr_min_post[r] = min_cost;
}
// updating the previous
for (int pu = 0; pu < PU; pu++) {
// clang-format off
#pragma HLS UNROLL
// clang-format on
int disp_idx = d * PU + pu;
if (co >= 1) {
Lr[0][disp_idx][co - 1] = Lr_r1[disp_idx];
}
Lr_r1[disp_idx] = Lr_r1_tmp[pu];
}
if (d >= (NDISP / PU -
1)) // when its the last set of disparities update the min arrays from the min post arrays
{
// for the last pixel in the col update the min values
if (co > 0) {
Lr_min[0][co - 1] = r1_min;
}
// vs r0_min = tmp_Lr_min_post[0];
r1_min = tmp_Lr_min_post[1];
for (int r = 2; r < R; r++) {
// clang-format off
#pragma HLS UNROLL
// clang-format on
Lr_min[r - 1][co] = tmp_Lr_min_post[r];
}
}
// update the tmp_store array for the next set of disparity computation, so not necessary for the last
// set
}
}
}
}
template <int NDISP, int PU, int ROWS, int COLS>
void xfSGBMcomputedisparity(hls::stream<ap_uint<16> > _agg_cost[PU],
hls::stream<ap_uint<8> >& _dst,
int height,
int width) {
// clang-format off
#pragma HLS INLINE OFF
// clang-format on
const int TOTAL_ITER = NDISP / PU;
for (int r = 0; r < height; r++) {
// clang-format off
#pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS
// clang-format on
for (int c = 0; c < width; c++) {
// clang-format off
#pragma HLS LOOP_TRIPCOUNT min=1 max=COLS
// clang-format on
if (PU == NDISP) {
// clang-format off
#pragma HLS PIPELINE II=1
// clang-format on
}
ap_uint<8> lmin_d;
ap_uint<16> lmin_cost;
ap_uint<16> min_cost = 32768;
ap_uint<8> min_disp;
for (int i = 0; i < TOTAL_ITER; i++) {
// clang-format off
#pragma HLS PIPELINE II=1
#pragma HLS LOOP_FLATTEN
// clang-format on
ap_uint<16> tmp[PU];
for (int j = 0; j < PU; j++) {
tmp[j] = _agg_cost[j].read();
}
xFMinSAD<PU>::find(tmp, lmin_d, lmin_cost);
if (lmin_cost < min_cost) {
min_disp = i * PU + lmin_d;
min_cost = lmin_cost;
}
}
_dst.write(min_disp);
}
}
}
template <int BORDER_TYPE, int WINDOW_SIZE, int NDISP, int PU, int R, int SRC_T, int DST_T, int ROWS, int COLS, int NPC>
void SemiGlobalBM(xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _src_mat_l,
xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _src_mat_r,
xf::cv::Mat<DST_T, ROWS, COLS, NPC>& _dst_mat,
uint8_t p1,
uint8_t p2) {
#ifndef _SYNTHESIS_
assert((SRC_T == XF_8UC1) && " WORDWIDTH_SRC must be XF_8UC1 ");
assert((DST_T == XF_8UC1) && " WORDWIDTH_DST must be XF_8UC1 ");
assert((NPC == XF_NPPC1) && " NPC must be XF_NPPC1 ");
assert((WINDOW_SIZE == 5) && " WSIZE must be set to '5' ");
assert(((NDISP > 1) && (NDISP <= 256)) && " NDISP must be greater than '1' and less than or equal to '256' ");
assert((NDISP >= PU) && " NDISP must not be lesser than PU (parallel units)");
assert((((NDISP / PU) * PU) == NDISP) && " NDISP/PU must be a non-fractional number ");
assert(((R == 2) || (R == 3) || (R == 4)) && "Number of directions R must be '2', '3' or '4' ");
assert((p1 < p2) && "p1 must be always less than p2");
assert((p2 <= 100) && "Maximum value of p2 must be 100 ");
#endif
hls::stream<XF_TNAME(SRC_T, NPC)> _src_l;
hls::stream<XF_TNAME(SRC_T, NPC)> _src_r;
hls::stream<ap_uint<32> > _src_census_l;
hls::stream<ap_uint<32> > _src_census_r;
hls::stream<ap_uint<24> > _src_census24_l;
hls::stream<ap_uint<24> > _src_census24_r;
hls::stream<ap_uint<8> > _cost[PU];
hls::stream<ap_uint<16> > _agg_cost[PU];
hls::stream<XF_TNAME(DST_T, NPC)> _dst;
// clang-format off
#pragma HLS INLINE OFF
#pragma HLS DATAFLOW
// clang-format on
int height = _src_mat_l.rows;
int width = _src_mat_l.cols;
int dheight = _dst_mat.rows;
int dwidth = _dst_mat.cols;
// Reading data from Mat to stream
for (int i = 0; i < height; i++) {
// clang-format off
#pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS
// clang-format on
for (int j = 0; j < width; j++) {
// clang-format off
#pragma HLS LOOP_TRIPCOUNT min=1 max=COLS
#pragma HLS LOOP_FLATTEN off
#pragma HLS PIPELINE
// clang-format on
_src_l.write(_src_mat_l.read(i * width + j));
_src_r.write(_src_mat_r.read(i * width + j));
}
}
xFCensusTransformKernel<ROWS, COLS, XF_DEPTH(SRC_T, NPC), XF_DEPTH(XF_32UC1, NPC), NPC, XF_WORDWIDTH(SRC_T, NPC),
XF_WORDWIDTH(XF_32UC1, NPC)>(_src_l, _src_census_l, WINDOW_SIZE, BORDER_TYPE, height,
width);
xFCensusTransformKernel<ROWS, COLS, XF_DEPTH(SRC_T, NPC), XF_DEPTH(XF_32UC1, NPC), NPC, XF_WORDWIDTH(SRC_T, NPC),
XF_WORDWIDTH(XF_32UC1, NPC)>(_src_r, _src_census_r, WINDOW_SIZE, BORDER_TYPE, height,
width);
for (int i = 0; i < height; i++) {
// clang-format off
#pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS
// clang-format on
for (int j = 0; j < width; j++) {
// clang-format off
#pragma HLS LOOP_TRIPCOUNT min=1 max=COLS
#pragma HLS LOOP_FLATTEN off
#pragma HLS PIPELINE
// clang-format on
_src_census24_l.write((ap_uint<24>)_src_census_l.read());
_src_census24_r.write((ap_uint<24>)_src_census_r.read());
}
}
xFSGBMcomputecost<NDISP, PU, ROWS, COLS>(_src_census24_l, _src_census24_r, _cost, height, width);
xFSGBMoptimization<NDISP, PU, R, ROWS, COLS>(_cost, _agg_cost, height, width, p1, p2);
xfSGBMcomputedisparity<NDISP, PU, ROWS, COLS>(_agg_cost, _dst, height, width);
// write back from stream to Mat
for (int i = 0; i < dheight; i++) {
// clang-format off
#pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS
// clang-format on
for (int j = 0; j < dwidth; j++) {
// clang-format off
#pragma HLS LOOP_TRIPCOUNT min=1 max=COLS
#pragma HLS LOOP_FLATTEN off
#pragma HLS PIPELINE
// clang-format on
_dst_mat.write(i * dwidth + j, _dst.read());
}
}
}
} // namespace cv
} // namespace xf
#endif