Program Listing for File xf_pyr_dense_optical_flow_scale.hpp
↰ Return to documentation for file (/tmp/ws/src/vitis_common/include/video/xf_pyr_dense_optical_flow_scale.hpp
)
/*
* Copyright 2019 Xilinx, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef __XF_PYR_DENSE_OPTICAL_FLOW_SCALE__
#define __XF_PYR_DENSE_OPTICAL_FLOW_SCALE__
template <int MAXWIDTH, int FLOW_WIDTH, int FLOW_INT, int SCCMP_WIDTH, int SCCMP_INT, int SCALE_WIDTH, int SCALE_INT>
void load_data(hls::stream<ap_fixed<FLOW_WIDTH, FLOW_INT> >& inStrm,
ap_fixed<FLOW_WIDTH, FLOW_INT> buf[MAXWIDTH],
int rows,
int cols,
bool& flagLoaded,
int inCurrRow,
ap_ufixed<SCALE_WIDTH, SCALE_INT> scaleI,
ap_fixed<SCCMP_WIDTH, SCCMP_INT>& fracI,
int& prevIceil) {
// clang-format off
#pragma HLS inline off
// clang-format on
// Calculate the input row needed to compute the current output
ap_fixed<SCCMP_WIDTH, SCCMP_INT> iSmall = inCurrRow * scaleI;
// integer index of the input row needed to compute the output row
int iSmallFloor = (int)iSmall;
// fractional value of the input row, i.e., weight needed for bilateral interpolation
fracI = iSmall - (ap_fixed<SCCMP_WIDTH, SCCMP_INT>)iSmallFloor;
// two rows are needed for bilinear interpolation. So, if the second row is not already in the buffer, read another
// row. this is also enabled when the row count is less than 2
if ((iSmallFloor + 1 > prevIceil || inCurrRow < 2) && (iSmallFloor < rows - 1)) {
// setting a flag that the input is read
flagLoaded = 1;
for (int i = 0; i < cols; i++) {
// clang-format off
#pragma HLS LOOP_TRIPCOUNT min=1 max=MAXWIDTH
#pragma HLS pipeline ii=1
#pragma HLS LOOP_FLATTEN OFF
// clang-format on
buf[i] = inStrm.read();
}
// after the read, increment the input row by 1
prevIceil = iSmallFloor + 1;
} else {
// setting a flag that the input is not read
flagLoaded = 0;
}
} // end load_data()
template <int FLOW_WIDTH, int FLOW_INT, int SCCMP_WIDTH, int SCCMP_INT, int RMAPPX_WIDTH, int RMAPPX_INT>
ap_fixed<FLOW_WIDTH, FLOW_INT> compute_result(ap_fixed<SCCMP_WIDTH, SCCMP_INT> fracI,
ap_fixed<SCCMP_WIDTH, SCCMP_INT> fracJ,
ap_fixed<FLOW_WIDTH, FLOW_INT> i0,
ap_fixed<FLOW_WIDTH, FLOW_INT> i1,
ap_fixed<FLOW_WIDTH, FLOW_INT> i2,
ap_fixed<FLOW_WIDTH, FLOW_INT> i3) {
// clang-format off
#pragma HLS inline off
// clang-format on
ap_fixed<18, 1> fi = (fracI);
ap_fixed<18, 1> fj = (fracJ);
ap_fixed<36, 1> fij = (ap_fixed<36, 1>)fi * (ap_fixed<36, 1>)fj;
ap_fixed<18, 1> p3 = (ap_fixed<18, 1>)fij;
ap_fixed<18, 1> p2 = (ap_fixed<18, 1>)((ap_fixed<36, 1>)fi - fij);
ap_fixed<18, 1> p1 = (ap_fixed<18, 1>)((ap_fixed<36, 1>)fj - fij);
ap_fixed<21, 4> p0 = ap_fixed<21, 4>(1.0) - ap_fixed<21, 4>(p1) - ap_fixed<21, 4>(p2) - ap_fixed<21, 4>(p3);
ap_fixed<FLOW_WIDTH + 2, FLOW_INT + 2> resIf =
(ap_fixed<FLOW_WIDTH + 2, FLOW_INT + 2>)i0 * p0 + (ap_fixed<FLOW_WIDTH + 2, FLOW_INT + 2>)i1 * p1 +
(ap_fixed<FLOW_WIDTH + 2, FLOW_INT + 2>)i2 * p2 + (ap_fixed<FLOW_WIDTH + 2, FLOW_INT + 2>)i3 * p3;
return (ap_fixed<FLOW_WIDTH, FLOW_INT>)resIf;
} // end compute_result()
template <unsigned short MAXHEIGHT,
unsigned short MAXWIDTH,
int FLOW_WIDTH,
int FLOW_INT,
int SCCMP_WIDTH,
int SCCMP_INT,
int RMAPPX_WIDTH,
int RMAPPX_INT,
int SCALE_WIDTH,
int SCALE_INT>
void process(ap_fixed<FLOW_WIDTH, FLOW_INT> buf[MAXWIDTH],
ap_fixed<FLOW_WIDTH, FLOW_INT> buffer[2][MAXWIDTH],
unsigned short int outRows,
unsigned short int outCols,
hls::stream<ap_fixed<FLOW_WIDTH, FLOW_INT> >& outStrm,
bool flagLoaded,
int row,
ap_ufixed<SCALE_WIDTH, SCALE_INT> scaleI,
ap_ufixed<SCALE_WIDTH, SCALE_INT> scaleJ,
ap_fixed<SCCMP_WIDTH, SCCMP_INT> fracI,
int mul) {
// clang-format off
#pragma HLS array_partition variable=buffer dim=1 complete
#pragma HLS inline off
// clang-format on
int bufCount = 0;
ap_fixed<FLOW_WIDTH, FLOW_INT> regLoad;
int prevJceil = -1;
ap_fixed<FLOW_WIDTH, FLOW_INT> i0 = 0, i1 = 0, i2 = 0, i3 = 0;
L3:
for (ap_uint<16> j = 0; j < outCols; j++) {
// clang-format off
#pragma HLS LOOP_TRIPCOUNT min=1 max=MAXWIDTH
#pragma HLS pipeline
#pragma HLS LOOP_FLATTEN OFF
#pragma HLS DEPENDENCE variable=buffer array inter false
// clang-format on
// calculate the current input column index needed for the current output
ap_fixed<SCCMP_WIDTH, SCCMP_INT> jSmall = j * scaleJ;
// integer part
int jSmallFloor = (int)jSmall;
// calculate the current input row index needed for the current output
ap_fixed<SCCMP_WIDTH, SCCMP_INT> iSmall = row * scaleI;
// integer part
int iSmallFloor = (int)iSmall;
// fractional index
ap_fixed<SCCMP_WIDTH, SCCMP_INT> fracI = iSmall - (ap_fixed<SCCMP_WIDTH, SCCMP_INT>)iSmallFloor;
ap_fixed<SCCMP_WIDTH, SCCMP_INT> fracJ = jSmall - (ap_fixed<SCCMP_WIDTH, SCCMP_INT>)jSmallFloor;
// copy the input buffer buf into the internal buffer 'buffer' while shifting the row of the buffer up
// i.e., buffer[0][column] = buffer[1][column]; buffer[1][column] = current read value
// for the first row
if (row == 0) {
// only one row is available to process hence fractional index is 1
fracI = 1;
// when column count is 0, for the first pixel the left pixel i1 = 0 and all the other pixels are 0
// only when the prevJceil is equal to the current column index, i.e., when another pixel is needed for
// computing the next pixel, a pixel is read or no pixel is read from the input. each iteration, i2 = i3 and
// i3 = current read value, top row is 0, hence i1 and i0 are always 0
if (j == 0) {
ap_fixed<FLOW_WIDTH, FLOW_INT> reg = buf[bufCount];
buffer[1][bufCount] = reg;
i3 = reg;
fracI = 1;
fracJ = 1;
bufCount++;
prevJceil = 0;
} else if (j < outCols) {
if (prevJceil == jSmallFloor) {
i2 = i3;
ap_fixed<FLOW_WIDTH, FLOW_INT> reg = buf[bufCount];
buffer[1][bufCount] = reg;
i3 = reg;
bufCount++;
prevJceil = jSmallFloor + 1;
}
} else {
i3 = buffer[1][bufCount - 1];
fracI = 1;
fracJ = 1;
}
}
// rows > 0 are processed, i0 and i2 are previous i1 and i3 and the current i1 and i3 are the current column
// reads. again, the internal buffer is loaded with the input buf values. This happens only when a input row is
// read during the previous iteration
else if (row < outRows - 1) {
if (j == 0) {
i0 = 0;
i2 = 0;
fracJ = 1;
if (flagLoaded) {
ap_fixed<FLOW_WIDTH, FLOW_INT> reg = buf[bufCount];
ap_fixed<FLOW_WIDTH, FLOW_INT> tmp = buffer[1][bufCount];
buffer[0][bufCount] = tmp;
i1 = tmp;
buffer[1][bufCount] = reg;
i3 = reg;
bufCount++;
} else {
i1 = buffer[0][bufCount];
i3 = buffer[1][bufCount];
bufCount++;
}
prevJceil = 0;
} else if (j < outCols) {
if (prevJceil == jSmallFloor) {
i0 = i1;
i2 = i3;
if (flagLoaded) {
ap_fixed<FLOW_WIDTH, FLOW_INT> reg = buf[bufCount];
ap_fixed<FLOW_WIDTH, FLOW_INT> tmp = buffer[1][bufCount];
buffer[0][bufCount] = tmp;
i1 = tmp;
buffer[1][bufCount] = reg;
i3 = reg;
bufCount++;
} else {
i1 = buffer[0][bufCount];
i3 = buffer[1][bufCount];
bufCount++;
}
prevJceil = jSmallFloor + 1;
}
} else {
fracJ = 1;
}
}
// for the final row, only one row is processed, the fracI index is always 1. i2 = previous iteration's i3 and
// i3 is the current buf read.
else {
if (j == 0) {
i3 = buffer[1][bufCount];
fracI = 1;
fracJ = 1;
bufCount++;
prevJceil = 0;
} else if (j < outCols) {
if (prevJceil == jSmallFloor) {
i2 = i3;
ap_fixed<FLOW_WIDTH, FLOW_INT> reg = buffer[1][bufCount];
i3 = reg;
bufCount++;
prevJceil = jSmallFloor + 1;
}
fracI = 1;
} else {
i3 = buffer[1][bufCount - 1];
fracI = 1;
fracJ = 1;
}
} // end else
// bilinear interpolation equation.
ap_fixed<FLOW_WIDTH, FLOW_INT> resIf =
compute_result<FLOW_WIDTH, FLOW_INT, SCCMP_WIDTH, SCCMP_INT, RMAPPX_WIDTH, RMAPPX_INT>(fracI, fracJ, i0, i1,
i2, i3);
// multiply the interpolation result by 2 as the image is scaled up by a factor of about 2 and the pixel
// displacements are scaled up by a factor of 2 too.
outStrm.write(resIf << 1);
} // end L3
} // end process()
template <unsigned short MAXHEIGHT,
unsigned short MAXWIDTH,
int FLOW_WIDTH,
int FLOW_INT,
int SCCMP_WIDTH,
int SCCMP_INT,
int RMAPPX_WIDTH,
int RMAPPX_INT,
int SCALE_WIDTH,
int SCALE_INT,
bool USE_URAM>
void scale_up(hls::stream<ap_fixed<FLOW_WIDTH, FLOW_INT> >& inStrm,
hls::stream<ap_fixed<FLOW_WIDTH, FLOW_INT> >& outStrm,
unsigned short int inRows,
unsigned short int inCols,
unsigned short int outRows,
unsigned short int outCols,
int mul,
const bool scale_up_flag,
float scale_comp) {
// clang-format off
#pragma HLS inline off
// clang-format on
// Buffer to store two rows of the input image. These rows are updated in the process function
ap_fixed<FLOW_WIDTH, FLOW_INT> buffer[2][MAXWIDTH];
if (USE_URAM) {
// clang-format off
#pragma HLS array_reshape variable=buffer dim=1 complete
// clang-format on
} else {
// clang-format off
#pragma HLS array_partition variable=buffer dim=1 complete
// clang-format on
}
// buf0 and buf1 are used as ping pong buffers to read and process. While one buffer is used to read the input
// image, the other buffer is copied into the buffer variable declared above
ap_fixed<FLOW_WIDTH, FLOW_INT> buf0[MAXWIDTH], buf1[MAXWIDTH];
if (USE_URAM) {
// clang-format off
#pragma HLS array_reshape variable=buf0 dim=1 complete
#pragma HLS array_reshape variable=buf1 dim=1 complete
#pragma HLS RESOURCE variable=buffer core=RAM_S2P_URAM
#pragma HLS RESOURCE variable=buf0 core=RAM_S2P_URAM
#pragma HLS RESOURCE variable=buf1 core=RAM_S2P_URAM
// clang-format on
}
// Copy input scale into the following variable
ap_ufixed<SCALE_WIDTH, SCALE_INT> scaleI = (ap_ufixed<SCALE_WIDTH, SCALE_INT>)scale_comp;
ap_ufixed<SCALE_WIDTH, SCALE_INT> scaleJ = (ap_ufixed<SCALE_WIDTH, SCALE_INT>)scale_comp;
#if DEBUG
cout << "Scale Flag: " << scale_up_flag << "\n";
cout << "Scale Comp: " << scale_comp << "\n";
cout << "Scale: " << float(scaleJ) << " " << float(scaleI) << "\n";
#endif
// Variables to store the bilinear interpolation weights
ap_fixed<SCCMP_WIDTH, SCCMP_INT> fracI0, fracI1;
// flags to mark if the buffer is read
bool flagLoaded0, flagLoaded1;
// ping-pong operation flag
bool flag = 0;
// if the input scale-up flag is 0, i.e., if this module needs to be bypassed, the input stream is copied to the
// output stream
if (scale_up_flag == 0) {
for (ap_uint<16> i = 0; i < outRows; i++) {
// clang-format off
#pragma HLS LOOP_TRIPCOUNT min=1 max=MAXHEIGHT
// clang-format on
for (ap_uint<16> j = 0; j < outCols; j++) {
// clang-format off
#pragma HLS LOOP_TRIPCOUNT min=1 max=MAXWIDTH
#pragma HLS pipeline II=1
#pragma HLS LOOP_FLATTEN OFF
// clang-format on
outStrm.write((ap_fixed<FLOW_WIDTH, FLOW_INT>)inStrm.read());
}
}
}
// Scale up enabled
else {
int prevIceil = -1;
// load first row into the buf0 so that the output processing can have two rows at the same time.
load_data<MAXWIDTH, FLOW_WIDTH, FLOW_INT, SCCMP_WIDTH, SCCMP_INT, SCALE_WIDTH, SCALE_INT>(
inStrm, buf0, inRows, inCols, flagLoaded0, 0, scaleI, fracI0, prevIceil);
// run the ping pong buffer for outRows -1 times
L2:
for (ap_uint<16> i = 0; i < outRows - 1; i++) {
// clang-format off
#pragma HLS LOOP_TRIPCOUNT min=1 max=MAXHEIGHT
// clang-format on
if (flag == 0) {
load_data<MAXWIDTH, FLOW_WIDTH, FLOW_INT, SCCMP_WIDTH, SCCMP_INT, SCALE_WIDTH, SCALE_INT>(
inStrm, buf1, inRows, inCols, flagLoaded1, i + 1, scaleI, fracI1, prevIceil);
process<MAXHEIGHT, MAXWIDTH, FLOW_WIDTH, FLOW_INT, SCCMP_WIDTH, SCCMP_INT, RMAPPX_WIDTH, RMAPPX_INT,
SCALE_WIDTH, SCALE_INT>(buf0, buffer, outRows, outCols, outStrm, flagLoaded0, i, scaleI, scaleJ,
fracI0, mul);
flag = 1;
} else {
load_data<MAXWIDTH, FLOW_WIDTH, FLOW_INT, SCCMP_WIDTH, SCCMP_INT, SCALE_WIDTH, SCALE_INT>(
inStrm, buf0, inRows, inCols, flagLoaded0, i + 1, scaleI, fracI0, prevIceil);
process<MAXHEIGHT, MAXWIDTH, FLOW_WIDTH, FLOW_INT, SCCMP_WIDTH, SCCMP_INT, RMAPPX_WIDTH, RMAPPX_INT,
SCALE_WIDTH, SCALE_INT>(buf1, buffer, outRows, outCols, outStrm, flagLoaded1, i, scaleI, scaleJ,
fracI1, mul);
flag = 0;
}
} // end L2
if (flag == 0) {
process<MAXHEIGHT, MAXWIDTH, FLOW_WIDTH, FLOW_INT, SCCMP_WIDTH, SCCMP_INT, RMAPPX_WIDTH, RMAPPX_INT>(
buf0, buffer, outRows, outCols, outStrm, flagLoaded0, outRows - 1, scaleI, scaleJ, fracI0, mul);
} else {
process<MAXHEIGHT, MAXWIDTH, FLOW_WIDTH, FLOW_INT, SCCMP_WIDTH, SCCMP_INT, RMAPPX_WIDTH, RMAPPX_INT>(
buf1, buffer, outRows, outCols, outStrm, flagLoaded1, outRows - 1, scaleI, scaleJ, fracI1, mul);
}
}
} // end scale_up
#endif