.. _program_listing_file__tmp_ws_src_vitis_common_include_video_xf_pyr_dense_optical_flow_scale.hpp: Program Listing for File xf_pyr_dense_optical_flow_scale.hpp ============================================================ |exhale_lsh| :ref:`Return to documentation for file ` (``/tmp/ws/src/vitis_common/include/video/xf_pyr_dense_optical_flow_scale.hpp``) .. |exhale_lsh| unicode:: U+021B0 .. UPWARDS ARROW WITH TIP LEFTWARDS .. code-block:: cpp /* * Copyright 2019 Xilinx, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef __XF_PYR_DENSE_OPTICAL_FLOW_SCALE__ #define __XF_PYR_DENSE_OPTICAL_FLOW_SCALE__ template void load_data(hls::stream >& inStrm, ap_fixed buf[MAXWIDTH], int rows, int cols, bool& flagLoaded, int inCurrRow, ap_ufixed scaleI, ap_fixed& fracI, int& prevIceil) { // clang-format off #pragma HLS inline off // clang-format on // Calculate the input row needed to compute the current output ap_fixed iSmall = inCurrRow * scaleI; // integer index of the input row needed to compute the output row int iSmallFloor = (int)iSmall; // fractional value of the input row, i.e., weight needed for bilateral interpolation fracI = iSmall - (ap_fixed)iSmallFloor; // two rows are needed for bilinear interpolation. So, if the second row is not already in the buffer, read another // row. this is also enabled when the row count is less than 2 if ((iSmallFloor + 1 > prevIceil || inCurrRow < 2) && (iSmallFloor < rows - 1)) { // setting a flag that the input is read flagLoaded = 1; for (int i = 0; i < cols; i++) { // clang-format off #pragma HLS LOOP_TRIPCOUNT min=1 max=MAXWIDTH #pragma HLS pipeline ii=1 #pragma HLS LOOP_FLATTEN OFF // clang-format on buf[i] = inStrm.read(); } // after the read, increment the input row by 1 prevIceil = iSmallFloor + 1; } else { // setting a flag that the input is not read flagLoaded = 0; } } // end load_data() template ap_fixed compute_result(ap_fixed fracI, ap_fixed fracJ, ap_fixed i0, ap_fixed i1, ap_fixed i2, ap_fixed i3) { // clang-format off #pragma HLS inline off // clang-format on ap_fixed<18, 1> fi = (fracI); ap_fixed<18, 1> fj = (fracJ); ap_fixed<36, 1> fij = (ap_fixed<36, 1>)fi * (ap_fixed<36, 1>)fj; ap_fixed<18, 1> p3 = (ap_fixed<18, 1>)fij; ap_fixed<18, 1> p2 = (ap_fixed<18, 1>)((ap_fixed<36, 1>)fi - fij); ap_fixed<18, 1> p1 = (ap_fixed<18, 1>)((ap_fixed<36, 1>)fj - fij); ap_fixed<21, 4> p0 = ap_fixed<21, 4>(1.0) - ap_fixed<21, 4>(p1) - ap_fixed<21, 4>(p2) - ap_fixed<21, 4>(p3); ap_fixed resIf = (ap_fixed)i0 * p0 + (ap_fixed)i1 * p1 + (ap_fixed)i2 * p2 + (ap_fixed)i3 * p3; return (ap_fixed)resIf; } // end compute_result() template void process(ap_fixed buf[MAXWIDTH], ap_fixed buffer[2][MAXWIDTH], unsigned short int outRows, unsigned short int outCols, hls::stream >& outStrm, bool flagLoaded, int row, ap_ufixed scaleI, ap_ufixed scaleJ, ap_fixed fracI, int mul) { // clang-format off #pragma HLS array_partition variable=buffer dim=1 complete #pragma HLS inline off // clang-format on int bufCount = 0; ap_fixed regLoad; int prevJceil = -1; ap_fixed i0 = 0, i1 = 0, i2 = 0, i3 = 0; L3: for (ap_uint<16> j = 0; j < outCols; j++) { // clang-format off #pragma HLS LOOP_TRIPCOUNT min=1 max=MAXWIDTH #pragma HLS pipeline #pragma HLS LOOP_FLATTEN OFF #pragma HLS DEPENDENCE variable=buffer array inter false // clang-format on // calculate the current input column index needed for the current output ap_fixed jSmall = j * scaleJ; // integer part int jSmallFloor = (int)jSmall; // calculate the current input row index needed for the current output ap_fixed iSmall = row * scaleI; // integer part int iSmallFloor = (int)iSmall; // fractional index ap_fixed fracI = iSmall - (ap_fixed)iSmallFloor; ap_fixed fracJ = jSmall - (ap_fixed)jSmallFloor; // copy the input buffer buf into the internal buffer 'buffer' while shifting the row of the buffer up // i.e., buffer[0][column] = buffer[1][column]; buffer[1][column] = current read value // for the first row if (row == 0) { // only one row is available to process hence fractional index is 1 fracI = 1; // when column count is 0, for the first pixel the left pixel i1 = 0 and all the other pixels are 0 // only when the prevJceil is equal to the current column index, i.e., when another pixel is needed for // computing the next pixel, a pixel is read or no pixel is read from the input. each iteration, i2 = i3 and // i3 = current read value, top row is 0, hence i1 and i0 are always 0 if (j == 0) { ap_fixed reg = buf[bufCount]; buffer[1][bufCount] = reg; i3 = reg; fracI = 1; fracJ = 1; bufCount++; prevJceil = 0; } else if (j < outCols) { if (prevJceil == jSmallFloor) { i2 = i3; ap_fixed reg = buf[bufCount]; buffer[1][bufCount] = reg; i3 = reg; bufCount++; prevJceil = jSmallFloor + 1; } } else { i3 = buffer[1][bufCount - 1]; fracI = 1; fracJ = 1; } } // rows > 0 are processed, i0 and i2 are previous i1 and i3 and the current i1 and i3 are the current column // reads. again, the internal buffer is loaded with the input buf values. This happens only when a input row is // read during the previous iteration else if (row < outRows - 1) { if (j == 0) { i0 = 0; i2 = 0; fracJ = 1; if (flagLoaded) { ap_fixed reg = buf[bufCount]; ap_fixed tmp = buffer[1][bufCount]; buffer[0][bufCount] = tmp; i1 = tmp; buffer[1][bufCount] = reg; i3 = reg; bufCount++; } else { i1 = buffer[0][bufCount]; i3 = buffer[1][bufCount]; bufCount++; } prevJceil = 0; } else if (j < outCols) { if (prevJceil == jSmallFloor) { i0 = i1; i2 = i3; if (flagLoaded) { ap_fixed reg = buf[bufCount]; ap_fixed tmp = buffer[1][bufCount]; buffer[0][bufCount] = tmp; i1 = tmp; buffer[1][bufCount] = reg; i3 = reg; bufCount++; } else { i1 = buffer[0][bufCount]; i3 = buffer[1][bufCount]; bufCount++; } prevJceil = jSmallFloor + 1; } } else { fracJ = 1; } } // for the final row, only one row is processed, the fracI index is always 1. i2 = previous iteration's i3 and // i3 is the current buf read. else { if (j == 0) { i3 = buffer[1][bufCount]; fracI = 1; fracJ = 1; bufCount++; prevJceil = 0; } else if (j < outCols) { if (prevJceil == jSmallFloor) { i2 = i3; ap_fixed reg = buffer[1][bufCount]; i3 = reg; bufCount++; prevJceil = jSmallFloor + 1; } fracI = 1; } else { i3 = buffer[1][bufCount - 1]; fracI = 1; fracJ = 1; } } // end else // bilinear interpolation equation. ap_fixed resIf = compute_result(fracI, fracJ, i0, i1, i2, i3); // multiply the interpolation result by 2 as the image is scaled up by a factor of about 2 and the pixel // displacements are scaled up by a factor of 2 too. outStrm.write(resIf << 1); } // end L3 } // end process() template void scale_up(hls::stream >& inStrm, hls::stream >& outStrm, unsigned short int inRows, unsigned short int inCols, unsigned short int outRows, unsigned short int outCols, int mul, const bool scale_up_flag, float scale_comp) { // clang-format off #pragma HLS inline off // clang-format on // Buffer to store two rows of the input image. These rows are updated in the process function ap_fixed buffer[2][MAXWIDTH]; if (USE_URAM) { // clang-format off #pragma HLS array_reshape variable=buffer dim=1 complete // clang-format on } else { // clang-format off #pragma HLS array_partition variable=buffer dim=1 complete // clang-format on } // buf0 and buf1 are used as ping pong buffers to read and process. While one buffer is used to read the input // image, the other buffer is copied into the buffer variable declared above ap_fixed buf0[MAXWIDTH], buf1[MAXWIDTH]; if (USE_URAM) { // clang-format off #pragma HLS array_reshape variable=buf0 dim=1 complete #pragma HLS array_reshape variable=buf1 dim=1 complete #pragma HLS RESOURCE variable=buffer core=RAM_S2P_URAM #pragma HLS RESOURCE variable=buf0 core=RAM_S2P_URAM #pragma HLS RESOURCE variable=buf1 core=RAM_S2P_URAM // clang-format on } // Copy input scale into the following variable ap_ufixed scaleI = (ap_ufixed)scale_comp; ap_ufixed scaleJ = (ap_ufixed)scale_comp; #if DEBUG cout << "Scale Flag: " << scale_up_flag << "\n"; cout << "Scale Comp: " << scale_comp << "\n"; cout << "Scale: " << float(scaleJ) << " " << float(scaleI) << "\n"; #endif // Variables to store the bilinear interpolation weights ap_fixed fracI0, fracI1; // flags to mark if the buffer is read bool flagLoaded0, flagLoaded1; // ping-pong operation flag bool flag = 0; // if the input scale-up flag is 0, i.e., if this module needs to be bypassed, the input stream is copied to the // output stream if (scale_up_flag == 0) { for (ap_uint<16> i = 0; i < outRows; i++) { // clang-format off #pragma HLS LOOP_TRIPCOUNT min=1 max=MAXHEIGHT // clang-format on for (ap_uint<16> j = 0; j < outCols; j++) { // clang-format off #pragma HLS LOOP_TRIPCOUNT min=1 max=MAXWIDTH #pragma HLS pipeline II=1 #pragma HLS LOOP_FLATTEN OFF // clang-format on outStrm.write((ap_fixed)inStrm.read()); } } } // Scale up enabled else { int prevIceil = -1; // load first row into the buf0 so that the output processing can have two rows at the same time. load_data( inStrm, buf0, inRows, inCols, flagLoaded0, 0, scaleI, fracI0, prevIceil); // run the ping pong buffer for outRows -1 times L2: for (ap_uint<16> i = 0; i < outRows - 1; i++) { // clang-format off #pragma HLS LOOP_TRIPCOUNT min=1 max=MAXHEIGHT // clang-format on if (flag == 0) { load_data( inStrm, buf1, inRows, inCols, flagLoaded1, i + 1, scaleI, fracI1, prevIceil); process(buf0, buffer, outRows, outCols, outStrm, flagLoaded0, i, scaleI, scaleJ, fracI0, mul); flag = 1; } else { load_data( inStrm, buf0, inRows, inCols, flagLoaded0, i + 1, scaleI, fracI0, prevIceil); process(buf1, buffer, outRows, outCols, outStrm, flagLoaded1, i, scaleI, scaleJ, fracI1, mul); flag = 0; } } // end L2 if (flag == 0) { process( buf0, buffer, outRows, outCols, outStrm, flagLoaded0, outRows - 1, scaleI, scaleJ, fracI0, mul); } else { process( buf1, buffer, outRows, outCols, outStrm, flagLoaded1, outRows - 1, scaleI, scaleJ, fracI1, mul); } } } // end scale_up #endif