.. _program_listing_file__tmp_ws_src_vitis_common_include_video_xf_dense_npyr_optical_flow.hpp: Program Listing for File xf_dense_npyr_optical_flow.hpp ======================================================= |exhale_lsh| :ref:`Return to documentation for file ` (``/tmp/ws/src/vitis_common/include/video/xf_dense_npyr_optical_flow.hpp``) .. |exhale_lsh| unicode:: U+021B0 .. UPWARDS ARROW WITH TIP LEFTWARDS .. code-block:: cpp /* * Copyright 2019 Xilinx, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef __XF_DENSE_NONPYR_OPTICAL_FLOW__ #define __XF_DENSE_NONPYR_OPTICAL_FLOW__ #include #include #include #include #include "assert.h" #include "video/xf_dense_npyr_optical_flow_types.h" #include "common/xf_common.hpp" namespace xf { namespace cv { // enable to run c-sim //#define HLS_SIM template static void readMatRows16(xf::cv::Mat& src, hls::stream >& pixStream, int rows, int cols, int size) { unsigned int count = 0; for (int i = 0; i < size; i++) { // clang-format off #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS*ROWS/NPC #pragma HLS PIPELINE // clang-format on unsigned short t; t = src.read(i); mywide_t tmpData; tmpData.data[0] = t & 0x00FF; tmpData.data[1] = t >> 8; pixStream.write(tmpData); } } template static void writeMatRowsRGBA16( hls::stream& pixStream0, hls::stream& pixStream1, ap_uint<64>* dst, int rows, int cols, int size) { for (int i = 0; i < size; ++i) { // clang-format off #pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS*COLS/NPC #pragma HLS PIPELINE // clang-format on rgba_t d0 = pixStream0.read(); rgba_t d1 = pixStream1.read(); // unsigned int t1 = (unsigned int)d1.a << 24 | (unsigned int)d1.b << 16 | (unsigned int)d1.g << 8 | (unsigned // int)d1.r; unsigned int t0 = (unsigned int)d0.a << 24 | (unsigned int)d0.b << 16 | (unsigned int)d0.g << 8 | // (unsigned int)d0.r; // who is at MSB? t0 or t1 TODO // unsigned long long l = (unsigned long long) t1 << 32 | (unsigned long long) t0; unsigned long long l = (unsigned long long)d1.a << 56 | (unsigned long long)d1.b << 48 | (unsigned long long)d1.g << 40 | (unsigned long long)d1.r << 32 | (unsigned long long)d0.a << 24 | (unsigned long long)d0.b << 16 | (unsigned long long)d0.g << 8 | (unsigned long long)d0.r; *(dst + i) = l; } } // write rgba stream to external array dst. The "a" is just padding and is // unused template static void pack2Vectors(hls::stream& flow0, hls::stream& flow1, xf::cv::Mat& out_flow, int rows, int cols, int size) { for (int i = 0; i < size; i++) { // clang-format off #pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS*COLS/NPC #pragma HLS PIPELINE // clang-format on float d0 = flow0.read(); float d1 = flow1.read(); ap_uint<32>* d0_int; d0_int = (ap_uint<32>*)&d0; ap_uint<32>* d1_int; d1_int = (ap_uint<32>*)&d1; // as 0th word will have d0_int and 1st word will have d1_int ap_uint<64> l = ((unsigned long long)(*d1_int) << 32) | (unsigned long long)(*d0_int); out_flow.write(i, l); } } // Compute sums for bottom-right and top-right pixel and update the column sums. // Use column sums to update the integrals. Implements O(1) sliding window. // // TODO: // 1. Dont need the entire column for img1Win and img2Win. Need only the kernel // 2. Full line buffer is not needed template static void computeSums16(hls::stream > img1Col[(WINDOW_SIZE + 1)], hls::stream > img2Col[(WINDOW_SIZE + 1)], hls::stream& ixix_out0, hls::stream& ixiy_out0, hls::stream& iyiy_out0, hls::stream& dix_out0, hls::stream& diy_out0, hls::stream& ixix_out1, hls::stream& ixiy_out1, hls::stream& iyiy_out1, hls::stream& dix_out1, hls::stream& diy_out1, int rows, int cols, int size) { pix_t img1Col0[(WINDOW_SIZE + 1)], img2Col0[(WINDOW_SIZE + 1)]; pix_t img1Col1[(WINDOW_SIZE + 1)], img2Col1[(WINDOW_SIZE + 1)]; // clang-format off #pragma HLS ARRAY_PARTITION variable=img1Col0 complete dim=0 #pragma HLS ARRAY_PARTITION variable=img2Col0 complete dim=0 #pragma HLS ARRAY_PARTITION variable=img1Col1 complete dim=0 #pragma HLS ARRAY_PARTITION variable=img2Col1 complete dim=0 // clang-format on static pix_t img1Win[2 * (WINDOW_SIZE + 1)], img2Win[1 * (WINDOW_SIZE + 1)]; // clang-format off #pragma HLS ARRAY_PARTITION variable=img1Win complete dim=0 #pragma HLS ARRAY_PARTITION variable=img2Win complete dim=0 // clang-format on // static pix_t img1Win1 [2 * (WINDOW_SIZE+1)], img2Win1 [1 * (WINDOW_SIZE+1)]; //#pragma HLS ARRAY_PARTITION variable=img1Win1 complete dim=0 //#pragma HLS ARRAY_PARTITION variable=img2Win1 complete dim=0 static int ixix = 0, ixiy = 0, iyiy = 0, dix = 0, diy = 0; // column sums: // need left-shift. Array-Part leads to FF with big Muxes. Try to do with // classic array and pointer. Need current and current-WINDOW_SIZE ptrs // For II=1 pipelining, need two read and 1 write ports. Simulating it with // two arrays that have their write ports tied together. // TODO need only MAX_WODTH/2. Have to adjust zIdx and nIdx as well static int csIxixO[COLS / 2], csIxiyO[COLS / 2], csIyiyO[COLS / 2], csDixO[COLS / 2], csDiyO[COLS / 2]; static int csIxixE[COLS / 2], csIxiyE[COLS / 2], csIyiyE[COLS / 2], csDixE[COLS / 2], csDiyE[COLS / 2]; static int cbIxixO[COLS / 2], cbIxiyO[COLS / 2], cbIyiyO[COLS / 2], cbDixO[COLS / 2], cbDiyO[COLS / 2]; static int cbIxixE[COLS / 2], cbIxiyE[COLS / 2], cbIyiyE[COLS / 2], cbDixE[COLS / 2], cbDiyE[COLS / 2]; int zIdx = -(WINDOW_SIZE / 2 - 1); int nIdx = zIdx + WINDOW_SIZE / 2 - 1; // clang-format off #pragma HLS ARRAY_MAP variable=csIxixO instance=csO vertical #pragma HLS ARRAY_MAP variable=csIxiyO instance=csO vertical #pragma HLS ARRAY_MAP variable=csIyiyO instance=csO vertical #pragma HLS ARRAY_MAP variable=csDixO instance=csO vertical #pragma HLS ARRAY_MAP variable=csDiyO instance=csO vertical // clang-format on // clang-format off #pragma HLS ARRAY_MAP variable=csIxixE instance=csE vertical #pragma HLS ARRAY_MAP variable=csIxiyE instance=csE vertical #pragma HLS ARRAY_MAP variable=csIyiyE instance=csE vertical #pragma HLS ARRAY_MAP variable=csDixE instance=csE vertical #pragma HLS ARRAY_MAP variable=csDiyE instance=csE vertical // clang-format on // clang-format off #pragma HLS ARRAY_MAP variable=cbIxixO instance=cb vertical #pragma HLS ARRAY_MAP variable=cbIxiyO instance=cb vertical #pragma HLS ARRAY_MAP variable=cbIyiyO instance=cb vertical #pragma HLS ARRAY_MAP variable=cbDixO instance=cb vertical #pragma HLS ARRAY_MAP variable=cbDiyO instance=cb vertical #pragma HLS ARRAY_MAP variable=cbIxixE instance=cb vertical #pragma HLS ARRAY_MAP variable=cbIxiyE instance=cb vertical #pragma HLS ARRAY_MAP variable=cbIyiyE instance=cb vertical #pragma HLS ARRAY_MAP variable=cbDixE instance=cb vertical #pragma HLS ARRAY_MAP variable=cbDiyE instance=cb vertical // clang-format on if (USE_URAM) { // clang-format off #pragma HLS RESOURCE variable=csIxixO core=RAM_2P_URAM #pragma HLS RESOURCE variable=csIxixE core=RAM_2P_URAM #pragma HLS RESOURCE variable=cbIxixO core=RAM_2P_URAM // clang-format on } else { // clang-format off #pragma HLS RESOURCE variable=csIxixO core=RAM_2P_BRAM #pragma HLS RESOURCE variable=csIxixE core=RAM_2P_BRAM #pragma HLS RESOURCE variable=cbIxixO core=RAM_2P_BRAM // clang-format on } // clang-format off #pragma HLS DEPENDENCE variable=csIxixO inter RAW false #pragma HLS DEPENDENCE variable=csIxiyO inter RAW false #pragma HLS DEPENDENCE variable=csIyiyO inter RAW false #pragma HLS DEPENDENCE variable=csDixO inter RAW false #pragma HLS DEPENDENCE variable=csDiyO inter RAW false // clang-format on // clang-format off #pragma HLS DEPENDENCE variable=csIxixE inter WAR false #pragma HLS DEPENDENCE variable=csIxiyE inter WAR false #pragma HLS DEPENDENCE variable=csIyiyE inter WAR false #pragma HLS DEPENDENCE variable=csDixE inter WAR false #pragma HLS DEPENDENCE variable=csDiyE inter WAR false // clang-format on // clang-format off #pragma HLS DEPENDENCE variable=cbIxixO inter RAW false #pragma HLS DEPENDENCE variable=cbIxiyO inter RAW false #pragma HLS DEPENDENCE variable=cbIyiyO inter RAW false #pragma HLS DEPENDENCE variable=cbDixO inter RAW false #pragma HLS DEPENDENCE variable=cbDiyO inter RAW false // clang-format on // clang-format off #pragma HLS DEPENDENCE variable=cbIxixE inter RAW false #pragma HLS DEPENDENCE variable=cbIxiyE inter RAW false #pragma HLS DEPENDENCE variable=cbIyiyE inter RAW false #pragma HLS DEPENDENCE variable=cbDixE inter RAW false #pragma HLS DEPENDENCE variable=cbDiyE inter RAW false // clang-format on int csIxixR0, csIxiyR0, csIyiyR0, csDixR0, csDiyR0; int csIxixR1, csIxiyR1, csIyiyR1, csDixR1, csDiyR1; for (int r = 0; r < rows; r++) { // clang-format off #pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS // clang-format on for (int c = 0; c < cols / 2; c++) { // clang-format off #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS/2 #pragma HLS PIPELINE // clang-format on int csIxixL0 = 0, csIxiyL0 = 0, csIyiyL0 = 0, csDixL0 = 0, csDiyL0 = 0; int csIxixL1 = 0, csIxiyL1 = 0, csIyiyL1 = 0, csDixL1 = 0, csDiyL1 = 0; for (int wr = 0; wr < (WINDOW_SIZE + 1); ++wr) { // clang-format off #pragma HLS LOOP_TRIPCOUNT min=1 max=WINDOW_SIZE+1 // clang-format on mywide_t tmp1 = img1Col[wr].read(); img1Col0[wr] = tmp1.data[0]; img1Col1[wr] = tmp1.data[1]; mywide_t tmp2 = img2Col[wr].read(); img2Col0[wr] = tmp2.data[0]; img2Col1[wr] = tmp2.data[1]; } // p(x+1,y) and p(x-1,y) int wrt = 1; int cIxTopR0 = (img1Col0[wrt] - img1Win[wrt * 2 + 2 - 2]) / 2; // p(x,y+1) and p(x,y-1) int cIyTopR0 = (img1Win[(wrt + 1) * 2 + 2 - 1] - img1Win[(wrt - 1) * 2 + 2 - 1]) / 2; // p1(x,y) and p2(x,y) int delTopR0 = img1Win[wrt * 2 + 2 - 1] - img2Win[wrt * 1 + 1 - 1]; int wrb = WINDOW_SIZE - 1; int cIxBotR0 = (img1Col0[wrb] - img1Win[wrb * 2 + 2 - 2]) / 2; int cIyBotR0 = (img1Win[(wrb + 1) * 2 + 2 - 1] - img1Win[(wrb - 1) * 2 + 2 - 1]) / 2; int delBotR0 = img1Win[wrb * 2 + 2 - 1] - img2Win[wrb * 1 + 1 - 1]; if (0 && r < WINDOW_SIZE) { cIxTopR0 = 0; cIyTopR0 = 0; delTopR0 = 0; } // p(x+1,y) and p(x-1,y) wrt = 1; int cIxTopR1 = (img1Col1[wrt] - img1Win[wrt * 2 + 2 - 1]) / 2; // p(x,y+1) and p(x,y-1) int cIyTopR1 = (img1Col0[wrt + 1] - img1Col0[wrt - 1]) / 2; // p1(x,y) and p2(x,y) int delTopR1 = (img1Col0[wrt] - img2Col0[wrt]); wrb = WINDOW_SIZE - 1; int cIxBotR1 = (img1Col1[wrb] - img1Win[wrb * 2 + 2 - 1]) / 2; int cIyBotR1 = (img1Col0[wrb + 1] - img1Col0[wrb - 1]) / 2; int delBotR1 = (img1Col0[wrb] - img2Col0[wrb]); csIxixR0 = cbIxixE[nIdx] + cIxBotR0 * cIxBotR0 - cIxTopR0 * cIxTopR0; csIxiyR0 = cbIxiyE[nIdx] + cIxBotR0 * cIyBotR0 - cIxTopR0 * cIyTopR0; csIyiyR0 = cbIyiyE[nIdx] + cIyBotR0 * cIyBotR0 - cIyTopR0 * cIyTopR0; csDixR0 = cbDixE[nIdx] + delBotR0 * cIxBotR0 - delTopR0 * cIxTopR0; csDiyR0 = cbDiyE[nIdx] + delBotR0 * cIyBotR0 - delTopR0 * cIyTopR0; csIxixR1 = cbIxixO[nIdx] + cIxBotR1 * cIxBotR1 - cIxTopR1 * cIxTopR1; csIxiyR1 = cbIxiyO[nIdx] + cIxBotR1 * cIyBotR1 - cIxTopR1 * cIyTopR1; csIyiyR1 = cbIyiyO[nIdx] + cIyBotR1 * cIyBotR1 - cIyTopR1 * cIyTopR1; csDixR1 = cbDixO[nIdx] + delBotR1 * cIxBotR1 - delTopR1 * cIxTopR1; csDiyR1 = cbDiyO[nIdx] + delBotR1 * cIyBotR1 - delTopR1 * cIyTopR1; csIxixE[nIdx] = csIxixR0; csIxiyE[nIdx] = csIxiyR0; csIyiyE[nIdx] = csIyiyR0; csDixE[nIdx] = csDixR0; csDiyE[nIdx] = csDiyR0; if (zIdx >= 0) { int const zIdxPrev = zIdx == 0 ? cols / 2 - 1 : zIdx - 1; csIxixL0 = csIxixO[zIdxPrev]; csIxiyL0 = csIxiyO[zIdxPrev]; csIyiyL0 = csIyiyO[zIdxPrev]; csDixL0 = csDixO[zIdxPrev]; csDiyL0 = csDiyO[zIdxPrev]; csIxixL1 = csIxixE[zIdx]; csIxiyL1 = csIxiyE[zIdx]; csIyiyL1 = csIyiyE[zIdx]; csDixL1 = csDixE[zIdx]; csDiyL1 = csDiyE[zIdx]; } int tmpixix0 = (csIxixR0 - csIxixL0); int tmpixix1 = (csIxixR0 - csIxixL0) + (csIxixR1 - csIxixL1); int tmpixiy0 = (csIxiyR0 - csIxiyL0); int tmpixiy1 = (csIxiyR0 - csIxiyL0) + (csIxiyR1 - csIxiyL1); int tmpiyiy0 = (csIyiyR0 - csIyiyL0); int tmpiyiy1 = (csIyiyR0 - csIyiyL0) + (csIyiyR1 - csIyiyL1); int tmpdix0 = (csDixR0 - csDixL0); int tmpdix1 = (csDixR0 - csDixL0) + (csDixR1 - csDixL1); int tmpdiy0 = (csDiyR0 - csDiyL0); int tmpdiy1 = (csDiyR0 - csDiyL0) + (csDiyR1 - csDiyL1); // ixix += (csIxixR0 - csIxixL0); // ixiy += (csIxiyR0 - csIxiyL0); // iyiy += (csIyiyR0 - csIyiyL0); // dix += (csDixR0 - csDixL0); // diy += (csDiyR0 - csDiyL0); // ixix_out0. write (ixix); // ixiy_out0. write (ixiy); // iyiy_out0. write (iyiy); // dix_out0. write (dix); // diy_out0. write (diy); ixix_out0.write(ixix + tmpixix0); ixiy_out0.write(ixiy + tmpixiy0); iyiy_out0.write(iyiy + tmpiyiy0); dix_out0.write(dix + tmpdix0); diy_out0.write(diy + tmpdiy0); // now compute the second pixel // ixix += (csIxixR1 - csIxixL1); // ixiy += (csIxiyR1 - csIxiyL1); // iyiy += (csIyiyR1 - csIyiyL1); // dix += (csDixR1 - csDixL1); // diy += (csDiyR1 - csDiyL1); ixix += tmpixix1; ixiy += tmpixiy1; iyiy += tmpiyiy1; dix += tmpdix1; diy += tmpdiy1; ixix_out1.write(ixix); ixiy_out1.write(ixiy); iyiy_out1.write(iyiy); dix_out1.write(dix); diy_out1.write(diy); for (int i = 0; i < (WINDOW_SIZE + 1); i++) { // clang-format off #pragma HLS LOOP_TRIPCOUNT min=1 max=WINDOW_SIZE+1 // clang-format on img1Win[i * 2] = img1Col0[i]; img1Win[i * 2 + 1] = img1Col1[i]; img2Win[i] = img2Col1[i]; } cbIxixE[nIdx] = csIxixR0; cbIxiyE[nIdx] = csIxiyR0; cbIyiyE[nIdx] = csIyiyR0; cbDixE[nIdx] = csDixR0; cbDiyE[nIdx] = csDiyR0; /* csIxixE [nIdx] = csIxixR0; csIxiyE [nIdx] = csIxiyR0; csIyiyE [nIdx] = csIyiyR0; csDixE [nIdx] = csDixR0; csDiyE [nIdx] = csDiyR0;*/ cbIxixO[nIdx] = csIxixR1; cbIxiyO[nIdx] = csIxiyR1; cbIyiyO[nIdx] = csIyiyR1; cbDixO[nIdx] = csDixR1; cbDiyO[nIdx] = csDiyR1; csIxixO[nIdx] = csIxixR1; csIxiyO[nIdx] = csIxiyR1; csIyiyO[nIdx] = csIyiyR1; csDixO[nIdx] = csDixR1; csDiyO[nIdx] = csDiyR1; zIdx++; if (zIdx == cols / 2) zIdx = 0; nIdx++; if (nIdx == cols / 2) nIdx = 0; } } // Cleanup. If kernel is called multiple times with different inputs, not // cleaning these vars would pollute the subsequent frames. // TODO zero in the line buffer instead, for r < WINDOW_SIZE for (int r = 0; r < (WINDOW_SIZE + 1); r++) { // clang-format off #pragma HLS LOOP_TRIPCOUNT min=1 max=WINDOW_SIZE+1 #pragma HLS UNROLL // clang-format on img1Win[r] = 0; img1Win[r + (WINDOW_SIZE + 1)] = 0; img2Win[r] = 0; img1Col0[r] = 0; img2Col0[r] = 0; img1Col1[r] = 0; img2Col1[r] = 0; } for (int r = 0; r < cols / 2; ++r) { // clang-format off #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS #pragma HLS PIPELINE // clang-format on csIxixO[r] = 0; csIxiyO[r] = 0; csIyiyO[r] = 0; csDixO[r] = 0; csDiyO[r] = 0; cbIxixO[r] = 0; cbIxiyO[r] = 0; cbIyiyO[r] = 0; cbDixO[r] = 0; cbDiyO[r] = 0; csIxixE[r] = 0; csIxiyE[r] = 0; csIyiyE[r] = 0; csDixE[r] = 0; csDiyE[r] = 0; cbIxixE[r] = 0; cbIxiyE[r] = 0; cbIyiyE[r] = 0; cbDixE[r] = 0; cbDiyE[r] = 0; } ixix = 0; ixiy = 0; iyiy = 0; dix = 0; diy = 0; } // consume the integrals and compute flow vectors template static void computeFlow16(hls::stream& ixix, hls::stream& ixiy, hls::stream& iyiy, hls::stream& dix, hls::stream& diy, hls::stream& fx_out, hls::stream& fy_out, int rows, int cols, int size) { for (int r = 0; r < rows; r++) { // clang-format off #pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS // clang-format on for (int c = 0; c < cols / 2; c++) { // clang-format off #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS/2 #pragma HLS PIPELINE // clang-format on int ixix_ = ixix.read(); int ixiy_ = ixiy.read(); int iyiy_ = iyiy.read(); int dix_ = dix.read(); int diy_ = diy.read(); float fx_ = 0, fy_ = 0; // matrix inv float det = (float)ixix_ * iyiy_ - (float)ixiy_ * ixiy_; if (det <= 1.0f || r < (WINDOW_SIZE) || c < ((WINDOW_SIZE + 1) / 2)) { fx_ = 0.0; fy_ = 0.0; } else { // res est: (dsp,ff,lut) // fdiv (0,748,800), fmul (3,143,139), fadd (2,306,246), fsub (2,306,246) // sitofp (0,229,365), fcmp (0,66,72), imul(1,0,0) (in cs) // float detInv = 1.0/det; float i00 = (float)iyiy_ / det; float i01 = (float)(-ixiy_) / det; float i10 = (float)(-ixiy_) / det; float i11 = (float)ixix_ / det; fx_ = i00 * dix_ + i01 * diy_; fy_ = i10 * dix_ + i11 * diy_; } fx_out.write(fx_); fy_out.write(fy_); } } } // line buffer for both input images. Can be split to a fn that models a single // linebuffer template static void lbWrapper16(hls::stream >& f0Stream, hls::stream >& f1Stream, hls::stream > img1Col[(WINDOW_SIZE + 1)], hls::stream > img2Col[(WINDOW_SIZE + 1)], int rows, int cols, int size) { static pix_t lb1[(WINDOW_SIZE + 1)][COLS / XF_NPIXPERCYCLE(NPC)][XF_NPIXPERCYCLE(NPC)], lb2[(WINDOW_SIZE + 1)][COLS / XF_NPIXPERCYCLE(NPC)][XF_NPIXPERCYCLE(NPC)]; // clang-format off #pragma HLS ARRAY_MAP variable=lb1 instance=lbMap vertical #pragma HLS ARRAY_MAP variable=lb2 instance=lbMap vertical // clang-format on // clang-format off #pragma HLS ARRAY_RESHAPE variable=lb1 complete dim=1 #pragma HLS ARRAY_RESHAPE variable=lb2 complete dim=1 #pragma HLS ARRAY_RESHAPE variable=lb1 complete dim=3 #pragma HLS ARRAY_RESHAPE variable=lb2 complete dim=3 // clang-format on if (USE_URAM) { // clang-format off #pragma HLS RESOURCE variable=lb1 core=RAM_T2P_URAM #pragma HLS RESOURCE variable=lb2 core=RAM_T2P_URAM // clang-format on } for (int r = 0; r < rows; r++) { // clang-format off #pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS #pragma HLS LOOP_FLATTEN OFF // clang-format on for (int c = 0; c < cols / 2; c++) { // clang-format off #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS/2 #pragma HLS pipeline // clang-format on // shift up both linebuffers at col=c for (int i = 0; i < ((WINDOW_SIZE + 1) - 1); i++) { mywide_t lb; for (int k = 0; k < XF_NPIXPERCYCLE(NPC); k++) { lb.data[k] = lb1[i + 1][c][k]; lb1[i][c][k] = lb.data[k]; } img1Col[i].write(lb); for (int k = 0; k < XF_NPIXPERCYCLE(NPC); k++) { lb.data[k] = lb2[i + 1][c][k]; lb2[i][c][k] = lb.data[k]; } img2Col[i].write(lb); } // read in the new pixels at col=c and row=bottom_of_lb mywide_t pix0 = f0Stream.read(); img1Col[(WINDOW_SIZE + 1) - 1].write(pix0); mywide_t pix1 = f1Stream.read(); img2Col[(WINDOW_SIZE + 1) - 1].write(pix1); for (int k = 0; k < XF_NPIXPERCYCLE(NPC); k++) { lb1[(WINDOW_SIZE + 1) - 1][c][k] = pix0.data[k]; lb2[(WINDOW_SIZE + 1) - 1][c][k] = pix1.data[k]; } } } // cleanup for (int c = 0; c < cols / 2; c++) { // clang-format off #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS/2 #pragma HLS PIPELINE // clang-format on for (int r = 0; r < (WINDOW_SIZE + 1); r++) { // clang-format off #pragma HLS LOOP_TRIPCOUNT min=1 max=WINDOW_SIZE+1 // clang-format on for (int k = 0; k < XF_NPIXPERCYCLE(NPC); k++) { lb1[r][c][k] = 0; lb2[r][c][k] = 0; } } } } // top level wrapper to avoid dataflow problems // void flowWrap (mywide_t frame0[NUM_WORDS], mywide_t frame1[NUM_WORDS], rgba2_t framef[NUM_WORDS]) template static void flowWrap16(xf::cv::Mat& frame0, xf::cv::Mat& frame1, xf::cv::Mat& flowx, xf::cv::Mat& flowy, int rows, int cols, int size) { //#pragma HLS data_pack variable=frame0 //#pragma HLS data_pack variable=frame1 //#pragma HLS data_pack variable=framef // clang-format off #pragma HLS DATAFLOW // clang-format on // ddr <-> kernel streams. Stream depths are probably too large and can be // trimmed hls::stream > f0Stream, f1Stream; // clang-format off #pragma HLS data_pack variable=f0Stream #pragma HLS data_pack variable=f1Stream #pragma HLS STREAM variable=f0Stream depth=16 #pragma HLS STREAM variable=f1Stream depth=16 // clang-format on // hls::stream ff0Stream, ff1Stream; // #pragma HLS data_pack variable=ff0Stream // #pragma HLS data_pack variable=ff1Stream // #pragma HLS STREAM variable=ff0Stream depth=16 // #pragma HLS STREAM variable=ff1Stream depth=16 hls::stream > img1Col[(WINDOW_SIZE + 1)], img2Col[(WINDOW_SIZE + 1)]; // clang-format off #pragma HLS data_pack variable=img1Col #pragma HLS data_pack variable=img2Col #pragma HLS STREAM variable=img1Col depth=16 #pragma HLS STREAM variable=img2Col depth=16 #pragma HLS ARRAY_PARTITION variable=img1Col complete dim=0 #pragma HLS ARRAY_PARTITION variable=img2Col complete dim=0 // clang-format on hls::stream ixix0, ixiy0, iyiy0, dix0, diy0; hls::stream fx0("fx0"), fy0("fy0"); // clang-format off #pragma HLS STREAM variable=ixix0 depth=16 #pragma HLS STREAM variable=ixiy0 depth=16 #pragma HLS STREAM variable=iyiy0 depth=16 #pragma HLS STREAM variable=dix0 depth=16 #pragma HLS STREAM variable=diy0 depth=16 #pragma HLS STREAM variable=fx0 depth=16 #pragma HLS STREAM variable=fy0 depth=16 // clang-format on hls::stream ixix1, ixiy1, iyiy1, dix1, diy1; hls::stream fx1("fx1"), fy1("fy1"); // clang-format off #pragma HLS STREAM variable=ixix1 depth=16 #pragma HLS STREAM variable=ixiy1 depth=16 #pragma HLS STREAM variable=iyiy1 depth=16 #pragma HLS STREAM variable=dix1 depth=16 #pragma HLS STREAM variable=diy1 depth=16 #pragma HLS STREAM variable=fx1 depth=16 #pragma HLS STREAM variable=fy1 depth=16 // clang-format on readMatRows16(frame0, f0Stream, rows, cols, size); readMatRows16(frame1, f1Stream, rows, cols, size); lbWrapper16(f0Stream, f1Stream, img1Col, img2Col, rows, cols, size); computeSums16(img1Col, img2Col, ixix0, ixiy0, iyiy0, dix0, diy0, ixix1, ixiy1, iyiy1, dix1, diy1, rows, cols, size); computeFlow16(ixix0, ixiy0, iyiy0, dix0, diy0, fx0, fy0, rows, cols, size); computeFlow16(ixix1, ixiy1, iyiy1, dix1, diy1, fx1, fy1, rows, cols, size); pack2Vectors(fx0, fx1, flowx, rows, cols, size); pack2Vectors(fy0, fy1, flowy, rows, cols, size); } //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- // external interface to the kernel. // frame0 - First input frame (grayscale 1 byte per pixel) // frame1 - Second input frame (grayscale 1 byte per pixel) // framef - Output frame with flows visualized. 3 bytes per pixel + 1 byte padding // void fpga_optflow ( mywide_t< XF_NPIXPERCYCLE(NPC) > *frame0, mywide_t< XF_NPIXPERCYCLE(NPC) > *frame1, rgba2_t // *framef); // ushort = 16bits, 8 bits per grayscale pix, so two pix // ulonglong = 64 bits, 32 bits per color pixel (rgba), so two color pix // void fpga_optflow (unsigned short *frame0, unsigned short *frame1, unsigned long long *framef) // void fpga_optflow (unsigned short frame0[NUM_WORDS], unsigned short frame1[NUM_WORDS], unsigned long long // framef[NUM_WORDS]) template static void fpga_optflow16(xf::cv::Mat& frame0, xf::cv::Mat& frame1, xf::cv::Mat& flowx, xf::cv::Mat& flowy, int rows, int cols, int size) { // clang-format off #pragma HLS inline off // clang-format on flowWrap16(frame0, frame1, flowx, flowy, rows, cols, size); return; } // read external array matB and stream. // Can be simplified to a single loop with II=1 TODO, hls::stream< mywide_t< XF_NPIXPERCYCLE(NPC) > > &frame1, // hls::stream &framef template static void readMatRows( xf::cv::Mat& matB, hls::stream& pixStream, int rows, int cols, int size) { const int WORD_SIZE = (NPC == XF_NPPC1) ? 1 : 2; for (int i = 0; i < size; i++) { // clang-format off #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS*ROWS/NPC #pragma HLS PIPELINE // clang-format on mywide_t tmpData; tmpData.data[0] = matB.read(i); // for (int k = 0; k < WORD_SIZE; ++k) { // pixStream.write(tmpData.data[k]); // } pixStream.write(tmpData.data[0]); } } // write rgba stream to external array dst. The "a" is just padding and is // unused /*template static void writeMatRowsRGBA(hls::stream& pixStream, unsigned int* dst, int rows, int cols, int size) { for (int i = 0; i < size; i++) { // clang-format off #pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS*COLS/NPC #pragma HLS PIPELINE // clang-format on rgba_t tmpData = pixStream.read(); *(dst + i) = (unsigned int)tmpData.a << 24 | (unsigned int)tmpData.b << 16 | (unsigned int)tmpData.g << 8 | (unsigned int)tmpData.r; } }*/ // Compute sums for bottom-right and top-right pixel and update the column sums. // Use column sums to update the integrals. Implements O(1) sliding window. // // TODO: // 1. Dont need the entire column for img1Win and img2Win. Need only the kernel // 2. Full line buffer is not needed template static void computeSums(hls::stream img1Col[(WINDOW_SIZE + 1)], hls::stream img2Col[(WINDOW_SIZE + 1)], hls::stream& ixix_out, hls::stream& ixiy_out, hls::stream& iyiy_out, hls::stream& dix_out, hls::stream& diy_out, int rows, int cols, int size) { pix_t img1Col_[(WINDOW_SIZE + 1)], img2Col_[(WINDOW_SIZE + 1)]; // clang-format off #pragma HLS ARRAY_PARTITION variable=img1Col_ complete dim=0 #pragma HLS ARRAY_PARTITION variable=img2Col_ complete dim=0 // clang-format on static pix_t img1Win[2 * (WINDOW_SIZE + 1)], img2Win[1 * (WINDOW_SIZE + 1)]; static int ixix = 0, ixiy = 0, iyiy = 0, dix = 0, diy = 0; // clang-format off #pragma HLS ARRAY_PARTITION variable=img1Win complete dim=0 #pragma HLS ARRAY_PARTITION variable=img2Win complete dim=0 // clang-format on // column sums: // need left-shift. Array-Part leads to FF with big Muxes. Try to do with // classic array and pointer. Need current and current-WINDOW_SIZE ptrs // For II=1 pipelining, need two read and 1 write ports. Simulating it with // two arrays that have their write ports tied together. static int csIxix[COLS], csIxiy[COLS], csIyiy[COLS], csDix[COLS], csDiy[COLS]; static int cbIxix[COLS], cbIxiy[COLS], cbIyiy[COLS], cbDix[COLS], cbDiy[COLS]; int zIdx = -(WINDOW_SIZE - 2); int nIdx = zIdx + WINDOW_SIZE - 2; // clang-format off #pragma HLS ARRAY_MAP variable=csIxix instance=cs vertical #pragma HLS ARRAY_MAP variable=csIxiy instance=cs vertical #pragma HLS ARRAY_MAP variable=csIyiy instance=cs vertical #pragma HLS ARRAY_MAP variable=csDix instance=cs vertical #pragma HLS ARRAY_MAP variable=csDiy instance=cs vertical // clang-format on if (USE_URAM) { // clang-format off #pragma HLS RESOURCE variable=csIxix core=RAM_2P_URAM // clang-format on } else { // clang-format off #pragma HLS RESOURCE variable=csIxix core=RAM_2P_BRAM // clang-format on } // clang-format off #pragma HLS DEPENDENCE variable=csIxix inter RAW false #pragma HLS DEPENDENCE variable=csIxiy inter RAW false #pragma HLS DEPENDENCE variable=csIyiy inter RAW false #pragma HLS DEPENDENCE variable=csDix inter RAW false #pragma HLS DEPENDENCE variable=csDiy inter RAW false // clang-format on // clang-format off #pragma HLS ARRAY_MAP variable=cbIxix instance=cb vertical #pragma HLS ARRAY_MAP variable=cbIxiy instance=cb vertical #pragma HLS ARRAY_MAP variable=cbIyiy instance=cb vertical #pragma HLS ARRAY_MAP variable=cbDix instance=cb vertical #pragma HLS ARRAY_MAP variable=cbDiy instance=cb vertical // clang-format on if (USE_URAM) { // clang-format off #pragma HLS RESOURCE variable=cbIxix core=RAM_2P_URAM // clang-format on } else { // clang-format off #pragma HLS RESOURCE variable=cbIxix core=RAM_2P_BRAM // clang-format on } // clang-format off #pragma HLS DEPENDENCE variable=cbIxix inter RAW false #pragma HLS DEPENDENCE variable=cbIxiy inter RAW false #pragma HLS DEPENDENCE variable=cbIyiy inter RAW false #pragma HLS DEPENDENCE variable=cbDix inter RAW false #pragma HLS DEPENDENCE variable=cbDiy inter RAW false // clang-format on int csIxixR, csIxiyR, csIyiyR, csDixR, csDiyR; for (int r = 0; r < rows; r++) { // clang-format off #pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS // clang-format on for (int c = 0; c < cols; c++) { // clang-format off #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS #pragma HLS PIPELINE II=1 // clang-format on int csIxixL = 0; int csIxiyL = 0; int csIyiyL = 0; int csDixL = 0; int csDiyL = 0; if (zIdx >= 0) { csIxixL = csIxix[zIdx]; csIxiyL = csIxiy[zIdx]; csIyiyL = csIyiy[zIdx]; csDixL = csDix[zIdx]; csDiyL = csDiy[zIdx]; } for (int wr = 0; wr < (WINDOW_SIZE + 1); ++wr) { // clang-format off #pragma HLS LOOP_TRIPCOUNT min=1 max=WINDOW_SIZE+1 // clang-format on img1Col_[wr] = img1Col[wr].read(); img2Col_[wr] = img2Col[wr].read(); } // p(x+1,y) and p(x-1,y) int wrt = 1; int cIxTopR = (img1Col_[wrt] - img1Win[wrt * 2 + 2 - 2]) / 2; // p(x,y+1) and p(x,y-1) int cIyTopR = (img1Win[(wrt + 1) * 2 + 2 - 1] - img1Win[(wrt - 1) * 2 + 2 - 1]) / 2; // p1(x,y) and p2(x,y) int delTopR = img1Win[wrt * 2 + 2 - 1] - img2Win[wrt * 1 + 1 - 1]; int wrb = WINDOW_SIZE - 1; int cIxBotR = (img1Col_[wrb] - img1Win[wrb * 2 + 2 - 2]) / 2; int cIyBotR = (img1Win[(wrb + 1) * 2 + 2 - 1] - img1Win[(wrb - 1) * 2 + 2 - 1]) / 2; int delBotR = img1Win[wrb * 2 + 2 - 1] - img2Win[wrb * 1 + 1 - 1]; if (0 && r < WINDOW_SIZE) { cIxTopR = 0; cIyTopR = 0; delTopR = 0; } csIxixR = cbIxix[nIdx] + cIxBotR * cIxBotR - cIxTopR * cIxTopR; csIxiyR = cbIxiy[nIdx] + cIxBotR * cIyBotR - cIxTopR * cIyTopR; csIyiyR = cbIyiy[nIdx] + cIyBotR * cIyBotR - cIyTopR * cIyTopR; csDixR = cbDix[nIdx] + delBotR * cIxBotR - delTopR * cIxTopR; csDiyR = cbDiy[nIdx] + delBotR * cIyBotR - delTopR * cIyTopR; ixix += (csIxixR - csIxixL); ixiy += (csIxiyR - csIxiyL); iyiy += (csIyiyR - csIyiyL); dix += (csDixR - csDixL); diy += (csDiyR - csDiyL); ixix_out.write(ixix); ixiy_out.write(ixiy); iyiy_out.write(iyiy); dix_out.write(dix); diy_out.write(diy); // we dont have the shifted pixel anymore to do overlay TODO // img1Delayed. write (0); for (int i = 0; i < (WINDOW_SIZE + 1); i++) { // clang-format off #pragma HLS LOOP_TRIPCOUNT min=1 max=WINDOW_SIZE+1 // clang-format on img1Win[i * 2] = img1Win[i * 2 + 1]; } for (int i = 0; i < (WINDOW_SIZE + 1); ++i) { // clang-format off #pragma HLS LOOP_TRIPCOUNT min=1 max=WINDOW_SIZE+1 // clang-format on img1Win[i * 2 + 1] = img1Col_[i]; img2Win[i] = img2Col_[i]; } cbIxix[nIdx] = csIxixR; cbIxiy[nIdx] = csIxiyR; cbIyiy[nIdx] = csIyiyR; cbDix[nIdx] = csDixR; cbDiy[nIdx] = csDiyR; csIxix[nIdx] = csIxixR; csIxiy[nIdx] = csIxiyR; csIyiy[nIdx] = csIyiyR; csDix[nIdx] = csDixR; csDiy[nIdx] = csDiyR; zIdx++; if (zIdx == cols) zIdx = 0; nIdx++; if (nIdx == cols) nIdx = 0; } } // Cleanup. If kernel is called multiple times with different inputs, not // cleaning these vars would pollute the subsequent frames. // TODO zero in the line buffer instead, for r < WINDOW_SIZE for (int r = 0; r < (WINDOW_SIZE + 1); r++) { // clang-format off #pragma HLS LOOP_TRIPCOUNT min=1 max=WINDOW_SIZE+1 #pragma HLS UNROLL // clang-format on img1Win[r] = 0; img1Win[r + (WINDOW_SIZE + 1)] = 0; img2Win[r] = 0; img1Col_[r] = 0; img2Col_[r] = 0; } for (int r = 0; r < cols; ++r) { // clang-format off #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS #pragma HLS PIPELINE II=1 // clang-format on csIxix[r] = 0; csIxiy[r] = 0; csIyiy[r] = 0; csDix[r] = 0; csDiy[r] = 0; cbIxix[r] = 0; cbIxiy[r] = 0; cbIyiy[r] = 0; cbDix[r] = 0; cbDiy[r] = 0; } ixix = 0; ixiy = 0; iyiy = 0; dix = 0; diy = 0; } // consume the integrals and compute flow vectors template static void computeFlow(hls::stream& ixix, hls::stream& ixiy, hls::stream& iyiy, hls::stream& dix, hls::stream& diy, hls::stream& fx_out, hls::stream& fy_out, int rows, int cols, int size) { for (int r = 0; r < rows; r++) { // clang-format off #pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS // clang-format on for (int c = 0; c < cols; c++) { // clang-format off #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS #pragma HLS PIPELINE // clang-format on int ixix_ = ixix.read(); int ixiy_ = ixiy.read(); int iyiy_ = iyiy.read(); int dix_ = dix.read(); int diy_ = diy.read(); float fx_ = 0, fy_ = 0; // matrix inv float det = (float)ixix_ * iyiy_ - (float)ixiy_ * ixiy_; if (det <= 1.0f || r < (WINDOW_SIZE) || c < (WINDOW_SIZE + 1)) { fx_ = 0.0; fy_ = 0.0; } else { // res est: (dsp,ff,lut) // fdiv (0,748,800), fmul (3,143,139), fadd (2,306,246), fsub (2,306,246) // sitofp (0,229,365), fcmp (0,66,72), imul(1,0,0) (in cs) // float detInv = 1.0/det; float i00 = (float)iyiy_ / det; float i01 = (float)(-ixiy_) / det; float i10 = (float)(-ixiy_) / det; float i11 = (float)ixix_ / det; fx_ = i00 * dix_ + i01 * diy_; fy_ = i10 * dix_ + i11 * diy_; } fx_out.write(fx_); fy_out.write(fy_); } } } template static void writeOutput8(hls::stream& fx_in, hls::stream& fy_in, xf::cv::Mat& flowx, xf::cv::Mat& flowy, int size) { for (int r = 0; r < size; r++) { // clang-format off #pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS*COLS #pragma HLS PIPELINE // clang-format on float fx_out = fx_in.read(); float fy_out = fy_in.read(); ap_uint<32>* fx_out_int; fx_out_int = (ap_uint<32>*)&fx_out; ap_uint<32>* fy_out_int; fy_out_int = (ap_uint<32>*)&fy_out; flowx.write(r, *fx_out_int); flowy.write(r, *fy_out_int); // ap_uint<32> a32 = flowx.read(r); // ap_uint<32> b32 = flowy.read(r); } } // line buffer for both input images. Can be split to a fn that models a single // linebuffer template static void lbWrapper(hls::stream& f0Stream, hls::stream& f1Stream, hls::stream img1Col[(WINDOW_SIZE + 1)], hls::stream img2Col[(WINDOW_SIZE + 1)], int rows, int cols, int size) { static pix_t lb1[(WINDOW_SIZE + 1)][COLS], lb2[(WINDOW_SIZE + 1)][COLS]; // clang-format off #pragma HLS ARRAY_MAP variable=lb1 instance=lbMap vertical #pragma HLS ARRAY_MAP variable=lb2 instance=lbMap vertical #pragma HLS ARRAY_RESHAPE variable=lb1 complete dim=1 #pragma HLS ARRAY_RESHAPE variable=lb2 complete dim=1 // clang-format on if (USE_URAM) { // clang-format off #pragma HLS RESOURCE variable=lb1 core=RAM_T2P_URAM #pragma HLS RESOURCE variable=lb2 core=RAM_T2P_URAM // clang-format on } loop_rows: for (int r = 0; r < rows; r++) { // clang-format off #pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS #pragma HLS LOOP_FLATTEN OFF // clang-format on loop_cols: for (int c = 0; c < cols; c++) { // clang-format off #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS #pragma HLS pipeline // clang-format on // shift up both linebuffers at col=c loop_ws: for (int i = 0; i < (WINDOW_SIZE + 1) - 1; i++) { lb1[i][c] = lb1[i + 1][c]; img1Col[i].write(lb1[i][c]); lb2[i][c] = lb2[i + 1][c]; img2Col[i].write(lb2[i][c]); } // read in the new pixels at col=c and row=bottom_of_lb pix_t pix0 = f0Stream.read(); lb1[(WINDOW_SIZE + 1) - 1][c] = pix0; img1Col[(WINDOW_SIZE + 1) - 1].write(pix0); pix_t pix1 = f1Stream.read(); lb2[(WINDOW_SIZE + 1) - 1][c] = pix1; img2Col[(WINDOW_SIZE + 1) - 1].write(pix1); } } // cleanup for (int c = 0; c < cols; c++) { // clang-format off #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS #pragma HLS PIPELINE // clang-format on for (int r = 0; r < (WINDOW_SIZE + 1); r++) { // clang-format off #pragma HLS LOOP_TRIPCOUNT min=1 max=WINDOW_SIZE+1 // clang-format on lb1[r][c] = 0; lb2[r][c] = 0; } } } // top level wrapper to avoid dataflow problems template static void flowWrap(xf::cv::Mat& frame0, xf::cv::Mat& frame1, xf::cv::Mat& flowx, xf::cv::Mat& flowy, int rows, int cols, int size) { // clang-format off #pragma HLS inline off #pragma HLS DATAFLOW // clang-format on // ddr <-> kernel streams. Stream depths are probably too large and can be // trimmed hls::stream f0Stream("f0Stream"), f1Stream("f1Stream"); hls::stream f0Delayed("f0Delayed"); // clang-format off #pragma HLS STREAM variable=f0Stream depth=16 #pragma HLS STREAM variable=f1Stream depth=16 // clang-format on // #pragma HLS STREAM variable=f0Delayed depth=128 // hls::stream ffStream ("ffStream"); // #pragma HLS data_pack variable=ffStream // #pragma HLS STREAM variable=ffStream depth=16 hls::stream img1Col[(WINDOW_SIZE + 1)], img2Col[(WINDOW_SIZE + 1)]; hls::stream ixix, ixiy, iyiy, dix, diy; hls::stream fx, fy; // clang-format off #pragma HLS STREAM variable=ixix depth=16 #pragma HLS STREAM variable=ixiy depth=16 #pragma HLS STREAM variable=iyiy depth=16 #pragma HLS STREAM variable=dix depth=16 #pragma HLS STREAM variable=diy depth=16 // clang-format on // #pragma HLS STREAM variable=fx depth=16 // #pragma HLS STREAM variable=fy depth=16 // clang-format off #pragma HLS STREAM variable=img1Col depth=16 #pragma HLS STREAM variable=img2Col depth=16 // clang-format on // clang-format off #pragma HLS ARRAY_PARTITION variable=img1Col complete dim=0 #pragma HLS ARRAY_PARTITION variable=img2Col complete dim=0 // clang-format on readMatRows(frame0, f0Stream, rows, cols, size); readMatRows(frame1, f1Stream, rows, cols, size); lbWrapper(f0Stream, f1Stream, img1Col, img2Col, rows, cols, size); computeSums(img1Col, img2Col, ixix, ixiy, iyiy, dix, diy, rows, cols, size); computeFlow(ixix, ixiy, iyiy, dix, diy, fx, fy, rows, cols, size); writeOutput8(fx, fy, flowx, flowy, size); } //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- // external interface to the kernel. // frame0 - First input frame (grayscale 1 byte per pixel) // frame1 - Second input frame (grayscale 1 byte per pixel) // framef - Output frame with flows visualized. 3 bytes per pixel + 1 byte padding template static void fpga_optflow8(xf::cv::Mat& frame0, xf::cv::Mat& frame1, xf::cv::Mat& flowx, xf::cv::Mat& flowy, int rows, int cols, int size) { // clang-format off #pragma HLS inline off // clang-format on flowWrap(frame0, frame1, flowx, flowy, rows, cols, size); return; } template void DenseNonPyrLKOpticalFlow(xf::cv::Mat& frame0, xf::cv::Mat& frame1, xf::cv::Mat& flowx, xf::cv::Mat& flowy) { if (NPC == XF_NPPC1) { fpga_optflow8(frame0, frame1, flowx, flowy, frame0.rows, frame0.cols, frame0.size); } else { fpga_optflow16(frame0, frame1, flowx, flowy, frame0.rows, frame0.cols, frame0.size); } } } // namespace cv } // namespace xf #endif