Program Listing for File xf_dense_npyr_optical_flow.hpp

Return to documentation for file (/tmp/ws/src/vitis_common/include/video/xf_dense_npyr_optical_flow.hpp)

/*
 * Copyright 2019 Xilinx, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef __XF_DENSE_NONPYR_OPTICAL_FLOW__
#define __XF_DENSE_NONPYR_OPTICAL_FLOW__
#include <stdio.h>
#include <string.h>
#include <ap_int.h>
#include <hls_stream.h>
#include "assert.h"
#include "video/xf_dense_npyr_optical_flow_types.h"
#include "common/xf_common.hpp"

namespace xf {
namespace cv {

// enable to run c-sim
//#define HLS_SIM

template <int TYPE, int ROWS, int COLS, int NPC, int WINDOW_SIZE>
static void readMatRows16(xf::cv::Mat<TYPE, ROWS, COLS, NPC>& src,
                          hls::stream<mywide_t<XF_NPIXPERCYCLE(NPC)> >& pixStream,
                          int rows,
                          int cols,
                          int size) {
    unsigned int count = 0;
    for (int i = 0; i < size; i++) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS*ROWS/NPC
        #pragma HLS PIPELINE
        // clang-format on
        unsigned short t;
        t = src.read(i);
        mywide_t<XF_NPIXPERCYCLE(NPC)> tmpData;
        tmpData.data[0] = t & 0x00FF;
        tmpData.data[1] = t >> 8;
        pixStream.write(tmpData);
    }
}

template <int ROWS, int COLS, int NPC, int WINDOW_SIZE>
static void writeMatRowsRGBA16(
    hls::stream<rgba_t>& pixStream0, hls::stream<rgba_t>& pixStream1, ap_uint<64>* dst, int rows, int cols, int size) {
    for (int i = 0; i < size; ++i) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS*COLS/NPC
        #pragma HLS PIPELINE
        // clang-format on
        rgba_t d0 = pixStream0.read();
        rgba_t d1 = pixStream1.read();

        // unsigned int t1 = (unsigned int)d1.a << 24 | (unsigned int)d1.b << 16 | (unsigned int)d1.g << 8 | (unsigned
        // int)d1.r; unsigned int t0 = (unsigned int)d0.a << 24 | (unsigned int)d0.b << 16 | (unsigned int)d0.g << 8 |
        // (unsigned int)d0.r;
        // who is at MSB? t0 or t1 TODO
        // unsigned long long l = (unsigned long long) t1 << 32 | (unsigned long long) t0;

        unsigned long long l = (unsigned long long)d1.a << 56 | (unsigned long long)d1.b << 48 |
                               (unsigned long long)d1.g << 40 | (unsigned long long)d1.r << 32 |
                               (unsigned long long)d0.a << 24 | (unsigned long long)d0.b << 16 |
                               (unsigned long long)d0.g << 8 | (unsigned long long)d0.r;
        *(dst + i) = l;
    }
}

// write rgba stream to external array dst. The "a" is just padding and is
// unused
template <int TYPE, int ROWS, int COLS, int NPC, int WINDOW_SIZE>
static void pack2Vectors(hls::stream<float>& flow0,
                         hls::stream<float>& flow1,
                         xf::cv::Mat<XF_32FC1, ROWS, COLS, NPC>& out_flow,
                         int rows,
                         int cols,
                         int size) {
    for (int i = 0; i < size; i++) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS*COLS/NPC
        #pragma HLS PIPELINE
        // clang-format on
        float d0 = flow0.read();
        float d1 = flow1.read();
        ap_uint<32>* d0_int;
        d0_int = (ap_uint<32>*)&d0;
        ap_uint<32>* d1_int;
        d1_int = (ap_uint<32>*)&d1;
        // as 0th word will have d0_int and 1st word will have d1_int
        ap_uint<64> l = ((unsigned long long)(*d1_int) << 32) | (unsigned long long)(*d0_int);
        out_flow.write(i, l);
    }
}

// Compute sums for bottom-right and top-right pixel and update the column sums.
// Use column sums to update the integrals. Implements O(1) sliding window.
//
// TODO:
// 1. Dont need the entire column for img1Win and img2Win. Need only the kernel
// 2. Full line buffer is not needed
template <int ROWS, int COLS, int NPC, int WINDOW_SIZE, bool USE_URAM>
static void computeSums16(hls::stream<mywide_t<XF_NPIXPERCYCLE(NPC)> > img1Col[(WINDOW_SIZE + 1)],
                          hls::stream<mywide_t<XF_NPIXPERCYCLE(NPC)> > img2Col[(WINDOW_SIZE + 1)],
                          hls::stream<int>& ixix_out0,
                          hls::stream<int>& ixiy_out0,
                          hls::stream<int>& iyiy_out0,
                          hls::stream<int>& dix_out0,
                          hls::stream<int>& diy_out0,

                          hls::stream<int>& ixix_out1,
                          hls::stream<int>& ixiy_out1,
                          hls::stream<int>& iyiy_out1,
                          hls::stream<int>& dix_out1,
                          hls::stream<int>& diy_out1,
                          int rows,
                          int cols,
                          int size)

{
    pix_t img1Col0[(WINDOW_SIZE + 1)], img2Col0[(WINDOW_SIZE + 1)];
    pix_t img1Col1[(WINDOW_SIZE + 1)], img2Col1[(WINDOW_SIZE + 1)];
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=img1Col0 complete dim=0
    #pragma HLS ARRAY_PARTITION variable=img2Col0 complete dim=0
    #pragma HLS ARRAY_PARTITION variable=img1Col1 complete dim=0
    #pragma HLS ARRAY_PARTITION variable=img2Col1 complete dim=0
    // clang-format on

    static pix_t img1Win[2 * (WINDOW_SIZE + 1)], img2Win[1 * (WINDOW_SIZE + 1)];
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=img1Win complete dim=0
    #pragma HLS ARRAY_PARTITION variable=img2Win complete dim=0
    // clang-format on
    // static pix_t img1Win1 [2 * (WINDOW_SIZE+1)], img2Win1 [1 * (WINDOW_SIZE+1)];
    //#pragma HLS ARRAY_PARTITION variable=img1Win1 complete dim=0
    //#pragma HLS ARRAY_PARTITION variable=img2Win1 complete dim=0

    static int ixix = 0, ixiy = 0, iyiy = 0, dix = 0, diy = 0;

    // column sums:
    // need left-shift. Array-Part leads to FF with big Muxes. Try to do with
    // classic array and pointer. Need current and current-WINDOW_SIZE ptrs
    // For II=1 pipelining, need two read and 1 write ports. Simulating it with
    // two arrays that have their write ports tied together.
    // TODO need only MAX_WODTH/2. Have to adjust zIdx and nIdx as well
    static int csIxixO[COLS / 2], csIxiyO[COLS / 2], csIyiyO[COLS / 2], csDixO[COLS / 2], csDiyO[COLS / 2];
    static int csIxixE[COLS / 2], csIxiyE[COLS / 2], csIyiyE[COLS / 2], csDixE[COLS / 2], csDiyE[COLS / 2];

    static int cbIxixO[COLS / 2], cbIxiyO[COLS / 2], cbIyiyO[COLS / 2], cbDixO[COLS / 2], cbDiyO[COLS / 2];
    static int cbIxixE[COLS / 2], cbIxiyE[COLS / 2], cbIyiyE[COLS / 2], cbDixE[COLS / 2], cbDiyE[COLS / 2];

    int zIdx = -(WINDOW_SIZE / 2 - 1);
    int nIdx = zIdx + WINDOW_SIZE / 2 - 1;

// clang-format off
    #pragma HLS ARRAY_MAP variable=csIxixO instance=csO vertical
    #pragma HLS ARRAY_MAP variable=csIxiyO instance=csO vertical
    #pragma HLS ARRAY_MAP variable=csIyiyO instance=csO vertical
    #pragma HLS ARRAY_MAP variable=csDixO  instance=csO vertical
    #pragma HLS ARRAY_MAP variable=csDiyO  instance=csO vertical
// clang-format on

// clang-format off
    #pragma HLS ARRAY_MAP variable=csIxixE instance=csE vertical
    #pragma HLS ARRAY_MAP variable=csIxiyE instance=csE vertical
    #pragma HLS ARRAY_MAP variable=csIyiyE instance=csE vertical
    #pragma HLS ARRAY_MAP variable=csDixE  instance=csE vertical
    #pragma HLS ARRAY_MAP variable=csDiyE  instance=csE vertical
// clang-format on

// clang-format off
    #pragma HLS ARRAY_MAP variable=cbIxixO instance=cb vertical
    #pragma HLS ARRAY_MAP variable=cbIxiyO instance=cb vertical
    #pragma HLS ARRAY_MAP variable=cbIyiyO instance=cb vertical
    #pragma HLS ARRAY_MAP variable=cbDixO  instance=cb vertical
    #pragma HLS ARRAY_MAP variable=cbDiyO  instance=cb vertical
    #pragma HLS ARRAY_MAP variable=cbIxixE instance=cb vertical
    #pragma HLS ARRAY_MAP variable=cbIxiyE instance=cb vertical
    #pragma HLS ARRAY_MAP variable=cbIyiyE instance=cb vertical
    #pragma HLS ARRAY_MAP variable=cbDixE  instance=cb vertical
    #pragma HLS ARRAY_MAP variable=cbDiyE  instance=cb vertical
    // clang-format on

    if (USE_URAM) {
// clang-format off
        #pragma HLS RESOURCE variable=csIxixO core=RAM_2P_URAM
        #pragma HLS RESOURCE variable=csIxixE core=RAM_2P_URAM
        #pragma HLS RESOURCE variable=cbIxixO core=RAM_2P_URAM
        // clang-format on
    } else {
// clang-format off
        #pragma HLS RESOURCE variable=csIxixO core=RAM_2P_BRAM
        #pragma HLS RESOURCE variable=csIxixE core=RAM_2P_BRAM
        #pragma HLS RESOURCE variable=cbIxixO core=RAM_2P_BRAM
        // clang-format on
    }

// clang-format off
    #pragma HLS DEPENDENCE variable=csIxixO inter RAW false
    #pragma HLS DEPENDENCE variable=csIxiyO inter RAW false
    #pragma HLS DEPENDENCE variable=csIyiyO inter RAW false
    #pragma HLS DEPENDENCE variable=csDixO  inter RAW false
    #pragma HLS DEPENDENCE variable=csDiyO  inter RAW false
// clang-format on

// clang-format off
    #pragma HLS DEPENDENCE variable=csIxixE inter WAR false
    #pragma HLS DEPENDENCE variable=csIxiyE inter WAR false
    #pragma HLS DEPENDENCE variable=csIyiyE inter WAR false
    #pragma HLS DEPENDENCE variable=csDixE  inter WAR false
    #pragma HLS DEPENDENCE variable=csDiyE  inter WAR false
// clang-format on

// clang-format off
    #pragma HLS DEPENDENCE variable=cbIxixO inter RAW false
    #pragma HLS DEPENDENCE variable=cbIxiyO inter RAW false
    #pragma HLS DEPENDENCE variable=cbIyiyO inter RAW false
    #pragma HLS DEPENDENCE variable=cbDixO  inter RAW false
    #pragma HLS DEPENDENCE variable=cbDiyO  inter RAW false
// clang-format on

// clang-format off
    #pragma HLS DEPENDENCE variable=cbIxixE inter RAW false
    #pragma HLS DEPENDENCE variable=cbIxiyE inter RAW false
    #pragma HLS DEPENDENCE variable=cbIyiyE inter RAW false
    #pragma HLS DEPENDENCE variable=cbDixE  inter RAW false
    #pragma HLS DEPENDENCE variable=cbDiyE  inter RAW false
    // clang-format on

    int csIxixR0, csIxiyR0, csIyiyR0, csDixR0, csDiyR0;
    int csIxixR1, csIxiyR1, csIyiyR1, csDixR1, csDiyR1;

    for (int r = 0; r < rows; r++) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS
        // clang-format on
        for (int c = 0; c < cols / 2; c++) {
// clang-format off
            #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS/2
            #pragma HLS PIPELINE
            // clang-format on

            int csIxixL0 = 0, csIxiyL0 = 0, csIyiyL0 = 0, csDixL0 = 0, csDiyL0 = 0;
            int csIxixL1 = 0, csIxiyL1 = 0, csIyiyL1 = 0, csDixL1 = 0, csDiyL1 = 0;

            for (int wr = 0; wr < (WINDOW_SIZE + 1); ++wr) {
// clang-format off
                #pragma HLS LOOP_TRIPCOUNT min=1 max=WINDOW_SIZE+1
                // clang-format on
                mywide_t<XF_NPIXPERCYCLE(NPC)> tmp1 = img1Col[wr].read();
                img1Col0[wr] = tmp1.data[0];
                img1Col1[wr] = tmp1.data[1];

                mywide_t<XF_NPIXPERCYCLE(NPC)> tmp2 = img2Col[wr].read();
                img2Col0[wr] = tmp2.data[0];
                img2Col1[wr] = tmp2.data[1];
            }

            // p(x+1,y) and p(x-1,y)
            int wrt = 1;
            int cIxTopR0 = (img1Col0[wrt] - img1Win[wrt * 2 + 2 - 2]) / 2;
            // p(x,y+1) and p(x,y-1)
            int cIyTopR0 = (img1Win[(wrt + 1) * 2 + 2 - 1] - img1Win[(wrt - 1) * 2 + 2 - 1]) / 2;
            // p1(x,y) and p2(x,y)
            int delTopR0 = img1Win[wrt * 2 + 2 - 1] - img2Win[wrt * 1 + 1 - 1];

            int wrb = WINDOW_SIZE - 1;
            int cIxBotR0 = (img1Col0[wrb] - img1Win[wrb * 2 + 2 - 2]) / 2;
            int cIyBotR0 = (img1Win[(wrb + 1) * 2 + 2 - 1] - img1Win[(wrb - 1) * 2 + 2 - 1]) / 2;
            int delBotR0 = img1Win[wrb * 2 + 2 - 1] - img2Win[wrb * 1 + 1 - 1];
            if (0 && r < WINDOW_SIZE) {
                cIxTopR0 = 0;
                cIyTopR0 = 0;
                delTopR0 = 0;
            }

            // p(x+1,y) and p(x-1,y)
            wrt = 1;
            int cIxTopR1 = (img1Col1[wrt] - img1Win[wrt * 2 + 2 - 1]) / 2;
            // p(x,y+1) and p(x,y-1)
            int cIyTopR1 = (img1Col0[wrt + 1] - img1Col0[wrt - 1]) / 2;
            // p1(x,y) and p2(x,y)
            int delTopR1 = (img1Col0[wrt] - img2Col0[wrt]);

            wrb = WINDOW_SIZE - 1;
            int cIxBotR1 = (img1Col1[wrb] - img1Win[wrb * 2 + 2 - 1]) / 2;
            int cIyBotR1 = (img1Col0[wrb + 1] - img1Col0[wrb - 1]) / 2;
            int delBotR1 = (img1Col0[wrb] - img2Col0[wrb]);

            csIxixR0 = cbIxixE[nIdx] + cIxBotR0 * cIxBotR0 - cIxTopR0 * cIxTopR0;
            csIxiyR0 = cbIxiyE[nIdx] + cIxBotR0 * cIyBotR0 - cIxTopR0 * cIyTopR0;
            csIyiyR0 = cbIyiyE[nIdx] + cIyBotR0 * cIyBotR0 - cIyTopR0 * cIyTopR0;
            csDixR0 = cbDixE[nIdx] + delBotR0 * cIxBotR0 - delTopR0 * cIxTopR0;
            csDiyR0 = cbDiyE[nIdx] + delBotR0 * cIyBotR0 - delTopR0 * cIyTopR0;

            csIxixR1 = cbIxixO[nIdx] + cIxBotR1 * cIxBotR1 - cIxTopR1 * cIxTopR1;
            csIxiyR1 = cbIxiyO[nIdx] + cIxBotR1 * cIyBotR1 - cIxTopR1 * cIyTopR1;
            csIyiyR1 = cbIyiyO[nIdx] + cIyBotR1 * cIyBotR1 - cIyTopR1 * cIyTopR1;
            csDixR1 = cbDixO[nIdx] + delBotR1 * cIxBotR1 - delTopR1 * cIxTopR1;
            csDiyR1 = cbDiyO[nIdx] + delBotR1 * cIyBotR1 - delTopR1 * cIyTopR1;

            csIxixE[nIdx] = csIxixR0;
            csIxiyE[nIdx] = csIxiyR0;
            csIyiyE[nIdx] = csIyiyR0;
            csDixE[nIdx] = csDixR0;
            csDiyE[nIdx] = csDiyR0;

            if (zIdx >= 0) {
                int const zIdxPrev = zIdx == 0 ? cols / 2 - 1 : zIdx - 1;
                csIxixL0 = csIxixO[zIdxPrev];
                csIxiyL0 = csIxiyO[zIdxPrev];
                csIyiyL0 = csIyiyO[zIdxPrev];
                csDixL0 = csDixO[zIdxPrev];
                csDiyL0 = csDiyO[zIdxPrev];

                csIxixL1 = csIxixE[zIdx];
                csIxiyL1 = csIxiyE[zIdx];
                csIyiyL1 = csIyiyE[zIdx];
                csDixL1 = csDixE[zIdx];
                csDiyL1 = csDiyE[zIdx];
            }

            int tmpixix0 = (csIxixR0 - csIxixL0);
            int tmpixix1 = (csIxixR0 - csIxixL0) + (csIxixR1 - csIxixL1);
            int tmpixiy0 = (csIxiyR0 - csIxiyL0);
            int tmpixiy1 = (csIxiyR0 - csIxiyL0) + (csIxiyR1 - csIxiyL1);
            int tmpiyiy0 = (csIyiyR0 - csIyiyL0);
            int tmpiyiy1 = (csIyiyR0 - csIyiyL0) + (csIyiyR1 - csIyiyL1);
            int tmpdix0 = (csDixR0 - csDixL0);
            int tmpdix1 = (csDixR0 - csDixL0) + (csDixR1 - csDixL1);
            int tmpdiy0 = (csDiyR0 - csDiyL0);
            int tmpdiy1 = (csDiyR0 - csDiyL0) + (csDiyR1 - csDiyL1);

            // ixix += (csIxixR0 - csIxixL0);
            // ixiy += (csIxiyR0 - csIxiyL0);
            // iyiy += (csIyiyR0 - csIyiyL0);
            // dix += (csDixR0 - csDixL0);
            // diy += (csDiyR0 - csDiyL0);

            // ixix_out0. write (ixix);
            // ixiy_out0. write (ixiy);
            // iyiy_out0. write (iyiy);
            // dix_out0. write (dix);
            // diy_out0. write (diy);
            ixix_out0.write(ixix + tmpixix0);
            ixiy_out0.write(ixiy + tmpixiy0);
            iyiy_out0.write(iyiy + tmpiyiy0);
            dix_out0.write(dix + tmpdix0);
            diy_out0.write(diy + tmpdiy0);

            // now compute the second pixel
            // ixix += (csIxixR1 - csIxixL1);
            // ixiy += (csIxiyR1 - csIxiyL1);
            // iyiy += (csIyiyR1 - csIyiyL1);
            // dix += (csDixR1 - csDixL1);
            // diy += (csDiyR1 - csDiyL1);
            ixix += tmpixix1;
            ixiy += tmpixiy1;
            iyiy += tmpiyiy1;
            dix += tmpdix1;
            diy += tmpdiy1;

            ixix_out1.write(ixix);
            ixiy_out1.write(ixiy);
            iyiy_out1.write(iyiy);
            dix_out1.write(dix);
            diy_out1.write(diy);

            for (int i = 0; i < (WINDOW_SIZE + 1); i++) {
// clang-format off
                #pragma HLS LOOP_TRIPCOUNT min=1 max=WINDOW_SIZE+1
                // clang-format on
                img1Win[i * 2] = img1Col0[i];
                img1Win[i * 2 + 1] = img1Col1[i];
                img2Win[i] = img2Col1[i];
            }

            cbIxixE[nIdx] = csIxixR0;
            cbIxiyE[nIdx] = csIxiyR0;
            cbIyiyE[nIdx] = csIyiyR0;
            cbDixE[nIdx] = csDixR0;
            cbDiyE[nIdx] = csDiyR0;

            /* csIxixE [nIdx] = csIxixR0;
             csIxiyE [nIdx] = csIxiyR0;
             csIyiyE [nIdx] = csIyiyR0;
             csDixE  [nIdx] = csDixR0;
             csDiyE  [nIdx] = csDiyR0;*/

            cbIxixO[nIdx] = csIxixR1;
            cbIxiyO[nIdx] = csIxiyR1;
            cbIyiyO[nIdx] = csIyiyR1;
            cbDixO[nIdx] = csDixR1;
            cbDiyO[nIdx] = csDiyR1;

            csIxixO[nIdx] = csIxixR1;
            csIxiyO[nIdx] = csIxiyR1;
            csIyiyO[nIdx] = csIyiyR1;
            csDixO[nIdx] = csDixR1;
            csDiyO[nIdx] = csDiyR1;

            zIdx++;
            if (zIdx == cols / 2) zIdx = 0;
            nIdx++;
            if (nIdx == cols / 2) nIdx = 0;
        }
    }

    // Cleanup. If kernel is called multiple times with different inputs, not
    // cleaning these vars would pollute the subsequent frames.
    // TODO zero in the line buffer instead, for r < WINDOW_SIZE
    for (int r = 0; r < (WINDOW_SIZE + 1); r++) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=1 max=WINDOW_SIZE+1
        #pragma HLS UNROLL
        // clang-format on
        img1Win[r] = 0;
        img1Win[r + (WINDOW_SIZE + 1)] = 0;
        img2Win[r] = 0;
        img1Col0[r] = 0;
        img2Col0[r] = 0;
        img1Col1[r] = 0;
        img2Col1[r] = 0;
    }
    for (int r = 0; r < cols / 2; ++r) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS
        #pragma HLS PIPELINE
        // clang-format on
        csIxixO[r] = 0;
        csIxiyO[r] = 0;
        csIyiyO[r] = 0;
        csDixO[r] = 0;
        csDiyO[r] = 0;
        cbIxixO[r] = 0;
        cbIxiyO[r] = 0;
        cbIyiyO[r] = 0;
        cbDixO[r] = 0;
        cbDiyO[r] = 0;

        csIxixE[r] = 0;
        csIxiyE[r] = 0;
        csIyiyE[r] = 0;
        csDixE[r] = 0;
        csDiyE[r] = 0;
        cbIxixE[r] = 0;
        cbIxiyE[r] = 0;
        cbIyiyE[r] = 0;
        cbDixE[r] = 0;
        cbDiyE[r] = 0;
    }
    ixix = 0;
    ixiy = 0;
    iyiy = 0;
    dix = 0;
    diy = 0;
}

// consume the integrals and compute flow vectors
template <int ROWS, int COLS, int NPC, int WINDOW_SIZE>
static void computeFlow16(hls::stream<int>& ixix,
                          hls::stream<int>& ixiy,
                          hls::stream<int>& iyiy,
                          hls::stream<int>& dix,
                          hls::stream<int>& diy,
                          hls::stream<float>& fx_out,
                          hls::stream<float>& fy_out,
                          int rows,
                          int cols,
                          int size) {
    for (int r = 0; r < rows; r++) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS
        // clang-format on
        for (int c = 0; c < cols / 2; c++) {
// clang-format off
            #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS/2
            #pragma HLS PIPELINE
            // clang-format on
            int ixix_ = ixix.read();
            int ixiy_ = ixiy.read();
            int iyiy_ = iyiy.read();
            int dix_ = dix.read();
            int diy_ = diy.read();
            float fx_ = 0, fy_ = 0;

            // matrix inv
            float det = (float)ixix_ * iyiy_ - (float)ixiy_ * ixiy_;
            if (det <= 1.0f || r < (WINDOW_SIZE) || c < ((WINDOW_SIZE + 1) / 2)) {
                fx_ = 0.0;
                fy_ = 0.0;
            } else {
                // res est: (dsp,ff,lut)
                // fdiv (0,748,800), fmul (3,143,139), fadd (2,306,246), fsub (2,306,246)
                // sitofp (0,229,365), fcmp (0,66,72), imul(1,0,0) (in cs)
                // float detInv = 1.0/det;
                float i00 = (float)iyiy_ / det;
                float i01 = (float)(-ixiy_) / det;
                float i10 = (float)(-ixiy_) / det;
                float i11 = (float)ixix_ / det;

                fx_ = i00 * dix_ + i01 * diy_;
                fy_ = i10 * dix_ + i11 * diy_;
            }
            fx_out.write(fx_);
            fy_out.write(fy_);
        }
    }
}

// line buffer for both input images. Can be split to a fn that models a single
// linebuffer
template <int ROWS, int COLS, int NPC, int WINDOW_SIZE, bool USE_URAM>
static void lbWrapper16(hls::stream<mywide_t<XF_NPIXPERCYCLE(NPC)> >& f0Stream,
                        hls::stream<mywide_t<XF_NPIXPERCYCLE(NPC)> >& f1Stream,
                        hls::stream<mywide_t<XF_NPIXPERCYCLE(NPC)> > img1Col[(WINDOW_SIZE + 1)],
                        hls::stream<mywide_t<XF_NPIXPERCYCLE(NPC)> > img2Col[(WINDOW_SIZE + 1)],
                        int rows,
                        int cols,
                        int size) {
    static pix_t lb1[(WINDOW_SIZE + 1)][COLS / XF_NPIXPERCYCLE(NPC)][XF_NPIXPERCYCLE(NPC)],
        lb2[(WINDOW_SIZE + 1)][COLS / XF_NPIXPERCYCLE(NPC)][XF_NPIXPERCYCLE(NPC)];

// clang-format off
    #pragma HLS ARRAY_MAP variable=lb1 instance=lbMap vertical
    #pragma HLS ARRAY_MAP variable=lb2 instance=lbMap vertical
// clang-format on

// clang-format off
    #pragma HLS ARRAY_RESHAPE variable=lb1 complete dim=1
    #pragma HLS ARRAY_RESHAPE variable=lb2 complete dim=1
    #pragma HLS ARRAY_RESHAPE variable=lb1 complete dim=3
    #pragma HLS ARRAY_RESHAPE variable=lb2 complete dim=3
    // clang-format on

    if (USE_URAM) {
// clang-format off
        #pragma HLS RESOURCE variable=lb1 core=RAM_T2P_URAM
        #pragma HLS RESOURCE variable=lb2 core=RAM_T2P_URAM
        // clang-format on
    }

    for (int r = 0; r < rows; r++) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS
        #pragma HLS LOOP_FLATTEN OFF
        // clang-format on
        for (int c = 0; c < cols / 2; c++) {
// clang-format off
            #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS/2
            #pragma HLS pipeline
            // clang-format on
            // shift up both linebuffers at col=c
            for (int i = 0; i < ((WINDOW_SIZE + 1) - 1); i++) {
                mywide_t<XF_NPIXPERCYCLE(NPC)> lb;

                for (int k = 0; k < XF_NPIXPERCYCLE(NPC); k++) {
                    lb.data[k] = lb1[i + 1][c][k];
                    lb1[i][c][k] = lb.data[k];
                }
                img1Col[i].write(lb);

                for (int k = 0; k < XF_NPIXPERCYCLE(NPC); k++) {
                    lb.data[k] = lb2[i + 1][c][k];
                    lb2[i][c][k] = lb.data[k];
                }
                img2Col[i].write(lb);
            }

            // read in the new pixels at col=c and row=bottom_of_lb
            mywide_t<XF_NPIXPERCYCLE(NPC)> pix0 = f0Stream.read();
            img1Col[(WINDOW_SIZE + 1) - 1].write(pix0);

            mywide_t<XF_NPIXPERCYCLE(NPC)> pix1 = f1Stream.read();
            img2Col[(WINDOW_SIZE + 1) - 1].write(pix1);

            for (int k = 0; k < XF_NPIXPERCYCLE(NPC); k++) {
                lb1[(WINDOW_SIZE + 1) - 1][c][k] = pix0.data[k];
                lb2[(WINDOW_SIZE + 1) - 1][c][k] = pix1.data[k];
            }
        }
    }

    // cleanup
    for (int c = 0; c < cols / 2; c++) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS/2
        #pragma HLS PIPELINE
        // clang-format on
        for (int r = 0; r < (WINDOW_SIZE + 1); r++) {
// clang-format off
            #pragma HLS LOOP_TRIPCOUNT min=1 max=WINDOW_SIZE+1
            // clang-format on
            for (int k = 0; k < XF_NPIXPERCYCLE(NPC); k++) {
                lb1[r][c][k] = 0;
                lb2[r][c][k] = 0;
            }
        }
    }
}

// top level wrapper to avoid dataflow problems
// void flowWrap (mywide_t frame0[NUM_WORDS], mywide_t frame1[NUM_WORDS], rgba2_t framef[NUM_WORDS])
template <int TYPE, int ROWS, int COLS, int NPC, int WINDOW_SIZE, bool USE_URAM>
static void flowWrap16(xf::cv::Mat<TYPE, ROWS, COLS, NPC>& frame0,
                       xf::cv::Mat<TYPE, ROWS, COLS, NPC>& frame1,
                       xf::cv::Mat<XF_32FC1, ROWS, COLS, NPC>& flowx,
                       xf::cv::Mat<XF_32FC1, ROWS, COLS, NPC>& flowy,
                       int rows,
                       int cols,
                       int size) {
//#pragma HLS data_pack variable=frame0
//#pragma HLS data_pack variable=frame1
//#pragma HLS data_pack variable=framef

// clang-format off
    #pragma HLS DATAFLOW
    // clang-format on

    // ddr <-> kernel streams. Stream depths are probably too large and can be
    // trimmed
    hls::stream<mywide_t<XF_NPIXPERCYCLE(NPC)> > f0Stream, f1Stream;
// clang-format off
    #pragma HLS data_pack variable=f0Stream
    #pragma HLS data_pack variable=f1Stream
    #pragma HLS STREAM variable=f0Stream depth=16
    #pragma HLS STREAM variable=f1Stream depth=16
    // clang-format on

    // hls::stream <rgba_t> ff0Stream, ff1Stream;
    // #pragma HLS data_pack variable=ff0Stream
    // #pragma HLS data_pack variable=ff1Stream
    // #pragma HLS STREAM variable=ff0Stream depth=16
    // #pragma HLS STREAM variable=ff1Stream depth=16

    hls::stream<mywide_t<XF_NPIXPERCYCLE(NPC)> > img1Col[(WINDOW_SIZE + 1)], img2Col[(WINDOW_SIZE + 1)];
// clang-format off
    #pragma HLS data_pack variable=img1Col
    #pragma HLS data_pack variable=img2Col
    #pragma HLS STREAM variable=img1Col  depth=16
    #pragma HLS STREAM variable=img2Col  depth=16
    #pragma HLS ARRAY_PARTITION variable=img1Col complete dim=0
    #pragma HLS ARRAY_PARTITION variable=img2Col complete dim=0
    // clang-format on

    hls::stream<int> ixix0, ixiy0, iyiy0, dix0, diy0;
    hls::stream<float> fx0("fx0"), fy0("fy0");
// clang-format off
    #pragma HLS STREAM variable=ixix0 depth=16
    #pragma HLS STREAM variable=ixiy0 depth=16
    #pragma HLS STREAM variable=iyiy0 depth=16
    #pragma HLS STREAM variable=dix0 depth=16
    #pragma HLS STREAM variable=diy0 depth=16
    #pragma HLS STREAM variable=fx0  depth=16
    #pragma HLS STREAM variable=fy0  depth=16
    // clang-format on

    hls::stream<int> ixix1, ixiy1, iyiy1, dix1, diy1;
    hls::stream<float> fx1("fx1"), fy1("fy1");
// clang-format off
    #pragma HLS STREAM variable=ixix1 depth=16
    #pragma HLS STREAM variable=ixiy1 depth=16
    #pragma HLS STREAM variable=iyiy1 depth=16
    #pragma HLS STREAM variable=dix1 depth=16
    #pragma HLS STREAM variable=diy1 depth=16
    #pragma HLS STREAM variable=fx1  depth=16
    #pragma HLS STREAM variable=fy1  depth=16
    // clang-format on

    readMatRows16<TYPE, ROWS, COLS, NPC, WINDOW_SIZE>(frame0, f0Stream, rows, cols, size);
    readMatRows16<TYPE, ROWS, COLS, NPC, WINDOW_SIZE>(frame1, f1Stream, rows, cols, size);

    lbWrapper16<ROWS, COLS, NPC, WINDOW_SIZE, USE_URAM>(f0Stream, f1Stream, img1Col, img2Col, rows, cols, size);
    computeSums16<ROWS, COLS, NPC, WINDOW_SIZE, USE_URAM>(img1Col, img2Col, ixix0, ixiy0, iyiy0, dix0, diy0, ixix1,
                                                          ixiy1, iyiy1, dix1, diy1, rows, cols, size);

    computeFlow16<ROWS, COLS, NPC, WINDOW_SIZE>(ixix0, ixiy0, iyiy0, dix0, diy0, fx0, fy0, rows, cols, size);
    computeFlow16<ROWS, COLS, NPC, WINDOW_SIZE>(ixix1, ixiy1, iyiy1, dix1, diy1, fx1, fy1, rows, cols, size);

    pack2Vectors<TYPE, ROWS, COLS, NPC, WINDOW_SIZE>(fx0, fx1, flowx, rows, cols, size);
    pack2Vectors<TYPE, ROWS, COLS, NPC, WINDOW_SIZE>(fy0, fy1, flowy, rows, cols, size);
}

//----------------------------------------------------------------------------
//----------------------------------------------------------------------------

// external interface to the kernel.
//  frame0 - First input frame (grayscale 1 byte per pixel)
//  frame1 - Second input frame (grayscale 1 byte per pixel)
//  framef - Output frame with flows visualized. 3 bytes per pixel + 1 byte padding
// void fpga_optflow ( mywide_t< XF_NPIXPERCYCLE(NPC) >  *frame0,  mywide_t< XF_NPIXPERCYCLE(NPC) >  *frame1, rgba2_t
// *framef);
// ushort = 16bits, 8 bits per grayscale pix, so two pix
// ulonglong = 64 bits, 32 bits per color pixel (rgba), so two color pix
// void fpga_optflow (unsigned short *frame0, unsigned short *frame1, unsigned long long *framef)
// void fpga_optflow (unsigned short frame0[NUM_WORDS], unsigned short frame1[NUM_WORDS], unsigned long long
// framef[NUM_WORDS])
template <int TYPE, int ROWS, int COLS, int NPC, int WINDOW_SIZE, bool USE_URAM>
static void fpga_optflow16(xf::cv::Mat<TYPE, ROWS, COLS, NPC>& frame0,
                           xf::cv::Mat<TYPE, ROWS, COLS, NPC>& frame1,
                           xf::cv::Mat<XF_32FC1, ROWS, COLS, NPC>& flowx,
                           xf::cv::Mat<XF_32FC1, ROWS, COLS, NPC>& flowy,
                           int rows,
                           int cols,
                           int size) {
// clang-format off
    #pragma HLS inline off
    // clang-format on

    flowWrap16<TYPE, ROWS, COLS, NPC, WINDOW_SIZE, USE_URAM>(frame0, frame1, flowx, flowy, rows, cols, size);

    return;
}

// read external array matB and stream.
// Can be simplified to a single loop with II=1 TODO, hls::stream< mywide_t< XF_NPIXPERCYCLE(NPC) > > &frame1,
// hls::stream<rgba_t> &framef
template <int TYPE, int ROWS, int COLS, int NPC, int WINDOW_SIZE>
static void readMatRows(
    xf::cv::Mat<TYPE, ROWS, COLS, NPC>& matB, hls::stream<pix_t>& pixStream, int rows, int cols, int size) {
    const int WORD_SIZE = (NPC == XF_NPPC1) ? 1 : 2;
    for (int i = 0; i < size; i++) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS*ROWS/NPC
        #pragma HLS PIPELINE
        // clang-format on
        mywide_t<XF_NPIXPERCYCLE(NPC)> tmpData;
        tmpData.data[0] = matB.read(i);

        // for (int k = 0; k < WORD_SIZE; ++k) {
        //    pixStream.write(tmpData.data[k]);
        // }
        pixStream.write(tmpData.data[0]);
    }
}

// write rgba stream to external array dst. The "a" is just padding and is
// unused
/*template <int ROWS, int COLS, int NPC, int WINDOW_SIZE>
static void writeMatRowsRGBA(hls::stream<rgba_t>& pixStream, unsigned int* dst, int rows, int cols, int size) {
    for (int i = 0; i < size; i++) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS*COLS/NPC
        #pragma HLS PIPELINE
        // clang-format on
        rgba_t tmpData = pixStream.read();
        *(dst + i) = (unsigned int)tmpData.a << 24 | (unsigned int)tmpData.b << 16 | (unsigned int)tmpData.g << 8 |
                     (unsigned int)tmpData.r;
    }
}*/

// Compute sums for bottom-right and top-right pixel and update the column sums.
// Use column sums to update the integrals. Implements O(1) sliding window.
//
// TODO:
// 1. Dont need the entire column for img1Win and img2Win. Need only the kernel
// 2. Full line buffer is not needed
template <int ROWS, int COLS, int NPC, int WINDOW_SIZE, bool USE_URAM>
static void computeSums(hls::stream<pix_t> img1Col[(WINDOW_SIZE + 1)],
                        hls::stream<pix_t> img2Col[(WINDOW_SIZE + 1)],
                        hls::stream<int>& ixix_out,
                        hls::stream<int>& ixiy_out,
                        hls::stream<int>& iyiy_out,
                        hls::stream<int>& dix_out,
                        hls::stream<int>& diy_out,
                        int rows,
                        int cols,
                        int size) {
    pix_t img1Col_[(WINDOW_SIZE + 1)], img2Col_[(WINDOW_SIZE + 1)];
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=img1Col_ complete dim=0
    #pragma HLS ARRAY_PARTITION variable=img2Col_ complete dim=0
    // clang-format on

    static pix_t img1Win[2 * (WINDOW_SIZE + 1)], img2Win[1 * (WINDOW_SIZE + 1)];
    static int ixix = 0, ixiy = 0, iyiy = 0, dix = 0, diy = 0;
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=img1Win complete dim=0
    #pragma HLS ARRAY_PARTITION variable=img2Win complete dim=0
    // clang-format on

    // column sums:
    // need left-shift. Array-Part leads to FF with big Muxes. Try to do with
    // classic array and pointer. Need current and current-WINDOW_SIZE ptrs
    // For II=1 pipelining, need two read and 1 write ports. Simulating it with
    // two arrays that have their write ports tied together.
    static int csIxix[COLS], csIxiy[COLS], csIyiy[COLS], csDix[COLS], csDiy[COLS];
    static int cbIxix[COLS], cbIxiy[COLS], cbIyiy[COLS], cbDix[COLS], cbDiy[COLS];

    int zIdx = -(WINDOW_SIZE - 2);
    int nIdx = zIdx + WINDOW_SIZE - 2;

// clang-format off
    #pragma HLS ARRAY_MAP variable=csIxix instance=cs vertical
    #pragma HLS ARRAY_MAP variable=csIxiy instance=cs vertical
    #pragma HLS ARRAY_MAP variable=csIyiy instance=cs vertical
    #pragma HLS ARRAY_MAP variable=csDix  instance=cs vertical
    #pragma HLS ARRAY_MAP variable=csDiy  instance=cs vertical
    // clang-format on

    if (USE_URAM) {
// clang-format off
        #pragma HLS RESOURCE variable=csIxix core=RAM_2P_URAM
        // clang-format on
    } else {
// clang-format off
        #pragma HLS RESOURCE variable=csIxix core=RAM_2P_BRAM
        // clang-format on
    }

// clang-format off
    #pragma HLS DEPENDENCE variable=csIxix inter RAW false
    #pragma HLS DEPENDENCE variable=csIxiy inter RAW false
    #pragma HLS DEPENDENCE variable=csIyiy inter RAW false
    #pragma HLS DEPENDENCE variable=csDix  inter RAW false
    #pragma HLS DEPENDENCE variable=csDiy  inter RAW false
// clang-format on

// clang-format off
    #pragma HLS ARRAY_MAP variable=cbIxix instance=cb vertical
    #pragma HLS ARRAY_MAP variable=cbIxiy instance=cb vertical
    #pragma HLS ARRAY_MAP variable=cbIyiy instance=cb vertical
    #pragma HLS ARRAY_MAP variable=cbDix  instance=cb vertical
    #pragma HLS ARRAY_MAP variable=cbDiy  instance=cb vertical
    // clang-format on

    if (USE_URAM) {
// clang-format off
        #pragma HLS RESOURCE variable=cbIxix core=RAM_2P_URAM
        // clang-format on
    } else {
// clang-format off
        #pragma HLS RESOURCE variable=cbIxix core=RAM_2P_BRAM
        // clang-format on
    }

// clang-format off
    #pragma HLS DEPENDENCE variable=cbIxix inter RAW false
    #pragma HLS DEPENDENCE variable=cbIxiy inter RAW false
    #pragma HLS DEPENDENCE variable=cbIyiy inter RAW false
    #pragma HLS DEPENDENCE variable=cbDix  inter RAW false
    #pragma HLS DEPENDENCE variable=cbDiy  inter RAW false
    // clang-format on

    int csIxixR, csIxiyR, csIyiyR, csDixR, csDiyR;

    for (int r = 0; r < rows; r++) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS
        // clang-format on
        for (int c = 0; c < cols; c++) {
// clang-format off
            #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS
            #pragma HLS PIPELINE II=1
            // clang-format on

            int csIxixL = 0;
            int csIxiyL = 0;
            int csIyiyL = 0;
            int csDixL = 0;
            int csDiyL = 0;
            if (zIdx >= 0) {
                csIxixL = csIxix[zIdx];
                csIxiyL = csIxiy[zIdx];
                csIyiyL = csIyiy[zIdx];
                csDixL = csDix[zIdx];
                csDiyL = csDiy[zIdx];
            }

            for (int wr = 0; wr < (WINDOW_SIZE + 1); ++wr) {
// clang-format off
                #pragma HLS LOOP_TRIPCOUNT min=1 max=WINDOW_SIZE+1
                // clang-format on
                img1Col_[wr] = img1Col[wr].read();
                img2Col_[wr] = img2Col[wr].read();
            }

            // p(x+1,y) and p(x-1,y)
            int wrt = 1;
            int cIxTopR = (img1Col_[wrt] - img1Win[wrt * 2 + 2 - 2]) / 2;
            // p(x,y+1) and p(x,y-1)
            int cIyTopR = (img1Win[(wrt + 1) * 2 + 2 - 1] - img1Win[(wrt - 1) * 2 + 2 - 1]) / 2;
            // p1(x,y) and p2(x,y)
            int delTopR = img1Win[wrt * 2 + 2 - 1] - img2Win[wrt * 1 + 1 - 1];

            int wrb = WINDOW_SIZE - 1;
            int cIxBotR = (img1Col_[wrb] - img1Win[wrb * 2 + 2 - 2]) / 2;
            int cIyBotR = (img1Win[(wrb + 1) * 2 + 2 - 1] - img1Win[(wrb - 1) * 2 + 2 - 1]) / 2;
            int delBotR = img1Win[wrb * 2 + 2 - 1] - img2Win[wrb * 1 + 1 - 1];
            if (0 && r < WINDOW_SIZE) {
                cIxTopR = 0;
                cIyTopR = 0;
                delTopR = 0;
            }

            csIxixR = cbIxix[nIdx] + cIxBotR * cIxBotR - cIxTopR * cIxTopR;
            csIxiyR = cbIxiy[nIdx] + cIxBotR * cIyBotR - cIxTopR * cIyTopR;
            csIyiyR = cbIyiy[nIdx] + cIyBotR * cIyBotR - cIyTopR * cIyTopR;
            csDixR = cbDix[nIdx] + delBotR * cIxBotR - delTopR * cIxTopR;
            csDiyR = cbDiy[nIdx] + delBotR * cIyBotR - delTopR * cIyTopR;

            ixix += (csIxixR - csIxixL);
            ixiy += (csIxiyR - csIxiyL);
            iyiy += (csIyiyR - csIyiyL);
            dix += (csDixR - csDixL);
            diy += (csDiyR - csDiyL);

            ixix_out.write(ixix);
            ixiy_out.write(ixiy);
            iyiy_out.write(iyiy);
            dix_out.write(dix);
            diy_out.write(diy);

            // we dont have the shifted pixel anymore to do overlay TODO
            // img1Delayed. write (0);

            for (int i = 0; i < (WINDOW_SIZE + 1); i++) {
// clang-format off
                #pragma HLS LOOP_TRIPCOUNT min=1 max=WINDOW_SIZE+1
                // clang-format on
                img1Win[i * 2] = img1Win[i * 2 + 1];
            }
            for (int i = 0; i < (WINDOW_SIZE + 1); ++i) {
// clang-format off
                #pragma HLS LOOP_TRIPCOUNT min=1 max=WINDOW_SIZE+1
                // clang-format on
                img1Win[i * 2 + 1] = img1Col_[i];
                img2Win[i] = img2Col_[i];
            }

            cbIxix[nIdx] = csIxixR;
            cbIxiy[nIdx] = csIxiyR;
            cbIyiy[nIdx] = csIyiyR;
            cbDix[nIdx] = csDixR;
            cbDiy[nIdx] = csDiyR;

            csIxix[nIdx] = csIxixR;
            csIxiy[nIdx] = csIxiyR;
            csIyiy[nIdx] = csIyiyR;
            csDix[nIdx] = csDixR;
            csDiy[nIdx] = csDiyR;
            zIdx++;
            if (zIdx == cols) zIdx = 0;
            nIdx++;
            if (nIdx == cols) nIdx = 0;
        }
    }

    // Cleanup. If kernel is called multiple times with different inputs, not
    // cleaning these vars would pollute the subsequent frames.
    // TODO zero in the line buffer instead, for r < WINDOW_SIZE
    for (int r = 0; r < (WINDOW_SIZE + 1); r++) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=1 max=WINDOW_SIZE+1
        #pragma HLS UNROLL
        // clang-format on
        img1Win[r] = 0;
        img1Win[r + (WINDOW_SIZE + 1)] = 0;
        img2Win[r] = 0;
        img1Col_[r] = 0;
        img2Col_[r] = 0;
    }
    for (int r = 0; r < cols; ++r) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS
        #pragma HLS PIPELINE II=1
        // clang-format on
        csIxix[r] = 0;
        csIxiy[r] = 0;
        csIyiy[r] = 0;
        csDix[r] = 0;
        csDiy[r] = 0;
        cbIxix[r] = 0;
        cbIxiy[r] = 0;
        cbIyiy[r] = 0;
        cbDix[r] = 0;
        cbDiy[r] = 0;
    }
    ixix = 0;
    ixiy = 0;
    iyiy = 0;
    dix = 0;
    diy = 0;
}

// consume the integrals and compute flow vectors
template <int ROWS, int COLS, int NPC, int WINDOW_SIZE>
static void computeFlow(hls::stream<int>& ixix,
                        hls::stream<int>& ixiy,
                        hls::stream<int>& iyiy,
                        hls::stream<int>& dix,
                        hls::stream<int>& diy,
                        hls::stream<float>& fx_out,
                        hls::stream<float>& fy_out,
                        int rows,
                        int cols,
                        int size) {
    for (int r = 0; r < rows; r++) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS
        // clang-format on
        for (int c = 0; c < cols; c++) {
// clang-format off
            #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS
            #pragma HLS PIPELINE
            // clang-format on
            int ixix_ = ixix.read();
            int ixiy_ = ixiy.read();
            int iyiy_ = iyiy.read();
            int dix_ = dix.read();
            int diy_ = diy.read();
            float fx_ = 0, fy_ = 0;

            // matrix inv
            float det = (float)ixix_ * iyiy_ - (float)ixiy_ * ixiy_;
            if (det <= 1.0f || r < (WINDOW_SIZE) || c < (WINDOW_SIZE + 1)) {
                fx_ = 0.0;
                fy_ = 0.0;
            } else {
                // res est: (dsp,ff,lut)
                // fdiv (0,748,800), fmul (3,143,139), fadd (2,306,246), fsub (2,306,246)
                // sitofp (0,229,365), fcmp (0,66,72), imul(1,0,0) (in cs)
                // float detInv = 1.0/det;
                float i00 = (float)iyiy_ / det;
                float i01 = (float)(-ixiy_) / det;
                float i10 = (float)(-ixiy_) / det;
                float i11 = (float)ixix_ / det;

                fx_ = i00 * dix_ + i01 * diy_;
                fy_ = i10 * dix_ + i11 * diy_;
            }
            fx_out.write(fx_);
            fy_out.write(fy_);
        }
    }
}

template <int TYPE, int ROWS, int COLS, int NPC, int WINDOW_SIZE>
static void writeOutput8(hls::stream<float>& fx_in,
                         hls::stream<float>& fy_in,
                         xf::cv::Mat<XF_32FC1, ROWS, COLS, NPC>& flowx,
                         xf::cv::Mat<XF_32FC1, ROWS, COLS, NPC>& flowy,
                         int size) {
    for (int r = 0; r < size; r++) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS*COLS
        #pragma HLS PIPELINE
        // clang-format on

        float fx_out = fx_in.read();
        float fy_out = fy_in.read();
        ap_uint<32>* fx_out_int;
        fx_out_int = (ap_uint<32>*)&fx_out;
        ap_uint<32>* fy_out_int;
        fy_out_int = (ap_uint<32>*)&fy_out;
        flowx.write(r, *fx_out_int);
        flowy.write(r, *fy_out_int);
        // ap_uint<32> a32 = flowx.read(r);
        // ap_uint<32> b32 = flowy.read(r);
    }
}

// line buffer for both input images. Can be split to a fn that models a single
// linebuffer
template <int ROWS, int COLS, int NPC, int WINDOW_SIZE, bool USE_URAM>
static void lbWrapper(hls::stream<pix_t>& f0Stream,
                      hls::stream<pix_t>& f1Stream,
                      hls::stream<pix_t> img1Col[(WINDOW_SIZE + 1)],
                      hls::stream<pix_t> img2Col[(WINDOW_SIZE + 1)],
                      int rows,
                      int cols,
                      int size) {
    static pix_t lb1[(WINDOW_SIZE + 1)][COLS], lb2[(WINDOW_SIZE + 1)][COLS];
// clang-format off
    #pragma HLS ARRAY_MAP variable=lb1 instance=lbMap vertical
    #pragma HLS ARRAY_MAP variable=lb2 instance=lbMap vertical
    #pragma HLS ARRAY_RESHAPE variable=lb1 complete dim=1
    #pragma HLS ARRAY_RESHAPE variable=lb2 complete dim=1
    // clang-format on
    if (USE_URAM) {
// clang-format off
        #pragma HLS RESOURCE variable=lb1 core=RAM_T2P_URAM
        #pragma HLS RESOURCE variable=lb2 core=RAM_T2P_URAM
        // clang-format on
    }
loop_rows:
    for (int r = 0; r < rows; r++) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=1 max=ROWS
        #pragma HLS LOOP_FLATTEN OFF
    // clang-format on
    loop_cols:
        for (int c = 0; c < cols; c++) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS
        #pragma HLS pipeline
        // clang-format on

        // shift up both linebuffers at col=c
        loop_ws:
            for (int i = 0; i < (WINDOW_SIZE + 1) - 1; i++) {
                lb1[i][c] = lb1[i + 1][c];
                img1Col[i].write(lb1[i][c]);

                lb2[i][c] = lb2[i + 1][c];
                img2Col[i].write(lb2[i][c]);
            }

            // read in the new pixels at col=c and row=bottom_of_lb
            pix_t pix0 = f0Stream.read();

            lb1[(WINDOW_SIZE + 1) - 1][c] = pix0;
            img1Col[(WINDOW_SIZE + 1) - 1].write(pix0);

            pix_t pix1 = f1Stream.read();

            lb2[(WINDOW_SIZE + 1) - 1][c] = pix1;
            img2Col[(WINDOW_SIZE + 1) - 1].write(pix1);
        }
    }

    // cleanup
    for (int c = 0; c < cols; c++) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=1 max=COLS
        #pragma HLS PIPELINE
        // clang-format on
        for (int r = 0; r < (WINDOW_SIZE + 1); r++) {
// clang-format off
            #pragma HLS LOOP_TRIPCOUNT min=1 max=WINDOW_SIZE+1
            // clang-format on
            lb1[r][c] = 0;
            lb2[r][c] = 0;
        }
    }
}

// top level wrapper to avoid dataflow problems
template <int TYPE, int ROWS, int COLS, int NPC, int WINDOW_SIZE, bool USE_URAM>
static void flowWrap(xf::cv::Mat<TYPE, ROWS, COLS, NPC>& frame0,
                     xf::cv::Mat<TYPE, ROWS, COLS, NPC>& frame1,
                     xf::cv::Mat<XF_32FC1, ROWS, COLS, NPC>& flowx,
                     xf::cv::Mat<XF_32FC1, ROWS, COLS, NPC>& flowy,
                     int rows,
                     int cols,
                     int size) {
// clang-format off
    #pragma HLS inline off
    #pragma HLS DATAFLOW
    // clang-format on

    // ddr <-> kernel streams. Stream depths are probably too large and can be
    // trimmed
    hls::stream<pix_t> f0Stream("f0Stream"), f1Stream("f1Stream");
    hls::stream<pix_t> f0Delayed("f0Delayed");
// clang-format off
    #pragma HLS STREAM variable=f0Stream depth=16
    #pragma HLS STREAM variable=f1Stream depth=16
    // clang-format on
    // #pragma HLS STREAM variable=f0Delayed depth=128

    // hls::stream <rgba_t> ffStream ("ffStream");
    // #pragma HLS data_pack variable=ffStream
    // #pragma HLS STREAM variable=ffStream depth=16

    hls::stream<pix_t> img1Col[(WINDOW_SIZE + 1)], img2Col[(WINDOW_SIZE + 1)];
    hls::stream<int> ixix, ixiy, iyiy, dix, diy;
    hls::stream<float> fx, fy;

// clang-format off
    #pragma HLS STREAM variable=ixix depth=16
    #pragma HLS STREAM variable=ixiy depth=16
    #pragma HLS STREAM variable=iyiy depth=16
    #pragma HLS STREAM variable=dix depth=16
    #pragma HLS STREAM variable=diy depth=16
// clang-format on
// #pragma HLS STREAM variable=fx  depth=16
// #pragma HLS STREAM variable=fy  depth=16
// clang-format off
    #pragma HLS STREAM variable=img1Col  depth=16
    #pragma HLS STREAM variable=img2Col  depth=16
// clang-format on

// clang-format off
    #pragma HLS ARRAY_PARTITION variable=img1Col complete dim=0
    #pragma HLS ARRAY_PARTITION variable=img2Col complete dim=0
    // clang-format on

    readMatRows<TYPE, ROWS, COLS, NPC, WINDOW_SIZE>(frame0, f0Stream, rows, cols, size);
    readMatRows<TYPE, ROWS, COLS, NPC, WINDOW_SIZE>(frame1, f1Stream, rows, cols, size);

    lbWrapper<ROWS, COLS, NPC, WINDOW_SIZE, USE_URAM>(f0Stream, f1Stream, img1Col, img2Col, rows, cols, size);

    computeSums<ROWS, COLS, NPC, WINDOW_SIZE, USE_URAM>(img1Col, img2Col, ixix, ixiy, iyiy, dix, diy, rows, cols, size);

    computeFlow<ROWS, COLS, NPC, WINDOW_SIZE>(ixix, ixiy, iyiy, dix, diy, fx, fy, rows, cols, size);

    writeOutput8<TYPE, ROWS, COLS, NPC, WINDOW_SIZE>(fx, fy, flowx, flowy, size);
}

//----------------------------------------------------------------------------
//----------------------------------------------------------------------------

// external interface to the kernel.
//  frame0 - First input frame (grayscale 1 byte per pixel)
//  frame1 - Second input frame (grayscale 1 byte per pixel)
//  framef - Output frame with flows visualized. 3 bytes per pixel + 1 byte padding
template <int TYPE, int ROWS, int COLS, int NPC, int WINDOW_SIZE, bool USE_URAM>
static void fpga_optflow8(xf::cv::Mat<TYPE, ROWS, COLS, NPC>& frame0,
                          xf::cv::Mat<TYPE, ROWS, COLS, NPC>& frame1,
                          xf::cv::Mat<XF_32FC1, ROWS, COLS, NPC>& flowx,
                          xf::cv::Mat<XF_32FC1, ROWS, COLS, NPC>& flowy,
                          int rows,
                          int cols,
                          int size) {
// clang-format off
    #pragma HLS inline off
    // clang-format on

    flowWrap<TYPE, ROWS, COLS, NPC, WINDOW_SIZE, USE_URAM>(frame0, frame1, flowx, flowy, rows, cols, size);

    return;
}
template <int WINDOW_SIZE, int TYPE, int ROWS, int COLS, int NPC, bool USE_URAM = false>
void DenseNonPyrLKOpticalFlow(xf::cv::Mat<TYPE, ROWS, COLS, NPC>& frame0,
                              xf::cv::Mat<TYPE, ROWS, COLS, NPC>& frame1,
                              xf::cv::Mat<XF_32FC1, ROWS, COLS, NPC>& flowx,
                              xf::cv::Mat<XF_32FC1, ROWS, COLS, NPC>& flowy) {
    if (NPC == XF_NPPC1) {
        fpga_optflow8<TYPE, ROWS, COLS, NPC, WINDOW_SIZE, USE_URAM>(frame0, frame1, flowx, flowy, frame0.rows,
                                                                    frame0.cols, frame0.size);
    } else {
        fpga_optflow16<TYPE, ROWS, COLS, NPC, WINDOW_SIZE, USE_URAM>(frame0, frame1, flowx, flowy, frame0.rows,
                                                                     frame0.cols, frame0.size);
    }
}
} // namespace cv
} // namespace xf
#endif