Program Listing for File xfcvDataMovers.h

Return to documentation for file (/tmp/ws/src/vitis_common/include/aie/common/xfcvDataMovers.h)

/*
 * Copyright 2021 Xilinx, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef _XFCVDATAMOVERS_
#define _XFCVDATAMOVERS_

#include <adf/adf_api/XRTConfig.h>
#include <array>
#include <common/smartTilerStitcher.hpp>
#include <experimental/xrt_kernel.h>
#include <fstream>
#include <functional>
#include <iostream>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <thread>
#include <vector>
#include <common/xf_aie_const.hpp>

int xrtSyncBOAIENB(xrtDeviceHandle handle,
                   xrtBufferHandle bohdl,
                   const char* gmioName,
                   enum xclBOSyncDirection dir,
                   size_t size,
                   size_t offset);
int xrtGMIOWait(xrtDeviceHandle handle, const char* gmioName);

namespace xF {

enum DataMoverKind { TILER, STITCHER };

template <int BITWIDTH>
class EmulAxiData {
    static constexpr int BYTEWIDTH = BITWIDTH / 8;

   public:
    char data[BYTEWIDTH];
    template <typename T>
    EmulAxiData(T m) {
        assert(sizeof(T) <= BYTEWIDTH);
        char* tmp = (char*)&m;
        for (unsigned int i = 0; i < BYTEWIDTH; i++) {
            data[i] = (i < sizeof(T)) ? tmp[i] : 0;
        }
    }
    template <typename T>
    EmulAxiData& operator=(const EmulAxiData& mc) {
        if (this != &mc) {
            for (unsigned int i = 0; i < BYTEWIDTH; i++) {
                data[i] = mc.data[i];
            }
        }
        return *this;
    }
};

template <typename T>
class CtypeToCVMatType {
   public:
    static constexpr uchar type =
        (std::is_same<T, float>::value)
            ? CV_32F
            : (std::is_same<T, double>::value)
                  ? CV_64F
                  : (std::is_same<T, int32_t>::value)
                        ? CV_32S
                        : (std::is_same<T, int16_t>::value)
                              ? CV_16S
                              : (std::is_same<T, uint16_t>::value)
                                    ? CV_16U
                                    : (std::is_same<T, int8_t>::value)
                                          ? CV_8S
                                          : (std::is_same<T, uint8_t>::value)
                                                ? CV_8U
                                                : (std::is_same<T, signed char>::value) ? CV_8S : CV_8U;
};

static xrtDeviceHandle gpDhdl = nullptr;
static std::vector<char> gHeader;
static const axlf* gpTop = nullptr;
static uint16_t gnTilerInstCount = 0;
static uint16_t gnStitcherInstCount = 0;

void deviceInit(const char* xclBin) {
    if (xclBin != nullptr) {
        if (gpDhdl == nullptr) {
            assert(gpTop == nullptr);

            gpDhdl = xrtDeviceOpen(0);
            if (gpDhdl == nullptr) {
                throw std::runtime_error("No valid device handle found. Make sure using right xclOpen index.");
            }

            std::ifstream stream(xclBin);
            stream.seekg(0, stream.end);
            size_t size = stream.tellg();
            stream.seekg(0, stream.beg);

            gHeader.resize(size);
            stream.read(gHeader.data(), size);

            gpTop = reinterpret_cast<const axlf*>(gHeader.data());
            if (xrtDeviceLoadXclbin(gpDhdl, gpTop)) {
                throw std::runtime_error("Xclbin loading failed");
            }

            adf::registerXRT(gpDhdl, gpTop->m_header.uuid);
        }
    }

    if (gpDhdl == nullptr) {
        throw std::runtime_error("No valid device handle found. Make sure using right xclOpen index.");
    }

    if (gpTop == nullptr) {
        throw std::runtime_error("Xclbin loading failed");
    }
}

template <DataMoverKind KIND,
          typename DATA_TYPE,
          int TILE_HEIGHT_MAX,
          int TILE_WIDTH_MAX,
          int AIE_VECTORIZATION_FACTOR,
          int CORES = 1,
          int PL_AXI_BITWIDTH = 32,
          bool USE_GMIO = false>
class xfcvDataMovers {
   private:
    uint16_t mOverlapH;
    uint16_t mOverlapV;
    uint16_t mTileRows;
    uint16_t mTileCols;
    bool mbUserHndl;

    cv::Mat* mpImage;
    std::array<uint16_t, 3> mImageSize;

    std::vector<smartTileMetaData> mMetaDataList;
    std::vector<EmulAxiData<PL_AXI_BITWIDTH> > mMetaDataVec;

    xrtBufferHandle mMetadataBOHndl;
    xrtBufferHandle mImageBOHndl;

    std::array<xrtKernelHandle, CORES> mPLKHandleArr;
    std::array<xrtRunHandle, CORES> mPLRHandleArr;

    int imgSize() { return (mImageSize[0] * mImageSize[1] * mImageSize[2]); }

    template <DataMoverKind _t = KIND, typename std::enable_if<(_t == TILER)>::type* = nullptr>
    int metadataSize() {
        return mMetaDataVec.size() * sizeof(EmulAxiData<PL_AXI_BITWIDTH>);
    }

    template <DataMoverKind _t = KIND, typename std::enable_if<(_t == STITCHER)>::type* = nullptr>
    int metadataSize() {
        return 0;
    }

    // Tiler copy {
    template <DataMoverKind _t = KIND, typename std::enable_if<(_t == TILER)>::type* = nullptr>
    void copy() {
        // Pack meta-data and image buffer in device buffer handle
        assert(mMetadataBOHndl);
        void* metadata_buffer = xrtBOMap(mMetadataBOHndl);
        memcpy(metadata_buffer, mMetaDataVec.data(), metadataSize());

        if (mbUserHndl == false) {
            assert(mpImage);
            assert(mImageBOHndl);
            void* buffer = xrtBOMap(mImageBOHndl);
            memcpy(buffer, mpImage->data, imgSize());
        }
    }
    //}

    // Stitcher copy {
    template <DataMoverKind _t = KIND, typename std::enable_if<(_t == STITCHER)>::type* = nullptr>
    void copy() {
        // No meta-data
        assert(mImageBOHndl);

        void* buffer = xrtBOMap(mImageBOHndl);
        if (mbUserHndl == false) {
            assert(mpImage);
            memcpy(mpImage->data, buffer, imgSize());
        } else {
            xrtBOSync(mImageBOHndl, XCL_BO_SYNC_BO_TO_DEVICE, imgSize(), 0);
        }
    }
    //}

    void free_metadata_buffer() {
        if (mMetadataBOHndl != nullptr) {
            xrtBOFree(mMetadataBOHndl);
        }
        mMetadataBOHndl = nullptr;
    }

    void alloc_metadata_buffer() {
        if (mMetadataBOHndl == nullptr) {
            assert(metadataSize() > 0);
            std::cout << "Allocating metadata device buffer (Tiler), "
                      << " Size : " << metadataSize() << " bytes" << std::endl;
            mMetadataBOHndl = xrtBOAlloc(gpDhdl, metadataSize(), 0, 0);
        }
    }

    void free_buffer() {
        if (mbUserHndl == false) {
            if (mImageBOHndl != nullptr) {
                xrtBOFree(mImageBOHndl);
            }
            mImageBOHndl = nullptr;
        }
    }

    void alloc_buffer() {
        if (mImageBOHndl == nullptr) {
            assert(imgSize() > 0);
            std::cout << "Allocating image device buffer (Tiler), "
                      << " Size : " << imgSize() << " bytes" << std::endl;
            mImageBOHndl = xrtBOAlloc(gpDhdl, imgSize(), 0, 0);
        }
    }

    template <DataMoverKind _t = KIND, typename std::enable_if<(_t == TILER)>::type* = nullptr>
    std::string krnl_inst_name(int n) {
        std::ostringstream ss;
        ss << "Tiler_top:{Tiler_top_" << n << "}";
        return ss.str();
    }

    template <DataMoverKind _t = KIND, typename std::enable_if<(_t == STITCHER)>::type* = nullptr>
    std::string krnl_inst_name(int n) {
        std::ostringstream ss;
        ss << "stitcher_top:{stitcher_top_" << n << "}";
        return ss.str();
    }

    void load_krnl() {
        for (int i = 0; i < CORES; i++) {
            std::string name =
                (KIND == TILER) ? krnl_inst_name(++gnTilerInstCount) : krnl_inst_name(++gnStitcherInstCount);
            std::cout << "Loading kernel " << name.c_str() << std::endl;
            mPLKHandleArr[i] = xrtPLKernelOpen(gpDhdl, gpTop->m_header.uuid, name.c_str());
            mPLRHandleArr[i] = xrtRunOpen(mPLKHandleArr[i]);
        }
    }

    template <DataMoverKind _t = KIND, typename std::enable_if<(_t == TILER)>::type* = nullptr>
    void setArgs() {
        std::cout << "Setting kernel args (Tiler) ..." << std::endl;
        for (int i = 0; i < CORES; i++) {
            (void)xrtRunSetArg(mPLRHandleArr[i], 1, mMetadataBOHndl);
            (void)xrtRunSetArg(mPLRHandleArr[i], 2, mImageBOHndl);
            (void)xrtRunSetArg(mPLRHandleArr[i], 3, mTileRows);
            (void)xrtRunSetArg(mPLRHandleArr[i], 4, mTileCols);
            (void)xrtRunSetArg(mPLRHandleArr[i], 5, 1);
            (void)xrtRunSetArg(mPLRHandleArr[i], 6, mImageSize[1]);
        }
    }

    template <DataMoverKind _t = KIND, typename std::enable_if<(_t == STITCHER)>::type* = nullptr>
    void setArgs() {
        std::cout << "Setting kernel args (Stitcher) ..." << std::endl;
        for (int i = 0; i < CORES; i++) {
            (void)xrtRunSetArg(mPLRHandleArr[i], 1, mImageBOHndl);
            (void)xrtRunSetArg(mPLRHandleArr[i], 2, mTileRows);
            (void)xrtRunSetArg(mPLRHandleArr[i], 3, mTileCols);
            (void)xrtRunSetArg(mPLRHandleArr[i], 4, 1);
            (void)xrtRunSetArg(mPLRHandleArr[i], 5, mImageSize[1]);
            (void)xrtRunSetArg(mPLRHandleArr[i], 6, mImageSize[0]);
        }
    }

   public:
    void start() {
        for (auto& r : mPLRHandleArr) {
            xrtRunStart(r);
        }
    }

    template <DataMoverKind _t = KIND, typename std::enable_if<(_t == TILER)>::type* = nullptr>
    xfcvDataMovers(uint16_t overlapH, uint16_t overlapV) {
        if (gpDhdl == nullptr) {
            throw std::runtime_error("No valid device handle found. Make sure using xF::deviceInit(...) is called.");
        }

        mpImage = nullptr;
        mImageSize = {0, 0, 0};

        // Initialize overlaps
        mOverlapH = overlapH;
        mOverlapV = overlapV;

        mTileRows = 0;
        mTileCols = 0;
        mbUserHndl = false;

        mMetadataBOHndl = nullptr;
        mImageBOHndl = nullptr;

        // Load the PL kernel
        load_krnl();
    }

    template <DataMoverKind _t = KIND, typename std::enable_if<(_t == STITCHER)>::type* = nullptr>
    xfcvDataMovers() {
        if (gpDhdl == nullptr) {
            throw std::runtime_error("No valid device handle found. Make sure using xF::deviceInit(...) is called.");
        }

        mpImage = nullptr;
        mImageSize = {0, 0, 0};

        // Initialize overlaps
        mOverlapH = 0;
        mOverlapV = 0;

        mTileRows = 0;
        mTileCols = 0;
        mbUserHndl = false;

        mMetadataBOHndl = nullptr;
        mImageBOHndl = nullptr;

        // Load the PL kernel
        load_krnl();
    }

    // Non copyable {
    xfcvDataMovers(const xfcvDataMovers&) = delete;
    xfcvDataMovers& operator=(const xfcvDataMovers&) = delete;
    //}

    // Close / free operations tp be done here {
    ~xfcvDataMovers() {
        free_buffer();
        free_metadata_buffer();

        for (auto& r : mPLRHandleArr) {
            xrtRunClose(r);
        }
        for (auto& r : mPLKHandleArr) {
            xrtKernelClose(r);
        }
        if (gpDhdl != nullptr) {
            xrtDeviceClose(gpDhdl);
            gpDhdl = nullptr;
        }
    }
    //}

    void compute_metadata(const cv::Size& img_size);

    // Theese functions will start the data transfer protocol {
    template <DataMoverKind _t = KIND, typename std::enable_if<(_t == TILER)>::type* = nullptr>
    std::array<uint16_t, 2> host2aie_nb(cv::Mat& img, xrtBufferHandle imgHndl = nullptr) {
        assert(sizeof(DATA_TYPE) >= img.elemSize());

        int old_metadata_buffer_size = metadataSize();
        int old_img_buffer_size = imgSize();

        bool bRecompute = false;
        if ((mImageSize[0] != img.rows) || (mImageSize[1] != img.cols)) {
            bRecompute = true;
        }

        mpImage = &img;
        mImageSize = {(uint16_t)img.rows, (uint16_t)img.cols, (uint16_t)img.elemSize()};
        if (bRecompute == true) {
            // Pack metadata
            compute_metadata(img.size());
        }

        int new_metadata_buffer_size = metadataSize();
        int new_img_buffer_size = imgSize();

        if (new_metadata_buffer_size > old_metadata_buffer_size) {
            free_metadata_buffer();
        }

        if ((new_img_buffer_size > old_img_buffer_size) || (imgHndl != nullptr)) {
            free_buffer();
        }

        mbUserHndl = (imgHndl != nullptr);
        if (mbUserHndl) mImageBOHndl = imgHndl;

        // Allocate buffer
        alloc_metadata_buffer();
        alloc_buffer();

        // Copy input data to device buffer
        copy();

        // Set args
        setArgs();

        // Start the kernel
        start();

        std::array<uint16_t, 2> ret = {mTileRows, mTileCols};
        return ret;
    }

    template <DataMoverKind _t = KIND, typename std::enable_if<(_t == TILER)>::type* = nullptr>
    std::array<uint16_t, 2> host2aie_nb(xrtBufferHandle imgHndl, const cv::Size& size) {
        cv::Mat img(size, CV_8UC1); // This image is redundant in case a handle is passed
        return host2aie_nb(img, imgHndl);
    }

    template <DataMoverKind _t = KIND, typename std::enable_if<(_t == STITCHER)>::type* = nullptr>
    void aie2host_nb(cv::Mat& img, std::array<uint16_t, 2> tiles, xrtBufferHandle imgHndl = nullptr) {
        assert(sizeof(DATA_TYPE) >= img.elemSize());

        int old_img_buffer_size = imgSize();

        mpImage = &img;
        mImageSize = {(uint16_t)img.rows, (uint16_t)img.cols, (uint16_t)img.elemSize()};
        mTileRows = tiles[0];
        mTileCols = tiles[1];

        int new_img_buffer_size = imgSize();
        if ((new_img_buffer_size > old_img_buffer_size) || (imgHndl != nullptr)) {
            free_buffer();
        }

        mbUserHndl = (imgHndl != nullptr);
        if (mbUserHndl) mImageBOHndl = imgHndl;

        // Allocate buffer
        alloc_buffer();

        // Set args
        setArgs();

        // Start the kernel
        start();
    }

    template <DataMoverKind _t = KIND, typename std::enable_if<(_t == STITCHER)>::type* = nullptr>
    void aie2host_nb(xrtBufferHandle imgHndl, const cv::Size& size, std::array<uint16_t, 2> tiles) {
        cv::Mat img(size, CV_8UC1); // This image is redundant in case a handle is passed
        aie2host_nb(img, tiles, imgHndl);
    }
    //}

    template <DataMoverKind _t = KIND, typename std::enable_if<(_t == TILER)>::type* = nullptr>
    void wait() {
        for (auto& r : mPLRHandleArr) {
            (void)xrtRunWait(r);
        }
    }

    template <DataMoverKind _t = KIND, typename std::enable_if<(_t == STITCHER)>::type* = nullptr>
    void wait() {
        for (auto& r : mPLRHandleArr) {
            (void)xrtRunWait(r);
        }

        // Copy data from device buffer to host
        copy();
    }
};

template <DataMoverKind KIND,
          typename DATA_TYPE,
          int TILE_HEIGHT_MAX,
          int TILE_WIDTH_MAX,
          int AIE_VECTORIZATION_FACTOR,
          int CORES,
          int PL_AXI_BITWIDTH,
          bool USE_GMIO>
void xfcvDataMovers<KIND,
                    DATA_TYPE,
                    TILE_HEIGHT_MAX,
                    TILE_WIDTH_MAX,
                    AIE_VECTORIZATION_FACTOR,
                    CORES,
                    PL_AXI_BITWIDTH,
                    USE_GMIO>::compute_metadata(const cv::Size& img_size) {
    mMetaDataList.clear();
    mMetaDataVec.clear();
    mImageSize[0] = (uint16_t)img_size.height;
    mImageSize[1] = (uint16_t)img_size.width;
    smartTileTilerGenerateMetaDataWithSpecifiedTileSize(
        {mImageSize[0], mImageSize[1]}, mMetaDataList, mTileRows, mTileCols, {TILE_HEIGHT_MAX, TILE_WIDTH_MAX},
        {mOverlapH, mOverlapH}, {mOverlapV, mOverlapV}, AIE_VECTORIZATION_FACTOR, true);
    char sMesg[2048];
    sMesg[0] = '\0';
    sprintf(sMesg, "Requested tile size (%d,%d). Computed tile size (%d,%d). Number of tiles (%d,%d)\n",
            TILE_HEIGHT_MAX, TILE_WIDTH_MAX, mMetaDataList[0].tileHeight(), mMetaDataList[0].tileWidth(), mTileRows,
            mTileCols);
    std::cout << sMesg << std::endl;

    for (auto& metaData : mMetaDataList) {
        mMetaDataVec.emplace_back((int16_t)metaData.tileWidth());
        mMetaDataVec.emplace_back((int16_t)metaData.tileHeight());
        mMetaDataVec.emplace_back((int16_t)metaData.positionH());
        mMetaDataVec.emplace_back((int16_t)metaData.positionV());
        mMetaDataVec.emplace_back((int16_t)metaData.overlapSizeH_left());
        mMetaDataVec.emplace_back((int16_t)metaData.overlapSizeH_right());
        mMetaDataVec.emplace_back((int16_t)metaData.overlapSizeV_top());
        mMetaDataVec.emplace_back((int16_t)metaData.overlapSizeV_bottom());
        mMetaDataVec.emplace_back((int16_t)16); // BIT_WIDTH
        mMetaDataVec.emplace_back((int16_t)0);  // DUP_WIDTH
        mMetaDataVec.emplace_back((int16_t)0);  // DUP_HEIGHT
        mMetaDataVec.emplace_back((int16_t)(metaData.positionH() + metaData.overlapSizeH_left()));
        mMetaDataVec.emplace_back((int16_t)(metaData.positionV() + metaData.overlapSizeV_top()));
        mMetaDataVec.emplace_back(
            (int16_t)(metaData.tileWidth() - (metaData.overlapSizeH_left() + metaData.overlapSizeH_right())));
        mMetaDataVec.emplace_back(
            (int16_t)(metaData.tileHeight() - (metaData.overlapSizeV_top() + metaData.overlapSizeV_bottom())));
        mMetaDataVec.emplace_back((int16_t)1); // Enable saturation, 1: 8U, 2: 8S
    }
}

template <DataMoverKind KIND,
          typename DATA_TYPE,
          int TILE_HEIGHT_MAX,
          int TILE_WIDTH_MAX,
          int AIE_VECTORIZATION_FACTOR,
          int CORES>
class xfcvDataMovers<KIND, DATA_TYPE, TILE_HEIGHT_MAX, TILE_WIDTH_MAX, AIE_VECTORIZATION_FACTOR, CORES, 0, true> {
    // using DataCopyF_t = std::function<int(DATA_TYPE*, DATA_TYPE*,
    // std::vector<int>&, int, int)>;

   private:
    uint16_t mOverlapH;
    uint16_t mOverlapV;
    uint16_t mTileRows;
    uint16_t mTileCols;
    bool mbUserHndl;

    cv::Mat* mpImage;
    DATA_TYPE* mpImgData;
    std::array<uint16_t, 3> mImageSize; // Rows, Cols, Elem Size

    std::vector<smartTileMetaData> mMetaDataList;

    xrtBufferHandle mImageBOHndl;

    //    DataCopyF_t mTileDataCopy;

    int imgSize() { return (mImageSize[0] * mImageSize[1] * mImageSize[2]); }

    int tileWindowSize() { return ((SMARTTILE_ELEMENTS + (TILE_HEIGHT_MAX * TILE_WIDTH_MAX))); }

    int tileImgSize() { return (sizeof(DATA_TYPE) * tileWindowSize() * (mTileRows * mTileCols)); }

    int bufferSizePerCore() { return (sizeof(DATA_TYPE) * tileWindowSize() * ((mTileRows * mTileCols) / CORES)); }

    // Helper function for Tiler copy {
    template <DataMoverKind _t = KIND, typename std::enable_if<(_t == TILER)>::type* = nullptr>
    void input_copy(uint16_t startInd, uint16_t endInd) {
        assert(mpImgData);

        DATA_TYPE* buffer = (DATA_TYPE*)xrtBOMap(mImageBOHndl);
        int tileSize = tileWindowSize();
        for (int t = startInd; t < endInd; t++) {
            for (int j = 0; j < SMARTTILE_ELEMENTS; j++) buffer[t * tileSize + j] = 0;

            int16_t tileWidth = mMetaDataList[t].tileWidth();
            int16_t tileHeight = mMetaDataList[t].tileHeight();
            int16_t positionH = mMetaDataList[t].positionH();
            int16_t positionV = mMetaDataList[t].positionV();
            buffer[t * tileSize + 0] = (DATA_TYPE)mMetaDataList[t].tileWidth();
            buffer[t * tileSize + 4] = (DATA_TYPE)mMetaDataList[t].tileHeight();
            buffer[t * tileSize + 8] = (DATA_TYPE)mMetaDataList[t].positionH();
            buffer[t * tileSize + 12] = (DATA_TYPE)mMetaDataList[t].positionV();
            buffer[t * tileSize + 16] = (DATA_TYPE)mMetaDataList[t].overlapSizeH_left();
            buffer[t * tileSize + 20] = (DATA_TYPE)mMetaDataList[t].overlapSizeH_right();
            buffer[t * tileSize + 24] = (DATA_TYPE)mMetaDataList[t].overlapSizeV_top();
            buffer[t * tileSize + 28] = (DATA_TYPE)mMetaDataList[t].overlapSizeV_bottom();

            for (int ti = 0; ti < tileHeight; ti++) {
                memcpy(buffer + (t * tileSize + SMARTTILE_ELEMENTS + (ti * tileWidth)),
                       mpImgData + (((positionV + ti) * mImageSize[1]) + positionH), tileWidth * sizeof(DATA_TYPE));
            }
        }
    }
    // }

    // Tiler copy {
    template <DataMoverKind _t = KIND, typename std::enable_if<(_t == TILER)>::type* = nullptr>
    void copy() {
        assert(mpImgData);

        uint16_t numThreads = std::thread::hardware_concurrency();

        std::thread mCopyThreads[numThreads];

        uint16_t tilesPerThread = (mTileRows * mTileCols) / numThreads;
        for (int i = 0; i < numThreads; i++) {
            uint16_t startInd = i * tilesPerThread;
            uint16_t endInd = (i == numThreads - 1) ? (mTileRows * mTileCols) : ((i + 1) * tilesPerThread);

            mCopyThreads[i] = std::thread(&xfcvDataMovers::input_copy, this, startInd, endInd);
        }
        for (int i = 0; i < numThreads; i++) {
            mCopyThreads[i].join();
        }
    }
    //}

    // Helper function for stitcher copy {
    template <DataMoverKind _t = KIND, typename std::enable_if<(_t == STITCHER)>::type* = nullptr>
    void output_copy(uint16_t startInd, uint16_t endInd) {
        assert(mpImgData != nullptr);

        DATA_TYPE* buffer = (DATA_TYPE*)xrtBOMap(mImageBOHndl);

        int tileSize = tileWindowSize();
        for (int t = startInd; t < endInd; t++) {
            int16_t tileWidth = (int16_t)buffer[t * tileSize + 0];
            int16_t tileHeight = (int16_t)buffer[t * tileSize + 4];
            int16_t positionH = (int16_t)buffer[t * tileSize + 8];
            int16_t positionV = (int16_t)buffer[t * tileSize + 12];
            int16_t overlapSizeH_left = (int16_t)buffer[t * tileSize + 16];
            int16_t overlapSizeH_right = (int16_t)buffer[t * tileSize + 20];
            int16_t overlapSizeV_top = (int16_t)buffer[t * tileSize + 24];
            int16_t overlapSizeV_bottom = (int16_t)buffer[t * tileSize + 28];

            int16_t correctedPositionH = positionH + overlapSizeH_left;
            int16_t correctedPositionV = positionV + overlapSizeV_top;
            int16_t correctedTileWidth = TILE_WIDTH_MAX - (overlapSizeH_left + overlapSizeH_right);
            int16_t correctedTileHeight = TILE_HEIGHT_MAX - (overlapSizeV_top + overlapSizeV_bottom);

            for (int ti = 0; ti < correctedTileHeight; ti++) {
                memcpy(mpImgData + (((correctedPositionV + ti) * mImageSize[1]) + correctedPositionH),
                       buffer + ((t * tileSize) + SMARTTILE_ELEMENTS + ((overlapSizeV_top + ti) * TILE_WIDTH_MAX) +
                                 overlapSizeH_left),
                       correctedTileWidth * sizeof(DATA_TYPE));
            }
        }
    }
    //}

    // Stitcher copy {
    template <DataMoverKind _t = KIND, typename std::enable_if<(_t == STITCHER)>::type* = nullptr>
    void copy() {
        assert(mpImgData != nullptr);

        uint16_t numThreads = std::thread::hardware_concurrency();

        std::thread mCopyThreads[numThreads];

        uint16_t tilesPerThread = (mTileRows * mTileCols) / numThreads;
        for (int i = 0; i < numThreads; i++) {
            uint16_t startInd = i * tilesPerThread;
            uint16_t endInd = (i == numThreads - 1) ? (mTileRows * mTileCols) : ((i + 1) * tilesPerThread);
            mCopyThreads[i] = std::thread(&xfcvDataMovers::output_copy, this, startInd, endInd);
        }
        for (int i = 0; i < numThreads; i++) {
            mCopyThreads[i].join();
        }
    }
    //}

    void free_buffer() {
        if (mImageBOHndl != nullptr) {
            xrtBOFree(mImageBOHndl);
        }
        mImageBOHndl = nullptr;
    }

    template <DataMoverKind _t = KIND, typename std::enable_if<(_t == TILER)>::type* = nullptr>
    void alloc_buffer() {
        if (mImageBOHndl == nullptr) {
            assert(tileImgSize() > 0);
            std::cout << "Allocating image device buffer (Tiler), "
                      << " Size : " << tileImgSize() << " bytes" << std::endl;
            //    mImageBOHndl = xrtBOAlloc(gpDhdl, tileImgSize(), XRT_BO_FLAGS_CACHEABLE, 0);
            mImageBOHndl = xrtBOAlloc(gpDhdl, tileImgSize(), 0, 0);
        }
    }

    template <DataMoverKind _t = KIND, typename std::enable_if<(_t == STITCHER)>::type* = nullptr>
    void alloc_buffer() {
        if (mImageBOHndl == nullptr) {
            assert(tileImgSize() > 0);
            std::cout << "Allocating image device buffer (Stitcher), "
                      << " Size : " << tileImgSize() << " bytes" << std::endl;
            mImageBOHndl = xrtBOAlloc(gpDhdl, tileImgSize(), XRT_BO_FLAGS_CACHEABLE, 0);
        }
    }

    void regTilerStitcherCount() {
        if (KIND == TILER)
            ++gnTilerInstCount;
        else
            ++gnStitcherInstCount;
    }

   public:
    template <DataMoverKind _t = KIND, typename std::enable_if<(_t == TILER)>::type* = nullptr>
    void start(std::array<std::string, CORES> portNames) {
        for (int i = 0; i < CORES; i++) {
            xrtBOSync(mImageBOHndl + i * bufferSizePerCore(), XCL_BO_SYNC_BO_TO_DEVICE, bufferSizePerCore(), 0);
            xrtSyncBOAIENB(gpDhdl, mImageBOHndl + i * bufferSizePerCore(), portNames[i].c_str(),
                           XCL_BO_SYNC_BO_GMIO_TO_AIE, bufferSizePerCore(), 0);
        }
    }

    template <DataMoverKind _t = KIND, typename std::enable_if<(_t == TILER)>::type* = nullptr>
    void wait(std::array<std::string, CORES> portNames) {
        for (int i = 0; i < CORES; i++) {
            xrtGMIOWait(gpDhdl, portNames[i].c_str());
        }
    }

    template <DataMoverKind _t = KIND, typename std::enable_if<(_t == STITCHER)>::type* = nullptr>
    void start(std::array<std::string, CORES> portNames) {
        for (int i = 0; i < CORES; i++) {
            xrtSyncBOAIENB(gpDhdl, mImageBOHndl + i * bufferSizePerCore(), portNames[i].c_str(),
                           XCL_BO_SYNC_BO_AIE_TO_GMIO, bufferSizePerCore(), 0);
        }
    }

    template <DataMoverKind _t = KIND, typename std::enable_if<(_t == STITCHER)>::type* = nullptr>
    void wait(std::array<std::string, CORES> portNames) {
        for (int i = 0; i < CORES; i++) {
            xrtGMIOWait(gpDhdl, portNames[i].c_str());
            xrtBOSync(mImageBOHndl + i * bufferSizePerCore(), XCL_BO_SYNC_BO_FROM_DEVICE, bufferSizePerCore(), 0);
        }

        // Copy data from device buffer to host
        copy();

        CtypeToCVMatType<DATA_TYPE> type;
        if (mpImage != nullptr) {
            cv::Mat dst(mImageSize[0], mImageSize[1], type.type, mpImgData);

            // TODO: saturation to be done based on the mat type ???
            if (mpImage->type() == CV_8U) {
                // Saturate the output values to [0,255]
                dst = cv::max(dst, 0);
                dst = cv::min(dst, 255);
            }
            dst.convertTo(*mpImage, mpImage->type());
        }
        mpImage = nullptr;
    }

    // Initialization / device buffer allocation / tile header copy / type
    // conversion to be done in constructor {
    template <DataMoverKind _t = KIND, typename std::enable_if<(_t == TILER)>::type* = nullptr>
    xfcvDataMovers(uint16_t overlapH, uint16_t overlapV) {
        if (gpDhdl == nullptr) {
            throw std::runtime_error("No valid device handle found. Make sure using xF::deviceInit(...) is called.");
        }

        mpImgData = nullptr;
        mImageSize = {0, 0, 0};

        // Initialize overlaps
        mOverlapH = overlapH;
        mOverlapV = overlapV;

        mTileRows = 0;
        mTileCols = 0;
        mbUserHndl = false;

        mImageBOHndl = nullptr;

        // Register the count of tiler/stitcher objects
        regTilerStitcherCount();
    }
    //}

    template <DataMoverKind _t = KIND, typename std::enable_if<(_t == STITCHER)>::type* = nullptr>
    xfcvDataMovers() {
        if (gpDhdl == nullptr) {
            throw std::runtime_error("No valid device handle found. Make sure using xF::deviceInit(...) is called.");
        }

        mpImage = nullptr;
        mpImgData = nullptr;
        mImageSize = {0, 0, 0};

        // Initialize overlaps
        mOverlapH = 0;
        mOverlapV = 0;

        mTileRows = 0;
        mTileCols = 0;
        mbUserHndl = false;

        mImageBOHndl = nullptr;

        // Register the count of tiler/stitcher objects
        regTilerStitcherCount();
    }

    // Non copyable {
    xfcvDataMovers(const xfcvDataMovers&) = delete;
    xfcvDataMovers& operator=(const xfcvDataMovers&) = delete;
    //}

    //    void setTileCopyFn(DataCopyF_t& fn);

    // Close / free operations tp be done here {
    ~xfcvDataMovers() {
        free_buffer();
        if (gpDhdl != nullptr) {
            xrtDeviceClose(gpDhdl);
        }
        gpDhdl = nullptr;
    }
    //}

    void compute_metadata(const cv::Size& img_size);

    // These functions will start the data transfer protocol {
    template <DataMoverKind _t = KIND, typename std::enable_if<(_t == TILER)>::type* = nullptr>
    std::array<uint16_t, 2> host2aie_nb(DATA_TYPE* img_data,
                                        const cv::Size& img_size,
                                        std::array<std::string, CORES> portNames) {
        int old_img_buffer_size = imgSize();

        bool bRecompute = false;
        if ((mImageSize[0] != img_size.height) || (mImageSize[1] != img_size.width)) {
            bRecompute = true;
        }

        mpImgData = (DATA_TYPE*)img_data;
        mImageSize = {(uint16_t)img_size.height, (uint16_t)img_size.width, (uint16_t)sizeof(DATA_TYPE)};

        if (bRecompute == true) {
            // Pack metadata
            compute_metadata(img_size);
        }

        int new_img_buffer_size = imgSize();

        if ((new_img_buffer_size > old_img_buffer_size)) {
            free_buffer();
        }

        // Allocate buffer
        alloc_buffer();

        // Copy input data to device buffer
        copy();

        // Start the data transfers
        start(portNames);

        std::array<uint16_t, 2> ret = {mTileRows, mTileCols};

        return ret;
    }

    template <DataMoverKind _t = KIND, typename std::enable_if<(_t == TILER)>::type* = nullptr>
    std::array<uint16_t, 2> host2aie_nb(cv::Mat& img, std::array<std::string, CORES> portNames) {
        CtypeToCVMatType<DATA_TYPE> cType;

        if (cType.type == img.type()) {
            return host2aie_nb((DATA_TYPE*)img.data, img.size(), portNames);
        } else if (cType.type < img.type()) {
            cv::Mat temp;
            img.convertTo(temp, cType.type);
            return host2aie_nb((DATA_TYPE*)temp.data, img.size(), portNames);
        } else {
            std::vector<DATA_TYPE> imgData;
            imgData.assign(img.data, img.data + img.total());
            return host2aie_nb(imgData.data(), img.size(), portNames);
        }
    }

    template <DataMoverKind _t = KIND, typename std::enable_if<(_t == STITCHER)>::type* = nullptr>
    void aie2host_nb(DATA_TYPE* img_data,
                     const cv::Size& img_size,
                     std::array<uint16_t, 2> tiles,
                     std::array<std::string, CORES> portNames) {
        int old_img_buffer_size = imgSize();

        mpImgData = (DATA_TYPE*)img_data;
        mImageSize = {(uint16_t)img_size.height, (uint16_t)img_size.width, sizeof(DATA_TYPE)};

        mTileRows = tiles[0];
        mTileCols = tiles[1];

        int new_img_buffer_size = imgSize();
        if ((new_img_buffer_size > old_img_buffer_size)) {
            free_buffer();
        }

        // Allocate buffer
        alloc_buffer();

        // Start the kernel
        start(portNames);
    }

    template <DataMoverKind _t = KIND, typename std::enable_if<(_t == STITCHER)>::type* = nullptr>
    void aie2host_nb(cv::Mat& img, std::array<uint16_t, 2> tiles, std::array<std::string, CORES> portNames) {
        mpImage = &img;

        CtypeToCVMatType<DATA_TYPE> cType;

        if (cType.type == img.type()) {
            return aie2host_nb((DATA_TYPE*)img.data, img.size(), tiles, portNames);
        }

        DATA_TYPE* imgData = (DATA_TYPE*)malloc(img.size().height * img.size().width * sizeof(DATA_TYPE));

        aie2host_nb(imgData, img.size(), tiles, portNames);
    }

    template <DataMoverKind _t = KIND, typename std::enable_if<(_t == TILER)>::type* = nullptr>
    std::array<uint16_t, 2> host2aie(cv::Mat& img, std::array<std::string, CORES> portNames) {
        std::array<uint16_t, 2> ret = host2aie_nb(img, portNames);

        wait(portNames);

        return ret;
    }

    template <DataMoverKind _t = KIND, typename std::enable_if<(_t == TILER)>::type* = nullptr>
    std::array<uint16_t, 2> host2aie(DATA_TYPE* img_data,
                                     const cv::Size& img_size,
                                     std::array<std::string, CORES> portNames) {
        std::array<uint16_t, 2> ret = host2aie_nb(img_data, img_size, portNames);

        wait(portNames);

        return ret;
    }

    template <DataMoverKind _t = KIND, typename std::enable_if<(_t == STITCHER)>::type* = nullptr>
    void aie2host(cv::Mat& img, std::array<uint16_t, 2> tiles, std::array<std::string, CORES> portNames) {
        aie2host_nb(img, tiles, portNames);

        wait(portNames);
    }

    template <DataMoverKind _t = KIND, typename std::enable_if<(_t == STITCHER)>::type* = nullptr>
    void aie2host(DATA_TYPE* img_data,
                  const cv::Size& img_size,
                  std::array<uint16_t, 2> tiles,
                  std::array<std::string, CORES> portNames) {
        aie2host_nb(img_data, img_size, tiles, portNames);

        wait(portNames);
    }

    //}
};

/*
template <DataMoverKind KIND,
          typename DATA_TYPE,
          int TILE_HEIGHT_MAX,
          int TILE_WIDTH_MAX,
          int AIE_VECTORIZATION_FACTOR,
          int CORES>
void xfcvDataMovers<KIND, DATA_TYPE, TILE_HEIGHT_MAX, TILE_WIDTH_MAX,
AIE_VECTORIZATION_FACTOR, CORES, 0, true>::
    setTileCopyFn(DataCopyF_t& fn) {}
*/
template <DataMoverKind KIND,
          typename DATA_TYPE,
          int TILE_HEIGHT_MAX,
          int TILE_WIDTH_MAX,
          int AIE_VECTORIZATION_FACTOR,
          int CORES>
void xfcvDataMovers<KIND, DATA_TYPE, TILE_HEIGHT_MAX, TILE_WIDTH_MAX, AIE_VECTORIZATION_FACTOR, CORES, 0, true>::
    compute_metadata(const cv::Size& img_size) {
    mMetaDataList.clear();
    mImageSize[0] = (uint16_t)img_size.height;
    mImageSize[1] = (uint16_t)img_size.width;

    smartTileTilerGenerateMetaDataWithSpecifiedTileSize(
        {mImageSize[0], mImageSize[1]}, mMetaDataList, mTileRows, mTileCols, {TILE_HEIGHT_MAX, TILE_WIDTH_MAX},
        {mOverlapH, mOverlapH}, {mOverlapV, mOverlapV}, AIE_VECTORIZATION_FACTOR, true);

    char sMesg[2048];
    sMesg[0] = '\0';
    sprintf(sMesg, "Requested tile size (%d,%d). Computed tile size (%d,%d). Number of tiles (%d,%d)\n",
            TILE_HEIGHT_MAX, TILE_WIDTH_MAX, mMetaDataList[0].tileHeight(), mMetaDataList[0].tileWidth(), mTileRows,
            mTileCols);
    std::cout << sMesg << std::endl;
}

} // xF

#endif