Program Listing for File xf_kalmanfilter.hpp

Return to documentation for file (/tmp/ws/src/vitis_common/include/video/xf_kalmanfilter.hpp)

/*
 * Copyright 2019 Xilinx, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef _XF_KALMANFILTER_HPP_
#define _XF_KALMANFILTER_HPP_

#define DEBUG 0
#include "common/xf_common.hpp"
#include "ap_int.h"

namespace xf {
namespace cv {
template <int PROC>
float KF_dotProduct(float dot_in1[PROC], float dot_in2[PROC]) {
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=dot_in1 complete dim=1
    #pragma HLS ARRAY_PARTITION variable=dot_in2 complete dim=1
    #pragma HLS inline off
    // clang-format on

    enum {
        TA_L1 = (PROC / 2 + (((PROC % 2) != 0) & (PROC != 1))),
        TA_L2 = (TA_L1 / 2 + (((TA_L1 % 2) != 0) & (TA_L1 != 1))),
        TA_L3 = (TA_L2 / 2 + (((TA_L2 % 2) != 0) & (TA_L2 != 1))),
        TA_L4 = (TA_L3 / 2 + (((TA_L3 % 2) != 0) & (TA_L3 != 1))),
        TA_L5 = (TA_L4 / 2 + (((TA_L4 % 2) != 0) & (TA_L4 != 1))),
        TA_L6 = (TA_L5 / 2 + (((TA_L5 % 2) != 0) & (TA_L5 != 1))),
        TA_L7 = (TA_L6 / 2 + (((TA_L6 % 2) != 0) & (TA_L6 != 1))),
        TA_L8 = (TA_L7 / 2 + (((TA_L7 % 2) != 0) & (TA_L7 != 1)))
    };

    float mul_out[PROC];
    for (ap_uint<10> idx = 0; idx < PROC; idx++) {
// clang-format off
        #pragma HLS unroll
        // clang-format on
        mul_out[idx] = dot_in1[idx] * dot_in2[idx];
    }

    float add1_out[TA_L1];
    float add2_out[TA_L2];
    float add3_out[TA_L3];
    float add4_out[TA_L4];
    float add5_out[TA_L5];
    float add6_out[TA_L6];
    float add7_out[TA_L7];
    float add8_out[TA_L8];

    if (TA_L1 != 0) {
        for (ap_uint<10> idx = 0; idx < TA_L1; idx++) {
// clang-format off
            #pragma HLS unroll
            // clang-format on
            if (idx == TA_L1 - 1 && PROC % 2 == 1)
                add1_out[idx] = mul_out[2 * idx];
            else
                add1_out[idx] = mul_out[2 * idx] + mul_out[2 * idx + 1];
        }
    }

    if (TA_L2 != 0) {
        for (ap_uint<10> idx = 0; idx < TA_L2; idx++) {
// clang-format off
            #pragma HLS unroll
            // clang-format on
            if (idx == TA_L2 - 1 && TA_L1 % 2 == 1)
                add2_out[idx] = add1_out[2 * idx];
            else
                add2_out[idx] = add1_out[2 * idx] + add1_out[2 * idx + 1];
        }
    }

    if (TA_L3 != 0) {
        for (ap_uint<10> idx = 0; idx < TA_L3; idx++) {
// clang-format off
            #pragma HLS unroll
            // clang-format on
            if (idx == TA_L3 - 1 && TA_L2 % 2 == 1)
                add3_out[idx] = add2_out[2 * idx];
            else
                add3_out[idx] = add2_out[2 * idx] + add2_out[2 * idx + 1];
        }
    }

    if (TA_L4 != 0) {
        for (ap_uint<10> idx = 0; idx < TA_L4; idx++) {
// clang-format off
            #pragma HLS unroll
            // clang-format on
            if (idx == TA_L4 - 1 && TA_L3 % 2 == 1)
                add4_out[idx] = add3_out[2 * idx];
            else
                add4_out[idx] = add3_out[2 * idx] + add3_out[2 * idx + 1];
        }
    }

    if (TA_L5 != 0) {
        for (ap_uint<10> idx = 0; idx < TA_L5; idx++) {
// clang-format off
            #pragma HLS unroll
            // clang-format on
            if (idx == TA_L5 - 1 && TA_L4 % 2 == 1)
                add5_out[idx] = add4_out[2 * idx];
            else
                add5_out[idx] = add4_out[2 * idx] + add4_out[2 * idx + 1];
        }
    }

    if (TA_L6 != 0) {
        for (ap_uint<10> idx = 0; idx < TA_L6; idx++) {
// clang-format off
            #pragma HLS unroll
            // clang-format on
            if (idx == TA_L6 - 1 && TA_L5 % 2 == 1)
                add6_out[idx] = add5_out[2 * idx];
            else
                add6_out[idx] = add5_out[2 * idx] + add5_out[2 * idx + 1];
        }
    }

    if (TA_L7 != 0) {
        for (ap_uint<10> idx = 0; idx < TA_L7; idx++) {
// clang-format off
            #pragma HLS unroll
            // clang-format on
            if (idx == TA_L7 - 1 && TA_L6 % 2 == 1)
                add7_out[idx] = add6_out[2 * idx];
            else
                add7_out[idx] = add6_out[2 * idx] + add6_out[2 * idx + 1];
        }
    }

    if (TA_L8 != 0) {
        for (ap_uint<10> idx = 0; idx < TA_L8; idx++) {
// clang-format off
            #pragma HLS unroll
            // clang-format on
            if (idx == TA_L8 - 1 && TA_L7 % 2 == 1)
                add8_out[idx] = add7_out[2 * idx];
            else
                add8_out[idx] = add7_out[2 * idx] + add7_out[2 * idx + 1];
        }
    }

    float add_out;
    if (TA_L1 == 1)
        add_out = add1_out[0];
    else if (TA_L2 == 1)
        add_out = add2_out[0];
    else if (TA_L3 == 1)
        add_out = add3_out[0];
    else if (TA_L4 == 1)
        add_out = add4_out[0];
    else if (TA_L5 == 1)
        add_out = add5_out[0];
    else if (TA_L6 == 1)
        add_out = add6_out[0];
    else if (TA_L7 == 1)
        add_out = add7_out[0];
    else if (TA_L8 == 1)
        add_out = add8_out[0];
    else
        add_out = mul_out[0];

    return (add_out);
}

template <int DEPTH>
void KF_treeAdder(float in1[DEPTH], float* output) {
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=in1 complete dim=1
    #pragma HLS inline
    // clang-format on
    enum {
        TA_L1 = (DEPTH / 2 + (((DEPTH % 2) != 0) & (DEPTH != 1))),
        TA_L2 = (TA_L1 / 2 + (((TA_L1 % 2) != 0) & (TA_L1 != 1))),
        TA_L3 = (TA_L2 / 2 + (((TA_L2 % 2) != 0) & (TA_L2 != 1))),
        TA_L4 = (TA_L3 / 2 + (((TA_L3 % 2) != 0) & (TA_L3 != 1))),
        TA_L5 = (TA_L4 / 2 + (((TA_L4 % 2) != 0) & (TA_L4 != 1))),
        TA_L6 = (TA_L5 / 2 + (((TA_L5 % 2) != 0) & (TA_L5 != 1))),
        TA_L7 = (TA_L6 / 2 + (((TA_L6 % 2) != 0) & (TA_L6 != 1))),
        TA_L8 = (TA_L7 / 2 + (((TA_L7 % 2) != 0) & (TA_L7 != 1)))
    };

    float add1_out[TA_L1];
    float add2_out[TA_L2];
    float add3_out[TA_L3];
    float add4_out[TA_L4];
    float add5_out[TA_L5];
    float add6_out[TA_L6];
    float add7_out[TA_L7];
    float add8_out[TA_L8];

    if (TA_L1 != 0) {
        for (ap_uint<10> idx = 0; idx < TA_L1; idx++) {
// clang-format off
            #pragma HLS unroll
            // clang-format on
            if (idx == TA_L1 - 1 && DEPTH % 2 == 1)
                add1_out[idx] = in1[2 * idx];
            else
                add1_out[idx] = in1[2 * idx] + in1[2 * idx + 1];
        }
    }

    if (TA_L2 != 0) {
        for (ap_uint<10> idx = 0; idx < TA_L2; idx++) {
// clang-format off
            #pragma HLS unroll
            // clang-format on
            if (idx == TA_L2 - 1 && TA_L1 % 2 == 1)
                add2_out[idx] = add1_out[2 * idx];
            else
                add2_out[idx] = add1_out[2 * idx] + add1_out[2 * idx + 1];
        }
    }

    if (TA_L3 != 0) {
        for (ap_uint<10> idx = 0; idx < TA_L3; idx++) {
// clang-format off
            #pragma HLS unroll
            // clang-format on
            if (idx == TA_L3 - 1 && TA_L2 % 2 == 1)
                add3_out[idx] = add2_out[2 * idx];
            else
                add3_out[idx] = add2_out[2 * idx] + add2_out[2 * idx + 1];
        }
    }

    if (TA_L4 != 0) {
        for (ap_uint<10> idx = 0; idx < TA_L4; idx++) {
// clang-format off
            #pragma HLS unroll
            // clang-format on
            if (idx == TA_L4 - 1 && TA_L3 % 2 == 1)
                add4_out[idx] = add3_out[2 * idx];
            else
                add4_out[idx] = add3_out[2 * idx] + add3_out[2 * idx + 1];
        }
    }

    if (TA_L5 != 0) {
        for (ap_uint<10> idx = 0; idx < TA_L5; idx++) {
// clang-format off
            #pragma HLS unroll
            // clang-format on
            if (idx == TA_L5 - 1 && TA_L4 % 2 == 1)
                add5_out[idx] = add4_out[2 * idx];
            else
                add5_out[idx] = add4_out[2 * idx] + add4_out[2 * idx + 1];
        }
    }

    if (TA_L6 != 0) {
        for (ap_uint<10> idx = 0; idx < TA_L6; idx++) {
// clang-format off
            #pragma HLS unroll
            // clang-format on
            if (idx == TA_L6 - 1 && TA_L5 % 2 == 1)
                add6_out[idx] = add5_out[2 * idx];
            else
                add6_out[idx] = add5_out[2 * idx] + add5_out[2 * idx + 1];
        }
    }

    if (TA_L7 != 0) {
        for (ap_uint<10> idx = 0; idx < TA_L7; idx++) {
// clang-format off
            #pragma HLS unroll
            // clang-format on
            if (idx == TA_L7 - 1 && TA_L6 % 2 == 1)
                add7_out[idx] = add6_out[2 * idx];
            else
                add7_out[idx] = add6_out[2 * idx] + add6_out[2 * idx + 1];
        }
    }

    if (TA_L8 != 0) {
        for (ap_uint<10> idx = 0; idx < TA_L8; idx++) {
// clang-format off
            #pragma HLS unroll
            // clang-format on
            if (idx == TA_L8 - 1 && TA_L7 % 2 == 1)
                add8_out[idx] = add7_out[2 * idx];
            else
                add8_out[idx] = add7_out[2 * idx] + add7_out[2 * idx + 1];
        }
    }

    float add_out;
    if (TA_L1 == 1)
        add_out = add1_out[0];
    else if (TA_L2 == 1)
        add_out = add2_out[0];
    else if (TA_L3 == 1)
        add_out = add3_out[0];
    else if (TA_L4 == 1)
        add_out = add4_out[0];
    else if (TA_L5 == 1)
        add_out = add5_out[0];
    else if (TA_L6 == 1)
        add_out = add6_out[0];
    else if (TA_L7 == 1)
        add_out = add7_out[0];
    else if (TA_L8 == 1)
        add_out = add8_out[0];
    else
        add_out = in1[0];

    *output = (add_out);
}

template <int PROC>
void KF_scaleSub(float in1[PROC], float scale, float in2[PROC], float out[PROC]) {
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=in1 complete dim=1
    #pragma HLS ARRAY_PARTITION variable=in2 complete dim=1
    #pragma HLS ARRAY_PARTITION variable=out complete dim=1
    #pragma HLS inline off
    // clang-format on

    float scale_neg = -scale;

    for (int idx = 0; idx < PROC; idx++) {
// clang-format off
        #pragma HLS unroll
        // clang-format on
        out[idx] = in1[idx] + (scale_neg * in2[idx]);
    }
}

template <int PROC>
void KF_scale(float in[PROC], float scale, float out[PROC]) {
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=in complete dim=1
    #pragma HLS ARRAY_PARTITION variable=out complete dim=1
    #pragma HLS inline off
    // clang-format on
    for (int idx = 0; idx < PROC; idx++) {
// clang-format off
        #pragma HLS unroll
        // clang-format on
        out[idx] = scale * in[idx];
    }
}

template <int PROC>
void KF_add(float in1[PROC], float in2[PROC], float out[PROC]) {
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=in1 complete dim=1
    #pragma HLS ARRAY_PARTITION variable=in2 complete dim=1
    #pragma HLS ARRAY_PARTITION variable=out complete dim=1
    #pragma HLS inline off
    // clang-format on
    for (int idx = 0; idx < PROC; idx++) {
// clang-format off
        #pragma HLS unroll
        // clang-format on
        out[idx] = in1[idx] + in2[idx];
    }
}

template <int N_STATE, int TYPE, int NPC>
void KF_X_write(float xu_vector[512], xf::cv::Mat<TYPE, N_STATE, 1, NPC>& Xout_mat) {
// clang-format off
    #pragma HLS inline off
    // clang-format on
    for (int ptr = 0; ptr < N_STATE; ptr++) {
// clang-format off
        #pragma HLS pipeline
        // clang-format on
        Xout_mat.write_float(ptr, xu_vector[ptr]);
    }
}

template <int N_STATE, int PROC_MU, int DEPTH_MU, int UMAT_DEPTH, int TYPE, int NPC>
void KF_UD_write(float U_matrix[PROC_MU][UMAT_DEPTH],
                 float D_vector[512],
                 xf::cv::Mat<TYPE, N_STATE, N_STATE, NPC>& Uout_mat,
                 xf::cv::Mat<TYPE, N_STATE, 1, NPC>& Dout_mat) {
// clang-format off
    #pragma HLS inline off
    // clang-format on

    ap_uint<32> counter1 = 0;
    ap_uint<32> counter1_1 = 0; // for dim2
    ap_uint<32> counter2 = 0;   // for dim1
    ap_uint<32> counter3 = 0;   // for dim2
LOOPI_U:
    for (int ptr = 0; ptr < N_STATE * N_STATE; ptr++) {
// clang-format off
        #pragma HLS pipeline
        // clang-format on

        ap_uint<8> dim1 = counter2;
        ap_uint<16> dim2 = counter1_1 + counter3;

        Uout_mat.write_float(ptr, U_matrix[dim1][dim2]);

        if (counter1 == N_STATE - 1) {
            if (counter2 == PROC_MU - 1) {
                counter2 = 0;
                counter3++;
            } else {
                counter2++;
            }

            counter1 = 0;
            counter1_1 = 0;
        } else {
            counter1++;
            counter1_1 += DEPTH_MU;
        }
    }

    for (int ptr = 0; ptr < N_STATE; ptr++) {
// clang-format off
        #pragma HLS pipeline
        // clang-format on
        Dout_mat.write_float(ptr, D_vector[ptr]);
    }
}

template <int N_STATE, int M_MEAS, int PROC_MU, int DEPTH_MU, int UMAT_DEPTH, bool URAM_EN, bool EKF_EN>
void MeasUpdate_1x(float Uin_matrix[PROC_MU][UMAT_DEPTH],
                   float Din_vector[N_STATE],
                   float Uout_matrix[PROC_MU][UMAT_DEPTH],
                   float Dout_vector[N_STATE],
                   float xu_vector[512],
                   float h_vector[PROC_MU][DEPTH_MU],
                   float r_value,
                   float z_value,
                   bool UDX_en) {
    // clang-format off
    //** comment for Ubar Dbar
    //    f1= h(1) & g1 = f1*D(1) & a0 = r
    //  ---------------------------
    //                  <a0,a1>       | Dbar(1) = D(1)*a0/a1            <a1,a2>      | Dbar(2) = D(2)*a1/a2         <a2,a3>     |..| Dbar(n) = D(n)*a(n-1)/a(n)
    // f1'=f1/a0        <f1',f2,g2>   | Ubar1= U1-f1'k1 -> f2'=f2/a1    <f2',f3,g3>  | Ubar2= U2-f2'k2 -> f3'=f3/a2 <f3',f4,g4> |..| Ubar(n)= Un-fn'kn -> f(n+1)'=f(n+1)/a(n)
    // k1 = {0,0..0}    <k1,g1>       | k2 = k1 + g1U1                    <k2,g2>    | k3 = k2 + g2U2               <k3,g3>     |..| k(n+1) = k(n) + g(n+1)U(n+1)_sv
    // f2 = U2*h                      | f3 = U3*h                                    | f4 = U4*h                                |..| f(n+2) = mulAcc
    // g2 = f2*D(2)                   | g3 = f3*D(3)                                 | g4 = f4*D(4)                             |..| g(n+2) = f(n+2)*g(n+2)
    // a1 = a0 + f1g1                 | a2 = a1 + f2g2                               | a3 = a2 + f3g3                           |..| a(n+1) = a(n) + f(n+1)g(n+1)
    //##############################################################################################################
    // a0 pass                        | a1 pass                                      | a2 pass                                  |..|
    // a1 compute/pass                | a2 compute/pass                              | a3 compute/pass                          |..|
    //###############################################################################################################
    // a_prev=a0,a_up=a1<a_prev,a_up> | Dbar(1)=D(1)*a_prev/a_up     <a_prev,a_up>   | Dbar(2)=D(2)*a_prev/a_up                 |..| Dbar(n)=D(n)*a_prev/a_up
    // f'=f1/a_prev   <f',f_nex,g_nex>| Ubar1=U1-f'K ->f'=f_nex/a_up <f',f_nex,g_nex>| Ubar2=U2-f'K -> f'=f/a_up                |..| Ubar(n)=U(n)-f'K -> f'=f/a_up
    // k=k1            <K,g>          | K= K + g*U1                  <k,g>           | K= K + g*U2                              |..| K= K + g*U2
    // g=g1                           | /*a1*/a_prev = a_up                          | /*a2*/a_prev = a_up                      |..| /*a(n)*/a_prev = a_up
    //                                | /*a2*/a_up   = a_up +f_nex*g_nex             | /*a3*/a_up   = a_up + f_nex*g_nex        |..| /*a(n+1)*/a_up   = a_up +f_nex*g_nex
    //                                | g = g_nex                                    | g = g_nex                                |..| g = g_nex
    // f=f2                           | f_nex = U3*h                                 | f_nex = f4 = U4*h                        |..| f_nex= f(n+2)
    // g=g2                           | g_nex = g3 = f*D(3)                          | g_nex = g4 = f4*D(4)                     |..| g_nex= g(n+2)=D(n+4)
    //*************
    // clang-format on
    if (URAM_EN == 0) {
// clang-format off
        #pragma HLS ARRAY_PARTITION variable=Uin_matrix complete dim=1
        // clang-format on
    } else {
// clang-format off
        #pragma HLS RESOURCE variable=Uin_matrix core=RAM_S2P_URAM
        #pragma HLS ARRAY_RESHAPE variable=Uin_matrix complete dim=1
        // clang-format on
    }
    if (URAM_EN == 0) {
// clang-format off
        #pragma HLS ARRAY_PARTITION variable=Uout_matrix complete dim=1
        // clang-format on
    } else {
// clang-format off
        #pragma HLS RESOURCE variable=Uout_matrix core=RAM_S2P_URAM
        #pragma HLS ARRAY_RESHAPE variable=Uout_matrix complete dim=1
        // clang-format on
    }
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=h_vector complete dim=0
    #pragma HLS inline off
    // clang-format on

    float res = z_value;
    float f1_value = h_vector[0][0];
    float g1_value = f1_value * Din_vector[0];
    float alpha_prev = r_value;                     // alpha0
    float alpha_up = r_value + f1_value * g1_value; // alpha1
    float f_dash_div = f1_value / r_value;          // f1' = f1/alpha0
    float f_dash;
    if (UDX_en == 0)
        f_dash = 0;
    else
        f_dash = f_dash_div;

    float kg_vector[PROC_MU][DEPTH_MU];
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=kg_vector complete dim=0
    // clang-format on
    for (int i = 0; i < PROC_MU; i++) {
// clang-format off
        #pragma HLS unroll
        // clang-format on
        for (int j = 0; j < DEPTH_MU; j++) {
// clang-format off
            #pragma HLS unroll
            // clang-format on
            kg_vector[i][j] = 0;
        }
    }

    float h_value2;
    if (PROC_MU == 1)
        h_value2 = h_vector[0][1];
    else
        h_value2 = h_vector[1][0];

    float f_nex = h_vector[0][0] * Uin_matrix[0][1 * DEPTH_MU] + h_value2;
    float g_nex = f_nex * Din_vector[1];
    float fg_nex = f_nex * g_nex;
    float g_value = g1_value;

LOOP2:
    for (int state = 0, u_offset = 0; state < N_STATE; state++, u_offset += DEPTH_MU) {
// clang-format off
        #pragma HLS pipeline II=10
        // clang-format on

        if (EKF_EN == 0) {
            //### needed for X update
            float hval = h_vector[state % PROC_MU][state / PROC_MU];
            res -= hval * xu_vector[state];
        }

        //####Dbar update
        float Din0 = Din_vector[state];
        float Din2 = Din_vector[state + 2];

        float alpha_div;
        if (UDX_en == 0)
            alpha_div = 1;
        else
            alpha_div = alpha_prev / alpha_up;

        Dout_vector[state] = Din0 * alpha_div;

        // Read col_j and col_j+2 from U matrix
        // For timing sake & II , Uin_matrix data is loaded in Uin0_col & Uin2_col
        float Uin0_col[PROC_MU][DEPTH_MU];
        float Uin2_col[PROC_MU][DEPTH_MU];
// clang-format off
        #pragma HLS ARRAY_PARTITION variable=Uin0_col complete dim=0
        #pragma HLS ARRAY_PARTITION variable=Uin2_col complete dim=0
        // clang-format on
        for (int i = 0; i < PROC_MU; i++) {
// clang-format off
            #pragma HLS unroll
            // clang-format on
            for (int j = 0; j < DEPTH_MU; j++) {
// clang-format off
                #pragma HLS unroll
                // clang-format on

                Uin0_col[i][j] = Uin_matrix[i][j + u_offset];
#if !__SYNTHESIS__

                if ((j + u_offset + 2 * DEPTH_MU) < UMAT_DEPTH)
                    Uin2_col[i][j] = Uin_matrix[i][j + u_offset + 2 * DEPTH_MU];
                else
                    Uin2_col[i][j] = 0;
#else
                Uin2_col[i][j] = Uin_matrix[i][j + u_offset + 2 * DEPTH_MU];
#endif
            }
        }

        float tmp_kg_vector[PROC_MU][DEPTH_MU];
// clang-format off
        #pragma HLS ARRAY_PARTITION variable=tmp_kg_vector complete dim=0
        // clang-format on
        for (int i = 0; i < PROC_MU; i++) {
// clang-format off
            #pragma HLS unroll
            // clang-format on
            for (int j = 0; j < DEPTH_MU; j++) {
// clang-format off
                #pragma HLS unroll
                // clang-format on
                tmp_kg_vector[i][j] = kg_vector[i][j];
            }
        }

        float Uout_col[PROC_MU][DEPTH_MU];
// clang-format off
        #pragma HLS ARRAY_PARTITION variable=Uout_col complete dim=0
    // clang-format on
    LOOP5:
        for (ap_uint<8> u_seq = 0, var = 0; u_seq < DEPTH_MU; u_seq++, var += PROC_MU) {
// clang-format off
            #pragma HLS unroll
            // clang-format on
            float u_readchunk[PROC_MU];
            float k_readchunk[PROC_MU];
// clang-format off
            #pragma HLS ARRAY_PARTITION variable=u_readchunk complete dim=1
            #pragma HLS ARRAY_PARTITION variable=k_readchunk complete dim=1
            // clang-format on
            for (ap_uint<8> loadin = 0; loadin < PROC_MU; loadin++) {
// clang-format off
                #pragma HLS unroll
                // clang-format on
                u_readchunk[loadin] = Uin0_col[loadin][u_seq];
                k_readchunk[loadin] = tmp_kg_vector[loadin][u_seq];
            }
            float u_writechunk[PROC_MU];
            KF_scaleSub<PROC_MU>(u_readchunk, f_dash, k_readchunk, u_writechunk);

            for (ap_uint<8> loadin = 0; loadin < PROC_MU; loadin++) {
// clang-format off
                #pragma HLS unroll
                // clang-format on
                Uout_col[loadin][u_seq] = u_writechunk[loadin];
            }

        } // u seq loop

        for (int i = 0; i < PROC_MU; i++) {
// clang-format off
            #pragma HLS unroll
            // clang-format on
            for (int j = 0; j < DEPTH_MU; j++) {
// clang-format off
                #pragma HLS unroll
                // clang-format on

                Uout_matrix[i][j + u_offset] = Uout_col[i][j];
            }
        }

        //###f_dash calculation
        float f_dash_temp = f_nex / alpha_up;

        if (UDX_en == 0)
            f_dash = 0;
        else
            f_dash = f_dash_temp;

        //##Update Kalman gain kg_vector
        float gu_vector[PROC_MU][DEPTH_MU];
    LOOP61:
        for (ap_uint<8> k_seq = 0, var = 0; k_seq < DEPTH_MU; k_seq++, var += PROC_MU) {
// clang-format off
            #pragma HLS unroll
            // clang-format on
            float u_readchunk[PROC_MU];
            for (ap_uint<8> loadin = 0; loadin < PROC_MU; loadin++) {
// clang-format off
                #pragma HLS unroll
                // clang-format on
                u_readchunk[loadin] = Uin0_col[loadin][k_seq];
            }

            float gu_writechunk[PROC_MU];
            KF_scale<PROC_MU>(u_readchunk, g_value, gu_writechunk);

            for (ap_uint<8> loadin = 0; loadin < PROC_MU; loadin++) {
// clang-format off
                #pragma HLS unroll
                // clang-format on
                gu_vector[loadin][k_seq] = gu_writechunk[loadin];
            }
        } // k seq loop

    LOOP62:
        for (ap_uint<8> k_seq = 0, var = 0; k_seq < DEPTH_MU; k_seq++, var += PROC_MU) {
// clang-format off
            #pragma HLS unroll
            // clang-format on
            float k_readchunk[PROC_MU];
            float gu_readchunk[PROC_MU];
            for (ap_uint<8> loadin = 0; loadin < PROC_MU; loadin++) {
// clang-format off
                #pragma HLS unroll
                // clang-format on
                k_readchunk[loadin] = tmp_kg_vector[loadin][k_seq];
                gu_readchunk[loadin] = gu_vector[loadin][k_seq];
            }

            float k_writechunk[PROC_MU];
            KF_add<PROC_MU>(k_readchunk, gu_readchunk, k_writechunk);

            for (ap_uint<8> loadin = 0; loadin < PROC_MU; loadin++) {
// clang-format off
                #pragma HLS unroll
                // clang-format on
                kg_vector[loadin][k_seq] = k_writechunk[loadin];
            }
        } // k seq loop

        //### update alpha
        alpha_prev = alpha_up;
        alpha_up = alpha_up + f_nex * g_nex;

        //### f and g calculation
        g_value = g_nex;
        float dot_out[DEPTH_MU];
// clang-format off
        #pragma HLS ARRAY_PARTITION variable=dot_out complete dim=1
    // clang-format on
    LOOP7:
        for (ap_uint<10> dot_seq = 0, var = 0; dot_seq < DEPTH_MU; dot_seq++, var += PROC_MU) {
// clang-format off
            #pragma HLS unroll
            // clang-format on
            float dot_in1[PROC_MU];
            float dot_in2[PROC_MU];
// clang-format off
            #pragma HLS ARRAY_PARTITION variable=dot_in1 complete dim=1
            #pragma HLS ARRAY_PARTITION variable=dot_in2 complete dim=1
            // clang-format on
            for (ap_uint<10> loadin = 0; loadin < PROC_MU; loadin++) {
// clang-format off
                #pragma HLS unroll
                // clang-format on
                dot_in1[loadin] = Uin2_col[loadin][dot_seq];
                dot_in2[loadin] = h_vector[loadin][dot_seq];
            } // loadin loop

            dot_out[dot_seq] = KF_dotProduct<PROC_MU>(dot_in1, dot_in2);
        } // dot seq loop

        float tmp_ta;
        KF_treeAdder<DEPTH_MU>(dot_out, &tmp_ta);
        f_nex = tmp_ta;
        g_nex = tmp_ta * Din2;

    } // state loop

    for (ap_uint<8> x_update = 0; x_update < N_STATE; x_update++) {
// clang-format off
        #pragma HLS pipeline
        // clang-format on
        float kg_temp = kg_vector[x_update % PROC_MU][x_update / PROC_MU] / alpha_prev;
        float kg;

        if (UDX_en == 0)
            kg = 0;
        else
            kg = kg_temp;

        xu_vector[x_update] = xu_vector[x_update] + kg * res;

        Dout_vector[x_update + N_STATE] = Dout_vector[x_update];
    }
}

template <int N_STATE, int PROC_TU, int DEPTH_TU, int TMAT_DEPTH, int UQMAT_DEPTH, bool URAM_EN>
void load_Uq(float T_matrix[PROC_TU][TMAT_DEPTH], float Uq_matrix[UQMAT_DEPTH]) {
    if (URAM_EN == 0) {
// clang-format off
        #pragma HLS ARRAY_PARTITION variable=T_matrix complete dim=1
        #pragma HLS resource variable=T_matrix core=RAM_S2P_BRAM
        // clang-format on
    } else {
// clang-format off
        #pragma HLS RESOURCE variable=T_matrix core=RAM_S2P_URAM
        #pragma HLS ARRAY_RESHAPE variable=T_matrix complete dim=1
        // clang-format on
    }

    if (URAM_EN == 1) {
// clang-format off
        #pragma HLS RESOURCE variable=Uq_matrix core=RAM_S2P_URAM
        // clang-format on
    }
// clang-format off
    #pragma HLS inline off
    // clang-format on

    ap_uint<16> counter_trow = 0;
    ap_uint<32> offset_inc = N_STATE;
LOOPI_UQ:
    for (int ptr = 0; ptr < N_STATE * N_STATE; ptr++) {
// clang-format off
        #pragma HLS pipeline
        // clang-format on

        ap_uint<32> offset = offset_inc + counter_trow;
        ap_uint<8> dim1 = offset % PROC_TU;
        ap_uint<16> dim2 = offset / PROC_TU;

        T_matrix[dim1][dim2] = Uq_matrix[ptr];

        if (counter_trow == N_STATE - 1) {
            counter_trow = 0;
            offset_inc += DEPTH_TU * PROC_TU;
        } else
            counter_trow++;
    }
}

template <int N_STATE,
          int C_CTRL,
          int M_MEAS,
          int PROC_MU,
          int DEPTH_MU,
          int UMAT_DEPTH,
          int HMAT_DEPTH,
          bool URAM_EN,
          bool EKF_EN,
          int TYPE,
          int NPC>
void MeasUpdate(float U_matrix[PROC_MU][UMAT_DEPTH],
                float H_matrix[PROC_MU][HMAT_DEPTH],
                float D_vector[512],
                float xu_vector[512],
                float ry_vector[512],
#if KF_C != 0
                xf::cv::Mat<TYPE, C_CTRL, 1, NPC>& u_mat,
#endif
                xf::cv::Mat<TYPE, M_MEAS, 1, NPC>& y_mat,
                xf::cv::Mat<TYPE, M_MEAS, 1, NPC>& R_mat,
                xf::cv::Mat<TYPE, M_MEAS, N_STATE, NPC>& H_mat,
                xf::cv::Mat<TYPE, N_STATE, 1, NPC>& Xout_mat,
                xf::cv::Mat<TYPE, N_STATE, N_STATE, NPC>& Uout_mat,
                xf::cv::Mat<TYPE, N_STATE, 1, NPC>& Dout_mat,
                bool X_write_en,
                bool UD_write_en) {
    if (URAM_EN == 0) {
// clang-format off
        #pragma HLS ARRAY_PARTITION variable=U_matrix complete dim=1
        // clang-format on
    } else {
// clang-format off
        #pragma HLS RESOURCE variable=U_matrix core=RAM_S2P_URAM
        #pragma HLS ARRAY_RESHAPE variable=U_matrix complete dim=1
        // clang-format on
    }

    if (URAM_EN == 0) {
// clang-format off
        #pragma HLS ARRAY_PARTITION variable=H_matrix complete dim=1
        // clang-format on
    } else {
// clang-format off
        #pragma HLS RESOURCE variable=H_matrix core=RAM_S2P_URAM
        #pragma HLS ARRAY_RESHAPE variable=H_matrix complete dim=1
        // clang-format on
    }

// clang-format off
    #pragma HLS inline off
    // clang-format on

    enum { M_MEAS_align2 = (M_MEAS + (M_MEAS % 2)) };

    float Uint_matrix[PROC_MU][UMAT_DEPTH];
    if (URAM_EN == 0) {
// clang-format off
        #pragma HLS ARRAY_PARTITION variable=Uint_matrix complete dim=1
        // clang-format on
    } else {
// clang-format off
        #pragma HLS RESOURCE variable=Uint_matrix core=RAM_S2P_URAM
        #pragma HLS ARRAY_RESHAPE variable=Uint_matrix complete dim=1
        // clang-format on
    }

    float Dint_vector[512];

    ap_uint<8> meas_index;
    if (EKF_EN == 1)
        meas_index = xu_vector[511];
    else
        meas_index = 0;

    float hx, Zekf, Rekf;
    //##### Read Y mesurements
    if (EKF_EN == 0) {
    LOOP1:
        for (ap_uint<8> ddr_ptr = 0; ddr_ptr < M_MEAS; ddr_ptr++) {
// clang-format off
            #pragma HLS pipeline
            // clang-format on
            ry_vector[ddr_ptr + M_MEAS] = y_mat.read_float(ddr_ptr);
        }
    } else {
        Zekf = y_mat.read_float(0);
#if KF_C != 0
        hx = u_mat.read_float(0);
#endif
        Rekf = R_mat.read_float(0);

        ap_uint<32> offset_incH = 0;
    LOOPI_H:
        for (int ptr = 0; ptr < N_STATE; ptr++) {
// clang-format off
            #pragma HLS pipeline
            // clang-format on
            ap_uint<10> dim1 = ptr % PROC_MU;
            ap_uint<16> dim2 = ptr / PROC_MU;

            H_matrix[dim1][dim2] = H_mat.read_float(ptr);
        }
    }

    bool flip = 0;

    ap_uint<8> meas_loop_cnt;
    if (EKF_EN == 1)
        meas_loop_cnt = 2;
    else
        meas_loop_cnt = M_MEAS_align2;

LOOP2:
    for (ap_uint<8> meas = 0; meas < meas_loop_cnt; meas++) {
        bool UDX_en;
        if (EKF_EN == 0) {
            if (meas == M_MEAS)
                UDX_en = 0;
            else
                UDX_en = 1;
        } else {
            if (meas == 1)
                UDX_en = 0;
            else
                UDX_en = 1;
        }

        float h_vector[PROC_MU][DEPTH_MU];
// clang-format off
        #pragma HLS ARRAY_PARTITION variable=h_vector complete dim=0
    // clang-format on
    LOOPHM:
        for (ap_uint<8> i = 0; i < DEPTH_MU; i++) {
// clang-format off
            #pragma HLS LOOP_FLATTEN off
            #pragma HLS pipeline
            // clang-format on
            for (ap_uint<8> j = 0; j < PROC_MU; j++) {
// clang-format off
                #pragma HLS unroll
                // clang-format on

                h_vector[j][i] = H_matrix[j][meas * DEPTH_MU + i];
            }
        }
        float r_value; // = ry_vector[meas];

        float z_value;
        if (EKF_EN == 0) {
            z_value = ry_vector[meas + M_MEAS];
            r_value = ry_vector[meas];
        } else {
            z_value = Zekf - hx;
            r_value = Rekf;
        }

        if (flip == 0) {
            MeasUpdate_1x<N_STATE, M_MEAS, PROC_MU, DEPTH_MU, UMAT_DEPTH, URAM_EN, EKF_EN>(
                U_matrix, D_vector, Uint_matrix, Dint_vector, xu_vector, h_vector, r_value, z_value, UDX_en);
            flip = 1;
        } else {
            MeasUpdate_1x<N_STATE, M_MEAS, PROC_MU, DEPTH_MU, UMAT_DEPTH, URAM_EN, EKF_EN>(
                Uint_matrix, Dint_vector, U_matrix, D_vector, xu_vector, h_vector, r_value, z_value, UDX_en);
            flip = 0;
        }
    }

    //###### Write X corrected state vector
    if (X_write_en) KF_X_write<N_STATE, TYPE, NPC>(xu_vector, Xout_mat);

    //###### Write P corrected state vector
    if (UD_write_en)
        KF_UD_write<N_STATE, PROC_MU, DEPTH_MU, UMAT_DEPTH, TYPE, NPC>(U_matrix, D_vector, Uout_mat, Dout_mat);
}

template <int N_STATE,
          int C_CTRL,
          int M_MEAS,
          int PROC_TU,
          int DEPTH_TU,
          int PROC_MU,
          int DEPTH_MU,
          int UMAT_DEPTH,
          int HMAT_DEPTH,
          int TMAT_DEPTH,
          int UQMAT_DEPTH,
          bool URAM_EN,
          bool EKF_EN,
          int TYPE,
          int NPC>
void MeasUpdate_wrapper(float U_matrix[PROC_MU][UMAT_DEPTH],
                        float H_matrix[PROC_MU][HMAT_DEPTH],
                        float D_vector[512],
                        float xu_vector[512],
                        float ry_vector[512],
                        float T_matrix[PROC_TU][TMAT_DEPTH],
                        float Uq_matrix[UQMAT_DEPTH],
#if KF_C != 0
                        xf::cv::Mat<TYPE, C_CTRL, 1, NPC>& u_mat,
#endif
                        xf::cv::Mat<TYPE, M_MEAS, 1, NPC>& y_mat,
                        xf::cv::Mat<TYPE, M_MEAS, 1, NPC>& R_mat,
                        xf::cv::Mat<TYPE, M_MEAS, N_STATE, NPC>& H_mat,
                        xf::cv::Mat<TYPE, N_STATE, 1, NPC>& Xout_mat,
                        xf::cv::Mat<TYPE, N_STATE, N_STATE, NPC>& Uout_mat,
                        xf::cv::Mat<TYPE, N_STATE, 1, NPC>& Dout_mat,
                        bool X_write_en,
                        bool UD_write_en) {
    if (URAM_EN == 0) {
// clang-format off
        #pragma HLS ARRAY_PARTITION variable=U_matrix complete dim=1
        // clang-format on
    } else {
// clang-format off
        #pragma HLS RESOURCE variable=U_matrix core=RAM_S2P_URAM
        #pragma HLS ARRAY_RESHAPE variable=U_matrix complete dim=1
        // clang-format on
    }

    if (URAM_EN == 0) {
// clang-format off
        #pragma HLS ARRAY_PARTITION variable=H_matrix complete dim=1
        // clang-format on
    } else {
// clang-format off
        #pragma HLS RESOURCE variable=H_matrix core=RAM_S2P_URAM
        #pragma HLS ARRAY_RESHAPE variable=H_matrix complete dim=1
        // clang-format on
    }

    if (URAM_EN == 0) {
// clang-format off
        #pragma HLS ARRAY_PARTITION variable=T_matrix complete dim=1
        #pragma HLS resource variable=T_matrix core=RAM_S2P_BRAM
        // clang-format on
    } else {
// clang-format off
        #pragma HLS RESOURCE variable=T_matrix core=RAM_S2P_URAM
        #pragma HLS ARRAY_RESHAPE variable=T_matrix complete dim=1
        // clang-format on
    }

// clang-format off
    #pragma HLS inline off
// clang-format on

LOOP1:
    for (int itr1 = 0; itr1 < 1; itr1++) {
        load_Uq<N_STATE, PROC_TU, DEPTH_TU, TMAT_DEPTH, UQMAT_DEPTH, URAM_EN>(T_matrix, Uq_matrix);

        MeasUpdate<N_STATE, C_CTRL, M_MEAS, PROC_MU, DEPTH_MU, UMAT_DEPTH, HMAT_DEPTH, URAM_EN, EKF_EN, TYPE, NPC>(
            U_matrix, H_matrix, D_vector, xu_vector, ry_vector,
#if KF_C != 0
            u_mat,
#endif
            y_mat, R_mat, H_mat, Xout_mat, Uout_mat, Dout_mat, X_write_en, UD_write_en);
    }
}

template <int N_STATE, int U_SIZE, int TYPE, int NPC>
void load_control_input(
#if KF_C != 0
    xf::cv::Mat<TYPE, U_SIZE, 1, NPC>& control_input,
#endif
    float xu_vector[512]) {
    for (ap_uint<8> idx = 0; idx < U_SIZE; idx++) {
// clang-format off
#pragma HLS pipeline
// clang-format on
#if KF_C != 0
        xu_vector[N_STATE + idx] = control_input.read_float(idx);
#endif
    }
}

//########################################################################################//
// For gemv operation,
//...
//    0__ 4__ 8 __ 11__      a|           0a            4b          8c
//    12d 1__ 5__ 9 __ 12__      b|           1a            5b          9c
//    13d
// A = 2__ 6__ 10__ 13__  X = c|        1st = 2a    2nd +=  6b  3rd +=  10c 4th +=  14d
//    3__ 7__ 11__ 14__      d|           3a            7b          11c
//    15d
//##########################################################################################//
// for   x' = Ax, x_buffer[256-383] = A_buffer*x_buffer[0-127]
// for x_tu = Bu, x_buffer[0-127]   = B_buffer*x_buffer[256-383]
//##########################################################################################//
template <int N_STATE, int C_CTRL, int PROC_MU, int DEPTH_MU, int DEPTH_MU_CTRL, int ABMAT_DEPTH, int UMAT_DEPTH>
void gemv(float AB_matrix[PROC_MU][ABMAT_DEPTH],
          float xu_vector[512],
          ap_uint<16> matrix_offset,
          ap_uint<10> vector_offset_in,
          ap_uint<10> vector_offset_out,
          ap_uint<8> outer_loop_bound) {
// clang-format off
    #pragma HLS inline off
// clang-format on

// New Gemv design with 1 multipliers and 1 adders
LOOP1:
    for (ap_uint<8> outer_loop = 0; outer_loop < outer_loop_bound; outer_loop++) {
// clang-format off
        #pragma HLS LOOP_TRIPCOUNT min=128 max=128
        // clang-format on
        float input_x = xu_vector[vector_offset_in + outer_loop];

        ap_uint<16> buffer_idx = 0;
        ap_uint<16> idx_inc;
        if (vector_offset_in == 0)
            idx_inc = DEPTH_MU;
        else
            idx_inc = DEPTH_MU_CTRL;

    LOOPF1:
        for (ap_uint<10> inner_loop = 0; inner_loop < N_STATE; inner_loop++) {
// clang-format off
            #pragma HLS loop_flatten off
            #pragma HLS DEPENDENCE variable=xu_vector inter false
            #pragma HLS pipeline
            // clang-format on

            float input_A = AB_matrix[outer_loop % PROC_MU][matrix_offset + buffer_idx + outer_loop / PROC_MU];

            float mul_out = input_A * input_x;

            ap_uint<32> offset;
            if (vector_offset_in != 0 && outer_loop != 0)
                offset = 0;
            else
                offset = 256;

            float intermediate_x = xu_vector[inner_loop + offset];

            float add_input;
            if (vector_offset_in == 0 && outer_loop == 0)
                add_input = 0;
            else
                add_input = intermediate_x;

            float add_in2;

            if (C_CTRL == 0) {
                if (vector_offset_in != 0)
                    add_in2 = 0;
                else
                    add_in2 = mul_out;

            } else {
                add_in2 = mul_out;
            }

            xu_vector[inner_loop + vector_offset_out] = add_input + add_in2;

            buffer_idx += idx_inc;

        } // end proc loop
    }     // end outer loop
}

template <int N_STATE, int C_CTRL, int PROC_MU, int DEPTH_MU, int DEPTH_MU_CTRL, int ABMAT_DEPTH, int UMAT_DEPTH>
void state_predict(float AB_matrix[PROC_MU][ABMAT_DEPTH], float xu_vector[512]) {
// clang-format off
    #pragma HLS inline off
    // clang-format on
    for (ap_uint<2> iteration = 0; iteration < 2; iteration++) {
        ap_uint<16> matrix_offset;
        ap_uint<10> vector_offset_in;
        ap_uint<10> vector_offset_out;
        ap_uint<8> outer_loop_bound;
        if (iteration == 0) {
            matrix_offset = 0;
            vector_offset_in = 0;
            vector_offset_out = 256;
            outer_loop_bound = N_STATE;
        } else {
            matrix_offset = UMAT_DEPTH;
            vector_offset_in = N_STATE;
            vector_offset_out = 0;
            if (C_CTRL == 0)
                outer_loop_bound = 1;
            else
                outer_loop_bound = C_CTRL;
        }

        gemv<N_STATE, C_CTRL, PROC_MU, DEPTH_MU, DEPTH_MU_CTRL, ABMAT_DEPTH, UMAT_DEPTH>(
            AB_matrix, xu_vector, matrix_offset, vector_offset_in, vector_offset_out, outer_loop_bound);
    }
}

template <int N_STATE,
          int PROC_TU,
          int DEPTH_TU,
          int PROC_MU,
          int DEPTH_MU,
          int ABMAT_DEPTH,
          int UMAT_DEPTH,
          int TMAT_DEPTH,
          bool URAM_EN>
void gemm_update(float AB_matrix[PROC_MU][ABMAT_DEPTH],
                 float U_matrix[PROC_MU][UMAT_DEPTH],
                 float T_matrix[PROC_TU][TMAT_DEPTH],
                 ap_uint<10> out_col_start,
                 ap_uint<10> out_col_cnt,
                 ap_uint<10> iteration) {
    if (URAM_EN == 0) {
// clang-format off
        #pragma HLS ARRAY_PARTITION variable=U_matrix complete dim=1
        // clang-format on
    } else {
// clang-format off
        #pragma HLS RESOURCE variable=U_matrix core=RAM_S2P_URAM
        #pragma HLS ARRAY_RESHAPE variable=U_matrix complete dim=1
        // clang-format on
    }

    if (URAM_EN == 0) {
// clang-format off
        #pragma HLS ARRAY_PARTITION variable=AB_matrix complete dim=1
        // clang-format on
    } else {
// clang-format off
        #pragma HLS RESOURCE variable=AB_matrix core=RAM_S2P_URAM
        #pragma HLS ARRAY_RESHAPE variable=AB_matrix complete dim=1
        // clang-format on
    }

    if (URAM_EN == 0) {
// clang-format off
        #pragma HLS ARRAY_PARTITION variable=T_matrix complete dim=1
        #pragma HLS resource variable=T_matrix core=RAM_S2P_BRAM
        // clang-format on
    } else {
// clang-format off
        #pragma HLS RESOURCE variable=T_matrix core=RAM_S2P_URAM
        #pragma HLS ARRAY_RESHAPE variable=T_matrix complete dim=1
        // clang-format on
    }

// clang-format off
    #pragma HLS inline off
// clang-format on
LOOP2:
    for (ap_uint<10> out_row0 = 0; out_row0 < N_STATE; out_row0++) {
    LOOP3:
        for (ap_uint<10> out_col_idx = 0; out_col_idx < out_col_cnt; out_col_idx++) {
// clang-format off
            #pragma HLS loop_flatten
            #pragma HLS LOOP_TRIPCOUNT min=128 max=128
            #pragma HLS DEPENDENCE variable=T_matrix inter false
            #pragma HLS pipeline
            // clang-format on

            ap_uint<8> out_col = out_col_start + out_col_idx;

            ap_uint<20> out_index0 = out_row0 * (DEPTH_TU * PROC_TU) + out_col;
            ap_uint<8> dim1_0_Tmatrix = out_index0 % PROC_TU;
            ap_uint<16> dim2_0_Tmatrix = out_index0 / PROC_TU;

            ap_uint<16> dim2_0_Amatrix = out_row0 * (DEPTH_MU) + iteration * 2;
            ap_uint<16> dim2_1_Amatrix = dim2_0_Amatrix + 1;

            ap_uint<16> dim2_0_Umatrix = out_col * (DEPTH_MU) + iteration * 2;
            ap_uint<16> dim2_1_Umatrix = dim2_0_Umatrix + 1;

            bool pad_en = (2 * iteration + 2) > DEPTH_MU;

            float input1_dotproduct[PROC_MU * 2];
            float input2_dotproduct[PROC_MU * 2];
            for (ap_uint<8> idx = 0; idx < PROC_MU; idx++) {
// clang-format off
                #pragma HLS unroll
                // clang-format on
                input1_dotproduct[idx] = AB_matrix[idx][dim2_0_Amatrix];
                if (pad_en)
                    input1_dotproduct[idx + PROC_MU] = 0;
                else
                    input1_dotproduct[idx + PROC_MU] = AB_matrix[idx][dim2_1_Amatrix];

                input2_dotproduct[idx] = U_matrix[idx][dim2_0_Umatrix];
                input2_dotproduct[idx + PROC_MU] = U_matrix[idx][dim2_1_Umatrix];
            }

            float dot_output = KF_dotProduct<2 * PROC_MU>(input1_dotproduct, input2_dotproduct);

            float read1 = T_matrix[dim1_0_Tmatrix][dim2_0_Tmatrix];
            float write1;
            if (iteration == 0)
                write1 = dot_output;
            else
                write1 = read1 + dot_output;

            T_matrix[dim1_0_Tmatrix][dim2_0_Tmatrix] = write1;
        }
    }
}

template <int N_STATE,
          int PROC_TU,
          int DEPTH_TU,
          int PROC_MU,
          int DEPTH_MU,
          int ABMAT_DEPTH,
          int UMAT_DEPTH,
          int TMAT_DEPTH,
          bool URAM_EN>
void AU_compute(float AB_matrix[PROC_MU][ABMAT_DEPTH],
                float U_matrix[PROC_MU][UMAT_DEPTH],
                float T_matrix[PROC_TU][TMAT_DEPTH]) {
    if (URAM_EN == 0) {
// clang-format off
        #pragma HLS ARRAY_PARTITION variable=U_matrix complete dim=1
        // clang-format on
    } else {
// clang-format off
        #pragma HLS RESOURCE variable=U_matrix core=RAM_S2P_URAM
        #pragma HLS ARRAY_RESHAPE variable=U_matrix complete dim=1
        // clang-format on
    }

    if (URAM_EN == 0) {
// clang-format off
        #pragma HLS ARRAY_PARTITION variable=AB_matrix complete dim=1
        // clang-format on
    } else {
// clang-format off
        #pragma HLS RESOURCE variable=AB_matrix core=RAM_S2P_URAM
        #pragma HLS ARRAY_RESHAPE variable=AB_matrix complete dim=1
        // clang-format on
    }

    if (URAM_EN == 0) {
// clang-format off
        #pragma HLS ARRAY_PARTITION variable=T_matrix complete dim=1
        #pragma HLS resource variable=T_matrix core=RAM_S2P_BRAM
        // clang-format on
    } else {
// clang-format off
        #pragma HLS RESOURCE variable=T_matrix core=RAM_S2P_URAM
        #pragma HLS ARRAY_RESHAPE variable=T_matrix complete dim=1
        // clang-format on
    }

// clang-format off
    #pragma HLS inline off
    // clang-format on

    enum { GEMM_ITERATION = ((DEPTH_MU / 2) + (DEPTH_MU % 2)) };

LOOP1:
    for (ap_uint<10> iteration = 0, out_col_start = 0, out_col_cnt = N_STATE; iteration < GEMM_ITERATION;
         iteration++, out_col_start += (2 * PROC_MU), out_col_cnt -= (2 * PROC_MU)) {
        gemm_update<N_STATE, PROC_TU, DEPTH_TU, PROC_MU, DEPTH_MU, ABMAT_DEPTH, UMAT_DEPTH, TMAT_DEPTH, URAM_EN>(
            AB_matrix, U_matrix, T_matrix, out_col_start, out_col_cnt, iteration);
    }
}

// In update_T_matrix function...
//
// for j= n..1
//{
//  Dbar[j] = trans(t[j]) * DpDq * t[j]; //Delta[j] = DpDq * t[j]
//
//  for i= 1..j-1
//  {
//      Ubar[i,j] = trans(t[i]) * Delta[j] / Dbar[j]
//      t[i] = t[i] - Ubar[i,j]*t[j]
//  }
//}
//
// Above psuedo code is modified as below
//
// for j= n..1
//{
//  for i= j..1
//  {
//      Delta[j] = DpDq * t[j]
//      Udash = dotproduct(trans(t[i]) ,Delta[j])
//      if(i=j)
//          Dbar[j] = Udash
//      Ubar[i,j] = Udash / Dbar[j]
//      t[i] = t[i] - Ubar[i,j]*t[j]
//  }
//}
template <int N_STATE,
          int PROC_TU,
          int DEPTH_TU,
          int PROC_MU,
          int DEPTH_MU,
          int TMAT_DEPTH,
          int DPDQ_DEPTH,
          int UMAT_DEPTH,
          bool URAM_EN>
void update_T_matrix(float Tj_vector[PROC_TU][DPDQ_DEPTH],
                     float Deltaj_vector[PROC_TU][DPDQ_DEPTH],
                     float T_matrix[PROC_TU][TMAT_DEPTH],
                     float U_matrix[PROC_MU][UMAT_DEPTH],
                     float D_vector[512],
                     ap_uint<10> u_col_num) {
    if (URAM_EN == 0) {
// clang-format off
        #pragma HLS ARRAY_PARTITION variable=T_matrix complete dim=1
        #pragma HLS resource variable=T_matrix core=RAM_S2P_BRAM
        // clang-format on
    } else {
// clang-format off
        #pragma HLS RESOURCE variable=T_matrix core=RAM_S2P_URAM
        #pragma HLS ARRAY_RESHAPE variable=T_matrix complete dim=1
        // clang-format on
    }
// clang-format off
    #pragma HLS inline off
    // clang-format on
    float Dn_value = 0;
    float dotOutInt_ti_Deltaj[DEPTH_TU];
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=dotOutInt_ti_Deltaj complete dim=1
    // clang-format on
    for (int i = 0; i < DEPTH_TU; i++) {
// clang-format off
        #pragma HLS unroll
        // clang-format on
        dotOutInt_ti_Deltaj[i] = 0;
    }
    float Un_dash = 0;
    float U_value_in = 0;

    float Ti_ping[PROC_TU][DEPTH_TU];
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=Ti_ping complete dim=0
    // clang-format on
    float Ti_pong[PROC_TU][DEPTH_TU];
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=Ti_pong complete dim=0
// clang-format on

//#########################
// 1st iteration of LOOPM_1, T matrix's rows will not be updated
// since this loop is running in ping-pong, 1st iteration will be ideal for U_value and T matrix
// After 1st iteration, T matrix row index = u_row_num+1

LOOPM_1:
    for (ap_int<16> u_row_num = u_col_num, start = 0; u_row_num >= -1; u_row_num--, start++) {
    LOOPM_2:
        for (ap_uint<10> depth_num = 0; depth_num < DEPTH_TU; depth_num++) {
// clang-format off
            #pragma HLS LOOP_TRIPCOUNT min=128*16 max=128*16
            #pragma HLS DEPENDENCE variable=T_matrix inter false
            #pragma HLS pipeline
            // clang-format on

            ap_uint<16> index_num;
            ap_uint<16> index_num2;
            index_num = u_row_num * DEPTH_TU + depth_num;
            index_num2 = (u_row_num + 1) * DEPTH_TU + depth_num;

            float Ti_chunk[PROC_TU];
// clang-format off
            #pragma HLS ARRAY_PARTITION variable=Ti_chunk complete dim=1
            // clang-format on

            for (ap_uint<10> idx = 0; idx < PROC_TU; idx++) {
// clang-format off
#pragma HLS unroll
// clang-format on
#if !__SYNTHESIS__
                float T_mat_read;
                if (u_row_num == -1)
                    T_mat_read = 0;
                else {
                    T_mat_read = T_matrix[idx][index_num];
                }
#else
                float T_mat_read = T_matrix[idx][index_num];
#endif
                Ti_chunk[idx] = T_mat_read;
                if (start[0] == 0)
                    Ti_ping[idx][depth_num] = T_mat_read;
                else
                    Ti_pong[idx][depth_num] = T_mat_read;
            } // idx loop

            float Tj_for_delta[PROC_TU];
// clang-format off
            #pragma HLS ARRAY_PARTITION variable=Tj_for_delta complete dim=1
            // clang-format on
            float deltaj_chunk[PROC_TU];
// clang-format off
            #pragma HLS ARRAY_PARTITION variable=deltaj_chunk complete dim=1
            // clang-format on

            for (ap_uint<10> idx = 0; idx < PROC_TU; idx++) {
// clang-format off
                #pragma HLS unroll
                // clang-format on
                Tj_for_delta[idx] = Tj_vector[idx][depth_num];
                deltaj_chunk[idx] = Deltaj_vector[idx][depth_num];
            } // idx loop

            float temp_dotout;
            temp_dotout = KF_dotProduct<PROC_TU>(Ti_chunk, deltaj_chunk);

            dotOutInt_ti_Deltaj[depth_num] = temp_dotout;

            float Un_dash_temp;
            KF_treeAdder<DEPTH_TU>(dotOutInt_ti_Deltaj, &Un_dash_temp);

            if ((depth_num) == (DEPTH_TU - 1)) Un_dash = Un_dash_temp;

            if (u_row_num == u_col_num && (depth_num) == (DEPTH_TU - 1)) {
                Dn_value = Un_dash;
                D_vector[u_col_num] = Un_dash;
            }

            float Un_value = Un_dash / Dn_value;

            ap_uint<8> dim1_Umat = u_row_num % PROC_MU;
            ap_uint<16> dim2_Umat = u_col_num * (DEPTH_MU) + u_row_num / PROC_MU;

            if (u_row_num != -1) {
                U_matrix[dim1_Umat][dim2_Umat] = Un_value;
            }

            float Ti_select[PROC_TU];
            for (ap_uint<10> idx = 0; idx < PROC_TU; idx++) {
                if (start[0] == 1)
                    Ti_select[idx] = Ti_ping[idx][depth_num];
                else
                    Ti_select[idx] = Ti_pong[idx][depth_num];
            }

            float Ti_update_chunk[PROC_TU];

            KF_scaleSub<PROC_TU>(Ti_select, U_value_in, Tj_for_delta, Ti_update_chunk);

            for (ap_uint<10> idx = 0; idx < PROC_TU; idx++) {
// clang-format off
                #pragma HLS unroll
                // clang-format on
                if (u_row_num != u_col_num) {
                    T_matrix[idx][index_num2] = Ti_update_chunk[idx];
                }
            } // idx loop

            if (depth_num == DEPTH_TU - 1) U_value_in = Un_value;

        } // depth_num
    }     // u_row_num
}

//####Load 1 ROW from T_matrix from Tj_vector. ROW id = u_col_num
template <int N_STATE, int PROC_TU, int DEPTH_TU, int TMAT_DEPTH, int DPDQ_DEPTH, bool URAM_EN>
void load_TjDeltaj_vector(float T_matrix[PROC_TU][TMAT_DEPTH],
                          float Tj_vector[PROC_TU][DPDQ_DEPTH],
                          float Deltaj_vector[PROC_TU][DPDQ_DEPTH],
                          float D_vector[512],
                          ap_uint<10> u_col_num) {
    if (URAM_EN == 0) {
// clang-format off
        #pragma HLS ARRAY_PARTITION variable=T_matrix complete dim=1
        #pragma HLS resource variable=T_matrix core=RAM_S2P_BRAM
        // clang-format on
    } else {
// clang-format off
        #pragma HLS RESOURCE variable=T_matrix core=RAM_S2P_URAM
        #pragma HLS ARRAY_RESHAPE variable=T_matrix complete dim=1
        // clang-format on
    }
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=Tj_vector complete dim=1
    #pragma HLS ARRAY_PARTITION variable=Deltaj_vector complete dim=1
    #pragma HLS inline off
    // clang-format on

    ap_uint<10> dim1_D = N_STATE;
    for (ap_uint<14> idx1 = 0; idx1 < (DEPTH_TU); idx1++) {
        for (ap_uint<8> idx2 = 0; idx2 < PROC_TU; idx2++) {
// clang-format off
            #pragma HLS pipeline
            // clang-format on
            ap_uint<16> dim2 = (idx1 + u_col_num * DEPTH_TU);

            float T_value = T_matrix[idx2][dim2];
            float D_value = D_vector[dim1_D++];

            Deltaj_vector[idx2][idx1] = T_value * D_value;
            Tj_vector[idx2][idx1] = T_value;
        }
    }
}

template <int N_STATE,
          int PROC_TU,
          int DEPTH_TU,
          int PROC_MU,
          int DEPTH_MU,
          int TMAT_DEPTH,
          int UMAT_DEPTH,
          int DPDQ_DEPTH,
          bool URAM_EN>
void UD_compute(float T_matrix[PROC_TU][TMAT_DEPTH], float U_matrix[PROC_MU][UMAT_DEPTH], float D_vector[512]) {
    if (URAM_EN == 0) {
// clang-format off
        #pragma HLS ARRAY_PARTITION variable=U_matrix complete dim=1
        // clang-format on
    } else {
// clang-format off
        #pragma HLS RESOURCE variable=U_matrix core=RAM_S2P_URAM
        #pragma HLS ARRAY_RESHAPE variable=U_matrix complete dim=1
        // clang-format on
    }

    if (URAM_EN == 0) {
// clang-format off
        #pragma HLS ARRAY_PARTITION variable=T_matrix complete dim=1
        #pragma HLS resource variable=T_matrix core=RAM_S2P_BRAM
        // clang-format on
    } else {
// clang-format off
        #pragma HLS RESOURCE variable=T_matrix core=RAM_S2P_URAM
        #pragma HLS ARRAY_RESHAPE variable=T_matrix complete dim=1
        // clang-format on
    }
// clang-format off
    #pragma HLS inline off
    // clang-format on

    float Tj_vector[PROC_TU][DPDQ_DEPTH];
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=Tj_vector complete dim=1
    // clang-format on
    float Deltaj_vector[PROC_TU][DPDQ_DEPTH];
// clang-format off
    #pragma HLS ARRAY_PARTITION variable=Deltaj_vector complete dim=1
    // clang-format on

    for (ap_int<10> u_col_num = N_STATE - 1; u_col_num >= 0; u_col_num--) {
        load_TjDeltaj_vector<N_STATE, PROC_TU, DEPTH_TU, TMAT_DEPTH, DPDQ_DEPTH, URAM_EN>(
            T_matrix, Tj_vector, Deltaj_vector, D_vector, u_col_num);

        update_T_matrix<N_STATE, PROC_TU, DEPTH_TU, PROC_MU, DEPTH_MU, TMAT_DEPTH, DPDQ_DEPTH, UMAT_DEPTH, URAM_EN>(
            Tj_vector, Deltaj_vector, T_matrix, U_matrix, D_vector, u_col_num);
    }
}

template <int N_STATE,
          int M_MEAS,
          int C_CTRL,
          int PROC_TU,
          int DEPTH_TU,
          int PROC_MU,
          int DEPTH_MU,
          int DEPTH_MU_CTRL,
          int UMAT_DEPTH,
          int ABMAT_DEPTH,
          int DPDQ_DEPTH,
          int TMAT_DEPTH,
          bool URAM_EN,
          bool EKF_EN,
          int TYPE,
          int NPC>
void TimeUpdate(float T_matrix[PROC_TU][TMAT_DEPTH],
                float AB_matrix[PROC_MU][ABMAT_DEPTH],
                float xu_vector[512],
                float U_matrix[PROC_MU][UMAT_DEPTH],
                float D_vector[512],
#if KF_C != 0
                xf::cv::Mat<TYPE, C_CTRL, 1, NPC>& u_mat,
#endif
                xf::cv::Mat<TYPE, N_STATE, 1, NPC>& Xout_mat,
                xf::cv::Mat<TYPE, N_STATE, N_STATE, NPC>& Uout_mat,
                xf::cv::Mat<TYPE, N_STATE, 1, NPC>& Dout_mat,
                bool X_write_en,
                bool UD_write_en) {

// clang-format off
    #pragma HLS inline off
// clang-format on

LOOP1:
    for (int itr1 = 0; itr1 < 1; itr1++) {
#if KF_C != 0
        if (EKF_EN == 0) load_control_input<N_STATE, C_CTRL, TYPE, NPC>(u_mat, xu_vector);
#endif
        AU_compute<N_STATE, PROC_TU, DEPTH_TU, PROC_MU, DEPTH_MU, ABMAT_DEPTH, UMAT_DEPTH, TMAT_DEPTH, URAM_EN>(
            AB_matrix, U_matrix, T_matrix);
    }

LOOP2:
    for (int itr1 = 0; itr1 < 1; itr1++) {
        if (EKF_EN == 0)
            state_predict<N_STATE, C_CTRL, PROC_MU, DEPTH_MU, DEPTH_MU_CTRL, ABMAT_DEPTH, UMAT_DEPTH>(AB_matrix,
                                                                                                      xu_vector);

        UD_compute<N_STATE, PROC_TU, DEPTH_TU, PROC_MU, DEPTH_MU, TMAT_DEPTH, UMAT_DEPTH, DPDQ_DEPTH, URAM_EN>(
            T_matrix, U_matrix, D_vector);
    }

    if (X_write_en) KF_X_write<N_STATE, TYPE, NPC>(xu_vector, Xout_mat);

    if (UD_write_en)
        KF_UD_write<N_STATE, PROC_MU, DEPTH_MU, UMAT_DEPTH, TYPE, NPC>(U_matrix, D_vector, Uout_mat, Dout_mat);
}

template <int N_STATE,
          int M_MEAS,
          int C_CTRL,
          int PROC_TU,
          int DEPTH_TU,
          int PROC_MU,
          int DEPTH_MU,
          int DEPTH_MU_CTRL,
          int UMAT_DEPTH,
          int HMAT_DEPTH,
          int ABMAT_DEPTH,
          int DPDQ_DEPTH,
          int TMAT_DEPTH,
          int UQMAT_DEPTH,
          bool URAM_EN,
          bool EKF_EN,
          int TYPE,
          int NPC>
void initialization(xf::cv::Mat<TYPE, N_STATE, N_STATE, NPC>& A_mat,
#if KF_C != 0
                    xf::cv::Mat<TYPE, N_STATE, C_CTRL, NPC>& B_mat,
#endif
                    xf::cv::Mat<TYPE, N_STATE, N_STATE, NPC>& Uq_mat,
                    xf::cv::Mat<TYPE, N_STATE, 1, NPC>& Dq_mat,
                    xf::cv::Mat<TYPE, M_MEAS, N_STATE, NPC>& H_mat,
                    xf::cv::Mat<TYPE, N_STATE, 1, NPC>& X0_mat,
                    xf::cv::Mat<TYPE, N_STATE, N_STATE, NPC>& U0_mat,
                    xf::cv::Mat<TYPE, N_STATE, 1, NPC>& D0_mat,
                    xf::cv::Mat<TYPE, M_MEAS, 1, NPC>& R_mat,
                    float H_matrix[PROC_MU][HMAT_DEPTH],
                    float U_matrix[PROC_MU][UMAT_DEPTH],
                    float xu_vector[512],
                    float ry_vector[512],
                    float D_vector[512],
                    float AB_matrix[PROC_MU][ABMAT_DEPTH],
                    float T_matrix[PROC_TU][TMAT_DEPTH],
                    float Uq_matrix[UQMAT_DEPTH],
                    bool read_opt_flag) {
    if (URAM_EN == 0) {
// clang-format off
        #pragma HLS ARRAY_PARTITION variable=H_matrix complete dim=1
        // clang-format on
    } else {
// clang-format off
        #pragma HLS RESOURCE variable=H_matrix core=RAM_S2P_URAM
        #pragma HLS ARRAY_RESHAPE variable=H_matrix complete dim=1
        // clang-format on
    }

    if (URAM_EN == 0) {
// clang-format off
        #pragma HLS ARRAY_PARTITION variable=U_matrix complete dim=1
        // clang-format on
    } else {
// clang-format off
        #pragma HLS RESOURCE variable=U_matrix core=RAM_S2P_URAM
        #pragma HLS ARRAY_RESHAPE variable=U_matrix complete dim=1
        // clang-format on
    }

    if (URAM_EN == 0) {
// clang-format off
        #pragma HLS ARRAY_PARTITION variable=AB_matrix complete dim=1
        // clang-format on
    } else {
// clang-format off
        #pragma HLS RESOURCE variable=AB_matrix core=RAM_S2P_URAM
        #pragma HLS ARRAY_RESHAPE variable=AB_matrix complete dim=1
        // clang-format on
    }

    if (URAM_EN == 0) {
// clang-format off
        #pragma HLS ARRAY_PARTITION variable=T_matrix complete dim=1
        #pragma HLS resource variable=T_matrix core=RAM_S2P_BRAM
        // clang-format on
    } else {
// clang-format off
        #pragma HLS RESOURCE variable=T_matrix core=RAM_S2P_URAM
        #pragma HLS ARRAY_RESHAPE variable=T_matrix complete dim=1
        // clang-format on
    }

    if (URAM_EN == 1) {
// clang-format off
        #pragma HLS RESOURCE variable=Uq_matrix core=RAM_S2P_URAM
        // clang-format on
    }

// clang-format off
    #pragma HLS inline off
    // clang-format on

    int U0_loop_cnt;
    if (EKF_EN == 1 && read_opt_flag == 1)
        U0_loop_cnt = 0;
    else
        U0_loop_cnt = N_STATE * N_STATE;

    ap_uint<32> counter1 = 0;
    ap_uint<32> counter1_1 = 0; // for dim2
    ap_uint<32> counter2 = 0;   // for dim1
    ap_uint<32> counter3 = 0;   // for dim2
LOOPI_U:
    for (int ptr = 0; ptr < U0_loop_cnt; ptr++) {
// clang-format off
        #pragma HLS pipeline
        // clang-format on

        ap_uint<8> dim1 = counter2;
        ap_uint<16> dim2 = counter1_1 + counter3;

        U_matrix[dim1][dim2] = U0_mat.read_float(ptr);

        if (counter1 == N_STATE - 1) {
            if (counter2 == PROC_MU - 1) {
                counter2 = 0;
                counter3++;
            } else {
                counter2++;
            }

            counter1 = 0;
            counter1_1 = 0;
        } else {
            counter1++;
            counter1_1 += DEPTH_MU;
        }
    }

LOOPHZ:
    for (int ptr_zero = 0, dim2 = (DEPTH_MU - 1); ptr_zero < M_MEAS; ptr_zero++, dim2 += DEPTH_MU) {
// clang-format off
        #pragma HLS pipeline
        // clang-format on

        for (int dim1 = 0; dim1 < PROC_MU; dim1++) {
// clang-format off
            #pragma HLS unroll
            // clang-format on

            H_matrix[dim1][dim2] = 0;
        }
    }

    if (EKF_EN == 0) {
        ap_uint<32> offset_incH = 0;
        ap_uint<32> counter_Hrow = 0;

    LOOPI_H:
        for (int ptr = 0; ptr < M_MEAS * N_STATE; ptr++) {
// clang-format off
            #pragma HLS pipeline
            // clang-format on
            ap_uint<32> offset = offset_incH + counter_Hrow;
            ap_uint<10> dim1 = offset % PROC_MU;
            ap_uint<16> dim2 = offset / PROC_MU;

            H_matrix[dim1][dim2] = H_mat.read_float(ptr);

            if (counter_Hrow == N_STATE - 1) {
                counter_Hrow = 0;
                offset_incH += DEPTH_MU * PROC_MU;
            } else
                counter_Hrow++;
        }
    }

    //******************************Load R ****************************//
    int R_loop_cnt;
    if (EKF_EN == 1 && read_opt_flag == 1)
        R_loop_cnt = 0;
    else
        R_loop_cnt = M_MEAS;

LOOPI_R:
    for (int ptr = 0; ptr < R_loop_cnt; ptr++) {
// clang-format off
        #pragma HLS pipeline
        // clang-format on

        ry_vector[ptr] = R_mat.read_float(ptr);
    }

//******************************Load X0 ****************************//
LOOPI_X:
    for (int ptr = 0; ptr < N_STATE; ptr++) {
// clang-format off
        #pragma HLS pipeline
        // clang-format on

        xu_vector[ptr] = X0_mat.read_float(ptr);
    }

    //******************************Load D0 ****************************//
    int D0_loop_cnt;
    if (EKF_EN == 1 && read_opt_flag == 1)
        D0_loop_cnt = 0;
    else
        D0_loop_cnt = N_STATE;

LOOPI_D:
    for (int ptr = 0; ptr < D0_loop_cnt; ptr++) {
// clang-format off
        #pragma HLS pipeline
        // clang-format on

        D_vector[ptr] = D0_mat.read_float(ptr);
    }

LOOPI_T1:
    for (int ptr = 0; ptr < D0_loop_cnt; ptr++) {
// clang-format off
        #pragma HLS pipeline
        // clang-format on

        D_vector[ptr + N_STATE] = D_vector[ptr];
    }

    //******************************Load A <Row major>****************************//

    ap_uint<16> dim2 = (DEPTH_MU - 1);
LOOPAZ:
    for (int ptr_zero = 0; ptr_zero < 2 * N_STATE; ptr_zero++) {
// clang-format off
        #pragma HLS pipeline
        // clang-format on

        for (int dim1 = 0; dim1 < PROC_MU; dim1++) {
// clang-format off
            #pragma HLS unroll
            // clang-format on

            AB_matrix[dim1][dim2] = 0;
        }
        if (ptr_zero < (N_STATE - 1))
            dim2 += DEPTH_MU;
        else
            dim2 += DEPTH_MU_CTRL;
    }

    ap_uint<32> offset_incA = 0;
    ap_uint<32> counter_Arow = 0;
LOOPI_A:
    for (int ptr = 0; ptr < N_STATE * N_STATE; ptr++) {
// clang-format off
        #pragma HLS pipeline
        // clang-format on

        ap_uint<32> offset = offset_incA + counter_Arow;
        ap_uint<8> dim1 = offset % PROC_MU;
        ap_uint<16> dim2 = offset / PROC_MU;

        AB_matrix[dim1][dim2] = A_mat.read_float(ptr);

        if (counter_Arow == N_STATE - 1) {
            counter_Arow = 0;
            offset_incA += DEPTH_MU * PROC_MU;
        } else
            counter_Arow++;
    }

    ap_uint<32> offset_incB = 0;
    ap_uint<32> counter_Brow = 0;

    int B_loop_cnt;
    if (EKF_EN == 1)
        B_loop_cnt = 0;
    else
        B_loop_cnt = N_STATE * C_CTRL;

LOOPI_B:
    for (int ptr = 0; ptr < B_loop_cnt; ptr++) {
// clang-format off
        #pragma HLS pipeline
        // clang-format on

        ap_uint<32> offset = offset_incB + counter_Brow;
        ap_uint<8> dim1 = offset % PROC_MU;
        ap_uint<16> dim2 = offset / PROC_MU;
#if KF_C != 0
        AB_matrix[dim1][dim2 + (DEPTH_MU * N_STATE)] = B_mat.read_float(ptr);
#endif
        if (counter_Brow == C_CTRL - 1) {
            counter_Brow = 0;
            offset_incB += DEPTH_MU_CTRL * PROC_MU;
        } else
            counter_Brow++;
    }

    //******************************Load Dq only digonal elements****************************//
    int Dq_loop_cnt;
    if (EKF_EN == 1 && read_opt_flag == 1)
        Dq_loop_cnt = 0;
    else
        Dq_loop_cnt = N_STATE;

LOOPI_T2:
    for (int ptr = 0; ptr < Dq_loop_cnt; ptr++) {
// clang-format off
        #pragma HLS pipeline
        // clang-format on

        D_vector[ptr + 2 * N_STATE] = Dq_mat.read_float(ptr);
    }
    //******************************Load Uq <Row major> ****************************//
    int Uq_loop_cnt;
    if (EKF_EN == 1 && read_opt_flag == 1)
        Uq_loop_cnt = 0;
    else
        Uq_loop_cnt = N_STATE * N_STATE;

    ap_uint<16> counter_trow = 0;
    ap_uint<32> offset_inc = N_STATE;
LOOPI_UQ:
    for (int ptr = 0; ptr < Uq_loop_cnt; ptr++) {
// clang-format off
        #pragma HLS pipeline
        // clang-format on

        ap_uint<32> offset = offset_inc + counter_trow;
        ap_uint<8> dim1 = offset % PROC_TU;
        ap_uint<16> dim2 = offset / PROC_TU;

        float Uq_value = Uq_mat.read_float(ptr);

        T_matrix[dim1][dim2] = Uq_value;
        Uq_matrix[ptr] = Uq_value;

        if (counter_trow == N_STATE - 1) {
            counter_trow = 0;
            offset_inc += DEPTH_TU * PROC_TU;
        } else
            counter_trow++;
    }
}

template <int N_STATE, int M_MEAS, int C_CTRL, int PROC_TU, int PROC_MU, bool URAM_EN, bool EKF_EN, int TYPE, int NPC>
void KalmanFilter_def(xf::cv::Mat<TYPE, N_STATE, N_STATE, NPC>& A_mat,
#if KF_C != 0
                      xf::cv::Mat<TYPE, N_STATE, C_CTRL, NPC>& B_mat,
#endif
                      xf::cv::Mat<TYPE, N_STATE, N_STATE, NPC>& Uq_mat,
                      xf::cv::Mat<TYPE, N_STATE, 1, NPC>& Dq_mat,
                      xf::cv::Mat<TYPE, M_MEAS, N_STATE, NPC>& H_mat,
                      xf::cv::Mat<TYPE, N_STATE, 1, NPC>& X0_mat,
                      xf::cv::Mat<TYPE, N_STATE, N_STATE, NPC>& U0_mat,
                      xf::cv::Mat<TYPE, N_STATE, 1, NPC>& D0_mat,
                      xf::cv::Mat<TYPE, M_MEAS, 1, NPC>& R_mat,
#if KF_C != 0
                      xf::cv::Mat<TYPE, C_CTRL, 1, NPC>& u_mat,
#endif
                      xf::cv::Mat<TYPE, M_MEAS, 1, NPC>& y_mat,
                      xf::cv::Mat<TYPE, N_STATE, 1, NPC>& Xout_mat,
                      xf::cv::Mat<TYPE, N_STATE, N_STATE, NPC>& Uout_mat,
                      xf::cv::Mat<TYPE, N_STATE, 1, NPC>& Dout_mat,
                      unsigned char flag) {
// clang-format off
    #pragma HLS inline off
    // clang-format on

    enum {
        DEPTH_TU = ((2 * N_STATE) / PROC_TU + (((2 * N_STATE) % PROC_TU) != 0)),
        DEPTH_MU = (N_STATE / PROC_MU + ((N_STATE % PROC_MU) != 0)),
        DEPTH_MU_CTRL = (C_CTRL / PROC_MU + ((C_CTRL % PROC_MU) != 0)),
        UMAT_DEPTH = (DEPTH_MU * N_STATE),
        HMAT_DEPTH = (DEPTH_MU * M_MEAS),
        ABMAT_DEPTH = ((DEPTH_MU * N_STATE) + (DEPTH_MU_CTRL * N_STATE)),
        DPDQ_DEPTH = DEPTH_TU,
        TMAT_DEPTH = (DEPTH_TU * N_STATE),
        UQMAT_DEPTH = (N_STATE * N_STATE)
    };

    static float H_matrix[PROC_MU][HMAT_DEPTH];
    if (URAM_EN == 0) {
// clang-format off
        #pragma HLS ARRAY_PARTITION variable=H_matrix complete dim=1
        // clang-format on
    } else {
// clang-format off
        #pragma HLS RESOURCE variable=H_matrix core=RAM_S2P_URAM
        #pragma HLS ARRAY_RESHAPE variable=H_matrix complete dim=1
        // clang-format on
    }

    static float U_matrix[PROC_MU][UMAT_DEPTH];
    if (URAM_EN == 0) {
// clang-format off
        #pragma HLS ARRAY_PARTITION variable=U_matrix complete dim=1
        // clang-format on
    } else {
// clang-format off
        #pragma HLS RESOURCE variable=U_matrix core=RAM_S2P_URAM
        #pragma HLS ARRAY_RESHAPE variable=U_matrix complete dim=1
        // clang-format on
    }

    static float xu_vector[512];
    static float ry_vector[512];
    static float D_vector[512];

    static float AB_matrix[PROC_MU][ABMAT_DEPTH];
    if (URAM_EN == 0) {
// clang-format off
        #pragma HLS ARRAY_PARTITION variable=AB_matrix complete dim=1
        // clang-format on
    } else {
// clang-format off
        #pragma HLS RESOURCE variable=AB_matrix core=RAM_S2P_URAM
        #pragma HLS ARRAY_RESHAPE variable=AB_matrix complete dim=1
        // clang-format on
    }

    static float T_matrix[PROC_TU][TMAT_DEPTH];
    if (URAM_EN == 0) {
// clang-format off
        #pragma HLS ARRAY_PARTITION variable=T_matrix complete dim=1
        #pragma HLS resource variable=T_matrix core=RAM_S2P_BRAM
        // clang-format on
    } else {
// clang-format off
        #pragma HLS RESOURCE variable=T_matrix core=RAM_S2P_URAM
        #pragma HLS ARRAY_RESHAPE variable=T_matrix complete dim=1
        // clang-format on
    }

    static float Uq_matrix[UQMAT_DEPTH];
    if (URAM_EN == 1) {
// clang-format off
        #pragma HLS RESOURCE variable=Uq_matrix core=RAM_S2P_URAM
        // clang-format on
    }

    ap_uint<8> flag_reg = flag;

    if (EKF_EN == 1) {
        if (flag_reg[0] == 1) xu_vector[511] = 0;
    }

    if (flag_reg[0])
        initialization<N_STATE, M_MEAS, C_CTRL, PROC_TU, DEPTH_TU, PROC_MU, DEPTH_MU, DEPTH_MU_CTRL, UMAT_DEPTH,
                       HMAT_DEPTH, ABMAT_DEPTH, DPDQ_DEPTH, TMAT_DEPTH, UQMAT_DEPTH, URAM_EN, EKF_EN, TYPE, NPC>(
            A_mat,
#if KF_C != 0
            B_mat,
#endif
            Uq_mat, Dq_mat, H_mat, X0_mat, U0_mat, D0_mat, R_mat, H_matrix, U_matrix, xu_vector, ry_vector, D_vector,
            AB_matrix, T_matrix, Uq_matrix, flag_reg[7]);

    if (flag_reg[1])
        TimeUpdate<N_STATE, M_MEAS, C_CTRL, PROC_TU, DEPTH_TU, PROC_MU, DEPTH_MU, DEPTH_MU_CTRL, UMAT_DEPTH,
                   ABMAT_DEPTH, DPDQ_DEPTH, TMAT_DEPTH, URAM_EN, EKF_EN, TYPE, NPC>(
            T_matrix, AB_matrix, xu_vector, U_matrix, D_vector,
#if KF_C != 0
            u_mat,
#endif
            Xout_mat, Uout_mat, Dout_mat, flag_reg[3], flag_reg[4]);

    if (flag_reg[2])
        MeasUpdate_wrapper<N_STATE, C_CTRL, M_MEAS, PROC_TU, DEPTH_TU, PROC_MU, DEPTH_MU, UMAT_DEPTH, HMAT_DEPTH,
                           TMAT_DEPTH, UQMAT_DEPTH, URAM_EN, EKF_EN, TYPE, NPC>(
            U_matrix, H_matrix, D_vector, xu_vector, ry_vector, T_matrix, Uq_matrix,
#if KF_C != 0
            u_mat,
#endif
            y_mat, R_mat, H_mat, Xout_mat, Uout_mat, Dout_mat, flag_reg[5], flag_reg[6]);

    if (EKF_EN == 1) {
        if (flag_reg[2] == 1) xu_vector[511]++;
    }
}

#if KF_C != 0
#endif
#if KF_C != 0
#endif

template <int N_STATE,
          int M_MEAS,
          int C_CTRL,
          int MTU,
          int MMU,
          bool USE_URAM = 0,
          bool EKF_EN = 0,
          int TYPE,
          int NPC = 1>
void KalmanFilter(xf::cv::Mat<TYPE, N_STATE, N_STATE, NPC>& A_mat,
#if KF_C != 0
                  xf::cv::Mat<TYPE, N_STATE, C_CTRL, NPC>& B_mat,
#endif
                  xf::cv::Mat<TYPE, N_STATE, N_STATE, NPC>& Uq_mat,
                  xf::cv::Mat<TYPE, N_STATE, 1, NPC>& Dq_mat,
                  xf::cv::Mat<TYPE, M_MEAS, N_STATE, NPC>& H_mat,
                  xf::cv::Mat<TYPE, N_STATE, 1, NPC>& X0_mat,
                  xf::cv::Mat<TYPE, N_STATE, N_STATE, NPC>& U0_mat,
                  xf::cv::Mat<TYPE, N_STATE, 1, NPC>& D0_mat,
                  xf::cv::Mat<TYPE, M_MEAS, 1, NPC>& R_mat,
#if KF_C != 0
                  xf::cv::Mat<TYPE, C_CTRL, 1, NPC>& u_mat,
#endif
                  xf::cv::Mat<TYPE, M_MEAS, 1, NPC>& y_mat,
                  xf::cv::Mat<TYPE, N_STATE, 1, NPC>& Xout_mat,
                  xf::cv::Mat<TYPE, N_STATE, N_STATE, NPC>& Uout_mat,
                  xf::cv::Mat<TYPE, N_STATE, 1, NPC>& Dout_mat,
                  unsigned char flag) {
    assert((N_STATE > 0 && N_STATE <= 128) && "For N_STATE, possible options are 1 to 128");
    assert((M_MEAS > 0 && M_MEAS <= 128) && "For M_MEAS, possible options are 1 to 128");
    assert((C_CTRL >= 0 && C_CTRL <= 128) && "For C_CTRL, possible options are 0 to 128");
    assert((MTU > 0 && MTU <= N_STATE) && "For MTU, possible options are 1 to N_STATE");
    assert((MMU > 0 && MMU <= N_STATE) && "For MMU, possible options are 1 to N_STATE");
    assert(((A_mat.rows == N_STATE) && (A_mat.cols == N_STATE)) && "A matrix dimension must be N_STATE x N_STATE");
#if KF_C != 0
    assert(((B_mat.rows == N_STATE) && (B_mat.cols == C_CTRL)) && "B matrix dimension must be N_STATE x C_CTRL");
#endif
    assert(((Uq_mat.rows == N_STATE) && (Uq_mat.cols == N_STATE)) && "Uq matrix dimension must be N_STATE x N_STATE");
    assert(((Dq_mat.rows == N_STATE) && (Dq_mat.cols == 1)) && "Dq matrix dimension must be N_STATE x 1");
    assert(((H_mat.rows == M_MEAS) && (H_mat.cols == N_STATE)) && "H matrix dimension must be M_MEAS x N_STATE");
    assert(((X0_mat.rows == N_STATE) && (X0_mat.cols == 1)) && "X0 matrix dimension must be N_STATE x 1");
    assert(((U0_mat.rows == N_STATE) && (U0_mat.cols == N_STATE)) && "U0 matrix dimension must be N_STATE x N_STATE");
    assert(((D0_mat.rows == N_STATE) && (D0_mat.cols == 1)) && "D0 matrix dimension must be N_STATE x 1");
    assert(((R_mat.rows == M_MEAS) && (R_mat.cols == 1)) && "R matrix dimension must be M_MEAS x 1");
#if KF_C != 0
    assert(((u_mat.rows == C_CTRL) && (u_mat.cols == 1)) && "u matrix dimension must be C_CTRL x 1");
#endif
    assert(((y_mat.rows == M_MEAS) && (y_mat.cols == 1)) && "y matrix dimension must be M_MEAS x 1");
    assert(((Xout_mat.rows == N_STATE) && (Xout_mat.cols == 1)) && "Xout matrix dimension must be N_STATE x 1");
    assert(((Uout_mat.rows == N_STATE) && (Uout_mat.cols == N_STATE)) &&
           "Uout matrix dimension must be N_STATE x N_STATE");
    assert(((Dout_mat.rows == N_STATE) && (Dout_mat.cols == 1)) && "Dout matrix dimension must be N_STATE x 1");
    assert((TYPE == XF_32FC1) && "TYPE must be XF_32FC1");
    assert((NPC == XF_NPPC1) && "NPC must be XF_NPPC1");

    KalmanFilter_def<N_STATE, M_MEAS, C_CTRL, MTU, MMU, USE_URAM, EKF_EN, TYPE, NPC>(
        A_mat,
#if KF_C != 0
        B_mat,
#endif
        Uq_mat, Dq_mat, H_mat, X0_mat, U0_mat, D0_mat, R_mat,
#if KF_C != 0
        u_mat,
#endif
        y_mat, Xout_mat, Uout_mat, Dout_mat, flag);
}
} // namespace cv
} // namespace xf
#endif //_XF_KALMANFILTER_HPP_