10 #ifndef EIGEN_SPARSELU_GEMM_KERNEL_H    11 #define EIGEN_SPARSELU_GEMM_KERNEL_H    24 template<
typename Scalar>
    36     RK = NumberOfRegisters>=16 ? 4 : 2, 
    37     BM = 4096/
sizeof(Scalar),           
    40   Index d_end = (d/RK)*RK;    
    41   Index n_end = (n/RN)*RN;    
    47   for(
Index i=0; i<i0; ++i)
    49     for(
Index j=0; j<n; ++j)
    51       Scalar c = C[i+j*ldc];
    52       for(
Index k=0; k<d; ++k)
    53         c += B[k+j*ldb] * A[i+k*lda];
    58   for(
Index ib=i0; ib<m; ib+=BM)
    60     Index actual_b = std::min<Index>(BM, m-ib);                 
    61     Index actual_b_end1 = (actual_b/SM)*SM;                   
    62     Index actual_b_end2 = (actual_b/PacketSize)*PacketSize;   
    65     for(
Index j=0; j<n_end; j+=RN)
    67       const Scalar* Bc0 = B+(j+0)*ldb;
    68       const Scalar* Bc1 = B+(j+1)*ldb;
    70       for(
Index k=0; k<d_end; k+=RK)
    74         Packet b00, b10, b20, b30, b01, b11, b21, b31;
    75                   { b00 = pset1<Packet>(Bc0[0]); }
    76                   { b10 = pset1<Packet>(Bc0[1]); }
    77         if(RK==4) { b20 = pset1<Packet>(Bc0[2]); }
    78         if(RK==4) { b30 = pset1<Packet>(Bc0[3]); }
    79                   { b01 = pset1<Packet>(Bc1[0]); }
    80                   { b11 = pset1<Packet>(Bc1[1]); }
    81         if(RK==4) { b21 = pset1<Packet>(Bc1[2]); }
    82         if(RK==4) { b31 = pset1<Packet>(Bc1[3]); }
    84         Packet a0, a1, a2, a3, c0, c1, t0, t1;
    86         const Scalar* A0 = A+ib+(k+0)*lda;
    87         const Scalar* A1 = A+ib+(k+1)*lda;
    88         const Scalar* A2 = A+ib+(k+2)*lda;
    89         const Scalar* A3 = A+ib+(k+3)*lda;
    91         Scalar* C0 = C+ib+(j+0)*ldc;
    92         Scalar* C1 = C+ib+(j+1)*ldc;
    94                   a0 = pload<Packet>(A0);
    95                   a1 = pload<Packet>(A1);
    98           a2 = pload<Packet>(A2);
    99           a3 = pload<Packet>(A3);
   107 #define KMADD(c, a, b, tmp) {tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);}   109                      c0 = pload<Packet>(C0+i+(I)*PacketSize);    \   110                      c1 = pload<Packet>(C1+i+(I)*PacketSize);    \   111                      KMADD(c0, a0, b00, t0)                      \   112                      KMADD(c1, a0, b01, t1)                      \   113                      a0 = pload<Packet>(A0+i+(I+1)*PacketSize);  \   114                      KMADD(c0, a1, b10, t0)                      \   115                      KMADD(c1, a1, b11, t1)                      \   116                      a1 = pload<Packet>(A1+i+(I+1)*PacketSize);  \   117           if(RK==4){ KMADD(c0, a2, b20, t0)                     }\   118           if(RK==4){ KMADD(c1, a2, b21, t1)                     }\   119           if(RK==4){ a2 = pload<Packet>(A2+i+(I+1)*PacketSize); }\   120           if(RK==4){ KMADD(c0, a3, b30, t0)                     }\   121           if(RK==4){ KMADD(c1, a3, b31, t1)                     }\   122           if(RK==4){ a3 = pload<Packet>(A3+i+(I+1)*PacketSize); }\   123                      pstore(C0+i+(I)*PacketSize, c0);            \   124                      pstore(C1+i+(I)*PacketSize, c1)   127         for(
Index i=0; i<actual_b_end1; i+=PacketSize*8)
   132           if(RK==4) 
prefetch((A2+i+(5)*PacketSize));
   133           if(RK==4) 
prefetch((A3+i+(5)*PacketSize));
   145         for(
Index i=actual_b_end1; i<actual_b_end2; i+=PacketSize)
   151         for(
Index i=actual_b_end2; i<actual_b; ++i)
   155             C0[i] += A0[i]*Bc0[0]+A1[i]*Bc0[1]+A2[i]*Bc0[2]+A3[i]*Bc0[3];
   156             C1[i] += A0[i]*Bc1[0]+A1[i]*Bc1[1]+A2[i]*Bc1[2]+A3[i]*Bc1[3];
   160             C0[i] += A0[i]*Bc0[0]+A1[i]*Bc0[1];
   161             C1[i] += A0[i]*Bc1[0]+A1[i]*Bc1[1];
   172       const Scalar* Bc0 = B+(n-1)*ldb;
   174       for(
Index k=0; k<d_end; k+=RK)
   178         Packet b00, b10, b20, b30;
   179                   b00 = pset1<Packet>(Bc0[0]);
   180                   b10 = pset1<Packet>(Bc0[1]);
   181         if(RK==4) b20 = pset1<Packet>(Bc0[2]);
   182         if(RK==4) b30 = pset1<Packet>(Bc0[3]);
   184         Packet a0, a1, a2, a3, c0, t0;
   186         const Scalar* A0 = A+ib+(k+0)*lda;
   187         const Scalar* A1 = A+ib+(k+1)*lda;
   188         const Scalar* A2 = A+ib+(k+2)*lda;
   189         const Scalar* A3 = A+ib+(k+3)*lda;
   191         Scalar* C0 = C+ib+(n_end)*ldc;
   193                   a0 = pload<Packet>(A0);
   194                   a1 = pload<Packet>(A1);
   197           a2 = pload<Packet>(A2);
   198           a3 = pload<Packet>(A3);
   207                    c0 = pload<Packet>(C0+i+(I)*PacketSize);     \   208                    KMADD(c0, a0, b00, t0)                       \   209                    a0 = pload<Packet>(A0+i+(I+1)*PacketSize);   \   210                    KMADD(c0, a1, b10, t0)                       \   211                    a1 = pload<Packet>(A1+i+(I+1)*PacketSize);   \   212         if(RK==4){ KMADD(c0, a2, b20, t0)                      }\   213         if(RK==4){ a2 = pload<Packet>(A2+i+(I+1)*PacketSize);  }\   214         if(RK==4){ KMADD(c0, a3, b30, t0)                      }\   215         if(RK==4){ a3 = pload<Packet>(A3+i+(I+1)*PacketSize);  }\   216                    pstore(C0+i+(I)*PacketSize, c0);   219         for(
Index i=0; i<actual_b_end1; i+=PacketSize*8)
   232         for(
Index i=actual_b_end1; i<actual_b_end2; i+=PacketSize)
   237         for(
Index i=actual_b_end2; i<actual_b; ++i)
   240             C0[i] += A0[i]*Bc0[0]+A1[i]*Bc0[1]+A2[i]*Bc0[2]+A3[i]*Bc0[3];
   242             C0[i] += A0[i]*Bc0[0]+A1[i]*Bc0[1];
   254       for(
Index j=0; j<n; ++j)
   257           Alignment = PacketSize>1 ? 
Aligned : 0
   261         if(rd==1)       MapVector(C+j*ldc+ib,actual_b) += B[0+d_end+j*ldb] * ConstMapVector(A+(d_end+0)*lda+ib, actual_b);
   263         else if(rd==2)  MapVector(C+j*ldc+ib,actual_b) += B[0+d_end+j*ldb] * ConstMapVector(A+(d_end+0)*lda+ib, actual_b)
   264                                                         + B[1+d_end+j*ldb] * ConstMapVector(A+(d_end+1)*lda+ib, actual_b);
   266         else            MapVector(C+j*ldc+ib,actual_b) += B[0+d_end+j*ldb] * ConstMapVector(A+(d_end+0)*lda+ib, actual_b)
   267                                                         + B[1+d_end+j*ldb] * ConstMapVector(A+(d_end+1)*lda+ib, actual_b)
   268                                                         + B[2+d_end+j*ldb] * ConstMapVector(A+(d_end+2)*lda+ib, actual_b);
   280 #endif // EIGEN_SPARSELU_GEMM_KERNEL_H EIGEN_DONT_INLINE void sparselu_gemm(Index m, Index n, Index d, const Scalar *A, Index lda, const Scalar *B, Index ldb, Scalar *C, Index ldc)
A matrix or vector expression mapping an existing array of data. 
#define EIGEN_ASM_COMMENT(X)
#define EIGEN_DONT_INLINE
static Index first_default_aligned(const DenseBase< Derived > &m)
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API. 
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
#define eigen_internal_assert(x)
EIGEN_DEVICE_FUNC void prefetch(const Scalar *addr)