10 #ifndef EIGEN_SPARSELU_GEMM_KERNEL_H    11 #define EIGEN_SPARSELU_GEMM_KERNEL_H    24 template<
typename Scalar,
typename Index>
    26 void sparselu_gemm(Index m, Index n, Index d, 
const Scalar* A, Index lda, 
const Scalar* B, Index ldb, Scalar* C, Index ldc)
    36     RK = NumberOfRegisters>=16 ? 4 : 2, 
    37     BM = 4096/
sizeof(Scalar),           
    40   Index d_end = (d/RK)*RK;    
    41   Index n_end = (n/RN)*RN;    
    47   for(Index i=0; i<i0; ++i)
    49     for(Index j=0; j<n; ++j)
    51       Scalar c = C[i+j*ldc];
    52       for(Index k=0; k<d; ++k)
    53         c += B[k+j*ldb] * A[i+k*lda];
    58   for(Index ib=i0; ib<m; ib+=BM)
    60     Index actual_b = std::min<Index>(BM, m-ib);                 
    61     Index actual_b_end1 = (actual_b/SM)*SM;                   
    62     Index actual_b_end2 = (actual_b/PacketSize)*PacketSize;   
    65     for(Index j=0; j<n_end; j+=RN)
    67       const Scalar* Bc0 = B+(j+0)*ldb;
    68       const Scalar* Bc1 = B+(j+1)*ldb;
    70       for(Index k=0; k<d_end; k+=RK)
    74         Packet b00, b10, b20, b30, b01, b11, b21, b31;
    75                   b00 = pset1<Packet>(Bc0[0]);
    76                   b10 = pset1<Packet>(Bc0[1]);
    77         if(RK==4) b20 = pset1<Packet>(Bc0[2]);
    78         if(RK==4) b30 = pset1<Packet>(Bc0[3]);
    79                   b01 = pset1<Packet>(Bc1[0]);
    80                   b11 = pset1<Packet>(Bc1[1]);
    81         if(RK==4) b21 = pset1<Packet>(Bc1[2]);
    82         if(RK==4) b31 = pset1<Packet>(Bc1[3]);
    84         Packet a0, a1, a2, a3, c0, c1, t0, t1;
    86         const Scalar* A0 = A+ib+(k+0)*lda;
    87         const Scalar* A1 = A+ib+(k+1)*lda;
    88         const Scalar* A2 = A+ib+(k+2)*lda;
    89         const Scalar* A3 = A+ib+(k+3)*lda;
    91         Scalar* C0 = C+ib+(j+0)*ldc;
    92         Scalar* C1 = C+ib+(j+1)*ldc;
    94                   a0 = pload<Packet>(A0);
    95                   a1 = pload<Packet>(A1);
    98           a2 = pload<Packet>(A2);
    99           a3 = pload<Packet>(A3);
   107 #define KMADD(c, a, b, tmp) {tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);}   109                     c0 = pload<Packet>(C0+i+(I)*PacketSize);   \   110                     c1 = pload<Packet>(C1+i+(I)*PacketSize);   \   111                     KMADD(c0, a0, b00, t0)      \   112                     KMADD(c1, a0, b01, t1)      \   113                     a0 = pload<Packet>(A0+i+(I+1)*PacketSize); \   114                     KMADD(c0, a1, b10, t0)      \   115                     KMADD(c1, a1, b11, t1)       \   116                     a1 = pload<Packet>(A1+i+(I+1)*PacketSize); \   117           if(RK==4) KMADD(c0, a2, b20, t0)       \   118           if(RK==4) KMADD(c1, a2, b21, t1)       \   119           if(RK==4) a2 = pload<Packet>(A2+i+(I+1)*PacketSize); \   120           if(RK==4) KMADD(c0, a3, b30, t0)       \   121           if(RK==4) KMADD(c1, a3, b31, t1)       \   122           if(RK==4) a3 = pload<Packet>(A3+i+(I+1)*PacketSize); \   123                     pstore(C0+i+(I)*PacketSize, c0);           \   124                     pstore(C1+i+(I)*PacketSize, c1)   127         for(Index i=0; i<actual_b_end1; i+=PacketSize*8)
   132           if(RK==4) 
prefetch((A2+i+(5)*PacketSize));
   133           if(RK==4) 
prefetch((A3+i+(5)*PacketSize));
   144         for(Index i=actual_b_end1; i<actual_b_end2; i+=PacketSize)
   150         for(Index i=actual_b_end2; i<actual_b; ++i)
   154             C0[i] += A0[i]*Bc0[0]+A1[i]*Bc0[1]+A2[i]*Bc0[2]+A3[i]*Bc0[3];
   155             C1[i] += A0[i]*Bc1[0]+A1[i]*Bc1[1]+A2[i]*Bc1[2]+A3[i]*Bc1[3];
   159             C0[i] += A0[i]*Bc0[0]+A1[i]*Bc0[1];
   160             C1[i] += A0[i]*Bc1[0]+A1[i]*Bc1[1];
   171       const Scalar* Bc0 = B+(n-1)*ldb;
   173       for(Index k=0; k<d_end; k+=RK)
   177         Packet b00, b10, b20, b30;
   178                   b00 = pset1<Packet>(Bc0[0]);
   179                   b10 = pset1<Packet>(Bc0[1]);
   180         if(RK==4) b20 = pset1<Packet>(Bc0[2]);
   181         if(RK==4) b30 = pset1<Packet>(Bc0[3]);
   183         Packet a0, a1, a2, a3, c0, t0;
   185         const Scalar* A0 = A+ib+(k+0)*lda;
   186         const Scalar* A1 = A+ib+(k+1)*lda;
   187         const Scalar* A2 = A+ib+(k+2)*lda;
   188         const Scalar* A3 = A+ib+(k+3)*lda;
   190         Scalar* C0 = C+ib+(n_end)*ldc;
   192                   a0 = pload<Packet>(A0);
   193                   a1 = pload<Packet>(A1);
   196           a2 = pload<Packet>(A2);
   197           a3 = pload<Packet>(A3);
   206                   c0 = pload<Packet>(C0+i+(I)*PacketSize);   \   207                   KMADD(c0, a0, b00, t0)       \   208                   a0 = pload<Packet>(A0+i+(I+1)*PacketSize); \   209                   KMADD(c0, a1, b10, t0)       \   210                   a1 = pload<Packet>(A1+i+(I+1)*PacketSize); \   211         if(RK==4) KMADD(c0, a2, b20, t0)       \   212         if(RK==4) a2 = pload<Packet>(A2+i+(I+1)*PacketSize); \   213         if(RK==4) KMADD(c0, a3, b30, t0)       \   214         if(RK==4) a3 = pload<Packet>(A3+i+(I+1)*PacketSize); \   215                   pstore(C0+i+(I)*PacketSize, c0);   218         for(Index i=0; i<actual_b_end1; i+=PacketSize*8)
   231         for(Index i=actual_b_end1; i<actual_b_end2; i+=PacketSize)
   236         for(Index i=actual_b_end2; i<actual_b; ++i)
   239             C0[i] += A0[i]*Bc0[0]+A1[i]*Bc0[1]+A2[i]*Bc0[2]+A3[i]*Bc0[3];
   241             C0[i] += A0[i]*Bc0[0]+A1[i]*Bc0[1];
   253       for(Index j=0; j<n; ++j)
   256           Alignment = PacketSize>1 ? 
Aligned : 0
   260         if(rd==1)       MapVector(C+j*ldc+ib,actual_b) += B[0+d_end+j*ldb] * ConstMapVector(A+(d_end+0)*lda+ib, actual_b);
   262         else if(rd==2)  MapVector(C+j*ldc+ib,actual_b) += B[0+d_end+j*ldb] * ConstMapVector(A+(d_end+0)*lda+ib, actual_b)
   263                                                         + B[1+d_end+j*ldb] * ConstMapVector(A+(d_end+1)*lda+ib, actual_b);
   265         else            MapVector(C+j*ldc+ib,actual_b) += B[0+d_end+j*ldb] * ConstMapVector(A+(d_end+0)*lda+ib, actual_b)
   266                                                         + B[1+d_end+j*ldb] * ConstMapVector(A+(d_end+1)*lda+ib, actual_b)
   267                                                         + B[2+d_end+j*ldb] * ConstMapVector(A+(d_end+2)*lda+ib, actual_b);
   279 #endif // EIGEN_SPARSELU_GEMM_KERNEL_H #define EIGEN_ASM_COMMENT(X)
A matrix or vector expression mapping an existing array of data. 
#define eigen_internal_assert(x)
void prefetch(const Scalar *addr)
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
EIGEN_DONT_INLINE void sparselu_gemm(Index m, Index n, Index d, const Scalar *A, Index lda, const Scalar *B, Index ldb, Scalar *C, Index ldc)
#define EIGEN_DONT_INLINE
static Derived::Index first_aligned(const Derived &m)