32 # define M_PI 3.141592653589793238462643383279502884196    45 static inline void idct_1D_u32(int32_t *in, 
int instride, int32_t *out, 
int outstride)
    47     for (
int x = 0; x < 8; x++)
    55         int32_t c181 = c * 181;
    56         out[0*outstride] += c181;
    57         out[1*outstride] += c181;
    58         out[2*outstride] += c181;
    59         out[3*outstride] += c181;
    60         out[4*outstride] += c181;
    61         out[5*outstride] += c181;
    62         out[6*outstride] += c181;
    63         out[7*outstride] += c181;
    69         int32_t c251 = c * 251;
    70         int32_t c212 = c * 212;
    71         int32_t c142 = c * 142;
    73         out[0*outstride] += c251;
    74         out[1*outstride] += c212;
    75         out[2*outstride] += c142;
    76         out[3*outstride] += c49;
    77         out[4*outstride] -= c49;
    78         out[5*outstride] -= c142;
    79         out[6*outstride] -= c212;
    80         out[7*outstride] -= c251;
    88         out[0*outstride] += c236;
    89         out[1*outstride] += c97;
    90         out[2*outstride] -= c97;
    91         out[3*outstride] -= c236;
    92         out[4*outstride] -= c236;
    93         out[5*outstride] -= c97;
    94         out[6*outstride] += c97;
    95         out[7*outstride] += c236;
   101         int32_t c212 = c*212;
   103         int32_t c251 = c*251;
   104         int32_t c142 = c*142;
   105         out[0*outstride] += c212;
   106         out[1*outstride] -= c49;
   107         out[2*outstride] -= c251;
   108         out[3*outstride] -= c142;
   109         out[4*outstride] += c142;
   110         out[5*outstride] += c251;
   111         out[6*outstride] += c49;
   112         out[7*outstride] -= c212;
   118         int32_t c181 = c*181;
   119         out[0*outstride] += c181;
   120         out[1*outstride] -= c181;
   121         out[2*outstride] -= c181;
   122         out[3*outstride] += c181;
   123         out[4*outstride] += c181;
   124         out[5*outstride] -= c181;
   125         out[6*outstride] -= c181;
   126         out[7*outstride] += c181;
   132         int32_t c142 = c*142;
   133         int32_t c251 = c*251;
   135         int32_t c212 = c*212;
   136         out[0*outstride] += c142;
   137         out[1*outstride] -= c251;
   138         out[2*outstride] += c49;
   139         out[3*outstride] += c212;
   140         out[4*outstride] -= c212;
   141         out[5*outstride] -= c49;
   142         out[6*outstride] += c251;
   143         out[7*outstride] -= c142;
   150         int32_t c236 = c*236;
   151         out[0*outstride] += c97;
   152         out[1*outstride] -= c236;
   153         out[2*outstride] += c236;
   154         out[3*outstride] -= c97;
   155         out[4*outstride] -= c97;
   156         out[5*outstride] += c236;
   157         out[6*outstride] -= c236;
   158         out[7*outstride] += c97;
   165         int32_t c142 = c*142;
   166         int32_t c212 = c*212;
   167         int32_t c251 = c*251;
   168         out[0*outstride] += c49;
   169         out[1*outstride] -= c142;
   170         out[2*outstride] += c212;
   171         out[3*outstride] -= c251;
   172         out[4*outstride] += c251;
   173         out[5*outstride] -= c212;
   174         out[6*outstride] += c142;
   175         out[7*outstride] -= c49;
   184     for (
int y = 0; y < 8; y++)
   190     for (
int x = 0; x < 8; x++)
   194     for (
int y = 0; y < 8; y++) {
   195         for (
int x = 0; x < 8; x++) {
   207             const int32_t offset = (128 << 18) + (1 << 17);
   208             int32_t v = (tmp2[i] + offset) >> 18;
   215             out[y*outstride + x] = v;
   222 static inline void idct_1D_double(
double *in, 
int instride, 
double *out, 
int outstride)
   224     for (
int x = 0; x < 8; x++)
   225         out[x*outstride] = 0;
   228     double Cu = 1/sqrt(2);
   230     for (
int u = 0; u < 8; u++, Cu = 1) {
   232         double coeff = in[u*instride];
   236         for (
int x = 0; x < 8; x++)
   237             out[x*outstride] += Cu*cos((2*x+1)*u*
M_PI/16) * coeff;
   243     double din[64], dout[64];
   244     for (
int i = 0; i < 64; i++)
   250     for (
int y = 0; y < 8; y++)
   254     for (
int x = 0; x < 8; x++)
   258     for (
int y = 0; y < 8; y++) {
   259         for (
int x = 0; x < 8; x++) {
   262             dout[i] = (dout[i] / 4) + 128;
   269             out[y*outstride + x] = dout[i];
   275 static inline unsigned char njClip(
const int x) {
   276     return (x < 0) ? 0 : ((x > 0xFF) ? 0xFF : (
unsigned char) x);
   287     int x0, x1, x2, x3, x4, x5, x6, x7, x8;
   288     if (!((x1 = blk[4] << 11)
   296         blk[0] = blk[1] = blk[2] = blk[3] = blk[4] = blk[5] = blk[6] = blk[7] = blk[0] << 3;
   299     x0 = (blk[0] << 11) + 128;
   301     x4 = x8 + (
W1 - 
W7) * x4;
   302     x5 = x8 - (
W1 + 
W7) * x5;
   304     x6 = x8 - (
W3 - 
W5) * x6;
   305     x7 = x8 - (
W3 + 
W5) * x7;
   309     x2 = x1 - (
W2 + 
W6) * x2;
   310     x3 = x1 + (
W2 - 
W6) * x3;
   319     x2 = (181 * (x4 + x5) + 128) >> 8;
   320     x4 = (181 * (x4 - x5) + 128) >> 8;
   321     blk[0] = (x7 + x1) >> 8;
   322     blk[1] = (x3 + x2) >> 8;
   323     blk[2] = (x0 + x4) >> 8;
   324     blk[3] = (x8 + x6) >> 8;
   325     blk[4] = (x8 - x6) >> 8;
   326     blk[5] = (x0 - x4) >> 8;
   327     blk[6] = (x3 - x2) >> 8;
   328     blk[7] = (x7 - x1) >> 8;
   331 static inline void njColIDCT(
const int* blk, 
unsigned char *out, 
int stride) {
   332     int x0, x1, x2, x3, x4, x5, x6, x7, x8;
   333     if (!((x1 = blk[8*4] << 8)
   341         x1 = 
njClip(((blk[0] + 32) >> 6) + 128);
   342         for (x0 = 8;  x0;  --x0) {
   343             *out = (
unsigned char) x1;
   348     x0 = (blk[0] << 8) + 8192;
   349     x8 = 
W7 * (x4 + x5) + 4;
   350     x4 = (x8 + (
W1 - 
W7) * x4) >> 3;
   351     x5 = (x8 - (
W1 + 
W7) * x5) >> 3;
   352     x8 = 
W3 * (x6 + x7) + 4;
   353     x6 = (x8 - (
W3 - 
W5) * x6) >> 3;
   354     x7 = (x8 - (
W3 + 
W5) * x7) >> 3;
   357     x1 = 
W6 * (x3 + x2) + 4;
   358     x2 = (x1 - (
W2 + 
W6) * x2) >> 3;
   359     x3 = (x1 + (
W2 - 
W6) * x3) >> 3;
   368     x2 = (181 * (x4 + x5) + 128) >> 8;
   369     x4 = (181 * (x4 - x5) + 128) >> 8;
   370     *out = 
njClip(((x7 + x1) >> 14) + 128);  out += stride;
   371     *out = 
njClip(((x3 + x2) >> 14) + 128);  out += stride;
   372     *out = 
njClip(((x0 + x4) >> 14) + 128);  out += stride;
   373     *out = 
njClip(((x8 + x6) >> 14) + 128);  out += stride;
   374     *out = 
njClip(((x8 - x6) >> 14) + 128);  out += stride;
   375     *out = 
njClip(((x0 - x4) >> 14) + 128);  out += stride;
   376     *out = 
njClip(((x3 - x2) >> 14) + 128);  out += stride;
   377     *out = 
njClip(((x7 - x1) >> 14) + 128);
   384     for (coef = 0;  coef < 64;  coef += 8)
   386     for (coef = 0;  coef < 8;  ++coef)
   387         njColIDCT(&in[coef], &out[coef], outstride);
 
static void idct_1D_u32(int32_t *in, int instride, int32_t *out, int outstride)
static void njRowIDCT(int *blk)
void pjpeg_idct_2D_double(int32_t in[64], uint8_t *out, uint32_t outstride)
static unsigned char njClip(const int x)
static void njColIDCT(const int *blk, unsigned char *out, int stride)
static void idct_1D_double(double *in, int instride, double *out, int outstride)
void pjpeg_idct_2D_nanojpeg(int32_t in[64], uint8_t *out, uint32_t outstride)
void pjpeg_idct_2D_u32(int32_t in[64], uint8_t *out, uint32_t outstride)