sick_scan_xd: toojpeg.cpp Source File

Go to the documentation of this file.
 // //////////////////////////////////////////////////////////
 // toojpeg.cpp
 // written by Stephan Brumme, 2018-2019
 // see https://create.stephan-brumme.com/toojpeg/
 //
  
 #include "toojpeg.h"
  
 // - the "official" specifications: https://www.w3.org/Graphics/JPEG/itu-t81.pdf and https://www.w3.org/Graphics/JPEG/jfif3.pdf
 // - Wikipedia has a short description of the JFIF/JPEG file format: https://en.wikipedia.org/wiki/JPEG_File_Interchange_Format
 // - the popular STB Image library includes Jon's JPEG encoder as well: https://github.com/nothings/stb/blob/master/stb_image_write.h
 // - the most readable JPEG book (from a developer's perspective) is Miano's "Compressed Image File Formats" (1999, ISBN 0-201-60443-4),
 //   used copies are really cheap nowadays and include a CD with C++ sources as well (plus great format descriptions of GIF & PNG)
 // - much more detailled is Mitchell/Pennebaker's "JPEG: Still Image Data Compression Standard" (1993, ISBN 0-442-01272-1)
 //   which contains the official JPEG standard, too - fun fact: I bought a signed copy in a second-hand store without noticing
  
 namespace // anonymous namespace to hide local functions / constants / etc.
 {
 // ////////////////////////////////////////
 // data types
 using uint8_t  = unsigned char;
 using uint16_t = unsigned short;
 using  int16_t =          short;
 using  int32_t =          int; // at least four bytes
  
 // ////////////////////////////////////////
 // constants
  
 // quantization tables from JPEG Standard, Annex K
 const uint8_t DefaultQuantLuminance[8*8] =
     { 16, 11, 10, 16, 24, 40, 51, 61, // there are a few experts proposing slightly more efficient values,
       12, 12, 14, 19, 26, 58, 60, 55, // e.g. https://www.imagemagick.org/discourse-server/viewtopic.php?t=20333
       14, 13, 16, 24, 40, 57, 69, 56, // btw: Google's Guetzli project optimizes the quantization tables per image
       14, 17, 22, 29, 51, 87, 80, 62,
       18, 22, 37, 56, 68,109,103, 77,
       24, 35, 55, 64, 81,104,113, 92,
       49, 64, 78, 87,103,121,120,101,
       72, 92, 95, 98,112,100,103, 99 };
 const uint8_t DefaultQuantChrominance[8*8] =
     { 17, 18, 24, 47, 99, 99, 99, 99,
       18, 21, 26, 66, 99, 99, 99, 99,
       24, 26, 56, 99, 99, 99, 99, 99,
       47, 66, 99, 99, 99, 99, 99, 99,
       99, 99, 99, 99, 99, 99, 99, 99,
       99, 99, 99, 99, 99, 99, 99, 99,
       99, 99, 99, 99, 99, 99, 99, 99,
       99, 99, 99, 99, 99, 99, 99, 99 };
  
 // 8x8 blocks are processed in zig-zag order
 // most encoders use a zig-zag "forward" table, I switched to its inverse for performance reasons
 // note: ZigZagInv[ZigZag[i]] = i
 const uint8_t ZigZagInv[8*8] =
     {  0, 1, 8,16, 9, 2, 3,10,   // ZigZag[] =  0, 1, 5, 6,14,15,27,28,
       17,24,32,25,18,11, 4, 5,   //             2, 4, 7,13,16,26,29,42,
       12,19,26,33,40,48,41,34,   //             3, 8,12,17,25,30,41,43,
       27,20,13, 6, 7,14,21,28,   //             9,11,18,24,31,40,44,53,
       35,42,49,56,57,50,43,36,   //            10,19,23,32,39,45,52,54,
       29,22,15,23,30,37,44,51,   //            20,22,33,38,46,51,55,60,
       58,59,52,45,38,31,39,46,   //            21,34,37,47,50,56,59,61,
       53,60,61,54,47,55,62,63 }; //            35,36,48,49,57,58,62,63
  
 // static Huffman code tables from JPEG standard Annex K
 // - CodesPerBitsize tables define how many Huffman codes will have a certain bitsize (plus 1 because there nothing with zero bits),
 //   e.g. DcLuminanceCodesPerBitsize[2] = 5 because there are 5 Huffman codes being 2+1=3 bits long
 // - Values tables are a list of values ordered by their Huffman code bitsize,
 //   e.g. AcLuminanceValues => Huffman(0x01,0x02 and 0x03) will have 2 bits, Huffman(0x00) will have 3 bits, Huffman(0x04,0x11 and 0x05) will have 4 bits, ...
  
 // Huffman definitions for first DC/AC tables (luminance / Y channel)
 const uint8_t DcLuminanceCodesPerBitsize[16]   = { 0,1,5,1,1,1,1,1,1,0,0,0,0,0,0,0 };   // sum = 12
 const uint8_t DcLuminanceValues         [12]   = { 0,1,2,3,4,5,6,7,8,9,10,11 };         // => 12 codes
 const uint8_t AcLuminanceCodesPerBitsize[16]   = { 0,2,1,3,3,2,4,3,5,5,4,4,0,0,1,125 }; // sum = 162
 const uint8_t AcLuminanceValues        [162]   =                                        // => 162 codes
     { 0x01,0x02,0x03,0x00,0x04,0x11,0x05,0x12,0x21,0x31,0x41,0x06,0x13,0x51,0x61,0x07,0x22,0x71,0x14,0x32,0x81,0x91,0xA1,0x08, // 16*10+2 symbols because
       0x23,0x42,0xB1,0xC1,0x15,0x52,0xD1,0xF0,0x24,0x33,0x62,0x72,0x82,0x09,0x0A,0x16,0x17,0x18,0x19,0x1A,0x25,0x26,0x27,0x28, // upper 4 bits can be 0..F
       0x29,0x2A,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x53,0x54,0x55,0x56,0x57,0x58,0x59, // while lower 4 bits can be 1..A
       0x5A,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x83,0x84,0x85,0x86,0x87,0x88,0x89, // plus two special codes 0x00 and 0xF0
       0x8A,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9A,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,0xA8,0xA9,0xAA,0xB2,0xB3,0xB4,0xB5,0xB6, // order of these symbols was determined empirically by JPEG committee
       0xB7,0xB8,0xB9,0xBA,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,0xC8,0xC9,0xCA,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,0xD8,0xD9,0xDA,0xE1,0xE2,
       0xE3,0xE4,0xE5,0xE6,0xE7,0xE8,0xE9,0xEA,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,0xF8,0xF9,0xFA };
 // Huffman definitions for second DC/AC tables (chrominance / Cb and Cr channels)
 const uint8_t DcChrominanceCodesPerBitsize[16] = { 0,3,1,1,1,1,1,1,1,1,1,0,0,0,0,0 };   // sum = 12
 const uint8_t DcChrominanceValues         [12] = { 0,1,2,3,4,5,6,7,8,9,10,11 };         // => 12 codes (identical to DcLuminanceValues)
 const uint8_t AcChrominanceCodesPerBitsize[16] = { 0,2,1,2,4,4,3,4,7,5,4,4,0,1,2,119 }; // sum = 162
 const uint8_t AcChrominanceValues        [162] =                                        // => 162 codes
     { 0x00,0x01,0x02,0x03,0x11,0x04,0x05,0x21,0x31,0x06,0x12,0x41,0x51,0x07,0x61,0x71,0x13,0x22,0x32,0x81,0x08,0x14,0x42,0x91, // same number of symbol, just different order
       0xA1,0xB1,0xC1,0x09,0x23,0x33,0x52,0xF0,0x15,0x62,0x72,0xD1,0x0A,0x16,0x24,0x34,0xE1,0x25,0xF1,0x17,0x18,0x19,0x1A,0x26, // (which is more efficient for AC coding)
       0x27,0x28,0x29,0x2A,0x35,0x36,0x37,0x38,0x39,0x3A,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x53,0x54,0x55,0x56,0x57,0x58,
       0x59,0x5A,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x82,0x83,0x84,0x85,0x86,0x87,
       0x88,0x89,0x8A,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9A,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,0xA8,0xA9,0xAA,0xB2,0xB3,0xB4,
       0xB5,0xB6,0xB7,0xB8,0xB9,0xBA,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,0xC8,0xC9,0xCA,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,0xD8,0xD9,0xDA,
       0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,0xE8,0xE9,0xEA,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,0xF8,0xF9,0xFA };
 const int16_t CodeWordLimit = 2048; // +/-2^11, maximum value after DCT
  
 // ////////////////////////////////////////
 // structs
  
 // represent a single Huffman code
 struct BitCode
 {
   BitCode() = default; // undefined state, must be initialized at a later time
   BitCode(uint16_t code_, uint8_t numBits_)
   : code(code_), numBits(numBits_) {}
   uint16_t code;       // JPEG's Huffman codes are limited to 16 bits
   uint8_t  numBits;    // number of valid bits
 };
  
 // wrapper for bit output operations
 struct BitWriter
 {
   // user-supplied callback that writes/stores one byte
   TooJpeg::WRITE_ONE_BYTE output;
   // initialize writer
   explicit BitWriter(TooJpeg::WRITE_ONE_BYTE output_) : output(output_) {}
  
   // store the most recently encoded bits that are not written yet
   struct BitBuffer
   {
     int32_t data    = 0; // actually only at most 24 bits are used
     uint8_t numBits = 0; // number of valid bits (the right-most bits)
   } buffer;
  
   // write Huffman bits stored in BitCode, keep excess bits in BitBuffer
   BitWriter& operator<<(const BitCode& data)
   {
     // append the new bits to those bits leftover from previous call(s)
     buffer.numBits += data.numBits;
     buffer.data   <<= data.numBits;
     buffer.data    |= data.code;
  
     // write all "full" bytes
     while (buffer.numBits >= 8)
     {
       // extract highest 8 bits
       buffer.numBits -= 8;
       auto oneByte = uint8_t(buffer.data >> buffer.numBits);
       output(oneByte);
  
       if (oneByte == 0xFF) // 0xFF has a special meaning for JPEGs (it's a block marker)
         output(0);         // therefore pad a zero to indicate "nope, this one ain't a marker, it's just a coincidence"
  
       // note: I don't clear those written bits, therefore buffer.bits may contain garbage in the high bits
       //       if you really want to "clean up" (e.g. for debugging purposes) then uncomment the following line
       //buffer.bits &= (1 << buffer.numBits) - 1;
     }
     return *this;
   }
  
   // write all non-yet-written bits, fill gaps with 1s (that's a strange JPEG thing)
   void flush()
   {
     // at most seven set bits needed to "fill" the last byte: 0x7F = binary 0111 1111
     *this << BitCode(0x7F, 7); // I should set buffer.numBits = 0 but since there are no single bits written after flush() I can safely ignore it
   }
  
   // NOTE: all the following BitWriter functions IGNORE the BitBuffer and write straight to output !
   // write a single byte
   BitWriter& operator<<(uint8_t oneByte)
   {
     output(oneByte);
     return *this;
   }
  
   // write an array of bytes
   template <typename T, int Size>
   BitWriter& operator<<(T (&manyBytes)[Size])
   {
     for (auto c : manyBytes)
       output(c);
     return *this;
   }
  
   // start a new JFIF block
   void addMarker(uint8_t id, uint16_t length)
   {
     output(0xFF); output(id);     // ID, always preceded by 0xFF
     output(uint8_t(length >> 8)); // length of the block (big-endian, includes the 2 length bytes as well)
     output(uint8_t(length & 0xFF));
   }
 };
  
 // ////////////////////////////////////////
 // functions / templates
  
 // same as std::min()
 template <typename Number>
 Number minimum(Number value, Number maximum)
 {
   return value <= maximum ? value : maximum;
 }
  
 // restrict a value to the interval [minimum, maximum]
 template <typename Number, typename Limit>
 Number clamp(Number value, Limit minValue, Limit maxValue)
 {
   if (value <= minValue) return minValue; // never smaller than the minimum
   if (value >= maxValue) return maxValue; // never bigger  than the maximum
   return value;                           // value was inside interval, keep it
 }
  
 // convert from RGB to YCbCr, constants are similar to ITU-R, see https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion
 float rgb2y (float r, float g, float b) { return +0.299f   * r +0.587f   * g +0.114f   * b; }
 float rgb2cb(float r, float g, float b) { return -0.16874f * r -0.33126f * g +0.5f     * b; }
 float rgb2cr(float r, float g, float b) { return +0.5f     * r -0.41869f * g -0.08131f * b; }
 float rgb2y (int r, int g, int b) { return rgb2y ((float)r, (float)g, (float)b); }
 float rgb2cb(int r, int g, int b) { return rgb2cb((float)r, (float)g, (float)b); }
 float rgb2cr(int r, int g, int b) { return rgb2cr((float)r, (float)g, (float)b); }
  
 // forward DCT computation "in one dimension" (fast AAN algorithm by Arai, Agui and Nakajima: "A fast DCT-SQ scheme for images")
 void DCT(float block[8*8], uint8_t stride) // stride must be 1 (=horizontal) or 8 (=vertical)
 {
   const auto SqrtHalfSqrt = 1.306562965f; //    sqrt((2 + sqrt(2)) / 2) = cos(pi * 1 / 8) * sqrt(2)
   const auto InvSqrt      = 0.707106781f; // 1 / sqrt(2)                = cos(pi * 2 / 8)
   const auto HalfSqrtSqrt = 0.382683432f; //     sqrt(2 - sqrt(2)) / 2  = cos(pi * 3 / 8)
   const auto InvSqrtSqrt  = 0.541196100f; // 1 / sqrt(2 - sqrt(2))      = cos(pi * 3 / 8) * sqrt(2)
  
   // modify in-place
   auto& block0 = block[0         ];
   auto& block1 = block[1 * stride];
   auto& block2 = block[2 * stride];
   auto& block3 = block[3 * stride];
   auto& block4 = block[4 * stride];
   auto& block5 = block[5 * stride];
   auto& block6 = block[6 * stride];
   auto& block7 = block[7 * stride];
  
   // based on https://dev.w3.org/Amaya/libjpeg/jfdctflt.c , the original variable names can be found in my comments
   auto add07 = block0 + block7; auto sub07 = block0 - block7; // tmp0, tmp7
   auto add16 = block1 + block6; auto sub16 = block1 - block6; // tmp1, tmp6
   auto add25 = block2 + block5; auto sub25 = block2 - block5; // tmp2, tmp5
   auto add34 = block3 + block4; auto sub34 = block3 - block4; // tmp3, tmp4
  
   auto add0347 = add07 + add34; auto sub07_34 = add07 - add34; // tmp10, tmp13 ("even part" / "phase 2")
   auto add1256 = add16 + add25; auto sub16_25 = add16 - add25; // tmp11, tmp12
  
   block0 = add0347 + add1256; block4 = add0347 - add1256; // "phase 3"
  
   auto z1 = (sub16_25 + sub07_34) * InvSqrt; // all temporary z-variables kept their original names
   block2 = sub07_34 + z1; block6 = sub07_34 - z1; // "phase 5"
  
   auto sub23_45 = sub25 + sub34; // tmp10 ("odd part" / "phase 2")
   auto sub12_56 = sub16 + sub25; // tmp11
   auto sub01_67 = sub16 + sub07; // tmp12
  
   auto z5 = (sub23_45 - sub01_67) * HalfSqrtSqrt;
   auto z2 = sub23_45 * InvSqrtSqrt  + z5;
   auto z3 = sub12_56 * InvSqrt;
   auto z4 = sub01_67 * SqrtHalfSqrt + z5;
   auto z6 = sub07 + z3; // z11 ("phase 5")
   auto z7 = sub07 - z3; // z13
   block1 = z6 + z4; block7 = z6 - z4; // "phase 6"
   block5 = z7 + z2; block3 = z7 - z2;
 }
  
 // run DCT, quantize and write Huffman bit codes
 int16_t encodeBlock(BitWriter& writer, float block[8][8], const float scaled[8*8], int16_t lastDC,
                     const BitCode huffmanDC[256], const BitCode huffmanAC[256], const BitCode* codewords)
 {
   // "linearize" the 8x8 block, treat it as a flat array of 64 floats
   auto block64 = (float*) block;
  
   // DCT: rows
   for (auto offset = 0; offset < 8; offset++)
     DCT(block64 + offset*8, 1);
   // DCT: columns
   for (auto offset = 0; offset < 8; offset++)
     DCT(block64 + offset*1, 8);
  
   // scale
   for (auto i = 0; i < 8*8; i++)
     block64[i] *= scaled[i];
  
   // encode DC (the first coefficient is the "average color" of the 8x8 block)
   auto DC = int(block64[0] + (block64[0] >= 0 ? +0.5f : -0.5f)); // C++11's nearbyint() achieves a similar effect
  
   // quantize and zigzag the other 63 coefficients
   auto posNonZero = 0; // find last coefficient which is not zero (because trailing zeros are encoded differently)
   int16_t quantized[8*8];
   for (auto i = 1; i < 8*8; i++) // start at 1 because block64[0]=DC was already processed
   {
     auto value = block64[ZigZagInv[i]];
     // round to nearest integer
     quantized[i] = int(value + (value >= 0 ? +0.5f : -0.5f)); // C++11's nearbyint() achieves a similar effect
     // remember offset of last non-zero coefficient
     if (quantized[i] != 0)
       posNonZero = i;
   }
  
   // same "average color" as previous block ?
   auto diff = DC - lastDC;
   if (diff == 0)
     writer << huffmanDC[0x00];   // yes, write a special short symbol
   else
   {
     auto bits = codewords[diff]; // nope, encode the difference to previous block's average color
     writer << huffmanDC[bits.numBits] << bits;
   }
  
   // encode ACs (quantized[1..63])
   auto offset = 0; // upper 4 bits count the number of consecutive zeros
   for (auto i = 1; i <= posNonZero; i++) // quantized[0] was already written, skip all trailing zeros, too
   {
     // zeros are encoded in a special way
     while (quantized[i] == 0) // found another zero ?
     {
       offset    += 0x10; // add 1 to the upper 4 bits
       // split into blocks of at most 16 consecutive zeros
       if (offset > 0xF0) // remember, the counter is in the upper 4 bits, 0xF = 15
       {
         writer << huffmanAC[0xF0]; // 0xF0 is a special code for "16 zeros"
         offset = 0;
       }
       i++;
     }
  
     auto encoded = codewords[quantized[i]];
     // combine number of zeros with the number of bits of the next non-zero value
     writer << huffmanAC[offset + encoded.numBits] << encoded; // and the value itself
     offset = 0;
   }
  
   // send end-of-block code (0x00), only needed if there are trailing zeros
   if (posNonZero < 8*8 - 1) // = 63
     writer << huffmanAC[0x00];
  
   return DC;
 }
  
 // Jon's code includes the pre-generated Huffman codes
 // I don't like these "magic constants" and compute them on my own :-)
 void generateHuffmanTable(const uint8_t numCodes[16], const uint8_t* values, BitCode result[256])
 {
   // process all bitsizes 1 thru 16, no JPEG Huffman code is allowed to exceed 16 bits
   auto huffmanCode = 0;
   for (auto numBits = 1; numBits <= 16; numBits++)
   {
     // ... and each code of these bitsizes
     for (auto i = 0; i < numCodes[numBits - 1]; i++) // note: numCodes array starts at zero, but smallest bitsize is 1
       result[*values++] = BitCode(huffmanCode++, numBits);
  
     // next Huffman code needs to be one bit wider
     huffmanCode <<= 1;
   }
 }
  
 } // end of anonymous namespace
  
 // -------------------- externally visible code --------------------
  
 namespace TooJpeg
 {
 // the only exported function ...
 bool writeJpeg(WRITE_ONE_BYTE output, const void* pixels_, unsigned short width, unsigned short height,
                bool isRGB, unsigned char quality_, bool downsample, const char* comment)
 {
   // reject invalid pointers
   if (output == nullptr || pixels_ == nullptr)
     return false;
   // check image format
   if (width == 0 || height == 0)
     return false;
  
   // number of components
   const auto numComponents = isRGB ? 3 : 1;
   // note: if there is just one component (=grayscale), then only luminance needs to be stored in the file
   //       thus everything related to chrominance need not to be written to the JPEG
   //       I still compute a few things, like quantization tables to avoid a complete code mess
  
   // grayscale images can't be downsampled (because there are no Cb + Cr channels)
   if (!isRGB)
     downsample = false;
  
   // wrapper for all output operations
   BitWriter bitWriter(output);
  
   // ////////////////////////////////////////
   // JFIF headers
   const uint8_t HeaderJfif[2+2+16] =
       { 0xFF,0xD8,         // SOI marker (start of image)
         0xFF,0xE0,         // JFIF APP0 tag
         0,16,              // length: 16 bytes (14 bytes payload + 2 bytes for this length field)
         'J','F','I','F',0, // JFIF identifier, zero-terminated
         1,1,               // JFIF version 1.1
         0,                 // no density units specified
         0,1,0,1,           // density: 1 pixel "per pixel" horizontally and vertically
         0,0 };             // no thumbnail (size 0 x 0)
   bitWriter << HeaderJfif;
  
   // ////////////////////////////////////////
   // comment (optional)
   if (comment != nullptr)
   {
     // look for zero terminator
     auto length = 0; // = strlen(comment);
     while (comment[length] != 0)
       length++;
  
     // write COM marker
     bitWriter.addMarker(0xFE, 2+length); // block size is number of bytes (without zero terminator) + 2 bytes for this length field
     // ... and write the comment itself
     for (auto i = 0; i < length; i++)
       bitWriter << comment[i];
   }
  
   // ////////////////////////////////////////
   // adjust quantization tables to desired quality
  
   // quality level must be in 1 ... 100
   auto quality = clamp<uint16_t>(quality_, 1, 100);
   // convert to an internal JPEG quality factor, formula taken from libjpeg
   quality = quality < 50 ? 5000 / quality : 200 - quality * 2;
  
   uint8_t quantLuminance  [8*8];
   uint8_t quantChrominance[8*8];
   for (auto i = 0; i < 8*8; i++)
   {
     int luminance   = (DefaultQuantLuminance  [ZigZagInv[i]] * quality + 50) / 100;
     int chrominance = (DefaultQuantChrominance[ZigZagInv[i]] * quality + 50) / 100;
  
     // clamp to 1..255
     quantLuminance  [i] = clamp(luminance,   1, 255);
     quantChrominance[i] = clamp(chrominance, 1, 255);
   }
  
   // write quantization tables
   bitWriter.addMarker(0xDB, 2 + (isRGB ? 2 : 1) * (1 + 8*8)); // length: 65 bytes per table + 2 bytes for this length field
                                                               // each table has 64 entries and is preceded by an ID byte
  
   bitWriter   << 0x00 << quantLuminance;   // first  quantization table
   if (isRGB)
     bitWriter << 0x01 << quantChrominance; // second quantization table, only relevant for color images
  
   // ////////////////////////////////////////
   // write image infos (SOF0 - start of frame)
   bitWriter.addMarker(0xC0, 2+6+3*numComponents); // length: 6 bytes general info + 3 per channel + 2 bytes for this length field
  
   // 8 bits per channel
   bitWriter << 0x08
   // image dimensions (big-endian)
             << (height >> 8) << (height & 0xFF)
             << (width  >> 8) << (width  & 0xFF);
  
   // sampling and quantization tables for each component
   bitWriter << numComponents;       // 1 component (grayscale, Y only) or 3 components (Y,Cb,Cr)
   for (auto id = 1; id <= numComponents; id++)
     bitWriter <<  id                // component ID (Y=1, Cb=2, Cr=3)
     // bitmasks for sampling: highest 4 bits: horizontal, lowest 4 bits: vertical
               << (id == 1 && downsample ? 0x22 : 0x11) // 0x11 is default YCbCr 4:4:4 and 0x22 stands for YCbCr 4:2:0
               << (id == 1 ? 0 : 1); // use quantization table 0 for Y, table 1 for Cb and Cr
  
   // ////////////////////////////////////////
   // Huffman tables
   // DHT marker - define Huffman tables
   bitWriter.addMarker(0xC4, isRGB ? (2+208+208) : (2+208));
                             // 2 bytes for the length field, store chrominance only if needed
                             //   1+16+12  for the DC luminance
                             //   1+16+162 for the AC luminance   (208 = 1+16+12 + 1+16+162)
                             //   1+16+12  for the DC chrominance
                             //   1+16+162 for the AC chrominance (208 = 1+16+12 + 1+16+162, same as above)
  
   // store luminance's DC+AC Huffman table definitions
   bitWriter << 0x00 // highest 4 bits: 0 => DC, lowest 4 bits: 0 => Y (baseline)
             << DcLuminanceCodesPerBitsize
             << DcLuminanceValues;
   bitWriter << 0x10 // highest 4 bits: 1 => AC, lowest 4 bits: 0 => Y (baseline)
             << AcLuminanceCodesPerBitsize
             << AcLuminanceValues;
  
   // compute actual Huffman code tables (see Jon's code for precalculated tables)
   BitCode huffmanLuminanceDC[256];
   BitCode huffmanLuminanceAC[256];
   generateHuffmanTable(DcLuminanceCodesPerBitsize, DcLuminanceValues, huffmanLuminanceDC);
   generateHuffmanTable(AcLuminanceCodesPerBitsize, AcLuminanceValues, huffmanLuminanceAC);
  
   // chrominance is only relevant for color images
   BitCode huffmanChrominanceDC[256];
   BitCode huffmanChrominanceAC[256];
   if (isRGB)
   {
     // store luminance's DC+AC Huffman table definitions
     bitWriter << 0x01 // highest 4 bits: 0 => DC, lowest 4 bits: 1 => Cr,Cb (baseline)
               << DcChrominanceCodesPerBitsize
               << DcChrominanceValues;
     bitWriter << 0x11 // highest 4 bits: 1 => AC, lowest 4 bits: 1 => Cr,Cb (baseline)
               << AcChrominanceCodesPerBitsize
               << AcChrominanceValues;
  
     // compute actual Huffman code tables (see Jon's code for precalculated tables)
     generateHuffmanTable(DcChrominanceCodesPerBitsize, DcChrominanceValues, huffmanChrominanceDC);
     generateHuffmanTable(AcChrominanceCodesPerBitsize, AcChrominanceValues, huffmanChrominanceAC);
   }
  
   // ////////////////////////////////////////
   // start of scan (there is only a single scan for baseline JPEGs)
   bitWriter.addMarker(0xDA, 2+1+2*numComponents+3); // 2 bytes for the length field, 1 byte for number of components,
                                                     // then 2 bytes for each component and 3 bytes for spectral selection
  
   // assign Huffman tables to each component
   bitWriter << numComponents;
   for (auto id = 1; id <= numComponents; id++)
     // highest 4 bits: DC Huffman table, lowest 4 bits: AC Huffman table
     bitWriter << id << (id == 1 ? 0x00 : 0x11); // Y: tables 0 for DC and AC; Cb + Cr: tables 1 for DC and AC
  
   // constant values for our baseline JPEGs (which have a single sequential scan)
   static const uint8_t Spectral[3] = { 0, 63, 0 }; // spectral selection: must be from 0 to 63; successive approximation must be 0
   bitWriter << Spectral;
  
   // ////////////////////////////////////////
   // adjust quantization tables with AAN scaling factors to simplify DCT
   float scaledLuminance  [8*8];
   float scaledChrominance[8*8];
   for (auto i = 0; i < 8*8; i++)
   {
     auto row    = ZigZagInv[i] / 8; // same as ZigZagInv[i] >> 3
     auto column = ZigZagInv[i] % 8; // same as ZigZagInv[i] &  7
  
     // scaling constants for AAN DCT algorithm: AanScaleFactors[0] = 1, AanScaleFactors[k=1..7] = cos(k*PI/16) * sqrt(2)
     static const float AanScaleFactors[8] = { 1, 1.387039845f, 1.306562965f, 1.175875602f, 1, 0.785694958f, 0.541196100f, 0.275899379f };
     auto factor = 1 / (AanScaleFactors[row] * AanScaleFactors[column] * 8);
     scaledLuminance  [ZigZagInv[i]] = factor / quantLuminance  [i];
     scaledChrominance[ZigZagInv[i]] = factor / quantChrominance[i];
     // if you really want JPEGs that are bitwise identical to Jon Olick's code then you need slightly different formulas (note: sqrt(8) = 2.828427125f)
     //static const float aasf[] = { 1.0f * 2.828427125f, 1.387039845f * 2.828427125f, 1.306562965f * 2.828427125f, 1.175875602f * 2.828427125f, 1.0f * 2.828427125f, 0.785694958f * 2.828427125f, 0.541196100f * 2.828427125f, 0.275899379f * 2.828427125f }; // line 240 of jo_jpeg.cpp
     //scaledLuminance  [ZigZagInv[i]] = 1 / (quantLuminance  [i] * aasf[row] * aasf[column]); // lines 266-267 of jo_jpeg.cpp
     //scaledChrominance[ZigZagInv[i]] = 1 / (quantChrominance[i] * aasf[row] * aasf[column]);
   }
  
   // ////////////////////////////////////////
   // precompute JPEG codewords for quantized DCT
   BitCode  codewordsArray[2 * CodeWordLimit];          // note: quantized[i] is found at codewordsArray[quantized[i] + CodeWordLimit]
   BitCode* codewords = &codewordsArray[CodeWordLimit]; // allow negative indices, so quantized[i] is at codewords[quantized[i]]
   uint8_t numBits = 1; // each codeword has at least one bit (value == 0 is undefined)
   int32_t mask    = 1; // mask is always 2^numBits - 1, initial value 2^1-1 = 2-1 = 1
   for (int16_t value = 1; value < CodeWordLimit; value++)
   {
     // numBits = position of highest set bit (ignoring the sign)
     // mask    = (2^numBits) - 1
     if (value > mask) // one more bit ?
     {
       numBits++;
       mask = (mask << 1) | 1; // append a set bit
     }
     codewords[-value] = BitCode(mask - value, numBits); // note that I use a negative index => codewords[-value] = codewordsArray[CodeWordLimit  value]
     codewords[+value] = BitCode(       value, numBits);
   }
  
   // just convert image data from void*
   auto pixels = (const uint8_t*)pixels_;
  
   // the next two variables are frequently used when checking for image borders
   const auto maxWidth  = width  - 1; // "last row"
   const auto maxHeight = height - 1; // "bottom line"
  
   // process MCUs (minimum codes units) => image is subdivided into a grid of 8x8 or 16x16 tiles
   const auto sampling = downsample ? 2 : 1; // 1x1 or 2x2 sampling
   const auto mcuSize  = 8 * sampling;
  
   // average color of the previous MCU
   int16_t lastYDC = 0, lastCbDC = 0, lastCrDC = 0;
   // convert from RGB to YCbCr
   float Y[8][8], Cb[8][8], Cr[8][8];
  
   for (auto mcuY = 0; mcuY < height; mcuY += mcuSize) // each step is either 8 or 16 (=mcuSize)
     for (auto mcuX = 0; mcuX < width; mcuX += mcuSize)
     {
       // YCbCr 4:4:4 format: each MCU is a 8x8 block - the same applies to grayscale images, too
       // YCbCr 4:2:0 format: each MCU represents a 16x16 block, stored as 4x 8x8 Y-blocks plus 1x 8x8 Cb and 1x 8x8 Cr block)
       for (auto blockY = 0; blockY < mcuSize; blockY += 8) // iterate once (YCbCr444 and grayscale) or twice (YCbCr420)
         for (auto blockX = 0; blockX < mcuSize; blockX += 8)
         {
           // now we finally have an 8x8 block ...
           for (auto deltaY = 0; deltaY < 8; deltaY++)
           {
             auto column = minimum(mcuX + blockX         , maxWidth); // must not exceed image borders, replicate last row/column if needed
             auto row    = minimum(mcuY + blockY + deltaY, maxHeight);
             for (auto deltaX = 0; deltaX < 8; deltaX++)
             {
               // find actual pixel position within the current image
               auto pixelPos = row * int(width) + column; // the cast ensures that we don't run into multiplication overflows
               if (column < maxWidth)
                 column++;
  
               // grayscale images have solely a Y channel which can be easily derived from the input pixel by shifting it by 128
               if (!isRGB)
               {
                 Y[deltaY][deltaX] = pixels[pixelPos] - 128.f;
                 continue;
               }
  
               // RGB: 3 bytes per pixel (whereas grayscale images have only 1 byte per pixel)
               auto r = pixels[3 * pixelPos    ];
               auto g = pixels[3 * pixelPos + 1];
               auto b = pixels[3 * pixelPos + 2];
  
               Y   [deltaY][deltaX] = rgb2y (r, g, b) - 128; // again, the JPEG standard requires Y to be shifted by 128
               // YCbCr444 is easy - the more complex YCbCr420 has to be computed about 20 lines below in a second pass
               if (!downsample)
               {
                 Cb[deltaY][deltaX] = rgb2cb(r, g, b); // standard RGB-to-YCbCr conversion
                 Cr[deltaY][deltaX] = rgb2cr(r, g, b);
               }
             }
           }
  
         // encode Y channel
         lastYDC = encodeBlock(bitWriter, Y, scaledLuminance, lastYDC, huffmanLuminanceDC, huffmanLuminanceAC, codewords);
         // Cb and Cr are encoded about 50 lines below
       }
  
       // grayscale images don't need any Cb and Cr information
       if (!isRGB)
         continue;
  
       // ////////////////////////////////////////
       // the following lines are only relevant for YCbCr420:
       // average/downsample chrominance of four pixels while respecting the image borders
       if (downsample)
         for (short deltaY = 7; downsample && deltaY >= 0; deltaY--) // iterating loop in reverse increases cache read efficiency
         {
           auto row      = minimum(mcuY + 2*deltaY, maxHeight); // each deltaX/Y step covers a 2x2 area
           auto column   =         mcuX;                        // column is updated inside next loop
           auto pixelPos = (row * int(width) + column) * 3;     // numComponents = 3
  
           // deltas (in bytes) to next row / column, must not exceed image borders
           auto rowStep    = (row    < maxHeight) ? 3 * int(width) : 0; // always numComponents*width except for bottom    line
           auto columnStep = (column < maxWidth ) ? 3              : 0; // always numComponents       except for rightmost pixel
  
           for (short deltaX = 0; deltaX < 8; deltaX++)
           {
             // let's add all four samples (2x2 area)
             auto right     = pixelPos + columnStep;
             auto down      = pixelPos +              rowStep;
             auto downRight = pixelPos + columnStep + rowStep;
  
             // note: cast from 8 bits to >8 bits to avoid overflows when adding
             auto r = short(pixels[pixelPos    ]) + pixels[right    ] + pixels[down    ] + pixels[downRight    ];
             auto g = short(pixels[pixelPos + 1]) + pixels[right + 1] + pixels[down + 1] + pixels[downRight + 1];
             auto b = short(pixels[pixelPos + 2]) + pixels[right + 2] + pixels[down + 2] + pixels[downRight + 2];
  
             // convert to Cb and Cr
             Cb[deltaY][deltaX] = rgb2cb(r, g, b) / 4; // I still have to divide r,g,b by 4 to get their average values
             Cr[deltaY][deltaX] = rgb2cr(r, g, b) / 4; // it's a bit faster if done AFTER CbCr conversion
  
             // step forward to next 2x2 area
             pixelPos += 2*3; // 2 pixels => 6 bytes (2*numComponents)
             column   += 2;
  
             // reached right border ?
             if (column >= maxWidth)
             {
               columnStep = 0;
               pixelPos = ((row + 1) * int(width) - 1) * 3; // same as (row * width + maxWidth) * numComponents => current's row last pixel
             }
           }
         } // end of YCbCr420 code for Cb and Cr
  
       // encode Cb and Cr
       lastCbDC = encodeBlock(bitWriter, Cb, scaledChrominance, lastCbDC, huffmanChrominanceDC, huffmanChrominanceAC, codewords);
       lastCrDC = encodeBlock(bitWriter, Cr, scaledChrominance, lastCrDC, huffmanChrominanceDC, huffmanChrominanceAC, codewords);
     }
  
   bitWriter.flush(); // now image is completely encoded, write any bits still left in the buffer
  
   // ///////////////////////////
   // EOI marker
   bitWriter << 0xFF << 0xD9; // this marker has no length, therefore I can't use addMarker()
   return true;
 } // writeJpeg()
 } // namespace TooJpeg