openhrp3: IceRevisitedRadix.cpp Source File

Go to the documentation of this file.
 
 
 
 
 /*
 To do:
         - add an offset parameter between two input values (avoid some data recopy sometimes)
         - unroll ? asm ?
         - 11 bits trick & 3 passes as Michael did
         - prefetch stuff the day I have a P3
         - make a version with 16-bits indices ?
 */
 
 // Precompiled Header
 #include "Stdafx.h"
 
 using namespace IceCore;
 
 #define INVALIDATE_RANKS        mCurrentSize|=0x80000000
 #define VALIDATE_RANKS          mCurrentSize&=0x7fffffff
 #define CURRENT_SIZE            (mCurrentSize&0x7fffffff)
 #define INVALID_RANKS           (mCurrentSize&0x80000000)
 
 #define CHECK_RESIZE(n)                                                                                                                                         \
         if(n!=mPreviousSize)                                                                                                                                    \
         {                                                                                                                                                                               \
                                 if(n>mCurrentSize)      Resize(n);                                                                                              \
                 else                                            ResetRanks();                                                                                   \
                 mPreviousSize = n;                                                                                                                                      \
         }
 
 #define CREATE_HISTOGRAMS(type, buffer)                                                                                                         \
         /* Clear counters/histograms */                                                                                                                 \
         ZeroMemory(mHistogram, 256*4*sizeof(udword));                                                                                   \
                                                                                                                                                                                         \
         /* Prepare to count */                                                                                                                                  \
         ubyte* p = (ubyte*)input;                                                                                                                               \
         ubyte* pe = &p[nb*4];                                                                                                                                   \
         udword* h0= &mHistogram[0];             /* Histogram for first pass (LSB)       */                                      \
         udword* h1= &mHistogram[256];   /* Histogram for second pass            */                                      \
         udword* h2= &mHistogram[512];   /* Histogram for third pass                     */                                      \
         udword* h3= &mHistogram[768];   /* Histogram for last pass (MSB)        */                                      \
                                                                                                                                                                                         \
         bool AlreadySorted = true;      /* Optimism... */                                                                                       \
                                                                                                                                                                                         \
         if(INVALID_RANKS)                                                                                                                                               \
         {                                                                                                                                                                               \
                 /* Prepare for temporal coherence */                                                                                            \
                 type* Running = (type*)buffer;                                                                                                          \
                 type PrevVal = *Running;                                                                                                                        \
                                                                                                                                                                                         \
                 while(p!=pe)                                                                                                                                            \
                 {                                                                                                                                                                       \
                         /* Read input buffer in previous sorted order */                                                                \
                         type Val = *Running++;                                                                                                                  \
                         /* Check whether already sorted or not */                                                                               \
                         if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */                               \
                         /* Update for next iteration */                                                                                                 \
                         PrevVal = Val;                                                                                                                                  \
                                                                                                                                                                                         \
                         /* Create histograms */                                                                                                                 \
                         h0[*p++]++;     h1[*p++]++;     h2[*p++]++;     h3[*p++]++;                                                                     \
                 }                                                                                                                                                                       \
                                                                                                                                                                                         \
                 /* If all input values are already sorted, we just have to return and leave the */      \
                 /* previous list unchanged. That way the routine may take advantage of temporal */      \
                 /* coherence, for example when used to sort transparent faces.                                  */      \
                 if(AlreadySorted)                                                                                                                                       \
                 {                                                                                                                                                                       \
                         mNbHits++;                                                                                                                                              \
                         for(udword i=0;i<nb;i++)        mRanks[i] = i;                                                                          \
                         return *this;                                                                                                                                   \
                 }                                                                                                                                                                       \
         }                                                                                                                                                                               \
         else                                                                                                                                                                    \
         {                                                                                                                                                                               \
                 /* Prepare for temporal coherence */                                                                                            \
                 udword* Indices = mRanks;                                                                                                                       \
                 type PrevVal = (type)buffer[*Indices];                                                                                          \
                                                                                                                                                                                         \
                 while(p!=pe)                                                                                                                                            \
                 {                                                                                                                                                                       \
                         /* Read input buffer in previous sorted order */                                                                \
                         type Val = (type)buffer[*Indices++];                                                                                    \
                         /* Check whether already sorted or not */                                                                               \
                         if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */                               \
                         /* Update for next iteration */                                                                                                 \
                         PrevVal = Val;                                                                                                                                  \
                                                                                                                                                                                         \
                         /* Create histograms */                                                                                                                 \
                         h0[*p++]++;     h1[*p++]++;     h2[*p++]++;     h3[*p++]++;                                                                     \
                 }                                                                                                                                                                       \
                                                                                                                                                                                         \
                 /* If all input values are already sorted, we just have to return and leave the */      \
                 /* previous list unchanged. That way the routine may take advantage of temporal */      \
                 /* coherence, for example when used to sort transparent faces.                                  */      \
                 if(AlreadySorted)       { mNbHits++; return *this;      }                                                                       \
         }                                                                                                                                                                               \
                                                                                                                                                                                         \
         /* Else there has been an early out and we must finish computing the histograms */              \
         while(p!=pe)                                                                                                                                                    \
         {                                                                                                                                                                               \
                 /* Create histograms without the previous overhead */                                                           \
                 h0[*p++]++;     h1[*p++]++;     h2[*p++]++;     h3[*p++]++;                                                                             \
         }
 
 #define CHECK_PASS_VALIDITY(pass)                                                                                                                       \
         /* Shortcut to current counters */                                                                                                              \
         udword* CurCount = &mHistogram[pass<<8];                                                                                                \
                                                                                                                                                                                         \
         /* Reset flag. The sorting pass is supposed to be performed. (default) */                               \
         bool PerformPass = true;                                                                                                                                \
                                                                                                                                                                                         \
         /* Check pass validity */                                                                                                                               \
                                                                                                                                                                                         \
         /* If all values have the same byte, sorting is useless. */                                                             \
         /* It may happen when sorting bytes or words instead of dwords. */                                              \
         /* This routine actually sorts words faster than dwords, and bytes */                                   \
         /* faster than words. Standard running time (O(4*n))is reduced to O(2*n) */                             \
         /* for words and O(n) for bytes. Running time for floats depends on actual values... */ \
                                                                                                                                                                                         \
         /* Get first byte */                                                                                                                                    \
         ubyte UniqueVal = *(((ubyte*)input)+pass);                                                                                              \
                                                                                                                                                                                         \
         /* Check that byte's counter */                                                                                                                 \
         if(CurCount[UniqueVal]==nb)     PerformPass=false;
 
 
 RadixSort::RadixSort() : mCurrentSize(0), mRanks(null), mRanks2(null), mTotalCalls(0), mNbHits(0)
 {
 #ifndef RADIX_LOCAL_RAM
         // Allocate input-independent ram
         mHistogram      = new udword[256*4];
         mOffset         = new udword[256];
 #endif
         // Initialize indices
         INVALIDATE_RANKS;
 }
 
 
 RadixSort::~RadixSort()
 {
         // Release everything
 #ifndef RADIX_LOCAL_RAM
         DELETEARRAY(mOffset);
         DELETEARRAY(mHistogram);
 #endif
         DELETEARRAY(mRanks2);
         DELETEARRAY(mRanks);
 }
 
 
 bool RadixSort::Resize(udword nb)
 {
         // Free previously used ram
         DELETEARRAY(mRanks2);
         DELETEARRAY(mRanks);
 
         // Get some fresh one
         mRanks  = new udword[nb];       CHECKALLOC(mRanks);
         mRanks2 = new udword[nb];       CHECKALLOC(mRanks2);
 
         return true;
 }
 
 inline_ void RadixSort::CheckResize(udword nb)
 {
         udword CurSize = CURRENT_SIZE;
         if(nb!=CurSize)
         {
                 if(nb>CurSize)  Resize(nb);
                 mCurrentSize = nb;
                 INVALIDATE_RANKS;
         }
 }
 
 
 RadixSort& RadixSort::Sort(const udword* input, udword nb, RadixHint hint)
 {
         // Checkings
         if(!input || !nb || nb&0x80000000)      return *this;
 
         // Stats
         mTotalCalls++;
 
         // Resize lists if needed
         CheckResize(nb);
 
 #ifdef RADIX_LOCAL_RAM
         // Allocate histograms & offsets on the stack
         udword mHistogram[256*4];
 //      udword mOffset[256];
         udword* mLink[256];
 #endif
 
         // Create histograms (counters). Counters for all passes are created in one run.
         // Pros:        read input buffer once instead of four times
         // Cons:        mHistogram is 4Kb instead of 1Kb
         // We must take care of signed/unsigned values for temporal coherence.... I just
         // have 2 code paths even if just a single opcode changes. Self-modifying code, someone?
         if(hint==RADIX_UNSIGNED)        { CREATE_HISTOGRAMS(udword, input);     }
         else                                            { CREATE_HISTOGRAMS(sdword, input);     }
 
         // Compute #negative values involved if needed
         udword NbNegativeValues = 0;
         if(hint==RADIX_SIGNED)
         {
                 // An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
                 // last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
                 // responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
                 udword* h3= &mHistogram[768];
                 for(udword i=128;i<256;i++)     NbNegativeValues += h3[i];      // 768 for last histogram, 128 for negative part
         }
 
         // Radix sort, j is the pass number (0=LSB, 3=MSB)
         for(udword j=0;j<4;j++)
         {
                 CHECK_PASS_VALIDITY(j);
 
                 // Sometimes the fourth (negative) pass is skipped because all numbers are negative and the MSB is 0xFF (for example). This is
                 // not a problem, numbers are correctly sorted anyway.
                 if(PerformPass)
                 {
                         // Should we care about negative values?
                         if(j!=3 || hint==RADIX_UNSIGNED)
                         {
                                 // Here we deal with positive values only
 
                                 // Create offsets
 //                              mOffset[0] = 0;
 //                              for(udword i=1;i<256;i++)               mOffset[i] = mOffset[i-1] + CurCount[i-1];
                                 mLink[0] = mRanks2;
                                 for(udword i=1;i<256;i++)               mLink[i] = mLink[i-1] + CurCount[i-1];
                         }
                         else
                         {
                                 // This is a special case to correctly handle negative integers. They're sorted in the right order but at the wrong place.
 
                                 // Create biased offsets, in order for negative numbers to be sorted as well
 //                              mOffset[0] = NbNegativeValues;                                                                                          // First positive number takes place after the negative ones
                                 mLink[0] = &mRanks2[NbNegativeValues];                                                                          // First positive number takes place after the negative ones
 //                              for(udword i=1;i<128;i++)               mOffset[i] = mOffset[i-1] + CurCount[i-1];      // 1 to 128 for positive numbers
                                 for(udword i=1;i<128;i++)               mLink[i] = mLink[i-1] + CurCount[i-1];          // 1 to 128 for positive numbers
 
                                 // Fixing the wrong place for negative values
 //                              mOffset[128] = 0;
                                 mLink[128] = mRanks2;
 //                              for(i=129;i<256;i++)                    mOffset[i] = mOffset[i-1] + CurCount[i-1];
                                 for(udword i=129;i<256;i++)             mLink[i] = mLink[i-1] + CurCount[i-1];
                         }
 
                         // Perform Radix Sort
                         ubyte* InputBytes       = (ubyte*)input;
                         InputBytes += j;
                         if(INVALID_RANKS)
                         {
 //                              for(udword i=0;i<nb;i++)        mRanks2[mOffset[InputBytes[i<<2]]++] = i;
                                 for(udword i=0;i<nb;i++)        *mLink[InputBytes[i<<2]]++ = i;
                                 VALIDATE_RANKS;
                         }
                         else
                         {
                                 udword* Indices         = mRanks;
                                 udword* IndicesEnd      = &mRanks[nb];
                                 while(Indices!=IndicesEnd)
                                 {
                                         udword id = *Indices++;
 //                                      mRanks2[mOffset[InputBytes[id<<2]]++] = id;
                                         *mLink[InputBytes[id<<2]]++ = id;
                                 }
                         }
 
                         // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
                         udword* Tmp     = mRanks;       mRanks = mRanks2; mRanks2 = Tmp;
                 }
         }
         return *this;
 }
 
 
 RadixSort& RadixSort::Sort(const float* input2, udword nb)
 {
         // Checkings
         if(!input2 || !nb || nb&0x80000000)     return *this;
 
         // Stats
         mTotalCalls++;
 
         udword* input = (udword*)input2;
 
         // Resize lists if needed
         CheckResize(nb);
 
 #ifdef RADIX_LOCAL_RAM
         // Allocate histograms & offsets on the stack
         udword mHistogram[256*4];
 //      udword mOffset[256];
         udword* mLink[256];
 #endif
 
         // Create histograms (counters). Counters for all passes are created in one run.
         // Pros:        read input buffer once instead of four times
         // Cons:        mHistogram is 4Kb instead of 1Kb
         // Floating-point values are always supposed to be signed values, so there's only one code path there.
         // Please note the floating point comparison needed for temporal coherence! Although the resulting asm code
         // is dreadful, this is surprisingly not such a performance hit - well, I suppose that's a big one on first
         // generation Pentiums....We can't make comparison on integer representations because, as Chris said, it just
         // wouldn't work with mixed positive/negative values....
         { CREATE_HISTOGRAMS(float, input2); }
 
         // Compute #negative values involved if needed
         udword NbNegativeValues = 0;
         // An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
         // last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
         // responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
         udword* h3= &mHistogram[768];
         for(udword i=128;i<256;i++)     NbNegativeValues += h3[i];      // 768 for last histogram, 128 for negative part
 
         // Radix sort, j is the pass number (0=LSB, 3=MSB)
         for(udword j=0;j<4;j++)
         {
                 // Should we care about negative values?
                 if(j!=3)
                 {
                         // Here we deal with positive values only
                         CHECK_PASS_VALIDITY(j);
 
                         if(PerformPass)
                         {
                                 // Create offsets
 //                              mOffset[0] = 0;
                                 mLink[0] = mRanks2;
 //                              for(udword i=1;i<256;i++)               mOffset[i] = mOffset[i-1] + CurCount[i-1];
                                 for(udword i=1;i<256;i++)               mLink[i] = mLink[i-1] + CurCount[i-1];
 
                                 // Perform Radix Sort
                                 ubyte* InputBytes = (ubyte*)input;
                                 InputBytes += j;
                                 if(INVALID_RANKS)
                                 {
 //                                      for(i=0;i<nb;i++)       mRanks2[mOffset[InputBytes[i<<2]]++] = i;
                                         for(udword i=0;i<nb;i++)        *mLink[InputBytes[i<<2]]++ = i;
                                         VALIDATE_RANKS;
                                 }
                                 else
                                 {
                                         udword* Indices         = mRanks;
                                         udword* IndicesEnd      = &mRanks[nb];
                                         while(Indices!=IndicesEnd)
                                         {
                                                 udword id = *Indices++;
 //                                              mRanks2[mOffset[InputBytes[id<<2]]++] = id;
                                                 *mLink[InputBytes[id<<2]]++ = id;
                                         }
                                 }
 
                                 // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
                                 udword* Tmp     = mRanks;       mRanks = mRanks2; mRanks2 = Tmp;
                         }
                 }
                 else
                 {
                         // This is a special case to correctly handle negative values
                         CHECK_PASS_VALIDITY(j);
 
                         if(PerformPass)
                         {
                                 // Create biased offsets, in order for negative numbers to be sorted as well
 //                              mOffset[0] = NbNegativeValues;                                                                                          // First positive number takes place after the negative ones
                                 mLink[0] = &mRanks2[NbNegativeValues];                                                                          // First positive number takes place after the negative ones
 //                              for(udword i=1;i<128;i++)               mOffset[i] = mOffset[i-1] + CurCount[i-1];      // 1 to 128 for positive numbers
                                 for(udword i=1;i<128;i++)               mLink[i] = mLink[i-1] + CurCount[i-1];          // 1 to 128 for positive numbers
 
                                 // We must reverse the sorting order for negative numbers!
 //                              mOffset[255] = 0;
                                 mLink[255] = mRanks2;
 //                              for(i=0;i<127;i++)              mOffset[254-i] = mOffset[255-i] + CurCount[255-i];      // Fixing the wrong order for negative values
                                 for(udword i=0;i<127;i++)       mLink[254-i] = mLink[255-i] + CurCount[255-i];          // Fixing the wrong order for negative values
 //                              for(i=128;i<256;i++)    mOffset[i] += CurCount[i];                                                      // Fixing the wrong place for negative values
                                 for(udword i=128;i<256;i++)     mLink[i] += CurCount[i];                                                        // Fixing the wrong place for negative values
 
                                 // Perform Radix Sort
                                 if(INVALID_RANKS)
                                 {
                                         for(udword i=0;i<nb;i++)
                                         {
                                                 udword Radix = input[i]>>24;                                                    // Radix byte, same as above. AND is useless here (udword).
                                                 // ### cmp to be killed. Not good. Later.
 //                                              if(Radix<128)           mRanks2[mOffset[Radix]++] = i;          // Number is positive, same as above
 //                                              else                            mRanks2[--mOffset[Radix]] = i;          // Number is negative, flip the sorting order
                                                 if(Radix<128)           *mLink[Radix]++ = i;            // Number is positive, same as above
                                                 else                            *(--mLink[Radix]) = i;          // Number is negative, flip the sorting order
                                         }
                                         VALIDATE_RANKS;
                                 }
                                 else
                                 {
                                         for(udword i=0;i<nb;i++)
                                         {
                                                 udword Radix = input[mRanks[i]]>>24;                                                    // Radix byte, same as above. AND is useless here (udword).
                                                 // ### cmp to be killed. Not good. Later.
 //                                              if(Radix<128)           mRanks2[mOffset[Radix]++] = mRanks[i];          // Number is positive, same as above
 //                                              else                            mRanks2[--mOffset[Radix]] = mRanks[i];          // Number is negative, flip the sorting order
                                                 if(Radix<128)           *mLink[Radix]++ = mRanks[i];            // Number is positive, same as above
                                                 else                            *(--mLink[Radix]) = mRanks[i];          // Number is negative, flip the sorting order
                                         }
                                 }
                                 // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
                                 udword* Tmp     = mRanks;       mRanks = mRanks2; mRanks2 = Tmp;
                         }
                         else
                         {
                                 // The pass is useless, yet we still have to reverse the order of current list if all values are negative.
                                 if(UniqueVal>=128)
                                 {
                                         if(INVALID_RANKS)
                                         {
                                                 // ###Possible?
                                                 for(udword i=0;i<nb;i++)        mRanks2[i] = nb-i-1;
                                                 VALIDATE_RANKS;
                                         }
                                         else
                                         {
                                                 for(udword i=0;i<nb;i++)        mRanks2[i] = mRanks[nb-i-1];
                                         }
 
                                         // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
                                         udword* Tmp     = mRanks;       mRanks = mRanks2; mRanks2 = Tmp;
                                 }
                         }
                 }
         }
         return *this;
 }
 
 
 udword RadixSort::GetUsedRam() const
 {
         udword UsedRam = sizeof(RadixSort);
 #ifndef RADIX_LOCAL_RAM
         UsedRam += 256*4*sizeof(udword);                        // Histograms
         UsedRam += 256*sizeof(udword);                          // Offsets
 #endif
         UsedRam += 2*CURRENT_SIZE*sizeof(udword);       // 2 lists of indices
         return UsedRam;
 }