IceRevisitedRadix.cpp
Go to the documentation of this file.
1 
8 
11 
39 
41 /*
42 To do:
43  - add an offset parameter between two input values (avoid some data recopy sometimes)
44  - unroll ? asm ?
45  - 11 bits trick & 3 passes as Michael did
46  - prefetch stuff the day I have a P3
47  - make a version with 16-bits indices ?
48 */
49 
51 // Precompiled Header
52 #include "Stdafx.h"
53 
54 using namespace IceCore;
55 
56 #define INVALIDATE_RANKS mCurrentSize|=0x80000000
57 #define VALIDATE_RANKS mCurrentSize&=0x7fffffff
58 #define CURRENT_SIZE (mCurrentSize&0x7fffffff)
59 #define INVALID_RANKS (mCurrentSize&0x80000000)
60 
61 #define CHECK_RESIZE(n) \
62  if(n!=mPreviousSize) \
63  { \
64  if(n>mCurrentSize) Resize(n); \
65  else ResetRanks(); \
66  mPreviousSize = n; \
67  }
68 
69 #define CREATE_HISTOGRAMS(type, buffer) \
70  /* Clear counters/histograms */ \
71  ZeroMemory(mHistogram, 256*4*sizeof(udword)); \
72  \
73  /* Prepare to count */ \
74  ubyte* p = (ubyte*)input; \
75  ubyte* pe = &p[nb*4]; \
76  udword* h0= &mHistogram[0]; /* Histogram for first pass (LSB) */ \
77  udword* h1= &mHistogram[256]; /* Histogram for second pass */ \
78  udword* h2= &mHistogram[512]; /* Histogram for third pass */ \
79  udword* h3= &mHistogram[768]; /* Histogram for last pass (MSB) */ \
80  \
81  bool AlreadySorted = true; /* Optimism... */ \
82  \
83  if(INVALID_RANKS) \
84  { \
85  /* Prepare for temporal coherence */ \
86  type* Running = (type*)buffer; \
87  type PrevVal = *Running; \
88  \
89  while(p!=pe) \
90  { \
91  /* Read input buffer in previous sorted order */ \
92  type Val = *Running++; \
93  /* Check whether already sorted or not */ \
94  if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */ \
95  /* Update for next iteration */ \
96  PrevVal = Val; \
97  \
98  /* Create histograms */ \
99  h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
100  } \
101  \
102  /* If all input values are already sorted, we just have to return and leave the */ \
103  /* previous list unchanged. That way the routine may take advantage of temporal */ \
104  /* coherence, for example when used to sort transparent faces. */ \
105  if(AlreadySorted) \
106  { \
107  mNbHits++; \
108  for(udword i=0;i<nb;i++) mRanks[i] = i; \
109  return *this; \
110  } \
111  } \
112  else \
113  { \
114  /* Prepare for temporal coherence */ \
115  udword* Indices = mRanks; \
116  type PrevVal = (type)buffer[*Indices]; \
117  \
118  while(p!=pe) \
119  { \
120  /* Read input buffer in previous sorted order */ \
121  type Val = (type)buffer[*Indices++]; \
122  /* Check whether already sorted or not */ \
123  if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */ \
124  /* Update for next iteration */ \
125  PrevVal = Val; \
126  \
127  /* Create histograms */ \
128  h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
129  } \
130  \
131  /* If all input values are already sorted, we just have to return and leave the */ \
132  /* previous list unchanged. That way the routine may take advantage of temporal */ \
133  /* coherence, for example when used to sort transparent faces. */ \
134  if(AlreadySorted) { mNbHits++; return *this; } \
135  } \
136  \
137  /* Else there has been an early out and we must finish computing the histograms */ \
138  while(p!=pe) \
139  { \
140  /* Create histograms without the previous overhead */ \
141  h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
142  }
143 
144 #define CHECK_PASS_VALIDITY(pass) \
145  /* Shortcut to current counters */ \
146  udword* CurCount = &mHistogram[pass<<8]; \
147  \
148  /* Reset flag. The sorting pass is supposed to be performed. (default) */ \
149  bool PerformPass = true; \
150  \
151  /* Check pass validity */ \
152  \
153  /* If all values have the same byte, sorting is useless. */ \
154  /* It may happen when sorting bytes or words instead of dwords. */ \
155  /* This routine actually sorts words faster than dwords, and bytes */ \
156  /* faster than words. Standard running time (O(4*n))is reduced to O(2*n) */ \
157  /* for words and O(n) for bytes. Running time for floats depends on actual values... */ \
158  \
159  /* Get first byte */ \
160  ubyte UniqueVal = *(((ubyte*)input)+pass); \
161  \
162  /* Check that byte's counter */ \
163  if(CurCount[UniqueVal]==nb) PerformPass=false;
164 
166 
169 RadixSort::RadixSort() : mCurrentSize(0), mRanks(null), mRanks2(null), mTotalCalls(0), mNbHits(0)
171 {
172 #ifndef RADIX_LOCAL_RAM
173  // Allocate input-independent ram
174  mHistogram = new udword[256*4];
175  mOffset = new udword[256];
176 #endif
177  // Initialize indices
179 }
180 
182 
187 {
188  // Release everything
189 #ifndef RADIX_LOCAL_RAM
190  DELETEARRAY(mOffset);
191  DELETEARRAY(mHistogram);
192 #endif
195 }
196 
198 
203 bool RadixSort::Resize(udword nb)
205 {
206  // Free previously used ram
209 
210  // Get some fresh one
211  mRanks = new udword[nb]; CHECKALLOC(mRanks);
212  mRanks2 = new udword[nb]; CHECKALLOC(mRanks2);
213 
214  return true;
215 }
216 
218 {
219  udword CurSize = CURRENT_SIZE;
220  if(nb!=CurSize)
221  {
222  if(nb>CurSize) Resize(nb);
223  mCurrentSize = nb;
225  }
226 }
227 
229 
237 RadixSort& RadixSort::Sort(const udword* input, udword nb, RadixHint hint)
239 {
240  // Checkings
241  if(!input || !nb || nb&0x80000000) return *this;
242 
243  // Stats
244  mTotalCalls++;
245 
246  // Resize lists if needed
247  CheckResize(nb);
248 
249 #ifdef RADIX_LOCAL_RAM
250  // Allocate histograms & offsets on the stack
251  udword mHistogram[256*4];
252 // udword mOffset[256];
253  udword* mLink[256];
254 #endif
255 
256  // Create histograms (counters). Counters for all passes are created in one run.
257  // Pros: read input buffer once instead of four times
258  // Cons: mHistogram is 4Kb instead of 1Kb
259  // We must take care of signed/unsigned values for temporal coherence.... I just
260  // have 2 code paths even if just a single opcode changes. Self-modifying code, someone?
261  if(hint==RADIX_UNSIGNED) { CREATE_HISTOGRAMS(udword, input); }
262  else { CREATE_HISTOGRAMS(sdword, input); }
263 
264  // Compute #negative values involved if needed
265  udword NbNegativeValues = 0;
266  if(hint==RADIX_SIGNED)
267  {
268  // An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
269  // last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
270  // responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
271  udword* h3= &mHistogram[768];
272  for(udword i=128;i<256;i++) NbNegativeValues += h3[i]; // 768 for last histogram, 128 for negative part
273  }
274 
275  // Radix sort, j is the pass number (0=LSB, 3=MSB)
276  for(udword j=0;j<4;j++)
277  {
279 
280  // Sometimes the fourth (negative) pass is skipped because all numbers are negative and the MSB is 0xFF (for example). This is
281  // not a problem, numbers are correctly sorted anyway.
282  if(PerformPass)
283  {
284  // Should we care about negative values?
285  if(j!=3 || hint==RADIX_UNSIGNED)
286  {
287  // Here we deal with positive values only
288 
289  // Create offsets
290 // mOffset[0] = 0;
291 // for(udword i=1;i<256;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1];
292  mLink[0] = mRanks2;
293  for(udword i=1;i<256;i++) mLink[i] = mLink[i-1] + CurCount[i-1];
294  }
295  else
296  {
297  // This is a special case to correctly handle negative integers. They're sorted in the right order but at the wrong place.
298 
299  // Create biased offsets, in order for negative numbers to be sorted as well
300 // mOffset[0] = NbNegativeValues; // First positive number takes place after the negative ones
301  mLink[0] = &mRanks2[NbNegativeValues]; // First positive number takes place after the negative ones
302 // for(udword i=1;i<128;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
303  for(udword i=1;i<128;i++) mLink[i] = mLink[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
304 
305  // Fixing the wrong place for negative values
306 // mOffset[128] = 0;
307  mLink[128] = mRanks2;
308 // for(i=129;i<256;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1];
309  for(udword i=129;i<256;i++) mLink[i] = mLink[i-1] + CurCount[i-1];
310  }
311 
312  // Perform Radix Sort
313  ubyte* InputBytes = (ubyte*)input;
314  InputBytes += j;
315  if(INVALID_RANKS)
316  {
317 // for(udword i=0;i<nb;i++) mRanks2[mOffset[InputBytes[i<<2]]++] = i;
318  for(udword i=0;i<nb;i++) *mLink[InputBytes[i<<2]]++ = i;
320  }
321  else
322  {
323  udword* Indices = mRanks;
324  udword* IndicesEnd = &mRanks[nb];
325  while(Indices!=IndicesEnd)
326  {
327  udword id = *Indices++;
328 // mRanks2[mOffset[InputBytes[id<<2]]++] = id;
329  *mLink[InputBytes[id<<2]]++ = id;
330  }
331  }
332 
333  // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
334  udword* Tmp = mRanks; mRanks = mRanks2; mRanks2 = Tmp;
335  }
336  }
337  return *this;
338 }
339 
341 
349 RadixSort& RadixSort::Sort(const float* input2, udword nb)
351 {
352  // Checkings
353  if(!input2 || !nb || nb&0x80000000) return *this;
354 
355  // Stats
356  mTotalCalls++;
357 
358  udword* input = (udword*)input2;
359 
360  // Resize lists if needed
361  CheckResize(nb);
362 
363 #ifdef RADIX_LOCAL_RAM
364  // Allocate histograms & offsets on the stack
365  udword mHistogram[256*4];
366 // udword mOffset[256];
367  udword* mLink[256];
368 #endif
369 
370  // Create histograms (counters). Counters for all passes are created in one run.
371  // Pros: read input buffer once instead of four times
372  // Cons: mHistogram is 4Kb instead of 1Kb
373  // Floating-point values are always supposed to be signed values, so there's only one code path there.
374  // Please note the floating point comparison needed for temporal coherence! Although the resulting asm code
375  // is dreadful, this is surprisingly not such a performance hit - well, I suppose that's a big one on first
376  // generation Pentiums....We can't make comparison on integer representations because, as Chris said, it just
377  // wouldn't work with mixed positive/negative values....
378  { CREATE_HISTOGRAMS(float, input2); }
379 
380  // Compute #negative values involved if needed
381  udword NbNegativeValues = 0;
382  // An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
383  // last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
384  // responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
385  udword* h3= &mHistogram[768];
386  for(udword i=128;i<256;i++) NbNegativeValues += h3[i]; // 768 for last histogram, 128 for negative part
387 
388  // Radix sort, j is the pass number (0=LSB, 3=MSB)
389  for(udword j=0;j<4;j++)
390  {
391  // Should we care about negative values?
392  if(j!=3)
393  {
394  // Here we deal with positive values only
396 
397  if(PerformPass)
398  {
399  // Create offsets
400 // mOffset[0] = 0;
401  mLink[0] = mRanks2;
402 // for(udword i=1;i<256;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1];
403  for(udword i=1;i<256;i++) mLink[i] = mLink[i-1] + CurCount[i-1];
404 
405  // Perform Radix Sort
406  ubyte* InputBytes = (ubyte*)input;
407  InputBytes += j;
408  if(INVALID_RANKS)
409  {
410 // for(i=0;i<nb;i++) mRanks2[mOffset[InputBytes[i<<2]]++] = i;
411  for(udword i=0;i<nb;i++) *mLink[InputBytes[i<<2]]++ = i;
413  }
414  else
415  {
416  udword* Indices = mRanks;
417  udword* IndicesEnd = &mRanks[nb];
418  while(Indices!=IndicesEnd)
419  {
420  udword id = *Indices++;
421 // mRanks2[mOffset[InputBytes[id<<2]]++] = id;
422  *mLink[InputBytes[id<<2]]++ = id;
423  }
424  }
425 
426  // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
427  udword* Tmp = mRanks; mRanks = mRanks2; mRanks2 = Tmp;
428  }
429  }
430  else
431  {
432  // This is a special case to correctly handle negative values
434 
435  if(PerformPass)
436  {
437  // Create biased offsets, in order for negative numbers to be sorted as well
438 // mOffset[0] = NbNegativeValues; // First positive number takes place after the negative ones
439  mLink[0] = &mRanks2[NbNegativeValues]; // First positive number takes place after the negative ones
440 // for(udword i=1;i<128;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
441  for(udword i=1;i<128;i++) mLink[i] = mLink[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
442 
443  // We must reverse the sorting order for negative numbers!
444 // mOffset[255] = 0;
445  mLink[255] = mRanks2;
446 // for(i=0;i<127;i++) mOffset[254-i] = mOffset[255-i] + CurCount[255-i]; // Fixing the wrong order for negative values
447  for(udword i=0;i<127;i++) mLink[254-i] = mLink[255-i] + CurCount[255-i]; // Fixing the wrong order for negative values
448 // for(i=128;i<256;i++) mOffset[i] += CurCount[i]; // Fixing the wrong place for negative values
449  for(udword i=128;i<256;i++) mLink[i] += CurCount[i]; // Fixing the wrong place for negative values
450 
451  // Perform Radix Sort
452  if(INVALID_RANKS)
453  {
454  for(udword i=0;i<nb;i++)
455  {
456  udword Radix = input[i]>>24; // Radix byte, same as above. AND is useless here (udword).
457  // ### cmp to be killed. Not good. Later.
458 // if(Radix<128) mRanks2[mOffset[Radix]++] = i; // Number is positive, same as above
459 // else mRanks2[--mOffset[Radix]] = i; // Number is negative, flip the sorting order
460  if(Radix<128) *mLink[Radix]++ = i; // Number is positive, same as above
461  else *(--mLink[Radix]) = i; // Number is negative, flip the sorting order
462  }
464  }
465  else
466  {
467  for(udword i=0;i<nb;i++)
468  {
469  udword Radix = input[mRanks[i]]>>24; // Radix byte, same as above. AND is useless here (udword).
470  // ### cmp to be killed. Not good. Later.
471 // if(Radix<128) mRanks2[mOffset[Radix]++] = mRanks[i]; // Number is positive, same as above
472 // else mRanks2[--mOffset[Radix]] = mRanks[i]; // Number is negative, flip the sorting order
473  if(Radix<128) *mLink[Radix]++ = mRanks[i]; // Number is positive, same as above
474  else *(--mLink[Radix]) = mRanks[i]; // Number is negative, flip the sorting order
475  }
476  }
477  // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
478  udword* Tmp = mRanks; mRanks = mRanks2; mRanks2 = Tmp;
479  }
480  else
481  {
482  // The pass is useless, yet we still have to reverse the order of current list if all values are negative.
483  if(UniqueVal>=128)
484  {
485  if(INVALID_RANKS)
486  {
487  // ###Possible?
488  for(udword i=0;i<nb;i++) mRanks2[i] = nb-i-1;
490  }
491  else
492  {
493  for(udword i=0;i<nb;i++) mRanks2[i] = mRanks[nb-i-1];
494  }
495 
496  // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
497  udword* Tmp = mRanks; mRanks = mRanks2; mRanks2 = Tmp;
498  }
499  }
500  }
501  }
502  return *this;
503 }
504 
506 
512 {
513  udword UsedRam = sizeof(RadixSort);
514 #ifndef RADIX_LOCAL_RAM
515  UsedRam += 256*4*sizeof(udword); // Histograms
516  UsedRam += 256*sizeof(udword); // Offsets
517 #endif
518  UsedRam += 2*CURRENT_SIZE*sizeof(udword); // 2 lists of indices
519  return UsedRam;
520 }
void CheckResize(udword nb)
udword GetUsedRam() const
#define INVALID_RANKS
signed int sdword
sizeof(sdword) must be 4
Definition: IceTypes.h:64
#define null
our own NULL pointer
Definition: IceTypes.h:57
udword mCurrentSize
Current size of the indices list.
Definition: OPC_IceHook.h:55
#define DELETEARRAY(x)
Deletes an array.
#define inline_
#define CREATE_HISTOGRAMS(type, buffer)
Input values are unsigned.
Definition: OPC_IceHook.h:22
png_uint_32 i
Definition: png.h:2735
udword * mRanks
Two lists, swapped each pass.
Definition: OPC_IceHook.h:56
unsigned int udword
sizeof(udword) must be 4
Definition: IceTypes.h:65
#define VALIDATE_RANKS
#define CHECK_PASS_VALIDITY(pass)
def j(str, encoding="cp932")
#define CHECKALLOC(x)
unsigned char ubyte
sizeof(ubyte) must be 1
Definition: IceTypes.h:61
RadixSort & Sort(const udword *input, udword nb, RadixHint hint=RADIX_SIGNED)
#define CURRENT_SIZE
bool Resize(udword nb)
udword mTotalCalls
Total number of calls to the sort routine.
Definition: OPC_IceHook.h:59
Input values are signed.
Definition: OPC_IceHook.h:21
#define INVALIDATE_RANKS


openhrp3
Author(s): AIST, General Robotix Inc., Nakamura Lab of Dept. of Mechano Informatics at University of Tokyo
autogenerated on Sat Apr 13 2019 02:14:22