xref: /freebsd/contrib/bzip2/blocksort.c (revision 0572ccaa4543b0abef8ef81e384c1d04de9f3da1)
1 
2 /*-------------------------------------------------------------*/
3 /*--- Block sorting machinery                               ---*/
4 /*---                                           blocksort.c ---*/
5 /*-------------------------------------------------------------*/
6 
7 /* ------------------------------------------------------------------
8    This file is part of bzip2/libbzip2, a program and library for
9    lossless, block-sorting data compression.
10 
11    bzip2/libbzip2 version 1.0.6 of 6 September 2010
12    Copyright (C) 1996-2010 Julian Seward <jseward@bzip.org>
13 
14    Please read the WARNING, DISCLAIMER and PATENTS sections in the
15    README file.
16 
17    This program is released under the terms of the license contained
18    in the file LICENSE.
19    ------------------------------------------------------------------ */
20 
21 
22 #include "bzlib_private.h"
23 
24 /*---------------------------------------------*/
25 /*--- Fallback O(N log(N)^2) sorting        ---*/
26 /*--- algorithm, for repetitive blocks      ---*/
27 /*---------------------------------------------*/
28 
29 /*---------------------------------------------*/
30 static
31 __inline__
32 void fallbackSimpleSort ( UInt32* fmap,
33                           UInt32* eclass,
34                           Int32   lo,
35                           Int32   hi )
36 {
37    Int32 i, j, tmp;
38    UInt32 ec_tmp;
39 
40    if (lo == hi) return;
41 
42    if (hi - lo > 3) {
43       for ( i = hi-4; i >= lo; i-- ) {
44          tmp = fmap[i];
45          ec_tmp = eclass[tmp];
46          for ( j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4 )
47             fmap[j-4] = fmap[j];
48          fmap[j-4] = tmp;
49       }
50    }
51 
52    for ( i = hi-1; i >= lo; i-- ) {
53       tmp = fmap[i];
54       ec_tmp = eclass[tmp];
55       for ( j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++ )
56          fmap[j-1] = fmap[j];
57       fmap[j-1] = tmp;
58    }
59 }
60 
61 
62 /*---------------------------------------------*/
63 #define fswap(zz1, zz2) \
64    { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
65 
66 #define fvswap(zzp1, zzp2, zzn)       \
67 {                                     \
68    Int32 yyp1 = (zzp1);               \
69    Int32 yyp2 = (zzp2);               \
70    Int32 yyn  = (zzn);                \
71    while (yyn > 0) {                  \
72       fswap(fmap[yyp1], fmap[yyp2]);  \
73       yyp1++; yyp2++; yyn--;          \
74    }                                  \
75 }
76 
77 
78 #define fmin(a,b) ((a) < (b)) ? (a) : (b)
79 
80 #define fpush(lz,hz) { stackLo[sp] = lz; \
81                        stackHi[sp] = hz; \
82                        sp++; }
83 
84 #define fpop(lz,hz) { sp--;              \
85                       lz = stackLo[sp];  \
86                       hz = stackHi[sp]; }
87 
88 #define FALLBACK_QSORT_SMALL_THRESH 10
89 #define FALLBACK_QSORT_STACK_SIZE   100
90 
91 
92 static
93 void fallbackQSort3 ( UInt32* fmap,
94                       UInt32* eclass,
95                       Int32   loSt,
96                       Int32   hiSt )
97 {
98    Int32 unLo, unHi, ltLo, gtHi, n, m;
99    Int32 sp, lo, hi;
100    UInt32 med, r, r3;
101    Int32 stackLo[FALLBACK_QSORT_STACK_SIZE];
102    Int32 stackHi[FALLBACK_QSORT_STACK_SIZE];
103 
104    r = 0;
105 
106    sp = 0;
107    fpush ( loSt, hiSt );
108 
109    while (sp > 0) {
110 
111       AssertH ( sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004 );
112 
113       fpop ( lo, hi );
114       if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
115          fallbackSimpleSort ( fmap, eclass, lo, hi );
116          continue;
117       }
118 
119       /* Random partitioning.  Median of 3 sometimes fails to
120          avoid bad cases.  Median of 9 seems to help but
121          looks rather expensive.  This too seems to work but
122          is cheaper.  Guidance for the magic constants
123          7621 and 32768 is taken from Sedgewick's algorithms
124          book, chapter 35.
125       */
126       r = ((r * 7621) + 1) % 32768;
127       r3 = r % 3;
128       if (r3 == 0) med = eclass[fmap[lo]]; else
129       if (r3 == 1) med = eclass[fmap[(lo+hi)>>1]]; else
130                    med = eclass[fmap[hi]];
131 
132       unLo = ltLo = lo;
133       unHi = gtHi = hi;
134 
135       while (1) {
136          while (1) {
137             if (unLo > unHi) break;
138             n = (Int32)eclass[fmap[unLo]] - (Int32)med;
139             if (n == 0) {
140                fswap(fmap[unLo], fmap[ltLo]);
141                ltLo++; unLo++;
142                continue;
143             };
144             if (n > 0) break;
145             unLo++;
146          }
147          while (1) {
148             if (unLo > unHi) break;
149             n = (Int32)eclass[fmap[unHi]] - (Int32)med;
150             if (n == 0) {
151                fswap(fmap[unHi], fmap[gtHi]);
152                gtHi--; unHi--;
153                continue;
154             };
155             if (n < 0) break;
156             unHi--;
157          }
158          if (unLo > unHi) break;
159          fswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
160       }
161 
162       AssertD ( unHi == unLo-1, "fallbackQSort3(2)" );
163 
164       if (gtHi < ltLo) continue;
165 
166       n = fmin(ltLo-lo, unLo-ltLo); fvswap(lo, unLo-n, n);
167       m = fmin(hi-gtHi, gtHi-unHi); fvswap(unLo, hi-m+1, m);
168 
169       n = lo + unLo - ltLo - 1;
170       m = hi - (gtHi - unHi) + 1;
171 
172       if (n - lo > hi - m) {
173          fpush ( lo, n );
174          fpush ( m, hi );
175       } else {
176          fpush ( m, hi );
177          fpush ( lo, n );
178       }
179    }
180 }
181 
182 #undef fmin
183 #undef fpush
184 #undef fpop
185 #undef fswap
186 #undef fvswap
187 #undef FALLBACK_QSORT_SMALL_THRESH
188 #undef FALLBACK_QSORT_STACK_SIZE
189 
190 
191 /*---------------------------------------------*/
192 /* Pre:
193       nblock > 0
194       eclass exists for [0 .. nblock-1]
195       ((UChar*)eclass) [0 .. nblock-1] holds block
196       ptr exists for [0 .. nblock-1]
197 
198    Post:
199       ((UChar*)eclass) [0 .. nblock-1] holds block
200       All other areas of eclass destroyed
201       fmap [0 .. nblock-1] holds sorted order
202       bhtab [ 0 .. 2+(nblock/32) ] destroyed
203 */
204 
205 #define       SET_BH(zz)  bhtab[(zz) >> 5] |= (1 << ((zz) & 31))
206 #define     CLEAR_BH(zz)  bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31))
207 #define     ISSET_BH(zz)  (bhtab[(zz) >> 5] & (1 << ((zz) & 31)))
208 #define      WORD_BH(zz)  bhtab[(zz) >> 5]
209 #define UNALIGNED_BH(zz)  ((zz) & 0x01f)
210 
211 static
212 void fallbackSort ( UInt32* fmap,
213                     UInt32* eclass,
214                     UInt32* bhtab,
215                     Int32   nblock,
216                     Int32   verb )
217 {
218    Int32 ftab[257];
219    Int32 ftabCopy[256];
220    Int32 H, i, j, k, l, r, cc, cc1;
221    Int32 nNotDone;
222    Int32 nBhtab;
223    UChar* eclass8 = (UChar*)eclass;
224 
225    /*--
226       Initial 1-char radix sort to generate
227       initial fmap and initial BH bits.
228    --*/
229    if (verb >= 4)
230       VPrintf0 ( "        bucket sorting ...\n" );
231    for (i = 0; i < 257;    i++) ftab[i] = 0;
232    for (i = 0; i < nblock; i++) ftab[eclass8[i]]++;
233    for (i = 0; i < 256;    i++) ftabCopy[i] = ftab[i];
234    for (i = 1; i < 257;    i++) ftab[i] += ftab[i-1];
235 
236    for (i = 0; i < nblock; i++) {
237       j = eclass8[i];
238       k = ftab[j] - 1;
239       ftab[j] = k;
240       fmap[k] = i;
241    }
242 
243    nBhtab = 2 + (nblock / 32);
244    for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
245    for (i = 0; i < 256; i++) SET_BH(ftab[i]);
246 
247    /*--
248       Inductively refine the buckets.  Kind-of an
249       "exponential radix sort" (!), inspired by the
250       Manber-Myers suffix array construction algorithm.
251    --*/
252 
253    /*-- set sentinel bits for block-end detection --*/
254    for (i = 0; i < 32; i++) {
255       SET_BH(nblock + 2*i);
256       CLEAR_BH(nblock + 2*i + 1);
257    }
258 
259    /*-- the log(N) loop --*/
260    H = 1;
261    while (1) {
262 
263       if (verb >= 4)
264          VPrintf1 ( "        depth %6d has ", H );
265 
266       j = 0;
267       for (i = 0; i < nblock; i++) {
268          if (ISSET_BH(i)) j = i;
269          k = fmap[i] - H; if (k < 0) k += nblock;
270          eclass[k] = j;
271       }
272 
273       nNotDone = 0;
274       r = -1;
275       while (1) {
276 
277 	 /*-- find the next non-singleton bucket --*/
278          k = r + 1;
279          while (ISSET_BH(k) && UNALIGNED_BH(k)) k++;
280          if (ISSET_BH(k)) {
281             while (WORD_BH(k) == 0xffffffff) k += 32;
282             while (ISSET_BH(k)) k++;
283          }
284          l = k - 1;
285          if (l >= nblock) break;
286          while (!ISSET_BH(k) && UNALIGNED_BH(k)) k++;
287          if (!ISSET_BH(k)) {
288             while (WORD_BH(k) == 0x00000000) k += 32;
289             while (!ISSET_BH(k)) k++;
290          }
291          r = k - 1;
292          if (r >= nblock) break;
293 
294          /*-- now [l, r] bracket current bucket --*/
295          if (r > l) {
296             nNotDone += (r - l + 1);
297             fallbackQSort3 ( fmap, eclass, l, r );
298 
299             /*-- scan bucket and generate header bits-- */
300             cc = -1;
301             for (i = l; i <= r; i++) {
302                cc1 = eclass[fmap[i]];
303                if (cc != cc1) { SET_BH(i); cc = cc1; };
304             }
305          }
306       }
307 
308       if (verb >= 4)
309          VPrintf1 ( "%6d unresolved strings\n", nNotDone );
310 
311       H *= 2;
312       if (H > nblock || nNotDone == 0) break;
313    }
314 
315    /*--
316       Reconstruct the original block in
317       eclass8 [0 .. nblock-1], since the
318       previous phase destroyed it.
319    --*/
320    if (verb >= 4)
321       VPrintf0 ( "        reconstructing block ...\n" );
322    j = 0;
323    for (i = 0; i < nblock; i++) {
324       while (ftabCopy[j] == 0) j++;
325       ftabCopy[j]--;
326       eclass8[fmap[i]] = (UChar)j;
327    }
328    AssertH ( j < 256, 1005 );
329 }
330 
331 #undef       SET_BH
332 #undef     CLEAR_BH
333 #undef     ISSET_BH
334 #undef      WORD_BH
335 #undef UNALIGNED_BH
336 
337 
338 /*---------------------------------------------*/
339 /*--- The main, O(N^2 log(N)) sorting       ---*/
340 /*--- algorithm.  Faster for "normal"       ---*/
341 /*--- non-repetitive blocks.                ---*/
342 /*---------------------------------------------*/
343 
344 /*---------------------------------------------*/
345 static
346 __inline__
347 Bool mainGtU ( UInt32  i1,
348                UInt32  i2,
349                UChar*  block,
350                UInt16* quadrant,
351                UInt32  nblock,
352                Int32*  budget )
353 {
354    Int32  k;
355    UChar  c1, c2;
356    UInt16 s1, s2;
357 
358    AssertD ( i1 != i2, "mainGtU" );
359    /* 1 */
360    c1 = block[i1]; c2 = block[i2];
361    if (c1 != c2) return (c1 > c2);
362    i1++; i2++;
363    /* 2 */
364    c1 = block[i1]; c2 = block[i2];
365    if (c1 != c2) return (c1 > c2);
366    i1++; i2++;
367    /* 3 */
368    c1 = block[i1]; c2 = block[i2];
369    if (c1 != c2) return (c1 > c2);
370    i1++; i2++;
371    /* 4 */
372    c1 = block[i1]; c2 = block[i2];
373    if (c1 != c2) return (c1 > c2);
374    i1++; i2++;
375    /* 5 */
376    c1 = block[i1]; c2 = block[i2];
377    if (c1 != c2) return (c1 > c2);
378    i1++; i2++;
379    /* 6 */
380    c1 = block[i1]; c2 = block[i2];
381    if (c1 != c2) return (c1 > c2);
382    i1++; i2++;
383    /* 7 */
384    c1 = block[i1]; c2 = block[i2];
385    if (c1 != c2) return (c1 > c2);
386    i1++; i2++;
387    /* 8 */
388    c1 = block[i1]; c2 = block[i2];
389    if (c1 != c2) return (c1 > c2);
390    i1++; i2++;
391    /* 9 */
392    c1 = block[i1]; c2 = block[i2];
393    if (c1 != c2) return (c1 > c2);
394    i1++; i2++;
395    /* 10 */
396    c1 = block[i1]; c2 = block[i2];
397    if (c1 != c2) return (c1 > c2);
398    i1++; i2++;
399    /* 11 */
400    c1 = block[i1]; c2 = block[i2];
401    if (c1 != c2) return (c1 > c2);
402    i1++; i2++;
403    /* 12 */
404    c1 = block[i1]; c2 = block[i2];
405    if (c1 != c2) return (c1 > c2);
406    i1++; i2++;
407 
408    k = nblock + 8;
409 
410    do {
411       /* 1 */
412       c1 = block[i1]; c2 = block[i2];
413       if (c1 != c2) return (c1 > c2);
414       s1 = quadrant[i1]; s2 = quadrant[i2];
415       if (s1 != s2) return (s1 > s2);
416       i1++; i2++;
417       /* 2 */
418       c1 = block[i1]; c2 = block[i2];
419       if (c1 != c2) return (c1 > c2);
420       s1 = quadrant[i1]; s2 = quadrant[i2];
421       if (s1 != s2) return (s1 > s2);
422       i1++; i2++;
423       /* 3 */
424       c1 = block[i1]; c2 = block[i2];
425       if (c1 != c2) return (c1 > c2);
426       s1 = quadrant[i1]; s2 = quadrant[i2];
427       if (s1 != s2) return (s1 > s2);
428       i1++; i2++;
429       /* 4 */
430       c1 = block[i1]; c2 = block[i2];
431       if (c1 != c2) return (c1 > c2);
432       s1 = quadrant[i1]; s2 = quadrant[i2];
433       if (s1 != s2) return (s1 > s2);
434       i1++; i2++;
435       /* 5 */
436       c1 = block[i1]; c2 = block[i2];
437       if (c1 != c2) return (c1 > c2);
438       s1 = quadrant[i1]; s2 = quadrant[i2];
439       if (s1 != s2) return (s1 > s2);
440       i1++; i2++;
441       /* 6 */
442       c1 = block[i1]; c2 = block[i2];
443       if (c1 != c2) return (c1 > c2);
444       s1 = quadrant[i1]; s2 = quadrant[i2];
445       if (s1 != s2) return (s1 > s2);
446       i1++; i2++;
447       /* 7 */
448       c1 = block[i1]; c2 = block[i2];
449       if (c1 != c2) return (c1 > c2);
450       s1 = quadrant[i1]; s2 = quadrant[i2];
451       if (s1 != s2) return (s1 > s2);
452       i1++; i2++;
453       /* 8 */
454       c1 = block[i1]; c2 = block[i2];
455       if (c1 != c2) return (c1 > c2);
456       s1 = quadrant[i1]; s2 = quadrant[i2];
457       if (s1 != s2) return (s1 > s2);
458       i1++; i2++;
459 
460       if (i1 >= nblock) i1 -= nblock;
461       if (i2 >= nblock) i2 -= nblock;
462 
463       k -= 8;
464       (*budget)--;
465    }
466       while (k >= 0);
467 
468    return False;
469 }
470 
471 
472 /*---------------------------------------------*/
473 /*--
474    Knuth's increments seem to work better
475    than Incerpi-Sedgewick here.  Possibly
476    because the number of elems to sort is
477    usually small, typically <= 20.
478 --*/
479 static
480 Int32 incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280,
481                    9841, 29524, 88573, 265720,
482                    797161, 2391484 };
483 
484 static
485 void mainSimpleSort ( UInt32* ptr,
486                       UChar*  block,
487                       UInt16* quadrant,
488                       Int32   nblock,
489                       Int32   lo,
490                       Int32   hi,
491                       Int32   d,
492                       Int32*  budget )
493 {
494    Int32 i, j, h, bigN, hp;
495    UInt32 v;
496 
497    bigN = hi - lo + 1;
498    if (bigN < 2) return;
499 
500    hp = 0;
501    while (incs[hp] < bigN) hp++;
502    hp--;
503 
504    for (; hp >= 0; hp--) {
505       h = incs[hp];
506 
507       i = lo + h;
508       while (True) {
509 
510          /*-- copy 1 --*/
511          if (i > hi) break;
512          v = ptr[i];
513          j = i;
514          while ( mainGtU (
515                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget
516                  ) ) {
517             ptr[j] = ptr[j-h];
518             j = j - h;
519             if (j <= (lo + h - 1)) break;
520          }
521          ptr[j] = v;
522          i++;
523 
524          /*-- copy 2 --*/
525          if (i > hi) break;
526          v = ptr[i];
527          j = i;
528          while ( mainGtU (
529                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget
530                  ) ) {
531             ptr[j] = ptr[j-h];
532             j = j - h;
533             if (j <= (lo + h - 1)) break;
534          }
535          ptr[j] = v;
536          i++;
537 
538          /*-- copy 3 --*/
539          if (i > hi) break;
540          v = ptr[i];
541          j = i;
542          while ( mainGtU (
543                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget
544                  ) ) {
545             ptr[j] = ptr[j-h];
546             j = j - h;
547             if (j <= (lo + h - 1)) break;
548          }
549          ptr[j] = v;
550          i++;
551 
552          if (*budget < 0) return;
553       }
554    }
555 }
556 
557 
558 /*---------------------------------------------*/
559 /*--
560    The following is an implementation of
561    an elegant 3-way quicksort for strings,
562    described in a paper "Fast Algorithms for
563    Sorting and Searching Strings", by Robert
564    Sedgewick and Jon L. Bentley.
565 --*/
566 
567 #define mswap(zz1, zz2) \
568    { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
569 
570 #define mvswap(zzp1, zzp2, zzn)       \
571 {                                     \
572    Int32 yyp1 = (zzp1);               \
573    Int32 yyp2 = (zzp2);               \
574    Int32 yyn  = (zzn);                \
575    while (yyn > 0) {                  \
576       mswap(ptr[yyp1], ptr[yyp2]);    \
577       yyp1++; yyp2++; yyn--;          \
578    }                                  \
579 }
580 
581 static
582 __inline__
583 UChar mmed3 ( UChar a, UChar b, UChar c )
584 {
585    UChar t;
586    if (a > b) { t = a; a = b; b = t; };
587    if (b > c) {
588       b = c;
589       if (a > b) b = a;
590    }
591    return b;
592 }
593 
594 #define mmin(a,b) ((a) < (b)) ? (a) : (b)
595 
596 #define mpush(lz,hz,dz) { stackLo[sp] = lz; \
597                           stackHi[sp] = hz; \
598                           stackD [sp] = dz; \
599                           sp++; }
600 
601 #define mpop(lz,hz,dz) { sp--;             \
602                          lz = stackLo[sp]; \
603                          hz = stackHi[sp]; \
604                          dz = stackD [sp]; }
605 
606 
607 #define mnextsize(az) (nextHi[az]-nextLo[az])
608 
609 #define mnextswap(az,bz)                                        \
610    { Int32 tz;                                                  \
611      tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
612      tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
613      tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; }
614 
615 
616 #define MAIN_QSORT_SMALL_THRESH 20
617 #define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
618 #define MAIN_QSORT_STACK_SIZE 100
619 
620 static
621 void mainQSort3 ( UInt32* ptr,
622                   UChar*  block,
623                   UInt16* quadrant,
624                   Int32   nblock,
625                   Int32   loSt,
626                   Int32   hiSt,
627                   Int32   dSt,
628                   Int32*  budget )
629 {
630    Int32 unLo, unHi, ltLo, gtHi, n, m, med;
631    Int32 sp, lo, hi, d;
632 
633    Int32 stackLo[MAIN_QSORT_STACK_SIZE];
634    Int32 stackHi[MAIN_QSORT_STACK_SIZE];
635    Int32 stackD [MAIN_QSORT_STACK_SIZE];
636 
637    Int32 nextLo[3];
638    Int32 nextHi[3];
639    Int32 nextD [3];
640 
641    sp = 0;
642    mpush ( loSt, hiSt, dSt );
643 
644    while (sp > 0) {
645 
646       AssertH ( sp < MAIN_QSORT_STACK_SIZE - 2, 1001 );
647 
648       mpop ( lo, hi, d );
649       if (hi - lo < MAIN_QSORT_SMALL_THRESH ||
650           d > MAIN_QSORT_DEPTH_THRESH) {
651          mainSimpleSort ( ptr, block, quadrant, nblock, lo, hi, d, budget );
652          if (*budget < 0) return;
653          continue;
654       }
655 
656       med = (Int32)
657             mmed3 ( block[ptr[ lo         ]+d],
658                     block[ptr[ hi         ]+d],
659                     block[ptr[ (lo+hi)>>1 ]+d] );
660 
661       unLo = ltLo = lo;
662       unHi = gtHi = hi;
663 
664       while (True) {
665          while (True) {
666             if (unLo > unHi) break;
667             n = ((Int32)block[ptr[unLo]+d]) - med;
668             if (n == 0) {
669                mswap(ptr[unLo], ptr[ltLo]);
670                ltLo++; unLo++; continue;
671             };
672             if (n >  0) break;
673             unLo++;
674          }
675          while (True) {
676             if (unLo > unHi) break;
677             n = ((Int32)block[ptr[unHi]+d]) - med;
678             if (n == 0) {
679                mswap(ptr[unHi], ptr[gtHi]);
680                gtHi--; unHi--; continue;
681             };
682             if (n <  0) break;
683             unHi--;
684          }
685          if (unLo > unHi) break;
686          mswap(ptr[unLo], ptr[unHi]); unLo++; unHi--;
687       }
688 
689       AssertD ( unHi == unLo-1, "mainQSort3(2)" );
690 
691       if (gtHi < ltLo) {
692          mpush(lo, hi, d+1 );
693          continue;
694       }
695 
696       n = mmin(ltLo-lo, unLo-ltLo); mvswap(lo, unLo-n, n);
697       m = mmin(hi-gtHi, gtHi-unHi); mvswap(unLo, hi-m+1, m);
698 
699       n = lo + unLo - ltLo - 1;
700       m = hi - (gtHi - unHi) + 1;
701 
702       nextLo[0] = lo;  nextHi[0] = n;   nextD[0] = d;
703       nextLo[1] = m;   nextHi[1] = hi;  nextD[1] = d;
704       nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1;
705 
706       if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
707       if (mnextsize(1) < mnextsize(2)) mnextswap(1,2);
708       if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
709 
710       AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)" );
711       AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)" );
712 
713       mpush (nextLo[0], nextHi[0], nextD[0]);
714       mpush (nextLo[1], nextHi[1], nextD[1]);
715       mpush (nextLo[2], nextHi[2], nextD[2]);
716    }
717 }
718 
719 #undef mswap
720 #undef mvswap
721 #undef mpush
722 #undef mpop
723 #undef mmin
724 #undef mnextsize
725 #undef mnextswap
726 #undef MAIN_QSORT_SMALL_THRESH
727 #undef MAIN_QSORT_DEPTH_THRESH
728 #undef MAIN_QSORT_STACK_SIZE
729 
730 
731 /*---------------------------------------------*/
732 /* Pre:
733       nblock > N_OVERSHOOT
734       block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
735       ((UChar*)block32) [0 .. nblock-1] holds block
736       ptr exists for [0 .. nblock-1]
737 
738    Post:
739       ((UChar*)block32) [0 .. nblock-1] holds block
740       All other areas of block32 destroyed
741       ftab [0 .. 65536 ] destroyed
742       ptr [0 .. nblock-1] holds sorted order
743       if (*budget < 0), sorting was abandoned
744 */
745 
746 #define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
747 #define SETMASK (1 << 21)
748 #define CLEARMASK (~(SETMASK))
749 
750 static
751 void mainSort ( UInt32* ptr,
752                 UChar*  block,
753                 UInt16* quadrant,
754                 UInt32* ftab,
755                 Int32   nblock,
756                 Int32   verb,
757                 Int32*  budget )
758 {
759    Int32  i, j, k, ss, sb;
760    Int32  runningOrder[256];
761    Bool   bigDone[256];
762    Int32  copyStart[256];
763    Int32  copyEnd  [256];
764    UChar  c1;
765    Int32  numQSorted;
766    UInt16 s;
767    if (verb >= 4) VPrintf0 ( "        main sort initialise ...\n" );
768 
769    /*-- set up the 2-byte frequency table --*/
770    for (i = 65536; i >= 0; i--) ftab[i] = 0;
771 
772    j = block[0] << 8;
773    i = nblock-1;
774    for (; i >= 3; i -= 4) {
775       quadrant[i] = 0;
776       j = (j >> 8) | ( ((UInt16)block[i]) << 8);
777       ftab[j]++;
778       quadrant[i-1] = 0;
779       j = (j >> 8) | ( ((UInt16)block[i-1]) << 8);
780       ftab[j]++;
781       quadrant[i-2] = 0;
782       j = (j >> 8) | ( ((UInt16)block[i-2]) << 8);
783       ftab[j]++;
784       quadrant[i-3] = 0;
785       j = (j >> 8) | ( ((UInt16)block[i-3]) << 8);
786       ftab[j]++;
787    }
788    for (; i >= 0; i--) {
789       quadrant[i] = 0;
790       j = (j >> 8) | ( ((UInt16)block[i]) << 8);
791       ftab[j]++;
792    }
793 
794    /*-- (emphasises close relationship of block & quadrant) --*/
795    for (i = 0; i < BZ_N_OVERSHOOT; i++) {
796       block   [nblock+i] = block[i];
797       quadrant[nblock+i] = 0;
798    }
799 
800    if (verb >= 4) VPrintf0 ( "        bucket sorting ...\n" );
801 
802    /*-- Complete the initial radix sort --*/
803    for (i = 1; i <= 65536; i++) ftab[i] += ftab[i-1];
804 
805    s = block[0] << 8;
806    i = nblock-1;
807    for (; i >= 3; i -= 4) {
808       s = (s >> 8) | (block[i] << 8);
809       j = ftab[s] -1;
810       ftab[s] = j;
811       ptr[j] = i;
812       s = (s >> 8) | (block[i-1] << 8);
813       j = ftab[s] -1;
814       ftab[s] = j;
815       ptr[j] = i-1;
816       s = (s >> 8) | (block[i-2] << 8);
817       j = ftab[s] -1;
818       ftab[s] = j;
819       ptr[j] = i-2;
820       s = (s >> 8) | (block[i-3] << 8);
821       j = ftab[s] -1;
822       ftab[s] = j;
823       ptr[j] = i-3;
824    }
825    for (; i >= 0; i--) {
826       s = (s >> 8) | (block[i] << 8);
827       j = ftab[s] -1;
828       ftab[s] = j;
829       ptr[j] = i;
830    }
831 
832    /*--
833       Now ftab contains the first loc of every small bucket.
834       Calculate the running order, from smallest to largest
835       big bucket.
836    --*/
837    for (i = 0; i <= 255; i++) {
838       bigDone     [i] = False;
839       runningOrder[i] = i;
840    }
841 
842    {
843       Int32 vv;
844       Int32 h = 1;
845       do h = 3 * h + 1; while (h <= 256);
846       do {
847          h = h / 3;
848          for (i = h; i <= 255; i++) {
849             vv = runningOrder[i];
850             j = i;
851             while ( BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv) ) {
852                runningOrder[j] = runningOrder[j-h];
853                j = j - h;
854                if (j <= (h - 1)) goto zero;
855             }
856             zero:
857             runningOrder[j] = vv;
858          }
859       } while (h != 1);
860    }
861 
862    /*--
863       The main sorting loop.
864    --*/
865 
866    numQSorted = 0;
867 
868    for (i = 0; i <= 255; i++) {
869 
870       /*--
871          Process big buckets, starting with the least full.
872          Basically this is a 3-step process in which we call
873          mainQSort3 to sort the small buckets [ss, j], but
874          also make a big effort to avoid the calls if we can.
875       --*/
876       ss = runningOrder[i];
877 
878       /*--
879          Step 1:
880          Complete the big bucket [ss] by quicksorting
881          any unsorted small buckets [ss, j], for j != ss.
882          Hopefully previous pointer-scanning phases have already
883          completed many of the small buckets [ss, j], so
884          we don't have to sort them at all.
885       --*/
886       for (j = 0; j <= 255; j++) {
887          if (j != ss) {
888             sb = (ss << 8) + j;
889             if ( ! (ftab[sb] & SETMASK) ) {
890                Int32 lo = ftab[sb]   & CLEARMASK;
891                Int32 hi = (ftab[sb+1] & CLEARMASK) - 1;
892                if (hi > lo) {
893                   if (verb >= 4)
894                      VPrintf4 ( "        qsort [0x%x, 0x%x]   "
895                                 "done %d   this %d\n",
896                                 ss, j, numQSorted, hi - lo + 1 );
897                   mainQSort3 (
898                      ptr, block, quadrant, nblock,
899                      lo, hi, BZ_N_RADIX, budget
900                   );
901                   numQSorted += (hi - lo + 1);
902                   if (*budget < 0) return;
903                }
904             }
905             ftab[sb] |= SETMASK;
906          }
907       }
908 
909       AssertH ( !bigDone[ss], 1006 );
910 
911       /*--
912          Step 2:
913          Now scan this big bucket [ss] so as to synthesise the
914          sorted order for small buckets [t, ss] for all t,
915          including, magically, the bucket [ss,ss] too.
916          This will avoid doing Real Work in subsequent Step 1's.
917       --*/
918       {
919          for (j = 0; j <= 255; j++) {
920             copyStart[j] =  ftab[(j << 8) + ss]     & CLEARMASK;
921             copyEnd  [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1;
922          }
923          for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) {
924             k = ptr[j]-1; if (k < 0) k += nblock;
925             c1 = block[k];
926             if (!bigDone[c1])
927                ptr[ copyStart[c1]++ ] = k;
928          }
929          for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) {
930             k = ptr[j]-1; if (k < 0) k += nblock;
931             c1 = block[k];
932             if (!bigDone[c1])
933                ptr[ copyEnd[c1]-- ] = k;
934          }
935       }
936 
937       AssertH ( (copyStart[ss]-1 == copyEnd[ss])
938                 ||
939                 /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1.
940                    Necessity for this case is demonstrated by compressing
941                    a sequence of approximately 48.5 million of character
942                    251; 1.0.0/1.0.1 will then die here. */
943                 (copyStart[ss] == 0 && copyEnd[ss] == nblock-1),
944                 1007 )
945 
946       for (j = 0; j <= 255; j++) ftab[(j << 8) + ss] |= SETMASK;
947 
948       /*--
949          Step 3:
950          The [ss] big bucket is now done.  Record this fact,
951          and update the quadrant descriptors.  Remember to
952          update quadrants in the overshoot area too, if
953          necessary.  The "if (i < 255)" test merely skips
954          this updating for the last bucket processed, since
955          updating for the last bucket is pointless.
956 
957          The quadrant array provides a way to incrementally
958          cache sort orderings, as they appear, so as to
959          make subsequent comparisons in fullGtU() complete
960          faster.  For repetitive blocks this makes a big
961          difference (but not big enough to be able to avoid
962          the fallback sorting mechanism, exponential radix sort).
963 
964          The precise meaning is: at all times:
965 
966             for 0 <= i < nblock and 0 <= j <= nblock
967 
968             if block[i] != block[j],
969 
970                then the relative values of quadrant[i] and
971                     quadrant[j] are meaningless.
972 
973                else {
974                   if quadrant[i] < quadrant[j]
975                      then the string starting at i lexicographically
976                      precedes the string starting at j
977 
978                   else if quadrant[i] > quadrant[j]
979                      then the string starting at j lexicographically
980                      precedes the string starting at i
981 
982                   else
983                      the relative ordering of the strings starting
984                      at i and j has not yet been determined.
985                }
986       --*/
987       bigDone[ss] = True;
988 
989       if (i < 255) {
990          Int32 bbStart  = ftab[ss << 8] & CLEARMASK;
991          Int32 bbSize   = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
992          Int32 shifts   = 0;
993 
994          while ((bbSize >> shifts) > 65534) shifts++;
995 
996          for (j = bbSize-1; j >= 0; j--) {
997             Int32 a2update     = ptr[bbStart + j];
998             UInt16 qVal        = (UInt16)(j >> shifts);
999             quadrant[a2update] = qVal;
1000             if (a2update < BZ_N_OVERSHOOT)
1001                quadrant[a2update + nblock] = qVal;
1002          }
1003          AssertH ( ((bbSize-1) >> shifts) <= 65535, 1002 );
1004       }
1005 
1006    }
1007 
1008    if (verb >= 4)
1009       VPrintf3 ( "        %d pointers, %d sorted, %d scanned\n",
1010                  nblock, numQSorted, nblock - numQSorted );
1011 }
1012 
1013 #undef BIGFREQ
1014 #undef SETMASK
1015 #undef CLEARMASK
1016 
1017 
1018 /*---------------------------------------------*/
1019 /* Pre:
1020       nblock > 0
1021       arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
1022       ((UChar*)arr2)  [0 .. nblock-1] holds block
1023       arr1 exists for [0 .. nblock-1]
1024 
1025    Post:
1026       ((UChar*)arr2) [0 .. nblock-1] holds block
1027       All other areas of block destroyed
1028       ftab [ 0 .. 65536 ] destroyed
1029       arr1 [0 .. nblock-1] holds sorted order
1030 */
1031 void BZ2_blockSort ( EState* s )
1032 {
1033    UInt32* ptr    = s->ptr;
1034    UChar*  block  = s->block;
1035    UInt32* ftab   = s->ftab;
1036    Int32   nblock = s->nblock;
1037    Int32   verb   = s->verbosity;
1038    Int32   wfact  = s->workFactor;
1039    UInt16* quadrant;
1040    Int32   budget;
1041    Int32   budgetInit;
1042    Int32   i;
1043 
1044    if (nblock < 10000) {
1045       fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
1046    } else {
1047       /* Calculate the location for quadrant, remembering to get
1048          the alignment right.  Assumes that &(block[0]) is at least
1049          2-byte aligned -- this should be ok since block is really
1050          the first section of arr2.
1051       */
1052       i = nblock+BZ_N_OVERSHOOT;
1053       if (i & 1) i++;
1054       quadrant = (UInt16*)(&(block[i]));
1055 
1056       /* (wfact-1) / 3 puts the default-factor-30
1057          transition point at very roughly the same place as
1058          with v0.1 and v0.9.0.
1059          Not that it particularly matters any more, since the
1060          resulting compressed stream is now the same regardless
1061          of whether or not we use the main sort or fallback sort.
1062       */
1063       if (wfact < 1  ) wfact = 1;
1064       if (wfact > 100) wfact = 100;
1065       budgetInit = nblock * ((wfact-1) / 3);
1066       budget = budgetInit;
1067 
1068       mainSort ( ptr, block, quadrant, ftab, nblock, verb, &budget );
1069       if (verb >= 3)
1070          VPrintf3 ( "      %d work, %d block, ratio %5.2f\n",
1071                     budgetInit - budget,
1072                     nblock,
1073                     (float)(budgetInit - budget) /
1074                     (float)(nblock==0 ? 1 : nblock) );
1075       if (budget < 0) {
1076          if (verb >= 2)
1077             VPrintf0 ( "    too repetitive; using fallback"
1078                        " sorting algorithm\n" );
1079          fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
1080       }
1081    }
1082 
1083    s->origPtr = -1;
1084    for (i = 0; i < s->nblock; i++)
1085       if (ptr[i] == 0)
1086          { s->origPtr = i; break; };
1087 
1088    AssertH( s->origPtr != -1, 1003 );
1089 }
1090 
1091 
1092 /*-------------------------------------------------------------*/
1093 /*--- end                                       blocksort.c ---*/
1094 /*-------------------------------------------------------------*/
1095