1*c08cbc64SXin LI /* 2*c08cbc64SXin LI * utils.c for libdivsufsort 3*c08cbc64SXin LI * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. 4*c08cbc64SXin LI * 5*c08cbc64SXin LI * Permission is hereby granted, free of charge, to any person 6*c08cbc64SXin LI * obtaining a copy of this software and associated documentation 7*c08cbc64SXin LI * files (the "Software"), to deal in the Software without 8*c08cbc64SXin LI * restriction, including without limitation the rights to use, 9*c08cbc64SXin LI * copy, modify, merge, publish, distribute, sublicense, and/or sell 10*c08cbc64SXin LI * copies of the Software, and to permit persons to whom the 11*c08cbc64SXin LI * Software is furnished to do so, subject to the following 12*c08cbc64SXin LI * conditions: 13*c08cbc64SXin LI * 14*c08cbc64SXin LI * The above copyright notice and this permission notice shall be 15*c08cbc64SXin LI * included in all copies or substantial portions of the Software. 16*c08cbc64SXin LI * 17*c08cbc64SXin LI * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18*c08cbc64SXin LI * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 19*c08cbc64SXin LI * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20*c08cbc64SXin LI * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 21*c08cbc64SXin LI * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 22*c08cbc64SXin LI * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23*c08cbc64SXin LI * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 24*c08cbc64SXin LI * OTHER DEALINGS IN THE SOFTWARE. 25*c08cbc64SXin LI */ 26*c08cbc64SXin LI 27*c08cbc64SXin LI #include "divsufsort_private.h" 28*c08cbc64SXin LI 29*c08cbc64SXin LI 30*c08cbc64SXin LI /*- Private Function -*/ 31*c08cbc64SXin LI 32*c08cbc64SXin LI /* Binary search for inverse bwt. */ 33*c08cbc64SXin LI static 34*c08cbc64SXin LI saidx_t 35*c08cbc64SXin LI binarysearch_lower(const saidx_t *A, saidx_t size, saidx_t value) { 36*c08cbc64SXin LI saidx_t half, i; 37*c08cbc64SXin LI for(i = 0, half = size >> 1; 38*c08cbc64SXin LI 0 < size; 39*c08cbc64SXin LI size = half, half >>= 1) { 40*c08cbc64SXin LI if(A[i + half] < value) { 41*c08cbc64SXin LI i += half + 1; 42*c08cbc64SXin LI half -= (size & 1) ^ 1; 43*c08cbc64SXin LI } 44*c08cbc64SXin LI } 45*c08cbc64SXin LI return i; 46*c08cbc64SXin LI } 47*c08cbc64SXin LI 48*c08cbc64SXin LI 49*c08cbc64SXin LI /*- Functions -*/ 50*c08cbc64SXin LI 51*c08cbc64SXin LI /* Burrows-Wheeler transform. */ 52*c08cbc64SXin LI saint_t 53*c08cbc64SXin LI bw_transform(const sauchar_t *T, sauchar_t *U, saidx_t *SA, 54*c08cbc64SXin LI saidx_t n, saidx_t *idx) { 55*c08cbc64SXin LI saidx_t *A, i, j, p, t; 56*c08cbc64SXin LI saint_t c; 57*c08cbc64SXin LI 58*c08cbc64SXin LI /* Check arguments. */ 59*c08cbc64SXin LI if((T == NULL) || (U == NULL) || (n < 0) || (idx == NULL)) { return -1; } 60*c08cbc64SXin LI if(n <= 1) { 61*c08cbc64SXin LI if(n == 1) { U[0] = T[0]; } 62*c08cbc64SXin LI *idx = n; 63*c08cbc64SXin LI return 0; 64*c08cbc64SXin LI } 65*c08cbc64SXin LI 66*c08cbc64SXin LI if((A = SA) == NULL) { 67*c08cbc64SXin LI i = divbwt(T, U, NULL, n); 68*c08cbc64SXin LI if(0 <= i) { *idx = i; i = 0; } 69*c08cbc64SXin LI return (saint_t)i; 70*c08cbc64SXin LI } 71*c08cbc64SXin LI 72*c08cbc64SXin LI /* BW transform. */ 73*c08cbc64SXin LI if(T == U) { 74*c08cbc64SXin LI t = n; 75*c08cbc64SXin LI for(i = 0, j = 0; i < n; ++i) { 76*c08cbc64SXin LI p = t - 1; 77*c08cbc64SXin LI t = A[i]; 78*c08cbc64SXin LI if(0 <= p) { 79*c08cbc64SXin LI c = T[j]; 80*c08cbc64SXin LI U[j] = (j <= p) ? T[p] : (sauchar_t)A[p]; 81*c08cbc64SXin LI A[j] = c; 82*c08cbc64SXin LI j++; 83*c08cbc64SXin LI } else { 84*c08cbc64SXin LI *idx = i; 85*c08cbc64SXin LI } 86*c08cbc64SXin LI } 87*c08cbc64SXin LI p = t - 1; 88*c08cbc64SXin LI if(0 <= p) { 89*c08cbc64SXin LI c = T[j]; 90*c08cbc64SXin LI U[j] = (j <= p) ? T[p] : (sauchar_t)A[p]; 91*c08cbc64SXin LI A[j] = c; 92*c08cbc64SXin LI } else { 93*c08cbc64SXin LI *idx = i; 94*c08cbc64SXin LI } 95*c08cbc64SXin LI } else { 96*c08cbc64SXin LI U[0] = T[n - 1]; 97*c08cbc64SXin LI for(i = 0; A[i] != 0; ++i) { U[i + 1] = T[A[i] - 1]; } 98*c08cbc64SXin LI *idx = i + 1; 99*c08cbc64SXin LI for(++i; i < n; ++i) { U[i] = T[A[i] - 1]; } 100*c08cbc64SXin LI } 101*c08cbc64SXin LI 102*c08cbc64SXin LI if(SA == NULL) { 103*c08cbc64SXin LI /* Deallocate memory. */ 104*c08cbc64SXin LI free(A); 105*c08cbc64SXin LI } 106*c08cbc64SXin LI 107*c08cbc64SXin LI return 0; 108*c08cbc64SXin LI } 109*c08cbc64SXin LI 110*c08cbc64SXin LI /* Inverse Burrows-Wheeler transform. */ 111*c08cbc64SXin LI saint_t 112*c08cbc64SXin LI inverse_bw_transform(const sauchar_t *T, sauchar_t *U, saidx_t *A, 113*c08cbc64SXin LI saidx_t n, saidx_t idx) { 114*c08cbc64SXin LI saidx_t C[ALPHABET_SIZE]; 115*c08cbc64SXin LI sauchar_t D[ALPHABET_SIZE]; 116*c08cbc64SXin LI saidx_t *B; 117*c08cbc64SXin LI saidx_t i, p; 118*c08cbc64SXin LI saint_t c, d; 119*c08cbc64SXin LI 120*c08cbc64SXin LI /* Check arguments. */ 121*c08cbc64SXin LI if((T == NULL) || (U == NULL) || (n < 0) || (idx < 0) || 122*c08cbc64SXin LI (n < idx) || ((0 < n) && (idx == 0))) { 123*c08cbc64SXin LI return -1; 124*c08cbc64SXin LI } 125*c08cbc64SXin LI if(n <= 1) { return 0; } 126*c08cbc64SXin LI 127*c08cbc64SXin LI if((B = A) == NULL) { 128*c08cbc64SXin LI /* Allocate n*sizeof(saidx_t) bytes of memory. */ 129*c08cbc64SXin LI if((B = (saidx_t *)malloc((size_t)n * sizeof(saidx_t))) == NULL) { return -2; } 130*c08cbc64SXin LI } 131*c08cbc64SXin LI 132*c08cbc64SXin LI /* Inverse BW transform. */ 133*c08cbc64SXin LI for(c = 0; c < ALPHABET_SIZE; ++c) { C[c] = 0; } 134*c08cbc64SXin LI for(i = 0; i < n; ++i) { ++C[T[i]]; } 135*c08cbc64SXin LI for(c = 0, d = 0, i = 0; c < ALPHABET_SIZE; ++c) { 136*c08cbc64SXin LI p = C[c]; 137*c08cbc64SXin LI if(0 < p) { 138*c08cbc64SXin LI C[c] = i; 139*c08cbc64SXin LI D[d++] = (sauchar_t)c; 140*c08cbc64SXin LI i += p; 141*c08cbc64SXin LI } 142*c08cbc64SXin LI } 143*c08cbc64SXin LI for(i = 0; i < idx; ++i) { B[C[T[i]]++] = i; } 144*c08cbc64SXin LI for( ; i < n; ++i) { B[C[T[i]]++] = i + 1; } 145*c08cbc64SXin LI for(c = 0; c < d; ++c) { C[c] = C[D[c]]; } 146*c08cbc64SXin LI for(i = 0, p = idx; i < n; ++i) { 147*c08cbc64SXin LI U[i] = D[binarysearch_lower(C, d, p)]; 148*c08cbc64SXin LI p = B[p - 1]; 149*c08cbc64SXin LI } 150*c08cbc64SXin LI 151*c08cbc64SXin LI if(A == NULL) { 152*c08cbc64SXin LI /* Deallocate memory. */ 153*c08cbc64SXin LI free(B); 154*c08cbc64SXin LI } 155*c08cbc64SXin LI 156*c08cbc64SXin LI return 0; 157*c08cbc64SXin LI } 158*c08cbc64SXin LI 159*c08cbc64SXin LI /* Checks the suffix array SA of the string T. */ 160*c08cbc64SXin LI saint_t 161*c08cbc64SXin LI sufcheck(const sauchar_t *T, const saidx_t *SA, 162*c08cbc64SXin LI saidx_t n, saint_t verbose) { 163*c08cbc64SXin LI saidx_t C[ALPHABET_SIZE]; 164*c08cbc64SXin LI saidx_t i, p, q, t; 165*c08cbc64SXin LI saint_t c; 166*c08cbc64SXin LI 167*c08cbc64SXin LI if(verbose) { fprintf(stderr, "sufcheck: "); } 168*c08cbc64SXin LI 169*c08cbc64SXin LI /* Check arguments. */ 170*c08cbc64SXin LI if((T == NULL) || (SA == NULL) || (n < 0)) { 171*c08cbc64SXin LI if(verbose) { fprintf(stderr, "Invalid arguments.\n"); } 172*c08cbc64SXin LI return -1; 173*c08cbc64SXin LI } 174*c08cbc64SXin LI if(n == 0) { 175*c08cbc64SXin LI if(verbose) { fprintf(stderr, "Done.\n"); } 176*c08cbc64SXin LI return 0; 177*c08cbc64SXin LI } 178*c08cbc64SXin LI 179*c08cbc64SXin LI /* check range: [0..n-1] */ 180*c08cbc64SXin LI for(i = 0; i < n; ++i) { 181*c08cbc64SXin LI if((SA[i] < 0) || (n <= SA[i])) { 182*c08cbc64SXin LI if(verbose) { 183*c08cbc64SXin LI fprintf(stderr, "Out of the range [0,%" PRIdSAIDX_T "].\n" 184*c08cbc64SXin LI " SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "\n", 185*c08cbc64SXin LI n - 1, i, SA[i]); 186*c08cbc64SXin LI } 187*c08cbc64SXin LI return -2; 188*c08cbc64SXin LI } 189*c08cbc64SXin LI } 190*c08cbc64SXin LI 191*c08cbc64SXin LI /* check first characters. */ 192*c08cbc64SXin LI for(i = 1; i < n; ++i) { 193*c08cbc64SXin LI if(T[SA[i - 1]] > T[SA[i]]) { 194*c08cbc64SXin LI if(verbose) { 195*c08cbc64SXin LI fprintf(stderr, "Suffixes in wrong order.\n" 196*c08cbc64SXin LI " T[SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "]=%d" 197*c08cbc64SXin LI " > T[SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "]=%d\n", 198*c08cbc64SXin LI i - 1, SA[i - 1], T[SA[i - 1]], i, SA[i], T[SA[i]]); 199*c08cbc64SXin LI } 200*c08cbc64SXin LI return -3; 201*c08cbc64SXin LI } 202*c08cbc64SXin LI } 203*c08cbc64SXin LI 204*c08cbc64SXin LI /* check suffixes. */ 205*c08cbc64SXin LI for(i = 0; i < ALPHABET_SIZE; ++i) { C[i] = 0; } 206*c08cbc64SXin LI for(i = 0; i < n; ++i) { ++C[T[i]]; } 207*c08cbc64SXin LI for(i = 0, p = 0; i < ALPHABET_SIZE; ++i) { 208*c08cbc64SXin LI t = C[i]; 209*c08cbc64SXin LI C[i] = p; 210*c08cbc64SXin LI p += t; 211*c08cbc64SXin LI } 212*c08cbc64SXin LI 213*c08cbc64SXin LI q = C[T[n - 1]]; 214*c08cbc64SXin LI C[T[n - 1]] += 1; 215*c08cbc64SXin LI for(i = 0; i < n; ++i) { 216*c08cbc64SXin LI p = SA[i]; 217*c08cbc64SXin LI if(0 < p) { 218*c08cbc64SXin LI c = T[--p]; 219*c08cbc64SXin LI t = C[c]; 220*c08cbc64SXin LI } else { 221*c08cbc64SXin LI c = T[p = n - 1]; 222*c08cbc64SXin LI t = q; 223*c08cbc64SXin LI } 224*c08cbc64SXin LI if((t < 0) || (p != SA[t])) { 225*c08cbc64SXin LI if(verbose) { 226*c08cbc64SXin LI fprintf(stderr, "Suffix in wrong position.\n" 227*c08cbc64SXin LI " SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T " or\n" 228*c08cbc64SXin LI " SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "\n", 229*c08cbc64SXin LI t, (0 <= t) ? SA[t] : -1, i, SA[i]); 230*c08cbc64SXin LI } 231*c08cbc64SXin LI return -4; 232*c08cbc64SXin LI } 233*c08cbc64SXin LI if(t != q) { 234*c08cbc64SXin LI ++C[c]; 235*c08cbc64SXin LI if((n <= C[c]) || (T[SA[C[c]]] != c)) { C[c] = -1; } 236*c08cbc64SXin LI } 237*c08cbc64SXin LI } 238*c08cbc64SXin LI 239*c08cbc64SXin LI if(1 <= verbose) { fprintf(stderr, "Done.\n"); } 240*c08cbc64SXin LI return 0; 241*c08cbc64SXin LI } 242*c08cbc64SXin LI 243*c08cbc64SXin LI 244*c08cbc64SXin LI static 245*c08cbc64SXin LI int 246*c08cbc64SXin LI _compare(const sauchar_t *T, saidx_t Tsize, 247*c08cbc64SXin LI const sauchar_t *P, saidx_t Psize, 248*c08cbc64SXin LI saidx_t suf, saidx_t *match) { 249*c08cbc64SXin LI saidx_t i, j; 250*c08cbc64SXin LI saint_t r; 251*c08cbc64SXin LI for(i = suf + *match, j = *match, r = 0; 252*c08cbc64SXin LI (i < Tsize) && (j < Psize) && ((r = T[i] - P[j]) == 0); ++i, ++j) { } 253*c08cbc64SXin LI *match = j; 254*c08cbc64SXin LI return (r == 0) ? -(j != Psize) : r; 255*c08cbc64SXin LI } 256*c08cbc64SXin LI 257*c08cbc64SXin LI /* Search for the pattern P in the string T. */ 258*c08cbc64SXin LI saidx_t 259*c08cbc64SXin LI sa_search(const sauchar_t *T, saidx_t Tsize, 260*c08cbc64SXin LI const sauchar_t *P, saidx_t Psize, 261*c08cbc64SXin LI const saidx_t *SA, saidx_t SAsize, 262*c08cbc64SXin LI saidx_t *idx) { 263*c08cbc64SXin LI saidx_t size, lsize, rsize, half; 264*c08cbc64SXin LI saidx_t match, lmatch, rmatch; 265*c08cbc64SXin LI saidx_t llmatch, lrmatch, rlmatch, rrmatch; 266*c08cbc64SXin LI saidx_t i, j, k; 267*c08cbc64SXin LI saint_t r; 268*c08cbc64SXin LI 269*c08cbc64SXin LI if(idx != NULL) { *idx = -1; } 270*c08cbc64SXin LI if((T == NULL) || (P == NULL) || (SA == NULL) || 271*c08cbc64SXin LI (Tsize < 0) || (Psize < 0) || (SAsize < 0)) { return -1; } 272*c08cbc64SXin LI if((Tsize == 0) || (SAsize == 0)) { return 0; } 273*c08cbc64SXin LI if(Psize == 0) { if(idx != NULL) { *idx = 0; } return SAsize; } 274*c08cbc64SXin LI 275*c08cbc64SXin LI for(i = j = k = 0, lmatch = rmatch = 0, size = SAsize, half = size >> 1; 276*c08cbc64SXin LI 0 < size; 277*c08cbc64SXin LI size = half, half >>= 1) { 278*c08cbc64SXin LI match = MIN(lmatch, rmatch); 279*c08cbc64SXin LI r = _compare(T, Tsize, P, Psize, SA[i + half], &match); 280*c08cbc64SXin LI if(r < 0) { 281*c08cbc64SXin LI i += half + 1; 282*c08cbc64SXin LI half -= (size & 1) ^ 1; 283*c08cbc64SXin LI lmatch = match; 284*c08cbc64SXin LI } else if(r > 0) { 285*c08cbc64SXin LI rmatch = match; 286*c08cbc64SXin LI } else { 287*c08cbc64SXin LI lsize = half, j = i, rsize = size - half - 1, k = i + half + 1; 288*c08cbc64SXin LI 289*c08cbc64SXin LI /* left part */ 290*c08cbc64SXin LI for(llmatch = lmatch, lrmatch = match, half = lsize >> 1; 291*c08cbc64SXin LI 0 < lsize; 292*c08cbc64SXin LI lsize = half, half >>= 1) { 293*c08cbc64SXin LI lmatch = MIN(llmatch, lrmatch); 294*c08cbc64SXin LI r = _compare(T, Tsize, P, Psize, SA[j + half], &lmatch); 295*c08cbc64SXin LI if(r < 0) { 296*c08cbc64SXin LI j += half + 1; 297*c08cbc64SXin LI half -= (lsize & 1) ^ 1; 298*c08cbc64SXin LI llmatch = lmatch; 299*c08cbc64SXin LI } else { 300*c08cbc64SXin LI lrmatch = lmatch; 301*c08cbc64SXin LI } 302*c08cbc64SXin LI } 303*c08cbc64SXin LI 304*c08cbc64SXin LI /* right part */ 305*c08cbc64SXin LI for(rlmatch = match, rrmatch = rmatch, half = rsize >> 1; 306*c08cbc64SXin LI 0 < rsize; 307*c08cbc64SXin LI rsize = half, half >>= 1) { 308*c08cbc64SXin LI rmatch = MIN(rlmatch, rrmatch); 309*c08cbc64SXin LI r = _compare(T, Tsize, P, Psize, SA[k + half], &rmatch); 310*c08cbc64SXin LI if(r <= 0) { 311*c08cbc64SXin LI k += half + 1; 312*c08cbc64SXin LI half -= (rsize & 1) ^ 1; 313*c08cbc64SXin LI rlmatch = rmatch; 314*c08cbc64SXin LI } else { 315*c08cbc64SXin LI rrmatch = rmatch; 316*c08cbc64SXin LI } 317*c08cbc64SXin LI } 318*c08cbc64SXin LI 319*c08cbc64SXin LI break; 320*c08cbc64SXin LI } 321*c08cbc64SXin LI } 322*c08cbc64SXin LI 323*c08cbc64SXin LI if(idx != NULL) { *idx = (0 < (k - j)) ? j : i; } 324*c08cbc64SXin LI return k - j; 325*c08cbc64SXin LI } 326*c08cbc64SXin LI 327*c08cbc64SXin LI /* Search for the character c in the string T. */ 328*c08cbc64SXin LI saidx_t 329*c08cbc64SXin LI sa_simplesearch(const sauchar_t *T, saidx_t Tsize, 330*c08cbc64SXin LI const saidx_t *SA, saidx_t SAsize, 331*c08cbc64SXin LI saint_t c, saidx_t *idx) { 332*c08cbc64SXin LI saidx_t size, lsize, rsize, half; 333*c08cbc64SXin LI saidx_t i, j, k, p; 334*c08cbc64SXin LI saint_t r; 335*c08cbc64SXin LI 336*c08cbc64SXin LI if(idx != NULL) { *idx = -1; } 337*c08cbc64SXin LI if((T == NULL) || (SA == NULL) || (Tsize < 0) || (SAsize < 0)) { return -1; } 338*c08cbc64SXin LI if((Tsize == 0) || (SAsize == 0)) { return 0; } 339*c08cbc64SXin LI 340*c08cbc64SXin LI for(i = j = k = 0, size = SAsize, half = size >> 1; 341*c08cbc64SXin LI 0 < size; 342*c08cbc64SXin LI size = half, half >>= 1) { 343*c08cbc64SXin LI p = SA[i + half]; 344*c08cbc64SXin LI r = (p < Tsize) ? T[p] - c : -1; 345*c08cbc64SXin LI if(r < 0) { 346*c08cbc64SXin LI i += half + 1; 347*c08cbc64SXin LI half -= (size & 1) ^ 1; 348*c08cbc64SXin LI } else if(r == 0) { 349*c08cbc64SXin LI lsize = half, j = i, rsize = size - half - 1, k = i + half + 1; 350*c08cbc64SXin LI 351*c08cbc64SXin LI /* left part */ 352*c08cbc64SXin LI for(half = lsize >> 1; 353*c08cbc64SXin LI 0 < lsize; 354*c08cbc64SXin LI lsize = half, half >>= 1) { 355*c08cbc64SXin LI p = SA[j + half]; 356*c08cbc64SXin LI r = (p < Tsize) ? T[p] - c : -1; 357*c08cbc64SXin LI if(r < 0) { 358*c08cbc64SXin LI j += half + 1; 359*c08cbc64SXin LI half -= (lsize & 1) ^ 1; 360*c08cbc64SXin LI } 361*c08cbc64SXin LI } 362*c08cbc64SXin LI 363*c08cbc64SXin LI /* right part */ 364*c08cbc64SXin LI for(half = rsize >> 1; 365*c08cbc64SXin LI 0 < rsize; 366*c08cbc64SXin LI rsize = half, half >>= 1) { 367*c08cbc64SXin LI p = SA[k + half]; 368*c08cbc64SXin LI r = (p < Tsize) ? T[p] - c : -1; 369*c08cbc64SXin LI if(r <= 0) { 370*c08cbc64SXin LI k += half + 1; 371*c08cbc64SXin LI half -= (rsize & 1) ^ 1; 372*c08cbc64SXin LI } 373*c08cbc64SXin LI } 374*c08cbc64SXin LI 375*c08cbc64SXin LI break; 376*c08cbc64SXin LI } 377*c08cbc64SXin LI } 378*c08cbc64SXin LI 379*c08cbc64SXin LI if(idx != NULL) { *idx = (0 < (k - j)) ? j : i; } 380*c08cbc64SXin LI return k - j; 381*c08cbc64SXin LI } 382