1 /* 2 * utils.c for libdivsufsort 3 * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. 4 * 5 * Permission is hereby granted, free of charge, to any person 6 * obtaining a copy of this software and associated documentation 7 * files (the "Software"), to deal in the Software without 8 * restriction, including without limitation the rights to use, 9 * copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following 12 * conditions: 13 * 14 * The above copyright notice and this permission notice shall be 15 * included in all copies or substantial portions of the Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 19 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 21 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 22 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 24 * OTHER DEALINGS IN THE SOFTWARE. 25 */ 26 27 #include "divsufsort_private.h" 28 29 30 /*- Private Function -*/ 31 32 /* Binary search for inverse bwt. */ 33 static 34 saidx_t 35 binarysearch_lower(const saidx_t *A, saidx_t size, saidx_t value) { 36 saidx_t half, i; 37 for(i = 0, half = size >> 1; 38 0 < size; 39 size = half, half >>= 1) { 40 if(A[i + half] < value) { 41 i += half + 1; 42 half -= (size & 1) ^ 1; 43 } 44 } 45 return i; 46 } 47 48 49 /*- Functions -*/ 50 51 /* Burrows-Wheeler transform. */ 52 saint_t 53 bw_transform(const sauchar_t *T, sauchar_t *U, saidx_t *SA, 54 saidx_t n, saidx_t *idx) { 55 saidx_t *A, i, j, p, t; 56 saint_t c; 57 58 /* Check arguments. */ 59 if((T == NULL) || (U == NULL) || (n < 0) || (idx == NULL)) { return -1; } 60 if(n <= 1) { 61 if(n == 1) { U[0] = T[0]; } 62 *idx = n; 63 return 0; 64 } 65 66 if((A = SA) == NULL) { 67 i = divbwt(T, U, NULL, n); 68 if(0 <= i) { *idx = i; i = 0; } 69 return (saint_t)i; 70 } 71 72 /* BW transform. */ 73 if(T == U) { 74 t = n; 75 for(i = 0, j = 0; i < n; ++i) { 76 p = t - 1; 77 t = A[i]; 78 if(0 <= p) { 79 c = T[j]; 80 U[j] = (j <= p) ? T[p] : (sauchar_t)A[p]; 81 A[j] = c; 82 j++; 83 } else { 84 *idx = i; 85 } 86 } 87 p = t - 1; 88 if(0 <= p) { 89 c = T[j]; 90 U[j] = (j <= p) ? T[p] : (sauchar_t)A[p]; 91 A[j] = c; 92 } else { 93 *idx = i; 94 } 95 } else { 96 U[0] = T[n - 1]; 97 for(i = 0; A[i] != 0; ++i) { U[i + 1] = T[A[i] - 1]; } 98 *idx = i + 1; 99 for(++i; i < n; ++i) { U[i] = T[A[i] - 1]; } 100 } 101 102 if(SA == NULL) { 103 /* Deallocate memory. */ 104 free(A); 105 } 106 107 return 0; 108 } 109 110 /* Inverse Burrows-Wheeler transform. */ 111 saint_t 112 inverse_bw_transform(const sauchar_t *T, sauchar_t *U, saidx_t *A, 113 saidx_t n, saidx_t idx) { 114 saidx_t C[ALPHABET_SIZE]; 115 sauchar_t D[ALPHABET_SIZE]; 116 saidx_t *B; 117 saidx_t i, p; 118 saint_t c, d; 119 120 /* Check arguments. */ 121 if((T == NULL) || (U == NULL) || (n < 0) || (idx < 0) || 122 (n < idx) || ((0 < n) && (idx == 0))) { 123 return -1; 124 } 125 if(n <= 1) { return 0; } 126 127 if((B = A) == NULL) { 128 /* Allocate n*sizeof(saidx_t) bytes of memory. */ 129 if((B = (saidx_t *)malloc((size_t)n * sizeof(saidx_t))) == NULL) { return -2; } 130 } 131 132 /* Inverse BW transform. */ 133 for(c = 0; c < ALPHABET_SIZE; ++c) { C[c] = 0; } 134 for(i = 0; i < n; ++i) { ++C[T[i]]; } 135 for(c = 0, d = 0, i = 0; c < ALPHABET_SIZE; ++c) { 136 p = C[c]; 137 if(0 < p) { 138 C[c] = i; 139 D[d++] = (sauchar_t)c; 140 i += p; 141 } 142 } 143 for(i = 0; i < idx; ++i) { B[C[T[i]]++] = i; } 144 for( ; i < n; ++i) { B[C[T[i]]++] = i + 1; } 145 for(c = 0; c < d; ++c) { C[c] = C[D[c]]; } 146 for(i = 0, p = idx; i < n; ++i) { 147 U[i] = D[binarysearch_lower(C, d, p)]; 148 p = B[p - 1]; 149 } 150 151 if(A == NULL) { 152 /* Deallocate memory. */ 153 free(B); 154 } 155 156 return 0; 157 } 158 159 /* Checks the suffix array SA of the string T. */ 160 saint_t 161 sufcheck(const sauchar_t *T, const saidx_t *SA, 162 saidx_t n, saint_t verbose) { 163 saidx_t C[ALPHABET_SIZE]; 164 saidx_t i, p, q, t; 165 saint_t c; 166 167 if(verbose) { fprintf(stderr, "sufcheck: "); } 168 169 /* Check arguments. */ 170 if((T == NULL) || (SA == NULL) || (n < 0)) { 171 if(verbose) { fprintf(stderr, "Invalid arguments.\n"); } 172 return -1; 173 } 174 if(n == 0) { 175 if(verbose) { fprintf(stderr, "Done.\n"); } 176 return 0; 177 } 178 179 /* check range: [0..n-1] */ 180 for(i = 0; i < n; ++i) { 181 if((SA[i] < 0) || (n <= SA[i])) { 182 if(verbose) { 183 fprintf(stderr, "Out of the range [0,%" PRIdSAIDX_T "].\n" 184 " SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "\n", 185 n - 1, i, SA[i]); 186 } 187 return -2; 188 } 189 } 190 191 /* check first characters. */ 192 for(i = 1; i < n; ++i) { 193 if(T[SA[i - 1]] > T[SA[i]]) { 194 if(verbose) { 195 fprintf(stderr, "Suffixes in wrong order.\n" 196 " T[SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "]=%d" 197 " > T[SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "]=%d\n", 198 i - 1, SA[i - 1], T[SA[i - 1]], i, SA[i], T[SA[i]]); 199 } 200 return -3; 201 } 202 } 203 204 /* check suffixes. */ 205 for(i = 0; i < ALPHABET_SIZE; ++i) { C[i] = 0; } 206 for(i = 0; i < n; ++i) { ++C[T[i]]; } 207 for(i = 0, p = 0; i < ALPHABET_SIZE; ++i) { 208 t = C[i]; 209 C[i] = p; 210 p += t; 211 } 212 213 q = C[T[n - 1]]; 214 C[T[n - 1]] += 1; 215 for(i = 0; i < n; ++i) { 216 p = SA[i]; 217 if(0 < p) { 218 c = T[--p]; 219 t = C[c]; 220 } else { 221 c = T[p = n - 1]; 222 t = q; 223 } 224 if((t < 0) || (p != SA[t])) { 225 if(verbose) { 226 fprintf(stderr, "Suffix in wrong position.\n" 227 " SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T " or\n" 228 " SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "\n", 229 t, (0 <= t) ? SA[t] : -1, i, SA[i]); 230 } 231 return -4; 232 } 233 if(t != q) { 234 ++C[c]; 235 if((n <= C[c]) || (T[SA[C[c]]] != c)) { C[c] = -1; } 236 } 237 } 238 239 if(1 <= verbose) { fprintf(stderr, "Done.\n"); } 240 return 0; 241 } 242 243 244 static 245 int 246 _compare(const sauchar_t *T, saidx_t Tsize, 247 const sauchar_t *P, saidx_t Psize, 248 saidx_t suf, saidx_t *match) { 249 saidx_t i, j; 250 saint_t r; 251 for(i = suf + *match, j = *match, r = 0; 252 (i < Tsize) && (j < Psize) && ((r = T[i] - P[j]) == 0); ++i, ++j) { } 253 *match = j; 254 return (r == 0) ? -(j != Psize) : r; 255 } 256 257 /* Search for the pattern P in the string T. */ 258 saidx_t 259 sa_search(const sauchar_t *T, saidx_t Tsize, 260 const sauchar_t *P, saidx_t Psize, 261 const saidx_t *SA, saidx_t SAsize, 262 saidx_t *idx) { 263 saidx_t size, lsize, rsize, half; 264 saidx_t match, lmatch, rmatch; 265 saidx_t llmatch, lrmatch, rlmatch, rrmatch; 266 saidx_t i, j, k; 267 saint_t r; 268 269 if(idx != NULL) { *idx = -1; } 270 if((T == NULL) || (P == NULL) || (SA == NULL) || 271 (Tsize < 0) || (Psize < 0) || (SAsize < 0)) { return -1; } 272 if((Tsize == 0) || (SAsize == 0)) { return 0; } 273 if(Psize == 0) { if(idx != NULL) { *idx = 0; } return SAsize; } 274 275 for(i = j = k = 0, lmatch = rmatch = 0, size = SAsize, half = size >> 1; 276 0 < size; 277 size = half, half >>= 1) { 278 match = MIN(lmatch, rmatch); 279 r = _compare(T, Tsize, P, Psize, SA[i + half], &match); 280 if(r < 0) { 281 i += half + 1; 282 half -= (size & 1) ^ 1; 283 lmatch = match; 284 } else if(r > 0) { 285 rmatch = match; 286 } else { 287 lsize = half, j = i, rsize = size - half - 1, k = i + half + 1; 288 289 /* left part */ 290 for(llmatch = lmatch, lrmatch = match, half = lsize >> 1; 291 0 < lsize; 292 lsize = half, half >>= 1) { 293 lmatch = MIN(llmatch, lrmatch); 294 r = _compare(T, Tsize, P, Psize, SA[j + half], &lmatch); 295 if(r < 0) { 296 j += half + 1; 297 half -= (lsize & 1) ^ 1; 298 llmatch = lmatch; 299 } else { 300 lrmatch = lmatch; 301 } 302 } 303 304 /* right part */ 305 for(rlmatch = match, rrmatch = rmatch, half = rsize >> 1; 306 0 < rsize; 307 rsize = half, half >>= 1) { 308 rmatch = MIN(rlmatch, rrmatch); 309 r = _compare(T, Tsize, P, Psize, SA[k + half], &rmatch); 310 if(r <= 0) { 311 k += half + 1; 312 half -= (rsize & 1) ^ 1; 313 rlmatch = rmatch; 314 } else { 315 rrmatch = rmatch; 316 } 317 } 318 319 break; 320 } 321 } 322 323 if(idx != NULL) { *idx = (0 < (k - j)) ? j : i; } 324 return k - j; 325 } 326 327 /* Search for the character c in the string T. */ 328 saidx_t 329 sa_simplesearch(const sauchar_t *T, saidx_t Tsize, 330 const saidx_t *SA, saidx_t SAsize, 331 saint_t c, saidx_t *idx) { 332 saidx_t size, lsize, rsize, half; 333 saidx_t i, j, k, p; 334 saint_t r; 335 336 if(idx != NULL) { *idx = -1; } 337 if((T == NULL) || (SA == NULL) || (Tsize < 0) || (SAsize < 0)) { return -1; } 338 if((Tsize == 0) || (SAsize == 0)) { return 0; } 339 340 for(i = j = k = 0, size = SAsize, half = size >> 1; 341 0 < size; 342 size = half, half >>= 1) { 343 p = SA[i + half]; 344 r = (p < Tsize) ? T[p] - c : -1; 345 if(r < 0) { 346 i += half + 1; 347 half -= (size & 1) ^ 1; 348 } else if(r == 0) { 349 lsize = half, j = i, rsize = size - half - 1, k = i + half + 1; 350 351 /* left part */ 352 for(half = lsize >> 1; 353 0 < lsize; 354 lsize = half, half >>= 1) { 355 p = SA[j + half]; 356 r = (p < Tsize) ? T[p] - c : -1; 357 if(r < 0) { 358 j += half + 1; 359 half -= (lsize & 1) ^ 1; 360 } 361 } 362 363 /* right part */ 364 for(half = rsize >> 1; 365 0 < rsize; 366 rsize = half, half >>= 1) { 367 p = SA[k + half]; 368 r = (p < Tsize) ? T[p] - c : -1; 369 if(r <= 0) { 370 k += half + 1; 371 half -= (rsize & 1) ^ 1; 372 } 373 } 374 375 break; 376 } 377 } 378 379 if(idx != NULL) { *idx = (0 < (k - j)) ? j : i; } 380 return k - j; 381 } 382