xref: /freebsd/contrib/libdivsufsort/lib/utils.c (revision a90b9d0159070121c221b966469c3e36d912bf82)
1 /*
2  * utils.c for libdivsufsort
3  * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
4  *
5  * Permission is hereby granted, free of charge, to any person
6  * obtaining a copy of this software and associated documentation
7  * files (the "Software"), to deal in the Software without
8  * restriction, including without limitation the rights to use,
9  * copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following
12  * conditions:
13  *
14  * The above copyright notice and this permission notice shall be
15  * included in all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
19  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
21  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
22  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
24  * OTHER DEALINGS IN THE SOFTWARE.
25  */
26 
27 #include "divsufsort_private.h"
28 
29 
30 /*- Private Function -*/
31 
32 /* Binary search for inverse bwt. */
33 static
34 saidx_t
35 binarysearch_lower(const saidx_t *A, saidx_t size, saidx_t value) {
36   saidx_t half, i;
37   for(i = 0, half = size >> 1;
38       0 < size;
39       size = half, half >>= 1) {
40     if(A[i + half] < value) {
41       i += half + 1;
42       half -= (size & 1) ^ 1;
43     }
44   }
45   return i;
46 }
47 
48 
49 /*- Functions -*/
50 
51 /* Burrows-Wheeler transform. */
52 saint_t
53 bw_transform(const sauchar_t *T, sauchar_t *U, saidx_t *SA,
54              saidx_t n, saidx_t *idx) {
55   saidx_t *A, i, j, p, t;
56   saint_t c;
57 
58   /* Check arguments. */
59   if((T == NULL) || (U == NULL) || (n < 0) || (idx == NULL)) { return -1; }
60   if(n <= 1) {
61     if(n == 1) { U[0] = T[0]; }
62     *idx = n;
63     return 0;
64   }
65 
66   if((A = SA) == NULL) {
67     i = divbwt(T, U, NULL, n);
68     if(0 <= i) { *idx = i; i = 0; }
69     return (saint_t)i;
70   }
71 
72   /* BW transform. */
73   if(T == U) {
74     t = n;
75     for(i = 0, j = 0; i < n; ++i) {
76       p = t - 1;
77       t = A[i];
78       if(0 <= p) {
79         c = T[j];
80         U[j] = (j <= p) ? T[p] : (sauchar_t)A[p];
81         A[j] = c;
82         j++;
83       } else {
84         *idx = i;
85       }
86     }
87     p = t - 1;
88     if(0 <= p) {
89       c = T[j];
90       U[j] = (j <= p) ? T[p] : (sauchar_t)A[p];
91       A[j] = c;
92     } else {
93       *idx = i;
94     }
95   } else {
96     U[0] = T[n - 1];
97     for(i = 0; A[i] != 0; ++i) { U[i + 1] = T[A[i] - 1]; }
98     *idx = i + 1;
99     for(++i; i < n; ++i) { U[i] = T[A[i] - 1]; }
100   }
101 
102   if(SA == NULL) {
103     /* Deallocate memory. */
104     free(A);
105   }
106 
107   return 0;
108 }
109 
110 /* Inverse Burrows-Wheeler transform. */
111 saint_t
112 inverse_bw_transform(const sauchar_t *T, sauchar_t *U, saidx_t *A,
113                      saidx_t n, saidx_t idx) {
114   saidx_t C[ALPHABET_SIZE];
115   sauchar_t D[ALPHABET_SIZE];
116   saidx_t *B;
117   saidx_t i, p;
118   saint_t c, d;
119 
120   /* Check arguments. */
121   if((T == NULL) || (U == NULL) || (n < 0) || (idx < 0) ||
122      (n < idx) || ((0 < n) && (idx == 0))) {
123     return -1;
124   }
125   if(n <= 1) { return 0; }
126 
127   if((B = A) == NULL) {
128     /* Allocate n*sizeof(saidx_t) bytes of memory. */
129     if((B = (saidx_t *)malloc((size_t)n * sizeof(saidx_t))) == NULL) { return -2; }
130   }
131 
132   /* Inverse BW transform. */
133   for(c = 0; c < ALPHABET_SIZE; ++c) { C[c] = 0; }
134   for(i = 0; i < n; ++i) { ++C[T[i]]; }
135   for(c = 0, d = 0, i = 0; c < ALPHABET_SIZE; ++c) {
136     p = C[c];
137     if(0 < p) {
138       C[c] = i;
139       D[d++] = (sauchar_t)c;
140       i += p;
141     }
142   }
143   for(i = 0; i < idx; ++i) { B[C[T[i]]++] = i; }
144   for( ; i < n; ++i)       { B[C[T[i]]++] = i + 1; }
145   for(c = 0; c < d; ++c) { C[c] = C[D[c]]; }
146   for(i = 0, p = idx; i < n; ++i) {
147     U[i] = D[binarysearch_lower(C, d, p)];
148     p = B[p - 1];
149   }
150 
151   if(A == NULL) {
152     /* Deallocate memory. */
153     free(B);
154   }
155 
156   return 0;
157 }
158 
159 /* Checks the suffix array SA of the string T. */
160 saint_t
161 sufcheck(const sauchar_t *T, const saidx_t *SA,
162          saidx_t n, saint_t verbose) {
163   saidx_t C[ALPHABET_SIZE];
164   saidx_t i, p, q, t;
165   saint_t c;
166 
167   if(verbose) { fprintf(stderr, "sufcheck: "); }
168 
169   /* Check arguments. */
170   if((T == NULL) || (SA == NULL) || (n < 0)) {
171     if(verbose) { fprintf(stderr, "Invalid arguments.\n"); }
172     return -1;
173   }
174   if(n == 0) {
175     if(verbose) { fprintf(stderr, "Done.\n"); }
176     return 0;
177   }
178 
179   /* check range: [0..n-1] */
180   for(i = 0; i < n; ++i) {
181     if((SA[i] < 0) || (n <= SA[i])) {
182       if(verbose) {
183         fprintf(stderr, "Out of the range [0,%" PRIdSAIDX_T "].\n"
184                         "  SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "\n",
185                         n - 1, i, SA[i]);
186       }
187       return -2;
188     }
189   }
190 
191   /* check first characters. */
192   for(i = 1; i < n; ++i) {
193     if(T[SA[i - 1]] > T[SA[i]]) {
194       if(verbose) {
195         fprintf(stderr, "Suffixes in wrong order.\n"
196                         "  T[SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "]=%d"
197                         " > T[SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "]=%d\n",
198                         i - 1, SA[i - 1], T[SA[i - 1]], i, SA[i], T[SA[i]]);
199       }
200       return -3;
201     }
202   }
203 
204   /* check suffixes. */
205   for(i = 0; i < ALPHABET_SIZE; ++i) { C[i] = 0; }
206   for(i = 0; i < n; ++i) { ++C[T[i]]; }
207   for(i = 0, p = 0; i < ALPHABET_SIZE; ++i) {
208     t = C[i];
209     C[i] = p;
210     p += t;
211   }
212 
213   q = C[T[n - 1]];
214   C[T[n - 1]] += 1;
215   for(i = 0; i < n; ++i) {
216     p = SA[i];
217     if(0 < p) {
218       c = T[--p];
219       t = C[c];
220     } else {
221       c = T[p = n - 1];
222       t = q;
223     }
224     if((t < 0) || (p != SA[t])) {
225       if(verbose) {
226         fprintf(stderr, "Suffix in wrong position.\n"
227                         "  SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T " or\n"
228                         "  SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "\n",
229                         t, (0 <= t) ? SA[t] : -1, i, SA[i]);
230       }
231       return -4;
232     }
233     if(t != q) {
234       ++C[c];
235       if((n <= C[c]) || (T[SA[C[c]]] != c)) { C[c] = -1; }
236     }
237   }
238 
239   if(1 <= verbose) { fprintf(stderr, "Done.\n"); }
240   return 0;
241 }
242 
243 
244 static
245 int
246 _compare(const sauchar_t *T, saidx_t Tsize,
247          const sauchar_t *P, saidx_t Psize,
248          saidx_t suf, saidx_t *match) {
249   saidx_t i, j;
250   saint_t r;
251   for(i = suf + *match, j = *match, r = 0;
252       (i < Tsize) && (j < Psize) && ((r = T[i] - P[j]) == 0); ++i, ++j) { }
253   *match = j;
254   return (r == 0) ? -(j != Psize) : r;
255 }
256 
257 /* Search for the pattern P in the string T. */
258 saidx_t
259 sa_search(const sauchar_t *T, saidx_t Tsize,
260           const sauchar_t *P, saidx_t Psize,
261           const saidx_t *SA, saidx_t SAsize,
262           saidx_t *idx) {
263   saidx_t size, lsize, rsize, half;
264   saidx_t match, lmatch, rmatch;
265   saidx_t llmatch, lrmatch, rlmatch, rrmatch;
266   saidx_t i, j, k;
267   saint_t r;
268 
269   if(idx != NULL) { *idx = -1; }
270   if((T == NULL) || (P == NULL) || (SA == NULL) ||
271      (Tsize < 0) || (Psize < 0) || (SAsize < 0)) { return -1; }
272   if((Tsize == 0) || (SAsize == 0)) { return 0; }
273   if(Psize == 0) { if(idx != NULL) { *idx = 0; } return SAsize; }
274 
275   for(i = j = k = 0, lmatch = rmatch = 0, size = SAsize, half = size >> 1;
276       0 < size;
277       size = half, half >>= 1) {
278     match = MIN(lmatch, rmatch);
279     r = _compare(T, Tsize, P, Psize, SA[i + half], &match);
280     if(r < 0) {
281       i += half + 1;
282       half -= (size & 1) ^ 1;
283       lmatch = match;
284     } else if(r > 0) {
285       rmatch = match;
286     } else {
287       lsize = half, j = i, rsize = size - half - 1, k = i + half + 1;
288 
289       /* left part */
290       for(llmatch = lmatch, lrmatch = match, half = lsize >> 1;
291           0 < lsize;
292           lsize = half, half >>= 1) {
293         lmatch = MIN(llmatch, lrmatch);
294         r = _compare(T, Tsize, P, Psize, SA[j + half], &lmatch);
295         if(r < 0) {
296           j += half + 1;
297           half -= (lsize & 1) ^ 1;
298           llmatch = lmatch;
299         } else {
300           lrmatch = lmatch;
301         }
302       }
303 
304       /* right part */
305       for(rlmatch = match, rrmatch = rmatch, half = rsize >> 1;
306           0 < rsize;
307           rsize = half, half >>= 1) {
308         rmatch = MIN(rlmatch, rrmatch);
309         r = _compare(T, Tsize, P, Psize, SA[k + half], &rmatch);
310         if(r <= 0) {
311           k += half + 1;
312           half -= (rsize & 1) ^ 1;
313           rlmatch = rmatch;
314         } else {
315           rrmatch = rmatch;
316         }
317       }
318 
319       break;
320     }
321   }
322 
323   if(idx != NULL) { *idx = (0 < (k - j)) ? j : i; }
324   return k - j;
325 }
326 
327 /* Search for the character c in the string T. */
328 saidx_t
329 sa_simplesearch(const sauchar_t *T, saidx_t Tsize,
330                 const saidx_t *SA, saidx_t SAsize,
331                 saint_t c, saidx_t *idx) {
332   saidx_t size, lsize, rsize, half;
333   saidx_t i, j, k, p;
334   saint_t r;
335 
336   if(idx != NULL) { *idx = -1; }
337   if((T == NULL) || (SA == NULL) || (Tsize < 0) || (SAsize < 0)) { return -1; }
338   if((Tsize == 0) || (SAsize == 0)) { return 0; }
339 
340   for(i = j = k = 0, size = SAsize, half = size >> 1;
341       0 < size;
342       size = half, half >>= 1) {
343     p = SA[i + half];
344     r = (p < Tsize) ? T[p] - c : -1;
345     if(r < 0) {
346       i += half + 1;
347       half -= (size & 1) ^ 1;
348     } else if(r == 0) {
349       lsize = half, j = i, rsize = size - half - 1, k = i + half + 1;
350 
351       /* left part */
352       for(half = lsize >> 1;
353           0 < lsize;
354           lsize = half, half >>= 1) {
355         p = SA[j + half];
356         r = (p < Tsize) ? T[p] - c : -1;
357         if(r < 0) {
358           j += half + 1;
359           half -= (lsize & 1) ^ 1;
360         }
361       }
362 
363       /* right part */
364       for(half = rsize >> 1;
365           0 < rsize;
366           rsize = half, half >>= 1) {
367         p = SA[k + half];
368         r = (p < Tsize) ? T[p] - c : -1;
369         if(r <= 0) {
370           k += half + 1;
371           half -= (rsize & 1) ^ 1;
372         }
373       }
374 
375       break;
376     }
377   }
378 
379   if(idx != NULL) { *idx = (0 < (k - j)) ? j : i; }
380   return k - j;
381 }
382