1*30b9430bSXin Li /*
2*30b9430bSXin Li * utils.c for libdivsufsort
3*30b9430bSXin Li * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
4*30b9430bSXin Li *
5*30b9430bSXin Li * Permission is hereby granted, free of charge, to any person
6*30b9430bSXin Li * obtaining a copy of this software and associated documentation
7*30b9430bSXin Li * files (the "Software"), to deal in the Software without
8*30b9430bSXin Li * restriction, including without limitation the rights to use,
9*30b9430bSXin Li * copy, modify, merge, publish, distribute, sublicense, and/or sell
10*30b9430bSXin Li * copies of the Software, and to permit persons to whom the
11*30b9430bSXin Li * Software is furnished to do so, subject to the following
12*30b9430bSXin Li * conditions:
13*30b9430bSXin Li *
14*30b9430bSXin Li * The above copyright notice and this permission notice shall be
15*30b9430bSXin Li * included in all copies or substantial portions of the Software.
16*30b9430bSXin Li *
17*30b9430bSXin Li * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18*30b9430bSXin Li * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
19*30b9430bSXin Li * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20*30b9430bSXin Li * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
21*30b9430bSXin Li * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
22*30b9430bSXin Li * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23*30b9430bSXin Li * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
24*30b9430bSXin Li * OTHER DEALINGS IN THE SOFTWARE.
25*30b9430bSXin Li */
26*30b9430bSXin Li
27*30b9430bSXin Li #include "divsufsort_private.h"
28*30b9430bSXin Li
29*30b9430bSXin Li
30*30b9430bSXin Li /*- Private Function -*/
31*30b9430bSXin Li
32*30b9430bSXin Li /* Binary search for inverse bwt. */
33*30b9430bSXin Li static
34*30b9430bSXin Li saidx_t
binarysearch_lower(const saidx_t * A,saidx_t size,saidx_t value)35*30b9430bSXin Li binarysearch_lower(const saidx_t *A, saidx_t size, saidx_t value) {
36*30b9430bSXin Li saidx_t half, i;
37*30b9430bSXin Li for(i = 0, half = size >> 1;
38*30b9430bSXin Li 0 < size;
39*30b9430bSXin Li size = half, half >>= 1) {
40*30b9430bSXin Li if(A[i + half] < value) {
41*30b9430bSXin Li i += half + 1;
42*30b9430bSXin Li half -= (size & 1) ^ 1;
43*30b9430bSXin Li }
44*30b9430bSXin Li }
45*30b9430bSXin Li return i;
46*30b9430bSXin Li }
47*30b9430bSXin Li
48*30b9430bSXin Li
49*30b9430bSXin Li /*- Functions -*/
50*30b9430bSXin Li
51*30b9430bSXin Li /* Burrows-Wheeler transform. */
52*30b9430bSXin Li saint_t
bw_transform(const sauchar_t * T,sauchar_t * U,saidx_t * SA,saidx_t n,saidx_t * idx)53*30b9430bSXin Li bw_transform(const sauchar_t *T, sauchar_t *U, saidx_t *SA,
54*30b9430bSXin Li saidx_t n, saidx_t *idx) {
55*30b9430bSXin Li saidx_t *A, i, j, p, t;
56*30b9430bSXin Li saint_t c;
57*30b9430bSXin Li
58*30b9430bSXin Li /* Check arguments. */
59*30b9430bSXin Li if((T == NULL) || (U == NULL) || (n < 0) || (idx == NULL)) { return -1; }
60*30b9430bSXin Li if(n <= 1) {
61*30b9430bSXin Li if(n == 1) { U[0] = T[0]; }
62*30b9430bSXin Li *idx = n;
63*30b9430bSXin Li return 0;
64*30b9430bSXin Li }
65*30b9430bSXin Li
66*30b9430bSXin Li if((A = SA) == NULL) {
67*30b9430bSXin Li i = divbwt(T, U, NULL, n);
68*30b9430bSXin Li if(0 <= i) { *idx = i; i = 0; }
69*30b9430bSXin Li return (saint_t)i;
70*30b9430bSXin Li }
71*30b9430bSXin Li
72*30b9430bSXin Li /* BW transform. */
73*30b9430bSXin Li if(T == U) {
74*30b9430bSXin Li t = n;
75*30b9430bSXin Li for(i = 0, j = 0; i < n; ++i) {
76*30b9430bSXin Li p = t - 1;
77*30b9430bSXin Li t = A[i];
78*30b9430bSXin Li if(0 <= p) {
79*30b9430bSXin Li c = T[j];
80*30b9430bSXin Li U[j] = (j <= p) ? T[p] : (sauchar_t)A[p];
81*30b9430bSXin Li A[j] = c;
82*30b9430bSXin Li j++;
83*30b9430bSXin Li } else {
84*30b9430bSXin Li *idx = i;
85*30b9430bSXin Li }
86*30b9430bSXin Li }
87*30b9430bSXin Li p = t - 1;
88*30b9430bSXin Li if(0 <= p) {
89*30b9430bSXin Li c = T[j];
90*30b9430bSXin Li U[j] = (j <= p) ? T[p] : (sauchar_t)A[p];
91*30b9430bSXin Li A[j] = c;
92*30b9430bSXin Li } else {
93*30b9430bSXin Li *idx = i;
94*30b9430bSXin Li }
95*30b9430bSXin Li } else {
96*30b9430bSXin Li U[0] = T[n - 1];
97*30b9430bSXin Li for(i = 0; A[i] != 0; ++i) { U[i + 1] = T[A[i] - 1]; }
98*30b9430bSXin Li *idx = i + 1;
99*30b9430bSXin Li for(++i; i < n; ++i) { U[i] = T[A[i] - 1]; }
100*30b9430bSXin Li }
101*30b9430bSXin Li
102*30b9430bSXin Li if(SA == NULL) {
103*30b9430bSXin Li /* Deallocate memory. */
104*30b9430bSXin Li free(A);
105*30b9430bSXin Li }
106*30b9430bSXin Li
107*30b9430bSXin Li return 0;
108*30b9430bSXin Li }
109*30b9430bSXin Li
110*30b9430bSXin Li /* Inverse Burrows-Wheeler transform. */
111*30b9430bSXin Li saint_t
inverse_bw_transform(const sauchar_t * T,sauchar_t * U,saidx_t * A,saidx_t n,saidx_t idx)112*30b9430bSXin Li inverse_bw_transform(const sauchar_t *T, sauchar_t *U, saidx_t *A,
113*30b9430bSXin Li saidx_t n, saidx_t idx) {
114*30b9430bSXin Li saidx_t C[ALPHABET_SIZE];
115*30b9430bSXin Li sauchar_t D[ALPHABET_SIZE];
116*30b9430bSXin Li saidx_t *B;
117*30b9430bSXin Li saidx_t i, p;
118*30b9430bSXin Li saint_t c, d;
119*30b9430bSXin Li
120*30b9430bSXin Li /* Check arguments. */
121*30b9430bSXin Li if((T == NULL) || (U == NULL) || (n < 0) || (idx < 0) ||
122*30b9430bSXin Li (n < idx) || ((0 < n) && (idx == 0))) {
123*30b9430bSXin Li return -1;
124*30b9430bSXin Li }
125*30b9430bSXin Li if(n <= 1) { return 0; }
126*30b9430bSXin Li
127*30b9430bSXin Li if((B = A) == NULL) {
128*30b9430bSXin Li /* Allocate n*sizeof(saidx_t) bytes of memory. */
129*30b9430bSXin Li if((B = (saidx_t *)malloc((size_t)n * sizeof(saidx_t))) == NULL) { return -2; }
130*30b9430bSXin Li }
131*30b9430bSXin Li
132*30b9430bSXin Li /* Inverse BW transform. */
133*30b9430bSXin Li for(c = 0; c < ALPHABET_SIZE; ++c) { C[c] = 0; }
134*30b9430bSXin Li for(i = 0; i < n; ++i) { ++C[T[i]]; }
135*30b9430bSXin Li for(c = 0, d = 0, i = 0; c < ALPHABET_SIZE; ++c) {
136*30b9430bSXin Li p = C[c];
137*30b9430bSXin Li if(0 < p) {
138*30b9430bSXin Li C[c] = i;
139*30b9430bSXin Li D[d++] = (sauchar_t)c;
140*30b9430bSXin Li i += p;
141*30b9430bSXin Li }
142*30b9430bSXin Li }
143*30b9430bSXin Li for(i = 0; i < idx; ++i) { B[C[T[i]]++] = i; }
144*30b9430bSXin Li for( ; i < n; ++i) { B[C[T[i]]++] = i + 1; }
145*30b9430bSXin Li for(c = 0; c < d; ++c) { C[c] = C[D[c]]; }
146*30b9430bSXin Li for(i = 0, p = idx; i < n; ++i) {
147*30b9430bSXin Li U[i] = D[binarysearch_lower(C, d, p)];
148*30b9430bSXin Li p = B[p - 1];
149*30b9430bSXin Li }
150*30b9430bSXin Li
151*30b9430bSXin Li if(A == NULL) {
152*30b9430bSXin Li /* Deallocate memory. */
153*30b9430bSXin Li free(B);
154*30b9430bSXin Li }
155*30b9430bSXin Li
156*30b9430bSXin Li return 0;
157*30b9430bSXin Li }
158*30b9430bSXin Li
159*30b9430bSXin Li /* Checks the suffix array SA of the string T. */
160*30b9430bSXin Li saint_t
sufcheck(const sauchar_t * T,const saidx_t * SA,saidx_t n,saint_t verbose)161*30b9430bSXin Li sufcheck(const sauchar_t *T, const saidx_t *SA,
162*30b9430bSXin Li saidx_t n, saint_t verbose) {
163*30b9430bSXin Li saidx_t C[ALPHABET_SIZE];
164*30b9430bSXin Li saidx_t i, p, q, t;
165*30b9430bSXin Li saint_t c;
166*30b9430bSXin Li
167*30b9430bSXin Li if(verbose) { fprintf(stderr, "sufcheck: "); }
168*30b9430bSXin Li
169*30b9430bSXin Li /* Check arguments. */
170*30b9430bSXin Li if((T == NULL) || (SA == NULL) || (n < 0)) {
171*30b9430bSXin Li if(verbose) { fprintf(stderr, "Invalid arguments.\n"); }
172*30b9430bSXin Li return -1;
173*30b9430bSXin Li }
174*30b9430bSXin Li if(n == 0) {
175*30b9430bSXin Li if(verbose) { fprintf(stderr, "Done.\n"); }
176*30b9430bSXin Li return 0;
177*30b9430bSXin Li }
178*30b9430bSXin Li
179*30b9430bSXin Li /* check range: [0..n-1] */
180*30b9430bSXin Li for(i = 0; i < n; ++i) {
181*30b9430bSXin Li if((SA[i] < 0) || (n <= SA[i])) {
182*30b9430bSXin Li if(verbose) {
183*30b9430bSXin Li fprintf(stderr, "Out of the range [0,%" PRIdSAIDX_T "].\n"
184*30b9430bSXin Li " SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "\n",
185*30b9430bSXin Li n - 1, i, SA[i]);
186*30b9430bSXin Li }
187*30b9430bSXin Li return -2;
188*30b9430bSXin Li }
189*30b9430bSXin Li }
190*30b9430bSXin Li
191*30b9430bSXin Li /* check first characters. */
192*30b9430bSXin Li for(i = 1; i < n; ++i) {
193*30b9430bSXin Li if(T[SA[i - 1]] > T[SA[i]]) {
194*30b9430bSXin Li if(verbose) {
195*30b9430bSXin Li fprintf(stderr, "Suffixes in wrong order.\n"
196*30b9430bSXin Li " T[SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "]=%d"
197*30b9430bSXin Li " > T[SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "]=%d\n",
198*30b9430bSXin Li i - 1, SA[i - 1], T[SA[i - 1]], i, SA[i], T[SA[i]]);
199*30b9430bSXin Li }
200*30b9430bSXin Li return -3;
201*30b9430bSXin Li }
202*30b9430bSXin Li }
203*30b9430bSXin Li
204*30b9430bSXin Li /* check suffixes. */
205*30b9430bSXin Li for(i = 0; i < ALPHABET_SIZE; ++i) { C[i] = 0; }
206*30b9430bSXin Li for(i = 0; i < n; ++i) { ++C[T[i]]; }
207*30b9430bSXin Li for(i = 0, p = 0; i < ALPHABET_SIZE; ++i) {
208*30b9430bSXin Li t = C[i];
209*30b9430bSXin Li C[i] = p;
210*30b9430bSXin Li p += t;
211*30b9430bSXin Li }
212*30b9430bSXin Li
213*30b9430bSXin Li q = C[T[n - 1]];
214*30b9430bSXin Li C[T[n - 1]] += 1;
215*30b9430bSXin Li for(i = 0; i < n; ++i) {
216*30b9430bSXin Li p = SA[i];
217*30b9430bSXin Li if(0 < p) {
218*30b9430bSXin Li c = T[--p];
219*30b9430bSXin Li t = C[c];
220*30b9430bSXin Li } else {
221*30b9430bSXin Li c = T[p = n - 1];
222*30b9430bSXin Li t = q;
223*30b9430bSXin Li }
224*30b9430bSXin Li if((t < 0) || (p != SA[t])) {
225*30b9430bSXin Li if(verbose) {
226*30b9430bSXin Li fprintf(stderr, "Suffix in wrong position.\n"
227*30b9430bSXin Li " SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T " or\n"
228*30b9430bSXin Li " SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "\n",
229*30b9430bSXin Li t, (0 <= t) ? SA[t] : -1, i, SA[i]);
230*30b9430bSXin Li }
231*30b9430bSXin Li return -4;
232*30b9430bSXin Li }
233*30b9430bSXin Li if(t != q) {
234*30b9430bSXin Li ++C[c];
235*30b9430bSXin Li if((n <= C[c]) || (T[SA[C[c]]] != c)) { C[c] = -1; }
236*30b9430bSXin Li }
237*30b9430bSXin Li }
238*30b9430bSXin Li
239*30b9430bSXin Li if(1 <= verbose) { fprintf(stderr, "Done.\n"); }
240*30b9430bSXin Li return 0;
241*30b9430bSXin Li }
242*30b9430bSXin Li
243*30b9430bSXin Li
244*30b9430bSXin Li static
245*30b9430bSXin Li int
_compare(const sauchar_t * T,saidx_t Tsize,const sauchar_t * P,saidx_t Psize,saidx_t suf,saidx_t * match)246*30b9430bSXin Li _compare(const sauchar_t *T, saidx_t Tsize,
247*30b9430bSXin Li const sauchar_t *P, saidx_t Psize,
248*30b9430bSXin Li saidx_t suf, saidx_t *match) {
249*30b9430bSXin Li saidx_t i, j;
250*30b9430bSXin Li saint_t r;
251*30b9430bSXin Li for(i = suf + *match, j = *match, r = 0;
252*30b9430bSXin Li (i < Tsize) && (j < Psize) && ((r = T[i] - P[j]) == 0); ++i, ++j) { }
253*30b9430bSXin Li *match = j;
254*30b9430bSXin Li return (r == 0) ? -(j != Psize) : r;
255*30b9430bSXin Li }
256*30b9430bSXin Li
257*30b9430bSXin Li /* Search for the pattern P in the string T. */
258*30b9430bSXin Li saidx_t
sa_search(const sauchar_t * T,saidx_t Tsize,const sauchar_t * P,saidx_t Psize,const saidx_t * SA,saidx_t SAsize,saidx_t * idx)259*30b9430bSXin Li sa_search(const sauchar_t *T, saidx_t Tsize,
260*30b9430bSXin Li const sauchar_t *P, saidx_t Psize,
261*30b9430bSXin Li const saidx_t *SA, saidx_t SAsize,
262*30b9430bSXin Li saidx_t *idx) {
263*30b9430bSXin Li saidx_t size, lsize, rsize, half;
264*30b9430bSXin Li saidx_t match, lmatch, rmatch;
265*30b9430bSXin Li saidx_t llmatch, lrmatch, rlmatch, rrmatch;
266*30b9430bSXin Li saidx_t i, j, k;
267*30b9430bSXin Li saint_t r;
268*30b9430bSXin Li
269*30b9430bSXin Li if(idx != NULL) { *idx = -1; }
270*30b9430bSXin Li if((T == NULL) || (P == NULL) || (SA == NULL) ||
271*30b9430bSXin Li (Tsize < 0) || (Psize < 0) || (SAsize < 0)) { return -1; }
272*30b9430bSXin Li if((Tsize == 0) || (SAsize == 0)) { return 0; }
273*30b9430bSXin Li if(Psize == 0) { if(idx != NULL) { *idx = 0; } return SAsize; }
274*30b9430bSXin Li
275*30b9430bSXin Li for(i = j = k = 0, lmatch = rmatch = 0, size = SAsize, half = size >> 1;
276*30b9430bSXin Li 0 < size;
277*30b9430bSXin Li size = half, half >>= 1) {
278*30b9430bSXin Li match = MIN(lmatch, rmatch);
279*30b9430bSXin Li r = _compare(T, Tsize, P, Psize, SA[i + half], &match);
280*30b9430bSXin Li if(r < 0) {
281*30b9430bSXin Li i += half + 1;
282*30b9430bSXin Li half -= (size & 1) ^ 1;
283*30b9430bSXin Li lmatch = match;
284*30b9430bSXin Li } else if(r > 0) {
285*30b9430bSXin Li rmatch = match;
286*30b9430bSXin Li } else {
287*30b9430bSXin Li lsize = half, j = i, rsize = size - half - 1, k = i + half + 1;
288*30b9430bSXin Li
289*30b9430bSXin Li /* left part */
290*30b9430bSXin Li for(llmatch = lmatch, lrmatch = match, half = lsize >> 1;
291*30b9430bSXin Li 0 < lsize;
292*30b9430bSXin Li lsize = half, half >>= 1) {
293*30b9430bSXin Li lmatch = MIN(llmatch, lrmatch);
294*30b9430bSXin Li r = _compare(T, Tsize, P, Psize, SA[j + half], &lmatch);
295*30b9430bSXin Li if(r < 0) {
296*30b9430bSXin Li j += half + 1;
297*30b9430bSXin Li half -= (lsize & 1) ^ 1;
298*30b9430bSXin Li llmatch = lmatch;
299*30b9430bSXin Li } else {
300*30b9430bSXin Li lrmatch = lmatch;
301*30b9430bSXin Li }
302*30b9430bSXin Li }
303*30b9430bSXin Li
304*30b9430bSXin Li /* right part */
305*30b9430bSXin Li for(rlmatch = match, rrmatch = rmatch, half = rsize >> 1;
306*30b9430bSXin Li 0 < rsize;
307*30b9430bSXin Li rsize = half, half >>= 1) {
308*30b9430bSXin Li rmatch = MIN(rlmatch, rrmatch);
309*30b9430bSXin Li r = _compare(T, Tsize, P, Psize, SA[k + half], &rmatch);
310*30b9430bSXin Li if(r <= 0) {
311*30b9430bSXin Li k += half + 1;
312*30b9430bSXin Li half -= (rsize & 1) ^ 1;
313*30b9430bSXin Li rlmatch = rmatch;
314*30b9430bSXin Li } else {
315*30b9430bSXin Li rrmatch = rmatch;
316*30b9430bSXin Li }
317*30b9430bSXin Li }
318*30b9430bSXin Li
319*30b9430bSXin Li break;
320*30b9430bSXin Li }
321*30b9430bSXin Li }
322*30b9430bSXin Li
323*30b9430bSXin Li if(idx != NULL) { *idx = (0 < (k - j)) ? j : i; }
324*30b9430bSXin Li return k - j;
325*30b9430bSXin Li }
326*30b9430bSXin Li
327*30b9430bSXin Li /* Search for the character c in the string T. */
328*30b9430bSXin Li saidx_t
sa_simplesearch(const sauchar_t * T,saidx_t Tsize,const saidx_t * SA,saidx_t SAsize,saint_t c,saidx_t * idx)329*30b9430bSXin Li sa_simplesearch(const sauchar_t *T, saidx_t Tsize,
330*30b9430bSXin Li const saidx_t *SA, saidx_t SAsize,
331*30b9430bSXin Li saint_t c, saidx_t *idx) {
332*30b9430bSXin Li saidx_t size, lsize, rsize, half;
333*30b9430bSXin Li saidx_t i, j, k, p;
334*30b9430bSXin Li saint_t r;
335*30b9430bSXin Li
336*30b9430bSXin Li if(idx != NULL) { *idx = -1; }
337*30b9430bSXin Li if((T == NULL) || (SA == NULL) || (Tsize < 0) || (SAsize < 0)) { return -1; }
338*30b9430bSXin Li if((Tsize == 0) || (SAsize == 0)) { return 0; }
339*30b9430bSXin Li
340*30b9430bSXin Li for(i = j = k = 0, size = SAsize, half = size >> 1;
341*30b9430bSXin Li 0 < size;
342*30b9430bSXin Li size = half, half >>= 1) {
343*30b9430bSXin Li p = SA[i + half];
344*30b9430bSXin Li r = (p < Tsize) ? T[p] - c : -1;
345*30b9430bSXin Li if(r < 0) {
346*30b9430bSXin Li i += half + 1;
347*30b9430bSXin Li half -= (size & 1) ^ 1;
348*30b9430bSXin Li } else if(r == 0) {
349*30b9430bSXin Li lsize = half, j = i, rsize = size - half - 1, k = i + half + 1;
350*30b9430bSXin Li
351*30b9430bSXin Li /* left part */
352*30b9430bSXin Li for(half = lsize >> 1;
353*30b9430bSXin Li 0 < lsize;
354*30b9430bSXin Li lsize = half, half >>= 1) {
355*30b9430bSXin Li p = SA[j + half];
356*30b9430bSXin Li r = (p < Tsize) ? T[p] - c : -1;
357*30b9430bSXin Li if(r < 0) {
358*30b9430bSXin Li j += half + 1;
359*30b9430bSXin Li half -= (lsize & 1) ^ 1;
360*30b9430bSXin Li }
361*30b9430bSXin Li }
362*30b9430bSXin Li
363*30b9430bSXin Li /* right part */
364*30b9430bSXin Li for(half = rsize >> 1;
365*30b9430bSXin Li 0 < rsize;
366*30b9430bSXin Li rsize = half, half >>= 1) {
367*30b9430bSXin Li p = SA[k + half];
368*30b9430bSXin Li r = (p < Tsize) ? T[p] - c : -1;
369*30b9430bSXin Li if(r <= 0) {
370*30b9430bSXin Li k += half + 1;
371*30b9430bSXin Li half -= (rsize & 1) ^ 1;
372*30b9430bSXin Li }
373*30b9430bSXin Li }
374*30b9430bSXin Li
375*30b9430bSXin Li break;
376*30b9430bSXin Li }
377*30b9430bSXin Li }
378*30b9430bSXin Li
379*30b9430bSXin Li if(idx != NULL) { *idx = (0 < (k - j)) ? j : i; }
380*30b9430bSXin Li return k - j;
381*30b9430bSXin Li }
382