xref: /freebsd/contrib/flex/src/ccl.c (revision b077aed33b7b6aefca7b17ddb250cf521f938613)
1 /* ccl - routines for character classes */
2 
3 /*  Copyright (c) 1990 The Regents of the University of California. */
4 /*  All rights reserved. */
5 
6 /*  This code is derived from software contributed to Berkeley by */
7 /*  Vern Paxson. */
8 
9 /*  The United States Government has rights in this work pursuant */
10 /*  to contract no. DE-AC03-76SF00098 between the United States */
11  /*  Department of Energy and the University of California. */
12 
13 /*  This file is part of flex. */
14 
15 /*  Redistribution and use in source and binary forms, with or without */
16 /*  modification, are permitted provided that the following conditions */
17 /*  are met: */
18 
19 /*  1. Redistributions of source code must retain the above copyright */
20 /*     notice, this list of conditions and the following disclaimer. */
21 /*  2. Redistributions in binary form must reproduce the above copyright */
22 /*     notice, this list of conditions and the following disclaimer in the */
23 /*     documentation and/or other materials provided with the distribution. */
24 
25 /*  Neither the name of the University nor the names of its contributors */
26 /*  may be used to endorse or promote products derived from this software */
27 /*  without specific prior written permission. */
28 
29 /*  THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR */
30 /*  IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED */
31 /*  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR */
32 /*  PURPOSE. */
33 
34 #include "flexdef.h"
35 
36 /* return true if the chr is in the ccl. Takes negation into account. */
37 static bool
38 ccl_contains (const int cclp, const int ch)
39 {
40 	int     ind, len, i;
41 
42 	len = ccllen[cclp];
43 	ind = cclmap[cclp];
44 
45 	for (i = 0; i < len; ++i)
46 		if (ccltbl[ind + i] == ch)
47 			return !cclng[cclp];
48 
49     return cclng[cclp];
50 }
51 
52 
53 /* ccladd - add a single character to a ccl */
54 
55 void ccladd (int cclp, int ch)
56 {
57 	int     ind, len, newpos, i;
58 
59 	check_char (ch);
60 
61 	len = ccllen[cclp];
62 	ind = cclmap[cclp];
63 
64 	/* check to see if the character is already in the ccl */
65 
66 	for (i = 0; i < len; ++i)
67 		if (ccltbl[ind + i] == ch)
68 			return;
69 
70 	/* mark newlines */
71 	if (ch == nlch)
72 		ccl_has_nl[cclp] = true;
73 
74 	newpos = ind + len;
75 
76 	if (newpos >= current_max_ccl_tbl_size) {
77 		current_max_ccl_tbl_size += MAX_CCL_TBL_SIZE_INCREMENT;
78 
79 		++num_reallocs;
80 
81 		ccltbl = reallocate_Character_array (ccltbl,
82 						     current_max_ccl_tbl_size);
83 	}
84 
85 	ccllen[cclp] = len + 1;
86 	ccltbl[newpos] = (unsigned char) ch;
87 }
88 
89 /* dump_cclp - same thing as list_character_set, but for cclps.  */
90 
91 static void    dump_cclp (FILE* file, int cclp)
92 {
93 	int i;
94 
95 	putc ('[', file);
96 
97 	for (i = 0; i < csize; ++i) {
98 		if (ccl_contains(cclp, i)){
99 			int start_char = i;
100 
101 			putc (' ', file);
102 
103 			fputs (readable_form (i), file);
104 
105 			while (++i < csize && ccl_contains(cclp,i)) ;
106 
107 			if (i - 1 > start_char)
108 				/* this was a run */
109 				fprintf (file, "-%s",
110 					 readable_form (i - 1));
111 
112 			putc (' ', file);
113 		}
114 	}
115 
116 	putc (']', file);
117 }
118 
119 
120 
121 /* ccl_set_diff - create a new ccl as the set difference of the two given ccls. */
122 int
123 ccl_set_diff (int a, int b)
124 {
125     int  d, ch;
126 
127     /* create new class  */
128     d = cclinit();
129 
130     /* In order to handle negation, we spin through all possible chars,
131      * addding each char in a that is not in b.
132      * (This could be O(n^2), but n is small and bounded.)
133      */
134 	for ( ch = 0; ch < csize; ++ch )
135         if (ccl_contains (a, ch) && !ccl_contains(b, ch))
136             ccladd (d, ch);
137 
138     /* debug */
139     if (0){
140         fprintf(stderr, "ccl_set_diff (");
141             fprintf(stderr, "\n    ");
142             dump_cclp (stderr, a);
143             fprintf(stderr, "\n    ");
144             dump_cclp (stderr, b);
145             fprintf(stderr, "\n    ");
146             dump_cclp (stderr, d);
147         fprintf(stderr, "\n)\n");
148     }
149     return d;
150 }
151 
152 /* ccl_set_union - create a new ccl as the set union of the two given ccls. */
153 int
154 ccl_set_union (int a, int b)
155 {
156     int  d, i;
157 
158     /* create new class  */
159     d = cclinit();
160 
161     /* Add all of a */
162     for (i = 0; i < ccllen[a]; ++i)
163 		ccladd (d, ccltbl[cclmap[a] + i]);
164 
165     /* Add all of b */
166     for (i = 0; i < ccllen[b]; ++i)
167 		ccladd (d, ccltbl[cclmap[b] + i]);
168 
169     /* debug */
170     if (0){
171         fprintf(stderr, "ccl_set_union (%d + %d = %d", a, b, d);
172             fprintf(stderr, "\n    ");
173             dump_cclp (stderr, a);
174             fprintf(stderr, "\n    ");
175             dump_cclp (stderr, b);
176             fprintf(stderr, "\n    ");
177             dump_cclp (stderr, d);
178         fprintf(stderr, "\n)\n");
179     }
180     return d;
181 }
182 
183 
184 /* cclinit - return an empty ccl */
185 
186 int     cclinit (void)
187 {
188 	if (++lastccl >= current_maxccls) {
189 		current_maxccls += MAX_CCLS_INCREMENT;
190 
191 		++num_reallocs;
192 
193 		cclmap =
194 			reallocate_integer_array (cclmap, current_maxccls);
195 		ccllen =
196 			reallocate_integer_array (ccllen, current_maxccls);
197 		cclng = reallocate_integer_array (cclng, current_maxccls);
198 		ccl_has_nl =
199 			reallocate_bool_array (ccl_has_nl,
200 					       current_maxccls);
201 	}
202 
203 	if (lastccl == 1)
204 		/* we're making the first ccl */
205 		cclmap[lastccl] = 0;
206 
207 	else
208 		/* The new pointer is just past the end of the last ccl.
209 		 * Since the cclmap points to the \first/ character of a
210 		 * ccl, adding the length of the ccl to the cclmap pointer
211 		 * will produce a cursor to the first free space.
212 		 */
213 		cclmap[lastccl] =
214 			cclmap[lastccl - 1] + ccllen[lastccl - 1];
215 
216 	ccllen[lastccl] = 0;
217 	cclng[lastccl] = 0;	/* ccl's start out life un-negated */
218 	ccl_has_nl[lastccl] = false;
219 
220 	return lastccl;
221 }
222 
223 
224 /* cclnegate - negate the given ccl */
225 
226 void    cclnegate (int cclp)
227 {
228 	cclng[cclp] = 1;
229 	ccl_has_nl[cclp] = !ccl_has_nl[cclp];
230 }
231 
232 
233 /* list_character_set - list the members of a set of characters in CCL form
234  *
235  * Writes to the given file a character-class representation of those
236  * characters present in the given CCL.  A character is present if it
237  * has a non-zero value in the cset array.
238  */
239 
240 void    list_character_set (FILE *file, int cset[])
241 {
242 	int i;
243 
244 	putc ('[', file);
245 
246 	for (i = 0; i < csize; ++i) {
247 		if (cset[i]) {
248 			int start_char = i;
249 
250 			putc (' ', file);
251 
252 			fputs (readable_form (i), file);
253 
254 			while (++i < csize && cset[i]) ;
255 
256 			if (i - 1 > start_char)
257 				/* this was a run */
258 				fprintf (file, "-%s",
259 					 readable_form (i - 1));
260 
261 			putc (' ', file);
262 		}
263 	}
264 
265 	putc (']', file);
266 }
267 
268 /** Determines if the range [c1-c2] is unambiguous in a case-insensitive
269  * scanner.  Specifically, if a lowercase or uppercase character, x, is in the
270  * range [c1-c2], then we require that UPPERCASE(x) and LOWERCASE(x) must also
271  * be in the range. If not, then this range is ambiguous, and the function
272  * returns false.  For example, [@-_] spans [a-z] but not [A-Z].  Beware that
273  * [a-z] will be labeled ambiguous because it does not include [A-Z].
274  *
275  * @param c1 the lower end of the range
276  * @param c2 the upper end of the range
277  * @return true if [c1-c2] is not ambiguous for a caseless scanner.
278  */
279 bool range_covers_case (int c1, int c2)
280 {
281 	int     i, o;
282 
283 	for (i = c1; i <= c2; i++) {
284 		if (has_case (i)) {
285 			o = reverse_case (i);
286 			if (o < c1 || c2 < o)
287 				return false;
288 		}
289 	}
290 	return true;
291 }
292 
293 /** Reverse the case of a character, if possible.
294  * @return c if case-reversal does not apply.
295  */
296 int reverse_case (int c)
297 {
298 	return isupper (c) ? tolower (c) : (islower (c) ? toupper (c) : c);
299 }
300 
301 /** Return true if c is uppercase or lowercase. */
302 bool has_case (int c)
303 {
304 	return (isupper (c) || islower (c)) ? true : false;
305 }
306