1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1991, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 33 34 #include <sys/types.h> 35 36 #include <ctype.h> 37 #include <err.h> 38 #include <errno.h> 39 #include <stddef.h> 40 #include <stdio.h> 41 #include <stdlib.h> 42 #include <string.h> 43 #include <wchar.h> 44 #include <wctype.h> 45 46 #include "extern.h" 47 48 static int backslash(STR *, int *); 49 static int bracket(STR *); 50 static void genclass(STR *); 51 static void genequiv(STR *); 52 static int genrange(STR *, int); 53 static void genseq(STR *); 54 55 wint_t 56 next(STR *s) 57 { 58 int is_octal; 59 wint_t ch; 60 wchar_t wch; 61 size_t clen; 62 63 switch (s->state) { 64 case EOS: 65 return (0); 66 case INFINITE: 67 return (1); 68 case NORMAL: 69 switch (*s->str) { 70 case '\0': 71 s->state = EOS; 72 return (0); 73 case '\\': 74 s->lastch = backslash(s, &is_octal); 75 break; 76 case '[': 77 if (bracket(s)) 78 return (next(s)); 79 /* FALLTHROUGH */ 80 default: 81 clen = mbrtowc(&wch, s->str, MB_LEN_MAX, NULL); 82 if (clen == (size_t)-1 || clen == (size_t)-2 || 83 clen == 0) 84 errc(1, EILSEQ, NULL); 85 is_octal = 0; 86 s->lastch = wch; 87 s->str += clen; 88 break; 89 } 90 91 /* We can start a range at any time. */ 92 if (s->str[0] == '-' && genrange(s, is_octal)) 93 return (next(s)); 94 return (1); 95 case RANGE: 96 if (s->cnt-- == 0) { 97 s->state = NORMAL; 98 return (next(s)); 99 } 100 ++s->lastch; 101 return (1); 102 case SEQUENCE: 103 if (s->cnt-- == 0) { 104 s->state = NORMAL; 105 return (next(s)); 106 } 107 return (1); 108 case CCLASS: 109 case CCLASS_UPPER: 110 case CCLASS_LOWER: 111 s->cnt++; 112 ch = nextwctype(s->lastch, s->cclass); 113 if (ch == -1) { 114 s->state = NORMAL; 115 return (next(s)); 116 } 117 s->lastch = ch; 118 return (1); 119 case SET: 120 if ((ch = s->set[s->cnt++]) == OOBCH) { 121 s->state = NORMAL; 122 return (next(s)); 123 } 124 s->lastch = ch; 125 return (1); 126 default: 127 return (0); 128 } 129 /* NOTREACHED */ 130 } 131 132 static int 133 bracket(STR *s) 134 { 135 char *p; 136 137 switch (s->str[1]) { 138 case ':': /* "[:class:]" */ 139 if ((p = strchr(s->str + 2, ']')) == NULL) 140 return (0); 141 if (*(p - 1) != ':' || p - s->str < 4) 142 goto repeat; 143 *(p - 1) = '\0'; 144 s->str += 2; 145 genclass(s); 146 s->str = p + 1; 147 return (1); 148 case '=': /* "[=equiv=]" */ 149 if (s->str[2] == '\0' || (p = strchr(s->str + 3, ']')) == NULL) 150 return (0); 151 if (*(p - 1) != '=' || p - s->str < 4) 152 goto repeat; 153 s->str += 2; 154 genequiv(s); 155 return (1); 156 default: /* "[\###*n]" or "[#*n]" */ 157 repeat: 158 if ((p = strpbrk(s->str + 2, "*]")) == NULL) 159 return (0); 160 if (p[0] != '*' || strchr(p, ']') == NULL) 161 return (0); 162 s->str += 1; 163 genseq(s); 164 return (1); 165 } 166 /* NOTREACHED */ 167 } 168 169 static void 170 genclass(STR *s) 171 { 172 173 if ((s->cclass = wctype(s->str)) == 0) 174 errx(1, "unknown class %s", s->str); 175 s->cnt = 0; 176 s->lastch = -1; /* incremented before check in next() */ 177 if (strcmp(s->str, "upper") == 0) 178 s->state = CCLASS_UPPER; 179 else if (strcmp(s->str, "lower") == 0) 180 s->state = CCLASS_LOWER; 181 else 182 s->state = CCLASS; 183 } 184 185 static void 186 genequiv(STR *s) 187 { 188 int i, p, pri; 189 char src[2], dst[3]; 190 size_t clen; 191 wchar_t wc; 192 193 if (*s->str == '\\') { 194 s->equiv[0] = backslash(s, NULL); 195 if (*s->str != '=') 196 errx(1, "misplaced equivalence equals sign"); 197 s->str += 2; 198 } else { 199 clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL); 200 if (clen == (size_t)-1 || clen == (size_t)-2 || clen == 0) 201 errc(1, EILSEQ, NULL); 202 s->equiv[0] = wc; 203 if (s->str[clen] != '=') 204 errx(1, "misplaced equivalence equals sign"); 205 s->str += clen + 2; 206 } 207 208 /* 209 * Calculate the set of all characters in the same equivalence class 210 * as the specified character (they will have the same primary 211 * collation weights). 212 * XXX Knows too much about how strxfrm() is implemented. Assumes 213 * it fills the string with primary collation weight bytes. Only one- 214 * to-one mappings are supported. 215 * XXX Equivalence classes not supported in multibyte locales. 216 */ 217 src[0] = (char)s->equiv[0]; 218 src[1] = '\0'; 219 if (MB_CUR_MAX == 1 && strxfrm(dst, src, sizeof(dst)) == 1) { 220 pri = (unsigned char)*dst; 221 for (p = 1, i = 1; i < NCHARS_SB; i++) { 222 *src = i; 223 if (strxfrm(dst, src, sizeof(dst)) == 1 && pri && 224 pri == (unsigned char)*dst) 225 s->equiv[p++] = i; 226 } 227 s->equiv[p] = OOBCH; 228 } 229 230 s->cnt = 0; 231 s->state = SET; 232 s->set = s->equiv; 233 } 234 235 static int 236 genrange(STR *s, int was_octal) 237 { 238 int stopval, octal; 239 char *savestart; 240 int n, cnt, *p; 241 size_t clen; 242 wchar_t wc; 243 244 octal = 0; 245 savestart = s->str; 246 if (*++s->str == '\\') 247 stopval = backslash(s, &octal); 248 else { 249 clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL); 250 if (clen == (size_t)-1 || clen == (size_t)-2) 251 errc(1, EILSEQ, NULL); 252 stopval = wc; 253 s->str += clen; 254 } 255 /* 256 * XXX Characters are not ordered according to collating sequence in 257 * multibyte locales. 258 */ 259 if (octal || was_octal || MB_CUR_MAX > 1) { 260 if (stopval < s->lastch) { 261 s->str = savestart; 262 return (0); 263 } 264 s->cnt = stopval - s->lastch + 1; 265 s->state = RANGE; 266 --s->lastch; 267 return (1); 268 } 269 if (charcoll((const void *)&stopval, (const void *)&(s->lastch)) < 0) { 270 s->str = savestart; 271 return (0); 272 } 273 if ((s->set = p = malloc((NCHARS_SB + 1) * sizeof(int))) == NULL) 274 err(1, "genrange() malloc"); 275 for (cnt = 0; cnt < NCHARS_SB; cnt++) 276 if (charcoll((const void *)&cnt, (const void *)&(s->lastch)) >= 0 && 277 charcoll((const void *)&cnt, (const void *)&stopval) <= 0) 278 *p++ = cnt; 279 *p = OOBCH; 280 n = p - s->set; 281 282 s->cnt = 0; 283 s->state = SET; 284 if (n > 1) 285 mergesort(s->set, n, sizeof(*(s->set)), charcoll); 286 return (1); 287 } 288 289 static void 290 genseq(STR *s) 291 { 292 char *ep; 293 wchar_t wc; 294 size_t clen; 295 296 if (s->which == STRING1) 297 errx(1, "sequences only valid in string2"); 298 299 if (*s->str == '\\') 300 s->lastch = backslash(s, NULL); 301 else { 302 clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL); 303 if (clen == (size_t)-1 || clen == (size_t)-2) 304 errc(1, EILSEQ, NULL); 305 s->lastch = wc; 306 s->str += clen; 307 } 308 if (*s->str != '*') 309 errx(1, "misplaced sequence asterisk"); 310 311 switch (*++s->str) { 312 case '\\': 313 s->cnt = backslash(s, NULL); 314 break; 315 case ']': 316 s->cnt = 0; 317 ++s->str; 318 break; 319 default: 320 if (isdigit((u_char)*s->str)) { 321 s->cnt = strtol(s->str, &ep, 0); 322 if (*ep == ']') { 323 s->str = ep + 1; 324 break; 325 } 326 } 327 errx(1, "illegal sequence count"); 328 /* NOTREACHED */ 329 } 330 331 s->state = s->cnt ? SEQUENCE : INFINITE; 332 } 333 334 /* 335 * Translate \??? into a character. Up to 3 octal digits, if no digits either 336 * an escape code or a literal character. 337 */ 338 static int 339 backslash(STR *s, int *is_octal) 340 { 341 int ch, cnt, val; 342 343 if (is_octal != NULL) 344 *is_octal = 0; 345 for (cnt = val = 0;;) { 346 ch = (u_char)*++s->str; 347 if (!isdigit(ch) || ch > '7') 348 break; 349 val = val * 8 + ch - '0'; 350 if (++cnt == 3) { 351 ++s->str; 352 break; 353 } 354 } 355 if (cnt) { 356 if (is_octal != NULL) 357 *is_octal = 1; 358 return (val); 359 } 360 if (ch != '\0') 361 ++s->str; 362 switch (ch) { 363 case 'a': /* escape characters */ 364 return ('\7'); 365 case 'b': 366 return ('\b'); 367 case 'f': 368 return ('\f'); 369 case 'n': 370 return ('\n'); 371 case 'r': 372 return ('\r'); 373 case 't': 374 return ('\t'); 375 case 'v': 376 return ('\13'); 377 case '\0': /* \" -> \ */ 378 s->state = EOS; 379 return ('\\'); 380 default: /* \x" -> x */ 381 return (ch); 382 } 383 } 384