1 /*- 2 * Copyright (c) 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 32 __FBSDID("$FreeBSD$"); 33 34 #ifndef lint 35 static const char sccsid[] = "@(#)str.c 8.2 (Berkeley) 4/28/95"; 36 #endif 37 38 #include <sys/types.h> 39 40 #include <ctype.h> 41 #include <err.h> 42 #include <errno.h> 43 #include <stddef.h> 44 #include <stdio.h> 45 #include <stdlib.h> 46 #include <string.h> 47 #include <wchar.h> 48 #include <wctype.h> 49 50 #include "extern.h" 51 52 static int backslash(STR *, int *); 53 static int bracket(STR *); 54 static void genclass(STR *); 55 static void genequiv(STR *); 56 static int genrange(STR *, int); 57 static void genseq(STR *); 58 59 wint_t 60 next(STR *s) 61 { 62 int is_octal; 63 wint_t ch; 64 wchar_t wch; 65 size_t clen; 66 67 switch (s->state) { 68 case EOS: 69 return (0); 70 case INFINITE: 71 return (1); 72 case NORMAL: 73 switch (*s->str) { 74 case '\0': 75 s->state = EOS; 76 return (0); 77 case '\\': 78 s->lastch = backslash(s, &is_octal); 79 break; 80 case '[': 81 if (bracket(s)) 82 return (next(s)); 83 /* FALLTHROUGH */ 84 default: 85 clen = mbrtowc(&wch, s->str, MB_LEN_MAX, NULL); 86 if (clen == (size_t)-1 || clen == (size_t)-2 || 87 clen == 0) 88 errc(1, EILSEQ, NULL); 89 is_octal = 0; 90 s->lastch = wch; 91 s->str += clen; 92 break; 93 } 94 95 /* We can start a range at any time. */ 96 if (s->str[0] == '-' && genrange(s, is_octal)) 97 return (next(s)); 98 return (1); 99 case RANGE: 100 if (s->cnt-- == 0) { 101 s->state = NORMAL; 102 return (next(s)); 103 } 104 ++s->lastch; 105 return (1); 106 case SEQUENCE: 107 if (s->cnt-- == 0) { 108 s->state = NORMAL; 109 return (next(s)); 110 } 111 return (1); 112 case CCLASS: 113 case CCLASS_UPPER: 114 case CCLASS_LOWER: 115 s->cnt++; 116 ch = nextwctype(s->lastch, s->cclass); 117 if (ch == -1) { 118 s->state = NORMAL; 119 return (next(s)); 120 } 121 s->lastch = ch; 122 return (1); 123 case SET: 124 if ((ch = s->set[s->cnt++]) == OOBCH) { 125 s->state = NORMAL; 126 return (next(s)); 127 } 128 s->lastch = ch; 129 return (1); 130 default: 131 return (0); 132 } 133 /* NOTREACHED */ 134 } 135 136 static int 137 bracket(STR *s) 138 { 139 char *p; 140 141 switch (s->str[1]) { 142 case ':': /* "[:class:]" */ 143 if ((p = strchr(s->str + 2, ']')) == NULL) 144 return (0); 145 if (*(p - 1) != ':' || p - s->str < 4) 146 goto repeat; 147 *(p - 1) = '\0'; 148 s->str += 2; 149 genclass(s); 150 s->str = p + 1; 151 return (1); 152 case '=': /* "[=equiv=]" */ 153 if (s->str[2] == '\0' || (p = strchr(s->str + 3, ']')) == NULL) 154 return (0); 155 if (*(p - 1) != '=' || p - s->str < 4) 156 goto repeat; 157 s->str += 2; 158 genequiv(s); 159 return (1); 160 default: /* "[\###*n]" or "[#*n]" */ 161 repeat: 162 if ((p = strpbrk(s->str + 2, "*]")) == NULL) 163 return (0); 164 if (p[0] != '*' || strchr(p, ']') == NULL) 165 return (0); 166 s->str += 1; 167 genseq(s); 168 return (1); 169 } 170 /* NOTREACHED */ 171 } 172 173 static void 174 genclass(STR *s) 175 { 176 177 if ((s->cclass = wctype(s->str)) == 0) 178 errx(1, "unknown class %s", s->str); 179 s->cnt = 0; 180 s->lastch = -1; /* incremented before check in next() */ 181 if (strcmp(s->str, "upper") == 0) 182 s->state = CCLASS_UPPER; 183 else if (strcmp(s->str, "lower") == 0) 184 s->state = CCLASS_LOWER; 185 else 186 s->state = CCLASS; 187 } 188 189 static void 190 genequiv(STR *s) 191 { 192 int i, p, pri; 193 char src[2], dst[3]; 194 size_t clen; 195 wchar_t wc; 196 197 if (*s->str == '\\') { 198 s->equiv[0] = backslash(s, NULL); 199 if (*s->str != '=') 200 errx(1, "misplaced equivalence equals sign"); 201 s->str += 2; 202 } else { 203 clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL); 204 if (clen == (size_t)-1 || clen == (size_t)-2 || clen == 0) 205 errc(1, EILSEQ, NULL); 206 s->equiv[0] = wc; 207 if (s->str[clen] != '=') 208 errx(1, "misplaced equivalence equals sign"); 209 s->str += clen + 2; 210 } 211 212 /* 213 * Calculate the set of all characters in the same equivalence class 214 * as the specified character (they will have the same primary 215 * collation weights). 216 * XXX Knows too much about how strxfrm() is implemented. Assumes 217 * it fills the string with primary collation weight bytes. Only one- 218 * to-one mappings are supported. 219 * XXX Equivalence classes not supported in multibyte locales. 220 */ 221 src[0] = (char)s->equiv[0]; 222 src[1] = '\0'; 223 if (MB_CUR_MAX == 1 && strxfrm(dst, src, sizeof(dst)) == 1) { 224 pri = (unsigned char)*dst; 225 for (p = 1, i = 1; i < NCHARS_SB; i++) { 226 *src = i; 227 if (strxfrm(dst, src, sizeof(dst)) == 1 && pri && 228 pri == (unsigned char)*dst) 229 s->equiv[p++] = i; 230 } 231 s->equiv[p] = OOBCH; 232 } 233 234 s->cnt = 0; 235 s->state = SET; 236 s->set = s->equiv; 237 } 238 239 static int 240 genrange(STR *s, int was_octal) 241 { 242 int stopval, octal; 243 char *savestart; 244 int n, cnt, *p; 245 size_t clen; 246 wchar_t wc; 247 248 octal = 0; 249 savestart = s->str; 250 if (*++s->str == '\\') 251 stopval = backslash(s, &octal); 252 else { 253 clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL); 254 if (clen == (size_t)-1 || clen == (size_t)-2) 255 errc(1, EILSEQ, NULL); 256 stopval = wc; 257 s->str += clen; 258 } 259 /* 260 * XXX Characters are not ordered according to collating sequence in 261 * multibyte locales. 262 */ 263 if (octal || was_octal || MB_CUR_MAX > 1) { 264 if (stopval < s->lastch) { 265 s->str = savestart; 266 return (0); 267 } 268 s->cnt = stopval - s->lastch + 1; 269 s->state = RANGE; 270 --s->lastch; 271 return (1); 272 } 273 if (charcoll((const void *)&stopval, (const void *)&(s->lastch)) < 0) { 274 s->str = savestart; 275 return (0); 276 } 277 if ((s->set = p = malloc((NCHARS_SB + 1) * sizeof(int))) == NULL) 278 err(1, "genrange() malloc"); 279 for (cnt = 0; cnt < NCHARS_SB; cnt++) 280 if (charcoll((const void *)&cnt, (const void *)&(s->lastch)) >= 0 && 281 charcoll((const void *)&cnt, (const void *)&stopval) <= 0) 282 *p++ = cnt; 283 *p = OOBCH; 284 n = p - s->set; 285 286 s->cnt = 0; 287 s->state = SET; 288 if (n > 1) 289 mergesort(s->set, n, sizeof(*(s->set)), charcoll); 290 return (1); 291 } 292 293 static void 294 genseq(STR *s) 295 { 296 char *ep; 297 wchar_t wc; 298 size_t clen; 299 300 if (s->which == STRING1) 301 errx(1, "sequences only valid in string2"); 302 303 if (*s->str == '\\') 304 s->lastch = backslash(s, NULL); 305 else { 306 clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL); 307 if (clen == (size_t)-1 || clen == (size_t)-2) 308 errc(1, EILSEQ, NULL); 309 s->lastch = wc; 310 s->str += clen; 311 } 312 if (*s->str != '*') 313 errx(1, "misplaced sequence asterisk"); 314 315 switch (*++s->str) { 316 case '\\': 317 s->cnt = backslash(s, NULL); 318 break; 319 case ']': 320 s->cnt = 0; 321 ++s->str; 322 break; 323 default: 324 if (isdigit((u_char)*s->str)) { 325 s->cnt = strtol(s->str, &ep, 0); 326 if (*ep == ']') { 327 s->str = ep + 1; 328 break; 329 } 330 } 331 errx(1, "illegal sequence count"); 332 /* NOTREACHED */ 333 } 334 335 s->state = s->cnt ? SEQUENCE : INFINITE; 336 } 337 338 /* 339 * Translate \??? into a character. Up to 3 octal digits, if no digits either 340 * an escape code or a literal character. 341 */ 342 static int 343 backslash(STR *s, int *is_octal) 344 { 345 int ch, cnt, val; 346 347 if (is_octal != NULL) 348 *is_octal = 0; 349 for (cnt = val = 0;;) { 350 ch = (u_char)*++s->str; 351 if (!isdigit(ch) || ch > '7') 352 break; 353 val = val * 8 + ch - '0'; 354 if (++cnt == 3) { 355 ++s->str; 356 break; 357 } 358 } 359 if (cnt) { 360 if (is_octal != NULL) 361 *is_octal = 1; 362 return (val); 363 } 364 if (ch != '\0') 365 ++s->str; 366 switch (ch) { 367 case 'a': /* escape characters */ 368 return ('\7'); 369 case 'b': 370 return ('\b'); 371 case 'f': 372 return ('\f'); 373 case 'n': 374 return ('\n'); 375 case 'r': 376 return ('\r'); 377 case 't': 378 return ('\t'); 379 case 'v': 380 return ('\13'); 381 case '\0': /* \" -> \ */ 382 s->state = EOS; 383 return ('\\'); 384 default: /* \x" -> x */ 385 return (ch); 386 } 387 } 388