1 /*- 2 * Copyright (c) 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 32 __FBSDID("$FreeBSD$"); 33 34 #ifndef lint 35 static const char sccsid[] = "@(#)str.c 8.2 (Berkeley) 4/28/95"; 36 #endif 37 38 #include <sys/types.h> 39 40 #include <ctype.h> 41 #include <err.h> 42 #include <errno.h> 43 #include <stddef.h> 44 #include <stdio.h> 45 #include <stdlib.h> 46 #include <string.h> 47 #include <wchar.h> 48 #include <wctype.h> 49 50 #include "extern.h" 51 52 static int backslash(STR *, int *); 53 static int bracket(STR *); 54 static void genclass(STR *); 55 static void genequiv(STR *); 56 static int genrange(STR *, int); 57 static void genseq(STR *); 58 59 wint_t 60 next(s) 61 STR *s; 62 { 63 int is_octal; 64 wint_t ch; 65 wchar_t wch; 66 size_t clen; 67 68 switch (s->state) { 69 case EOS: 70 return (0); 71 case INFINITE: 72 return (1); 73 case NORMAL: 74 switch (*s->str) { 75 case '\0': 76 s->state = EOS; 77 return (0); 78 case '\\': 79 s->lastch = backslash(s, &is_octal); 80 break; 81 case '[': 82 if (bracket(s)) 83 return (next(s)); 84 /* FALLTHROUGH */ 85 default: 86 clen = mbrtowc(&wch, s->str, MB_LEN_MAX, NULL); 87 if (clen == (size_t)-1 || clen == (size_t)-2 || 88 clen == 0) 89 errc(1, EILSEQ, NULL); 90 is_octal = 0; 91 s->lastch = wch; 92 s->str += clen; 93 break; 94 } 95 96 /* We can start a range at any time. */ 97 if (s->str[0] == '-' && genrange(s, is_octal)) 98 return (next(s)); 99 return (1); 100 case RANGE: 101 if (s->cnt-- == 0) { 102 s->state = NORMAL; 103 return (next(s)); 104 } 105 ++s->lastch; 106 return (1); 107 case SEQUENCE: 108 if (s->cnt-- == 0) { 109 s->state = NORMAL; 110 return (next(s)); 111 } 112 return (1); 113 case CCLASS: 114 case CCLASS_UPPER: 115 case CCLASS_LOWER: 116 s->cnt++; 117 ch = nextwctype(s->lastch, s->cclass); 118 if (ch == -1) { 119 s->state = NORMAL; 120 return (next(s)); 121 } 122 s->lastch = ch; 123 return (1); 124 case SET: 125 if ((ch = s->set[s->cnt++]) == OOBCH) { 126 s->state = NORMAL; 127 return (next(s)); 128 } 129 s->lastch = ch; 130 return (1); 131 default: 132 return (0); 133 } 134 /* NOTREACHED */ 135 } 136 137 static int 138 bracket(s) 139 STR *s; 140 { 141 char *p; 142 143 switch (s->str[1]) { 144 case ':': /* "[:class:]" */ 145 if ((p = strchr(s->str + 2, ']')) == NULL) 146 return (0); 147 if (*(p - 1) != ':' || p - s->str < 4) 148 goto repeat; 149 *(p - 1) = '\0'; 150 s->str += 2; 151 genclass(s); 152 s->str = p + 1; 153 return (1); 154 case '=': /* "[=equiv=]" */ 155 if (s->str[2] == '\0' || (p = strchr(s->str + 3, ']')) == NULL) 156 return (0); 157 if (*(p - 1) != '=' || p - s->str < 4) 158 goto repeat; 159 s->str += 2; 160 genequiv(s); 161 return (1); 162 default: /* "[\###*n]" or "[#*n]" */ 163 repeat: 164 if ((p = strpbrk(s->str + 2, "*]")) == NULL) 165 return (0); 166 if (p[0] != '*' || index(p, ']') == NULL) 167 return (0); 168 s->str += 1; 169 genseq(s); 170 return (1); 171 } 172 /* NOTREACHED */ 173 } 174 175 static void 176 genclass(s) 177 STR *s; 178 { 179 180 if ((s->cclass = wctype(s->str)) == 0) 181 errx(1, "unknown class %s", s->str); 182 s->cnt = 0; 183 s->lastch = -1; /* incremented before check in next() */ 184 if (strcmp(s->str, "upper") == 0) 185 s->state = CCLASS_UPPER; 186 else if (strcmp(s->str, "lower") == 0) 187 s->state = CCLASS_LOWER; 188 else 189 s->state = CCLASS; 190 } 191 192 static void 193 genequiv(s) 194 STR *s; 195 { 196 int i, p, pri; 197 char src[2], dst[3]; 198 size_t clen; 199 wchar_t wc; 200 201 if (*s->str == '\\') { 202 s->equiv[0] = backslash(s, NULL); 203 if (*s->str != '=') 204 errx(1, "misplaced equivalence equals sign"); 205 s->str += 2; 206 } else { 207 clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL); 208 if (clen == (size_t)-1 || clen == (size_t)-2 || clen == 0) 209 errc(1, EILSEQ, NULL); 210 s->equiv[0] = wc; 211 if (s->str[clen] != '=') 212 errx(1, "misplaced equivalence equals sign"); 213 s->str += clen + 2; 214 } 215 216 /* 217 * Calculate the set of all characters in the same equivalence class 218 * as the specified character (they will have the same primary 219 * collation weights). 220 * XXX Knows too much about how strxfrm() is implemented. Assumes 221 * it fills the string with primary collation weight bytes. Only one- 222 * to-one mappings are supported. 223 * XXX Equivalence classes not supported in multibyte locales. 224 */ 225 src[0] = (char)s->equiv[0]; 226 src[1] = '\0'; 227 if (MB_CUR_MAX == 1 && strxfrm(dst, src, sizeof(dst)) == 1) { 228 pri = (unsigned char)*dst; 229 for (p = 1, i = 1; i < NCHARS_SB; i++) { 230 *src = i; 231 if (strxfrm(dst, src, sizeof(dst)) == 1 && pri && 232 pri == (unsigned char)*dst) 233 s->equiv[p++] = i; 234 } 235 s->equiv[p] = OOBCH; 236 } 237 238 s->cnt = 0; 239 s->state = SET; 240 s->set = s->equiv; 241 } 242 243 static int 244 genrange(STR *s, int was_octal) 245 { 246 int stopval, octal; 247 char *savestart; 248 int n, cnt, *p; 249 size_t clen; 250 wchar_t wc; 251 252 octal = 0; 253 savestart = s->str; 254 if (*++s->str == '\\') 255 stopval = backslash(s, &octal); 256 else { 257 clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL); 258 if (clen == (size_t)-1 || clen == (size_t)-2) 259 errc(1, EILSEQ, NULL); 260 stopval = wc; 261 s->str += clen; 262 } 263 /* 264 * XXX Characters are not ordered according to collating sequence in 265 * multibyte locales. 266 */ 267 if (octal || was_octal || MB_CUR_MAX > 1) { 268 if (stopval < s->lastch) { 269 s->str = savestart; 270 return (0); 271 } 272 s->cnt = stopval - s->lastch + 1; 273 s->state = RANGE; 274 --s->lastch; 275 return (1); 276 } 277 if (charcoll((const void *)&stopval, (const void *)&(s->lastch)) < 0) { 278 s->str = savestart; 279 return (0); 280 } 281 if ((s->set = p = malloc((NCHARS_SB + 1) * sizeof(int))) == NULL) 282 err(1, "genrange() malloc"); 283 for (cnt = 0; cnt < NCHARS_SB; cnt++) 284 if (charcoll((const void *)&cnt, (const void *)&(s->lastch)) >= 0 && 285 charcoll((const void *)&cnt, (const void *)&stopval) <= 0) 286 *p++ = cnt; 287 *p = OOBCH; 288 n = p - s->set; 289 290 s->cnt = 0; 291 s->state = SET; 292 if (n > 1) 293 mergesort(s->set, n, sizeof(*(s->set)), charcoll); 294 return (1); 295 } 296 297 static void 298 genseq(s) 299 STR *s; 300 { 301 char *ep; 302 wchar_t wc; 303 size_t clen; 304 305 if (s->which == STRING1) 306 errx(1, "sequences only valid in string2"); 307 308 if (*s->str == '\\') 309 s->lastch = backslash(s, NULL); 310 else { 311 clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL); 312 if (clen == (size_t)-1 || clen == (size_t)-2) 313 errc(1, EILSEQ, NULL); 314 s->lastch = wc; 315 s->str += clen; 316 } 317 if (*s->str != '*') 318 errx(1, "misplaced sequence asterisk"); 319 320 switch (*++s->str) { 321 case '\\': 322 s->cnt = backslash(s, NULL); 323 break; 324 case ']': 325 s->cnt = 0; 326 ++s->str; 327 break; 328 default: 329 if (isdigit((u_char)*s->str)) { 330 s->cnt = strtol(s->str, &ep, 0); 331 if (*ep == ']') { 332 s->str = ep + 1; 333 break; 334 } 335 } 336 errx(1, "illegal sequence count"); 337 /* NOTREACHED */ 338 } 339 340 s->state = s->cnt ? SEQUENCE : INFINITE; 341 } 342 343 /* 344 * Translate \??? into a character. Up to 3 octal digits, if no digits either 345 * an escape code or a literal character. 346 */ 347 static int 348 backslash(STR *s, int *is_octal) 349 { 350 int ch, cnt, val; 351 352 if (is_octal != NULL) 353 *is_octal = 0; 354 for (cnt = val = 0;;) { 355 ch = (u_char)*++s->str; 356 if (!isdigit(ch) || ch > '7') 357 break; 358 val = val * 8 + ch - '0'; 359 if (++cnt == 3) { 360 ++s->str; 361 break; 362 } 363 } 364 if (cnt) { 365 if (is_octal != NULL) 366 *is_octal = 1; 367 return (val); 368 } 369 if (ch != '\0') 370 ++s->str; 371 switch (ch) { 372 case 'a': /* escape characters */ 373 return ('\7'); 374 case 'b': 375 return ('\b'); 376 case 'f': 377 return ('\f'); 378 case 'n': 379 return ('\n'); 380 case 'r': 381 return ('\r'); 382 case 't': 383 return ('\t'); 384 case 'v': 385 return ('\13'); 386 case '\0': /* \" -> \ */ 387 s->state = EOS; 388 return ('\\'); 389 default: /* \x" -> x */ 390 return (ch); 391 } 392 } 393