1 /*- 2 * Copyright (c) 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 32 __FBSDID("$FreeBSD$"); 33 34 #ifndef lint 35 static const char sccsid[] = "@(#)str.c 8.2 (Berkeley) 4/28/95"; 36 #endif 37 38 #include <sys/types.h> 39 40 #include <ctype.h> 41 #include <err.h> 42 #include <errno.h> 43 #include <stddef.h> 44 #include <stdio.h> 45 #include <stdlib.h> 46 #include <string.h> 47 #include <wchar.h> 48 #include <wctype.h> 49 50 #include "extern.h" 51 52 static int backslash(STR *, int *); 53 static int bracket(STR *); 54 static void genclass(STR *); 55 static void genequiv(STR *); 56 static int genrange(STR *); 57 static void genseq(STR *); 58 59 wint_t 60 next(STR *s) 61 { 62 int is_octal; 63 wint_t ch; 64 wchar_t wch; 65 size_t clen; 66 67 switch (s->state) { 68 case EOS: 69 return (0); 70 case INFINITE: 71 return (1); 72 case NORMAL: 73 switch (*s->str) { 74 case '\0': 75 s->state = EOS; 76 return (0); 77 case '\\': 78 s->lastch = backslash(s, &is_octal); 79 break; 80 case '[': 81 if (bracket(s)) 82 return (next(s)); 83 /* FALLTHROUGH */ 84 default: 85 clen = mbrtowc(&wch, s->str, MB_LEN_MAX, NULL); 86 if (clen == (size_t)-1 || clen == (size_t)-2 || 87 clen == 0) 88 errc(1, EILSEQ, NULL); 89 is_octal = 0; 90 s->lastch = wch; 91 s->str += clen; 92 break; 93 } 94 95 /* We can start a range at any time. */ 96 if (s->str[0] == '-' && genrange(s)) 97 return (next(s)); 98 return (1); 99 case RANGE: 100 if (s->cnt-- == 0) { 101 s->state = NORMAL; 102 return (next(s)); 103 } 104 ++s->lastch; 105 return (1); 106 case SEQUENCE: 107 if (s->cnt-- == 0) { 108 s->state = NORMAL; 109 return (next(s)); 110 } 111 return (1); 112 case CCLASS: 113 case CCLASS_UPPER: 114 case CCLASS_LOWER: 115 s->cnt++; 116 ch = nextwctype(s->lastch, s->cclass); 117 if (ch == -1) { 118 s->state = NORMAL; 119 return (next(s)); 120 } 121 s->lastch = ch; 122 return (1); 123 case SET: 124 if ((ch = s->set[s->cnt++]) == OOBCH) { 125 s->state = NORMAL; 126 return (next(s)); 127 } 128 s->lastch = ch; 129 return (1); 130 default: 131 return (0); 132 } 133 /* NOTREACHED */ 134 } 135 136 static int 137 bracket(STR *s) 138 { 139 char *p; 140 141 switch (s->str[1]) { 142 case ':': /* "[:class:]" */ 143 if ((p = strchr(s->str + 2, ']')) == NULL) 144 return (0); 145 if (*(p - 1) != ':' || p - s->str < 4) 146 goto repeat; 147 *(p - 1) = '\0'; 148 s->str += 2; 149 genclass(s); 150 s->str = p + 1; 151 return (1); 152 case '=': /* "[=equiv=]" */ 153 if (s->str[2] == '\0' || (p = strchr(s->str + 3, ']')) == NULL) 154 return (0); 155 if (*(p - 1) != '=' || p - s->str < 4) 156 goto repeat; 157 s->str += 2; 158 genequiv(s); 159 return (1); 160 default: /* "[\###*n]" or "[#*n]" */ 161 repeat: 162 if ((p = strpbrk(s->str + 2, "*]")) == NULL) 163 return (0); 164 if (p[0] != '*' || strchr(p, ']') == NULL) 165 return (0); 166 s->str += 1; 167 genseq(s); 168 return (1); 169 } 170 /* NOTREACHED */ 171 } 172 173 static void 174 genclass(STR *s) 175 { 176 177 if ((s->cclass = wctype(s->str)) == 0) 178 errx(1, "unknown class %s", s->str); 179 s->cnt = 0; 180 s->lastch = -1; /* incremented before check in next() */ 181 if (strcmp(s->str, "upper") == 0) 182 s->state = CCLASS_UPPER; 183 else if (strcmp(s->str, "lower") == 0) 184 s->state = CCLASS_LOWER; 185 else 186 s->state = CCLASS; 187 } 188 189 static void 190 genequiv(STR *s) 191 { 192 int i, p, pri; 193 char src[2], dst[3]; 194 size_t clen; 195 wchar_t wc; 196 197 if (*s->str == '\\') { 198 s->equiv[0] = backslash(s, NULL); 199 if (*s->str != '=') 200 errx(1, "misplaced equivalence equals sign"); 201 s->str += 2; 202 } else { 203 clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL); 204 if (clen == (size_t)-1 || clen == (size_t)-2 || clen == 0) 205 errc(1, EILSEQ, NULL); 206 s->equiv[0] = wc; 207 if (s->str[clen] != '=') 208 errx(1, "misplaced equivalence equals sign"); 209 s->str += clen + 2; 210 } 211 212 /* 213 * Calculate the set of all characters in the same equivalence class 214 * as the specified character (they will have the same primary 215 * collation weights). 216 * XXX Knows too much about how strxfrm() is implemented. Assumes 217 * it fills the string with primary collation weight bytes. Only one- 218 * to-one mappings are supported. 219 * XXX Equivalence classes not supported in multibyte locales. 220 */ 221 src[0] = (char)s->equiv[0]; 222 src[1] = '\0'; 223 if (MB_CUR_MAX == 1 && strxfrm(dst, src, sizeof(dst)) == 1) { 224 pri = (unsigned char)*dst; 225 for (p = 1, i = 1; i < NCHARS_SB; i++) { 226 *src = i; 227 if (strxfrm(dst, src, sizeof(dst)) == 1 && pri && 228 pri == (unsigned char)*dst) 229 s->equiv[p++] = i; 230 } 231 s->equiv[p] = OOBCH; 232 } 233 234 s->cnt = 0; 235 s->state = SET; 236 s->set = s->equiv; 237 } 238 239 static int 240 genrange(STR *s) 241 { 242 int stopval; 243 char *savestart; 244 size_t clen; 245 wchar_t wc; 246 247 savestart = s->str; 248 if (*++s->str == '\\') 249 stopval = backslash(s, NULL); 250 else { 251 clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL); 252 if (clen == (size_t)-1 || clen == (size_t)-2) 253 errc(1, EILSEQ, NULL); 254 stopval = wc; 255 s->str += clen; 256 } 257 if (stopval < s->lastch) { 258 s->str = savestart; 259 return (0); 260 } 261 s->cnt = stopval - s->lastch + 1; 262 s->state = RANGE; 263 --s->lastch; 264 return (1); 265 } 266 267 static void 268 genseq(STR *s) 269 { 270 char *ep; 271 wchar_t wc; 272 size_t clen; 273 274 if (s->which == STRING1) 275 errx(1, "sequences only valid in string2"); 276 277 if (*s->str == '\\') 278 s->lastch = backslash(s, NULL); 279 else { 280 clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL); 281 if (clen == (size_t)-1 || clen == (size_t)-2) 282 errc(1, EILSEQ, NULL); 283 s->lastch = wc; 284 s->str += clen; 285 } 286 if (*s->str != '*') 287 errx(1, "misplaced sequence asterisk"); 288 289 switch (*++s->str) { 290 case '\\': 291 s->cnt = backslash(s, NULL); 292 break; 293 case ']': 294 s->cnt = 0; 295 ++s->str; 296 break; 297 default: 298 if (isdigit((u_char)*s->str)) { 299 s->cnt = strtol(s->str, &ep, 0); 300 if (*ep == ']') { 301 s->str = ep + 1; 302 break; 303 } 304 } 305 errx(1, "illegal sequence count"); 306 /* NOTREACHED */ 307 } 308 309 s->state = s->cnt ? SEQUENCE : INFINITE; 310 } 311 312 /* 313 * Translate \??? into a character. Up to 3 octal digits, if no digits either 314 * an escape code or a literal character. 315 */ 316 static int 317 backslash(STR *s, int *is_octal) 318 { 319 int ch, cnt, val; 320 321 if (is_octal != NULL) 322 *is_octal = 0; 323 for (cnt = val = 0;;) { 324 ch = (u_char)*++s->str; 325 if (!isdigit(ch) || ch > '7') 326 break; 327 val = val * 8 + ch - '0'; 328 if (++cnt == 3) { 329 ++s->str; 330 break; 331 } 332 } 333 if (cnt) { 334 if (is_octal != NULL) 335 *is_octal = 1; 336 return (val); 337 } 338 if (ch != '\0') 339 ++s->str; 340 switch (ch) { 341 case 'a': /* escape characters */ 342 return ('\7'); 343 case 'b': 344 return ('\b'); 345 case 'f': 346 return ('\f'); 347 case 'n': 348 return ('\n'); 349 case 'r': 350 return ('\r'); 351 case 't': 352 return ('\t'); 353 case 'v': 354 return ('\13'); 355 case '\0': /* \" -> \ */ 356 s->state = EOS; 357 return ('\\'); 358 default: /* \x" -> x */ 359 return (ch); 360 } 361 } 362