1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2010 Nexenta Systems, Inc. All rights reserved. 14 * Copyright 2013 DEY Storage Systems, Inc. 15 */ 16 17 /* 18 * This file contains the "scanner", which tokenizes the input files 19 * for localedef for processing by the higher level grammar processor. 20 */ 21 22 #include <stdio.h> 23 #include <stdlib.h> 24 #include <ctype.h> 25 #include <limits.h> 26 #include <string.h> 27 #include <widec.h> 28 #include <sys/types.h> 29 #include <assert.h> 30 #include "localedef.h" 31 #include "parser.tab.h" 32 33 int com_char = '#'; 34 int esc_char = '\\'; 35 int mb_cur_min = 1; 36 int mb_cur_max = 1; 37 int lineno = 1; 38 int warnings = 0; 39 static int nextline; 40 static FILE *input = stdin; 41 static const char *filename = "<stdin>"; 42 static int instring = 0; 43 static int escaped = 0; 44 45 /* 46 * Token space ... grows on demand. 47 */ 48 static char *token = NULL; 49 static int tokidx; 50 static int toksz = 0; 51 static int hadtok = 0; 52 53 /* 54 * Wide string space ... grows on demand. 55 */ 56 static wchar_t *widestr = NULL; 57 static int wideidx = 0; 58 static int widesz = 0; 59 60 /* 61 * The last keyword seen. This is useful to trigger the special lexer rules 62 * for "copy" and also collating symbols and elements. 63 */ 64 int last_kw = 0; 65 static int category = T_END; 66 67 static struct token { 68 int id; 69 const char *name; 70 } keywords[] = { 71 { T_COM_CHAR, "comment_char" }, 72 { T_ESC_CHAR, "escape_char" }, 73 { T_END, "END" }, 74 { T_COPY, "copy" }, 75 { T_MESSAGES, "LC_MESSAGES" }, 76 { T_YESSTR, "yesstr" }, 77 { T_YESEXPR, "yesexpr" }, 78 { T_NOSTR, "nostr" }, 79 { T_NOEXPR, "noexpr" }, 80 { T_MONETARY, "LC_MONETARY" }, 81 { T_INT_CURR_SYMBOL, "int_curr_symbol" }, 82 { T_CURRENCY_SYMBOL, "currency_symbol" }, 83 { T_MON_DECIMAL_POINT, "mon_decimal_point" }, 84 { T_MON_THOUSANDS_SEP, "mon_thousands_sep" }, 85 { T_POSITIVE_SIGN, "positive_sign" }, 86 { T_NEGATIVE_SIGN, "negative_sign" }, 87 { T_MON_GROUPING, "mon_grouping" }, 88 { T_INT_FRAC_DIGITS, "int_frac_digits" }, 89 { T_FRAC_DIGITS, "frac_digits" }, 90 { T_P_CS_PRECEDES, "p_cs_precedes" }, 91 { T_P_SEP_BY_SPACE, "p_sep_by_space" }, 92 { T_N_CS_PRECEDES, "n_cs_precedes" }, 93 { T_N_SEP_BY_SPACE, "n_sep_by_space" }, 94 { T_P_SIGN_POSN, "p_sign_posn" }, 95 { T_N_SIGN_POSN, "n_sign_posn" }, 96 { T_INT_P_CS_PRECEDES, "int_p_cs_precedes" }, 97 { T_INT_N_CS_PRECEDES, "int_n_cs_precedes" }, 98 { T_INT_P_SEP_BY_SPACE, "int_p_sep_by_space" }, 99 { T_INT_N_SEP_BY_SPACE, "int_n_sep_by_space" }, 100 { T_INT_P_SIGN_POSN, "int_p_sign_posn" }, 101 { T_INT_N_SIGN_POSN, "int_n_sign_posn" }, 102 { T_COLLATE, "LC_COLLATE" }, 103 { T_COLLATING_SYMBOL, "collating-symbol" }, 104 { T_COLLATING_ELEMENT, "collating-element" }, 105 { T_FROM, "from" }, 106 { T_ORDER_START, "order_start" }, 107 { T_ORDER_END, "order_end" }, 108 { T_FORWARD, "forward" }, 109 { T_BACKWARD, "backward" }, 110 { T_POSITION, "position" }, 111 { T_IGNORE, "IGNORE" }, 112 { T_UNDEFINED, "UNDEFINED" }, 113 { T_NUMERIC, "LC_NUMERIC" }, 114 { T_DECIMAL_POINT, "decimal_point" }, 115 { T_THOUSANDS_SEP, "thousands_sep" }, 116 { T_GROUPING, "grouping" }, 117 { T_TIME, "LC_TIME" }, 118 { T_ABDAY, "abday" }, 119 { T_DAY, "day" }, 120 { T_ABMON, "abmon" }, 121 { T_MON, "mon" }, 122 { T_D_T_FMT, "d_t_fmt" }, 123 { T_D_FMT, "d_fmt" }, 124 { T_T_FMT, "t_fmt" }, 125 { T_AM_PM, "am_pm" }, 126 { T_T_FMT_AMPM, "t_fmt_ampm" }, 127 { T_ERA, "era" }, 128 { T_ERA_D_FMT, "era_d_fmt" }, 129 { T_ERA_T_FMT, "era_t_fmt" }, 130 { T_ERA_D_T_FMT, "era_d_t_fmt" }, 131 { T_ALT_DIGITS, "alt_digits" }, 132 { T_CTYPE, "LC_CTYPE" }, 133 { T_ISUPPER, "upper" }, 134 { T_ISLOWER, "lower" }, 135 { T_ISALPHA, "alpha" }, 136 { T_ISDIGIT, "digit" }, 137 { T_ISPUNCT, "punct" }, 138 { T_ISXDIGIT, "xdigit" }, 139 { T_ISSPACE, "space" }, 140 { T_ISPRINT, "print" }, 141 { T_ISGRAPH, "graph" }, 142 { T_ISBLANK, "blank" }, 143 { T_ISCNTRL, "cntrl" }, 144 /* 145 * These entries are local additions, and not specified by 146 * TOG. Note that they are not guaranteed to be accurate for 147 * all locales, and so applications should not depend on them. 148 */ 149 { T_ISSPECIAL, "special" }, 150 { T_ISENGLISH, "english" }, 151 { T_ISPHONOGRAM, "phonogram" }, 152 { T_ISIDEOGRAM, "ideogram" }, 153 { T_ISNUMBER, "number" }, 154 /* 155 * We have to support this in the grammar, but it would be a 156 * syntax error to define a character as one of these without 157 * also defining it as an alpha or digit. We ignore it in our 158 * parsing. 159 */ 160 { T_ISALNUM, "alnum" }, 161 { T_TOUPPER, "toupper" }, 162 { T_TOLOWER, "tolower" }, 163 164 /* 165 * These are keywords used in the charmap file. Note that 166 * Solaris originally used angle brackets to wrap some of them, 167 * but we removed that to simplify our parser. The first of these 168 * items are "global items." 169 */ 170 { T_CHARMAP, "CHARMAP" }, 171 { T_WIDTH, "WIDTH" }, 172 173 { -1, NULL }, 174 }; 175 176 /* 177 * These special words are only used in a charmap file, enclosed in <>. 178 */ 179 static struct token symwords[] = { 180 { T_COM_CHAR, "comment_char" }, 181 { T_ESC_CHAR, "escape_char" }, 182 { T_CODE_SET, "code_set_name" }, 183 { T_MB_CUR_MAX, "mb_cur_max" }, 184 { T_MB_CUR_MIN, "mb_cur_min" }, 185 { -1, NULL }, 186 }; 187 188 static int categories[] = { 189 T_CHARMAP, 190 T_CTYPE, 191 T_COLLATE, 192 T_MESSAGES, 193 T_MONETARY, 194 T_NUMERIC, 195 T_TIME, 196 T_WIDTH, 197 0 198 }; 199 200 void 201 reset_scanner(const char *fname) 202 { 203 if (fname == NULL) { 204 filename = "<stdin>"; 205 input = stdin; 206 } else { 207 if (input != stdin) 208 (void) fclose(input); 209 if ((input = fopen(fname, "r")) == NULL) { 210 perror("fopen"); 211 exit(4); 212 } 213 filename = fname; 214 } 215 com_char = '#'; 216 esc_char = '\\'; 217 instring = 0; 218 escaped = 0; 219 lineno = 1; 220 nextline = 1; 221 tokidx = 0; 222 wideidx = 0; 223 } 224 225 #define hex(x) \ 226 (isdigit(x) ? (x - '0') : ((islower(x) ? (x - 'a') : (x - 'A')) + 10)) 227 #define isodigit(x) ((x >= '0') && (x <= '7')) 228 229 static int 230 scanc(void) 231 { 232 int c; 233 234 c = getc(input); 235 lineno = nextline; 236 if (c == '\n') { 237 nextline++; 238 } 239 return (c); 240 } 241 242 static void 243 unscanc(int c) 244 { 245 if (c == '\n') { 246 nextline--; 247 } 248 if (ungetc(c, input) < 0) { 249 (void) yyerror(_("ungetc failed")); 250 } 251 } 252 253 static int 254 scan_hex_byte(void) 255 { 256 int c1, c2; 257 int v; 258 259 c1 = scanc(); 260 if (!isxdigit(c1)) { 261 (void) yyerror(_("malformed hex digit")); 262 } 263 c2 = scanc(); 264 if (!isxdigit(c2)) { 265 (void) yyerror(_("malformed hex digit")); 266 } 267 v = ((hex(c1) << 4) | hex(c2)); 268 return (v); 269 } 270 271 static int 272 scan_dec_byte(void) 273 { 274 int c1, c2, c3; 275 int b; 276 277 c1 = scanc(); 278 if (!isdigit(c1)) { 279 (void) yyerror(_("malformed decimal digit")); 280 } 281 b = c1 - '0'; 282 c2 = scanc(); 283 if (!isdigit(c2)) { 284 (void) yyerror(_("malformed decimal digit")); 285 } 286 b *= 10; 287 b += (c2 - '0'); 288 c3 = scanc(); 289 if (!isdigit(c3)) { 290 unscanc(c3); 291 } else { 292 b *= 10; 293 b += (c3 - '0'); 294 } 295 return (b); 296 } 297 298 static int 299 scan_oct_byte(void) 300 { 301 int c1, c2, c3; 302 int b; 303 304 b = 0; 305 306 c1 = scanc(); 307 if (!isodigit(c1)) { 308 (void) yyerror(_("malformed octal digit")); 309 } 310 b = c1 - '0'; 311 c2 = scanc(); 312 if (!isodigit(c2)) { 313 (void) yyerror(_("malformed octal digit")); 314 } 315 b *= 8; 316 b += (c2 - '0'); 317 c3 = scanc(); 318 if (!isodigit(c3)) { 319 unscanc(c3); 320 } else { 321 b *= 8; 322 b += (c3 - '0'); 323 } 324 return (b); 325 } 326 327 void 328 add_tok(int c) 329 { 330 if ((tokidx + 1) >= toksz) { 331 toksz += 64; 332 if ((token = realloc(token, toksz)) == NULL) { 333 (void) yyerror(_("out of memory")); 334 } 335 } 336 337 token[tokidx++] = (char)c; 338 token[tokidx] = 0; 339 } 340 void 341 add_wcs(wchar_t c) 342 { 343 if ((wideidx + 1) >= widesz) { 344 widesz += 64; 345 widestr = realloc(widestr, (widesz * sizeof (wchar_t))); 346 if (widestr == NULL) { 347 (void) yyerror(_("out of memory")); 348 } 349 } 350 351 widestr[wideidx++] = c; 352 widestr[wideidx] = 0; 353 } 354 355 wchar_t * 356 get_wcs(void) 357 { 358 wchar_t *ws = widestr; 359 wideidx = 0; 360 widestr = NULL; 361 widesz = 0; 362 if (ws == NULL) { 363 if ((ws = wsdup(L"")) == NULL) { 364 (void) yyerror(_("out of memory")); 365 } 366 } 367 return (ws); 368 } 369 370 static int 371 get_byte(void) 372 { 373 int c; 374 375 if ((c = scanc()) != esc_char) { 376 unscanc(c); 377 return (EOF); 378 } 379 c = scanc(); 380 381 switch (c) { 382 case 'd': 383 case 'D': 384 return (scan_dec_byte()); 385 case 'x': 386 case 'X': 387 return (scan_hex_byte()); 388 case '0': 389 case '1': 390 case '2': 391 case '3': 392 case '4': 393 case '5': 394 case '6': 395 case '7': 396 /* put the character back so we can get it */ 397 unscanc(c); 398 return (scan_oct_byte()); 399 default: 400 unscanc(c); 401 unscanc(esc_char); 402 return (EOF); 403 } 404 } 405 406 int 407 get_escaped(int c) 408 { 409 switch (c) { 410 case 'n': 411 return ('\n'); 412 case 'r': 413 return ('\r'); 414 case 't': 415 return ('\t'); 416 case 'f': 417 return ('\f'); 418 case 'v': 419 return ('\v'); 420 case 'b': 421 return ('\b'); 422 case 'a': 423 return ('\a'); 424 default: 425 return (c); 426 } 427 } 428 429 int 430 get_wide(void) 431 { 432 static char mbs[MB_LEN_MAX + 1] = ""; 433 static int mbi = 0; 434 int c; 435 wchar_t wc; 436 437 if (mb_cur_max >= sizeof (mbs)) { 438 (void) yyerror(_("max multibyte character size too big")); 439 } 440 for (;;) { 441 if ((mbi == mb_cur_max) || ((c = get_byte()) == EOF)) { 442 /* 443 * end of the byte sequence reached, but no 444 * valid wide decoding. fatal error. 445 */ 446 mbi = 0; 447 (void) yyerror(_("not a valid character encoding")); 448 } 449 mbs[mbi++] = c; 450 mbs[mbi] = 0; 451 452 /* does it decode? */ 453 if (to_wide(&wc, mbs) >= 0) { 454 break; 455 } 456 } 457 458 mbi = 0; 459 if ((category != T_CHARMAP) && (category != T_WIDTH)) { 460 if (check_charmap(wc) < 0) { 461 (void) yyerror(_("no symbolic name for character")); 462 } 463 } 464 465 yylval.wc = wc; 466 return (T_CHAR); 467 } 468 469 int 470 get_symbol(void) 471 { 472 int c; 473 474 while ((c = scanc()) != EOF) { 475 if (escaped) { 476 escaped = 0; 477 if (c == '\n') 478 continue; 479 add_tok(get_escaped(c)); 480 continue; 481 } 482 if (c == esc_char) { 483 escaped = 1; 484 continue; 485 } 486 if (c == '\n') { /* well that's strange! */ 487 (void) yyerror(_("unterminated symbolic name")); 488 } 489 if (c == '>') { /* end of symbol */ 490 491 /* 492 * This restarts the token from the beginning 493 * the next time we scan a character. (This 494 * token is complete.) 495 */ 496 497 if (token == NULL) { 498 (void) yyerror(_("missing symbolic name")); 499 } 500 tokidx = 0; 501 502 /* 503 * A few symbols are handled as keywords outside 504 * of the normal categories. 505 */ 506 if (category == T_END) { 507 int i; 508 for (i = 0; symwords[i].name != 0; i++) { 509 if (strcmp(token, symwords[i].name) == 510 0) { 511 last_kw = symwords[i].id; 512 return (last_kw); 513 } 514 } 515 } 516 /* 517 * Contextual rule: Only literal characters are 518 * permitted in CHARMAP. Anywhere else the symbolic 519 * forms are fine. 520 */ 521 if ((category != T_CHARMAP) && 522 (lookup_charmap(token, &yylval.wc)) != -1) { 523 return (T_CHAR); 524 } 525 if ((yylval.collsym = lookup_collsym(token)) != NULL) { 526 return (T_COLLSYM); 527 } 528 if ((yylval.collelem = lookup_collelem(token)) != 529 NULL) { 530 return (T_COLLELEM); 531 } 532 /* its an undefined symbol */ 533 yylval.token = strdup(token); 534 token = NULL; 535 toksz = 0; 536 tokidx = 0; 537 return (T_SYMBOL); 538 } 539 add_tok(c); 540 } 541 542 (void) yyerror(_("unterminated symbolic name")); 543 return (EOF); 544 } 545 546 int 547 get_category(void) 548 { 549 return (category); 550 } 551 552 static int 553 consume_token(void) 554 { 555 int len = tokidx; 556 int i; 557 558 tokidx = 0; 559 if (token == NULL) 560 return (T_NULL); 561 562 /* 563 * this one is special, because we don't want it to alter the 564 * last_kw field. 565 */ 566 if (strcmp(token, "...") == 0) { 567 return (T_ELLIPSIS); 568 } 569 570 /* search for reserved words first */ 571 for (i = 0; keywords[i].name; i++) { 572 int j; 573 if (strcmp(keywords[i].name, token) != 0) { 574 continue; 575 } 576 577 last_kw = keywords[i].id; 578 579 /* clear the top level category if we're done with it */ 580 if (last_kw == T_END) { 581 category = T_END; 582 } 583 584 /* set the top level category if we're changing */ 585 for (j = 0; categories[j]; j++) { 586 if (categories[j] != last_kw) 587 continue; 588 category = last_kw; 589 } 590 591 return (keywords[i].id); 592 } 593 594 /* maybe its a numeric constant? */ 595 if (isdigit(*token) || (*token == '-' && isdigit(token[1]))) { 596 char *eptr; 597 yylval.num = strtol(token, &eptr, 10); 598 if (*eptr != 0) 599 (void) yyerror(_("malformed number")); 600 return (T_NUMBER); 601 } 602 603 /* 604 * A single lone character is treated as a character literal. 605 * To avoid duplication of effort, we stick in the charmap. 606 */ 607 if (len == 1) { 608 yylval.wc = token[0]; 609 return (T_CHAR); 610 } 611 612 /* anything else is treated as a symbolic name */ 613 yylval.token = strdup(token); 614 token = NULL; 615 toksz = 0; 616 tokidx = 0; 617 return (T_NAME); 618 } 619 620 void 621 scan_to_eol(void) 622 { 623 int c; 624 while ((c = scanc()) != '\n') { 625 if (c == EOF) { 626 /* end of file without newline! */ 627 errf(_("missing newline")); 628 return; 629 } 630 } 631 assert(c == '\n'); 632 } 633 634 int 635 yylex(void) 636 { 637 int c; 638 639 while ((c = scanc()) != EOF) { 640 641 /* special handling for quoted string */ 642 if (instring) { 643 if (escaped) { 644 escaped = 0; 645 646 /* if newline, just eat and forget it */ 647 if (c == '\n') 648 continue; 649 650 if (strchr("xXd01234567", c)) { 651 unscanc(c); 652 unscanc(esc_char); 653 return (get_wide()); 654 } 655 yylval.wc = get_escaped(c); 656 return (T_CHAR); 657 } 658 if (c == esc_char) { 659 escaped = 1; 660 continue; 661 } 662 switch (c) { 663 case '<': 664 return (get_symbol()); 665 case '>': 666 /* oops! should generate syntax error */ 667 return (T_GT); 668 case '"': 669 instring = 0; 670 return (T_QUOTE); 671 default: 672 yylval.wc = c; 673 return (T_CHAR); 674 } 675 } 676 677 /* escaped characters first */ 678 if (escaped) { 679 escaped = 0; 680 if (c == '\n') { 681 /* eat the newline */ 682 continue; 683 } 684 hadtok = 1; 685 if (tokidx) { 686 /* an escape mid-token is nonsense */ 687 return (T_NULL); 688 } 689 690 /* numeric escapes are treated as wide characters */ 691 if (strchr("xXd01234567", c)) { 692 unscanc(c); 693 unscanc(esc_char); 694 return (get_wide()); 695 } 696 697 add_tok(get_escaped(c)); 698 continue; 699 } 700 701 /* if it is the escape charter itself note it */ 702 if (c == esc_char) { 703 escaped = 1; 704 continue; 705 } 706 707 /* remove from the comment char to end of line */ 708 if (c == com_char) { 709 while (c != '\n') { 710 if ((c = scanc()) == EOF) { 711 /* end of file without newline! */ 712 return (EOF); 713 } 714 } 715 assert(c == '\n'); 716 if (!hadtok) { 717 /* 718 * If there were no tokens on this line, 719 * then just pretend it didn't exist at all. 720 */ 721 continue; 722 } 723 hadtok = 0; 724 return (T_NL); 725 } 726 727 if (strchr(" \t\n;()<>,\"", c) && (tokidx != 0)) { 728 /* 729 * These are all token delimiters. If there 730 * is a token already in progress, we need to 731 * process it. 732 */ 733 unscanc(c); 734 return (consume_token()); 735 } 736 737 switch (c) { 738 case '\n': 739 if (!hadtok) { 740 /* 741 * If the line was completely devoid of tokens, 742 * then just ignore it. 743 */ 744 continue; 745 } 746 /* we're starting a new line, reset the token state */ 747 hadtok = 0; 748 return (T_NL); 749 case ',': 750 hadtok = 1; 751 return (T_COMMA); 752 case ';': 753 hadtok = 1; 754 return (T_SEMI); 755 case '(': 756 hadtok = 1; 757 return (T_LPAREN); 758 case ')': 759 hadtok = 1; 760 return (T_RPAREN); 761 case '>': 762 hadtok = 1; 763 return (T_GT); 764 case '<': 765 /* symbol start! */ 766 hadtok = 1; 767 return (get_symbol()); 768 case ' ': 769 case '\t': 770 /* whitespace, just ignore it */ 771 continue; 772 case '"': 773 hadtok = 1; 774 instring = 1; 775 return (T_QUOTE); 776 default: 777 hadtok = 1; 778 add_tok(c); 779 continue; 780 } 781 } 782 return (EOF); 783 } 784 785 int 786 yyerror(const char *msg) 787 { 788 (void) fprintf(stderr, _("%s: %d: error: %s\n"), 789 filename, lineno, msg); 790 exit(4); 791 } 792 793 void 794 errf(const char *fmt, ...) 795 { 796 char *msg; 797 798 va_list va; 799 va_start(va, fmt); 800 (void) vasprintf(&msg, fmt, va); 801 va_end(va); 802 803 (void) fprintf(stderr, _("%s: %d: error: %s\n"), 804 filename, lineno, msg); 805 free(msg); 806 exit(4); 807 } 808 809 void 810 warn(const char *fmt, ...) 811 { 812 char *msg; 813 814 va_list va; 815 va_start(va, fmt); 816 (void) vasprintf(&msg, fmt, va); 817 va_end(va); 818 819 (void) fprintf(stderr, _("%s: %d: warning: %s\n"), 820 filename, lineno, msg); 821 free(msg); 822 warnings++; 823 if (!warnok) 824 exit(4); 825 } 826