1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2010 Nexenta Systems, Inc. All rights reserved. 14 * Copyright 2013 DEY Storage Systems, Inc. 15 */ 16 17 /* 18 * This file contains the "scanner", which tokenizes the input files 19 * for localedef for processing by the higher level grammar processor. 20 */ 21 22 #include <stdio.h> 23 #include <stdlib.h> 24 #include <ctype.h> 25 #include <limits.h> 26 #include <string.h> 27 #include <widec.h> 28 #include <sys/types.h> 29 #include <assert.h> 30 #include "localedef.h" 31 #include "parser.tab.h" 32 33 int com_char = '#'; 34 int esc_char = '\\'; 35 int mb_cur_min = 1; 36 int mb_cur_max = 1; 37 int lineno = 1; 38 int warnings = 0; 39 static int nextline; 40 static FILE *input = stdin; 41 static const char *filename = "<stdin>"; 42 static int instring = 0; 43 static int escaped = 0; 44 45 /* 46 * Token space ... grows on demand. 47 */ 48 static char *token = NULL; 49 static int tokidx; 50 static int toksz = 0; 51 static int hadtok = 0; 52 53 /* 54 * Wide string space ... grows on demand. 55 */ 56 static wchar_t *widestr = NULL; 57 static int wideidx = 0; 58 static int widesz = 0; 59 60 /* 61 * The last keyword seen. This is useful to trigger the special lexer rules 62 * for "copy" and also collating symbols and elements. 63 */ 64 int last_kw = 0; 65 static int category = T_END; 66 67 static struct token { 68 int id; 69 const char *name; 70 } keywords[] = { 71 { T_COM_CHAR, "comment_char" }, 72 { T_ESC_CHAR, "escape_char" }, 73 { T_END, "END" }, 74 { T_COPY, "copy" }, 75 { T_MESSAGES, "LC_MESSAGES" }, 76 { T_YESSTR, "yesstr" }, 77 { T_YESEXPR, "yesexpr" }, 78 { T_NOSTR, "nostr" }, 79 { T_NOEXPR, "noexpr" }, 80 { T_MONETARY, "LC_MONETARY" }, 81 { T_INT_CURR_SYMBOL, "int_curr_symbol" }, 82 { T_CURRENCY_SYMBOL, "currency_symbol" }, 83 { T_MON_DECIMAL_POINT, "mon_decimal_point" }, 84 { T_MON_THOUSANDS_SEP, "mon_thousands_sep" }, 85 { T_POSITIVE_SIGN, "positive_sign" }, 86 { T_NEGATIVE_SIGN, "negative_sign" }, 87 { T_MON_GROUPING, "mon_grouping" }, 88 { T_INT_FRAC_DIGITS, "int_frac_digits" }, 89 { T_FRAC_DIGITS, "frac_digits" }, 90 { T_P_CS_PRECEDES, "p_cs_precedes" }, 91 { T_P_SEP_BY_SPACE, "p_sep_by_space" }, 92 { T_N_CS_PRECEDES, "n_cs_precedes" }, 93 { T_N_SEP_BY_SPACE, "n_sep_by_space" }, 94 { T_P_SIGN_POSN, "p_sign_posn" }, 95 { T_N_SIGN_POSN, "n_sign_posn" }, 96 { T_INT_P_CS_PRECEDES, "int_p_cs_precedes" }, 97 { T_INT_N_CS_PRECEDES, "int_n_cs_precedes" }, 98 { T_INT_P_SEP_BY_SPACE, "int_p_sep_by_space" }, 99 { T_INT_N_SEP_BY_SPACE, "int_n_sep_by_space" }, 100 { T_INT_P_SIGN_POSN, "int_p_sign_posn" }, 101 { T_INT_N_SIGN_POSN, "int_n_sign_posn" }, 102 { T_COLLATE, "LC_COLLATE" }, 103 { T_COLLATING_SYMBOL, "collating-symbol" }, 104 { T_COLLATING_ELEMENT, "collating-element" }, 105 { T_FROM, "from" }, 106 { T_ORDER_START, "order_start" }, 107 { T_ORDER_END, "order_end" }, 108 { T_FORWARD, "forward" }, 109 { T_BACKWARD, "backward" }, 110 { T_POSITION, "position" }, 111 { T_IGNORE, "IGNORE" }, 112 { T_UNDEFINED, "UNDEFINED" }, 113 { T_NUMERIC, "LC_NUMERIC" }, 114 { T_DECIMAL_POINT, "decimal_point" }, 115 { T_THOUSANDS_SEP, "thousands_sep" }, 116 { T_GROUPING, "grouping" }, 117 { T_TIME, "LC_TIME" }, 118 { T_ABDAY, "abday" }, 119 { T_DAY, "day" }, 120 { T_ABMON, "abmon" }, 121 { T_MON, "mon" }, 122 { T_D_T_FMT, "d_t_fmt" }, 123 { T_D_FMT, "d_fmt" }, 124 { T_T_FMT, "t_fmt" }, 125 { T_AM_PM, "am_pm" }, 126 { T_T_FMT_AMPM, "t_fmt_ampm" }, 127 { T_ERA, "era" }, 128 { T_ERA_D_FMT, "era_d_fmt" }, 129 { T_ERA_T_FMT, "era_t_fmt" }, 130 { T_ERA_D_T_FMT, "era_d_t_fmt" }, 131 { T_ALT_DIGITS, "alt_digits" }, 132 { T_CTYPE, "LC_CTYPE" }, 133 { T_ISUPPER, "upper" }, 134 { T_ISLOWER, "lower" }, 135 { T_ISALPHA, "alpha" }, 136 { T_ISDIGIT, "digit" }, 137 { T_ISPUNCT, "punct" }, 138 { T_ISXDIGIT, "xdigit" }, 139 { T_ISSPACE, "space" }, 140 { T_ISPRINT, "print" }, 141 { T_ISGRAPH, "graph" }, 142 { T_ISBLANK, "blank" }, 143 { T_ISCNTRL, "cntrl" }, 144 /* 145 * These entries are local additions, and not specified by 146 * TOG. Note that they are not guaranteed to be accurate for 147 * all locales, and so applications should not depend on them. 148 */ 149 { T_ISSPECIAL, "special" }, 150 { T_ISENGLISH, "english" }, 151 { T_ISPHONOGRAM, "phonogram" }, 152 { T_ISIDEOGRAM, "ideogram" }, 153 { T_ISNUMBER, "number" }, 154 /* 155 * We have to support this in the grammar, but it would be a 156 * syntax error to define a character as one of these without 157 * also defining it as an alpha or digit. We ignore it in our 158 * parsing. 159 */ 160 { T_ISALNUM, "alnum" }, 161 { T_TOUPPER, "toupper" }, 162 { T_TOLOWER, "tolower" }, 163 164 /* 165 * These are keywords used in the charmap file. Note that 166 * Solaris orginally used angle brackets to wrap some of them, 167 * but we removed that to simplify our parser. The first of these 168 * items are "global items." 169 */ 170 { T_CHARMAP, "CHARMAP" }, 171 { T_WIDTH, "WIDTH" }, 172 173 { -1, NULL }, 174 }; 175 176 /* 177 * These special words are only used in a charmap file, enclosed in <>. 178 */ 179 static struct token symwords[] = { 180 { T_COM_CHAR, "comment_char" }, 181 { T_ESC_CHAR, "escape_char" }, 182 { T_CODE_SET, "code_set_name" }, 183 { T_MB_CUR_MAX, "mb_cur_max" }, 184 { T_MB_CUR_MIN, "mb_cur_min" }, 185 { -1, NULL }, 186 }; 187 188 static int categories[] = { 189 T_CHARMAP, 190 T_CTYPE, 191 T_COLLATE, 192 T_MESSAGES, 193 T_MONETARY, 194 T_NUMERIC, 195 T_TIME, 196 T_WIDTH, 197 0 198 }; 199 200 void 201 reset_scanner(const char *fname) 202 { 203 if (fname == NULL) { 204 filename = "<stdin>"; 205 input = stdin; 206 } else { 207 if (input != stdin) 208 (void) fclose(input); 209 if ((input = fopen(fname, "r")) == NULL) { 210 perror("fopen"); 211 exit(4); 212 } 213 filename = fname; 214 } 215 com_char = '#'; 216 esc_char = '\\'; 217 instring = 0; 218 escaped = 0; 219 lineno = 1; 220 nextline = 1; 221 tokidx = 0; 222 wideidx = 0; 223 } 224 225 #define hex(x) \ 226 (isdigit(x) ? (x - '0') : ((islower(x) ? (x - 'a') : (x - 'A')) + 10)) 227 #define isodigit(x) ((x >= '0') && (x <= '7')) 228 229 static int 230 scanc(void) 231 { 232 int c; 233 234 c = getc(input); 235 lineno = nextline; 236 if (c == '\n') { 237 nextline++; 238 } 239 return (c); 240 } 241 242 static void 243 unscanc(int c) 244 { 245 if (c == '\n') { 246 nextline--; 247 } 248 if (ungetc(c, input) < 0) { 249 yyerror(_("ungetc failed")); 250 } 251 } 252 253 static int 254 scan_hex_byte(void) 255 { 256 int c1, c2; 257 int v; 258 259 c1 = scanc(); 260 if (!isxdigit(c1)) { 261 yyerror(_("malformed hex digit")); 262 return (0); 263 } 264 c2 = scanc(); 265 if (!isxdigit(c2)) { 266 yyerror(_("malformed hex digit")); 267 return (0); 268 } 269 v = ((hex(c1) << 4) | hex(c2)); 270 return (v); 271 } 272 273 static int 274 scan_dec_byte(void) 275 { 276 int c1, c2, c3; 277 int b; 278 279 c1 = scanc(); 280 if (!isdigit(c1)) { 281 yyerror(_("malformed decimal digit")); 282 return (0); 283 } 284 b = c1 - '0'; 285 c2 = scanc(); 286 if (!isdigit(c2)) { 287 yyerror(_("malformed decimal digit")); 288 return (0); 289 } 290 b *= 10; 291 b += (c2 - '0'); 292 c3 = scanc(); 293 if (!isdigit(c3)) { 294 unscanc(c3); 295 } else { 296 b *= 10; 297 b += (c3 - '0'); 298 } 299 return (b); 300 } 301 302 static int 303 scan_oct_byte(void) 304 { 305 int c1, c2, c3; 306 int b; 307 308 b = 0; 309 310 c1 = scanc(); 311 if (!isodigit(c1)) { 312 yyerror(_("malformed octal digit")); 313 return (0); 314 } 315 b = c1 - '0'; 316 c2 = scanc(); 317 if (!isodigit(c2)) { 318 yyerror(_("malformed octal digit")); 319 return (0); 320 } 321 b *= 8; 322 b += (c2 - '0'); 323 c3 = scanc(); 324 if (!isodigit(c3)) { 325 unscanc(c3); 326 } else { 327 b *= 8; 328 b += (c3 - '0'); 329 } 330 return (b); 331 } 332 333 void 334 add_tok(int c) 335 { 336 if ((tokidx + 1) >= toksz) { 337 toksz += 64; 338 if ((token = realloc(token, toksz)) == NULL) { 339 yyerror(_("out of memory")); 340 tokidx = 0; 341 toksz = 0; 342 return; 343 } 344 } 345 346 token[tokidx++] = (char)c; 347 token[tokidx] = 0; 348 } 349 void 350 add_wcs(wchar_t c) 351 { 352 if ((wideidx + 1) >= widesz) { 353 widesz += 64; 354 widestr = realloc(widestr, (widesz * sizeof (wchar_t))); 355 if (widestr == NULL) { 356 yyerror(_("out of memory")); 357 wideidx = 0; 358 widesz = 0; 359 return; 360 } 361 } 362 363 widestr[wideidx++] = c; 364 widestr[wideidx] = 0; 365 } 366 367 wchar_t * 368 get_wcs(void) 369 { 370 wchar_t *ws = widestr; 371 wideidx = 0; 372 widestr = NULL; 373 widesz = 0; 374 if (ws == NULL) { 375 if ((ws = wsdup(L"")) == NULL) { 376 yyerror(_("out of memory")); 377 } 378 } 379 return (ws); 380 } 381 382 static int 383 get_byte(void) 384 { 385 int c; 386 387 if ((c = scanc()) != esc_char) { 388 unscanc(c); 389 return (EOF); 390 } 391 c = scanc(); 392 393 switch (c) { 394 case 'd': 395 case 'D': 396 return (scan_dec_byte()); 397 case 'x': 398 case 'X': 399 return (scan_hex_byte()); 400 case '0': 401 case '1': 402 case '2': 403 case '3': 404 case '4': 405 case '5': 406 case '6': 407 case '7': 408 /* put the character back so we can get it */ 409 unscanc(c); 410 return (scan_oct_byte()); 411 default: 412 unscanc(c); 413 unscanc(esc_char); 414 return (EOF); 415 } 416 } 417 418 int 419 get_escaped(int c) 420 { 421 switch (c) { 422 case 'n': 423 return ('\n'); 424 case 'r': 425 return ('\r'); 426 case 't': 427 return ('\t'); 428 case 'f': 429 return ('\f'); 430 case 'v': 431 return ('\v'); 432 case 'b': 433 return ('\b'); 434 case 'a': 435 return ('\a'); 436 default: 437 return (c); 438 } 439 } 440 441 int 442 get_wide(void) 443 { 444 static char mbs[MB_LEN_MAX + 1] = ""; 445 static int mbi = 0; 446 int c; 447 wchar_t wc; 448 449 if (mb_cur_max >= sizeof (mbs)) { 450 yyerror(_("max multibyte character size too big")); 451 mbi = 0; 452 return (T_NULL); 453 } 454 for (;;) { 455 if ((mbi == mb_cur_max) || ((c = get_byte()) == EOF)) { 456 /* 457 * end of the byte sequence reached, but no 458 * valid wide decoding. fatal error. 459 */ 460 mbi = 0; 461 yyerror(_("not a valid character encoding")); 462 return (T_NULL); 463 } 464 mbs[mbi++] = c; 465 mbs[mbi] = 0; 466 467 /* does it decode? */ 468 if (to_wide(&wc, mbs) >= 0) { 469 break; 470 } 471 } 472 473 mbi = 0; 474 if ((category != T_CHARMAP) && (category != T_WIDTH)) { 475 if (check_charmap(wc) < 0) { 476 yyerror(_("no symbolic name for character")); 477 return (T_NULL); 478 } 479 } 480 481 yylval.wc = wc; 482 return (T_CHAR); 483 } 484 485 int 486 get_symbol(void) 487 { 488 int c; 489 490 while ((c = scanc()) != EOF) { 491 if (escaped) { 492 escaped = 0; 493 if (c == '\n') 494 continue; 495 add_tok(get_escaped(c)); 496 continue; 497 } 498 if (c == esc_char) { 499 escaped = 1; 500 continue; 501 } 502 if (c == '\n') { /* well that's strange! */ 503 yyerror(_("unterminated symbolic name")); 504 continue; 505 } 506 if (c == '>') { /* end of symbol */ 507 508 /* 509 * This restarts the token from the beginning 510 * the next time we scan a character. (This 511 * token is complete.) 512 */ 513 514 if (token == NULL) { 515 yyerror(_("missing symbolic name")); 516 return (T_NULL); 517 } 518 tokidx = 0; 519 520 /* 521 * A few symbols are handled as keywords outside 522 * of the normal categories. 523 */ 524 if (category == T_END) { 525 int i; 526 for (i = 0; symwords[i].name != 0; i++) { 527 if (strcmp(token, symwords[i].name) == 528 0) { 529 last_kw = symwords[i].id; 530 return (last_kw); 531 } 532 } 533 } 534 /* 535 * Contextual rule: Only literal characters are 536 * permitted in CHARMAP. Anywhere else the symbolic 537 * forms are fine. 538 */ 539 if ((category != T_CHARMAP) && 540 (lookup_charmap(token, &yylval.wc)) != -1) { 541 return (T_CHAR); 542 } 543 if ((yylval.collsym = lookup_collsym(token)) != NULL) { 544 return (T_COLLSYM); 545 } 546 if ((yylval.collelem = lookup_collelem(token)) != 547 NULL) { 548 return (T_COLLELEM); 549 } 550 /* its an undefined symbol */ 551 yylval.token = strdup(token); 552 token = NULL; 553 toksz = 0; 554 tokidx = 0; 555 return (T_SYMBOL); 556 } 557 add_tok(c); 558 } 559 560 yyerror(_("unterminated symbolic name")); 561 return (EOF); 562 } 563 564 int 565 get_category(void) 566 { 567 return (category); 568 } 569 570 static int 571 consume_token(void) 572 { 573 int len = tokidx; 574 int i; 575 576 tokidx = 0; 577 if (token == NULL) 578 return (T_NULL); 579 580 /* 581 * this one is special, because we don't want it to alter the 582 * last_kw field. 583 */ 584 if (strcmp(token, "...") == 0) { 585 return (T_ELLIPSIS); 586 } 587 588 /* search for reserved words first */ 589 for (i = 0; keywords[i].name; i++) { 590 int j; 591 if (strcmp(keywords[i].name, token) != 0) { 592 continue; 593 } 594 595 last_kw = keywords[i].id; 596 597 /* clear the top level category if we're done with it */ 598 if (last_kw == T_END) { 599 category = T_END; 600 } 601 602 /* set the top level category if we're changing */ 603 for (j = 0; categories[j]; j++) { 604 if (categories[j] != last_kw) 605 continue; 606 category = last_kw; 607 } 608 609 return (keywords[i].id); 610 } 611 612 /* maybe its a numeric constant? */ 613 if (isdigit(*token) || (*token == '-' && isdigit(token[1]))) { 614 char *eptr; 615 yylval.num = strtol(token, &eptr, 10); 616 if (*eptr != 0) 617 yyerror(_("malformed number")); 618 return (T_NUMBER); 619 } 620 621 /* 622 * A single lone character is treated as a character literal. 623 * To avoid duplication of effort, we stick in the charmap. 624 */ 625 if (len == 1) { 626 yylval.wc = token[0]; 627 return (T_CHAR); 628 } 629 630 /* anything else is treated as a symbolic name */ 631 yylval.token = strdup(token); 632 token = NULL; 633 toksz = 0; 634 tokidx = 0; 635 return (T_NAME); 636 } 637 638 void 639 scan_to_eol(void) 640 { 641 int c; 642 while ((c = scanc()) != '\n') { 643 if (c == EOF) { 644 /* end of file without newline! */ 645 errf(_("missing newline")); 646 return; 647 } 648 } 649 assert(c == '\n'); 650 } 651 652 int 653 yylex(void) 654 { 655 int c; 656 657 while ((c = scanc()) != EOF) { 658 659 /* special handling for quoted string */ 660 if (instring) { 661 if (escaped) { 662 escaped = 0; 663 664 /* if newline, just eat and forget it */ 665 if (c == '\n') 666 continue; 667 668 if (strchr("xXd01234567", c)) { 669 unscanc(c); 670 unscanc(esc_char); 671 return (get_wide()); 672 } 673 yylval.wc = get_escaped(c); 674 return (T_CHAR); 675 } 676 if (c == esc_char) { 677 escaped = 1; 678 continue; 679 } 680 switch (c) { 681 case '<': 682 return (get_symbol()); 683 case '>': 684 /* oops! should generate syntax error */ 685 return (T_GT); 686 case '"': 687 instring = 0; 688 return (T_QUOTE); 689 default: 690 yylval.wc = c; 691 return (T_CHAR); 692 } 693 } 694 695 /* escaped characters first */ 696 if (escaped) { 697 escaped = 0; 698 if (c == '\n') { 699 /* eat the newline */ 700 continue; 701 } 702 hadtok = 1; 703 if (tokidx) { 704 /* an escape mid-token is nonsense */ 705 return (T_NULL); 706 } 707 708 /* numeric escapes are treated as wide characters */ 709 if (strchr("xXd01234567", c)) { 710 unscanc(c); 711 unscanc(esc_char); 712 return (get_wide()); 713 } 714 715 add_tok(get_escaped(c)); 716 continue; 717 } 718 719 /* if it is the escape charter itself note it */ 720 if (c == esc_char) { 721 escaped = 1; 722 continue; 723 } 724 725 /* remove from the comment char to end of line */ 726 if (c == com_char) { 727 while (c != '\n') { 728 if ((c = scanc()) == EOF) { 729 /* end of file without newline! */ 730 return (EOF); 731 } 732 } 733 assert(c == '\n'); 734 if (!hadtok) { 735 /* 736 * If there were no tokens on this line, 737 * then just pretend it didn't exist at all. 738 */ 739 continue; 740 } 741 hadtok = 0; 742 return (T_NL); 743 } 744 745 if (strchr(" \t\n;()<>,\"", c) && (tokidx != 0)) { 746 /* 747 * These are all token delimiters. If there 748 * is a token already in progress, we need to 749 * process it. 750 */ 751 unscanc(c); 752 return (consume_token()); 753 } 754 755 switch (c) { 756 case '\n': 757 if (!hadtok) { 758 /* 759 * If the line was completely devoid of tokens, 760 * then just ignore it. 761 */ 762 continue; 763 } 764 /* we're starting a new line, reset the token state */ 765 hadtok = 0; 766 return (T_NL); 767 case ',': 768 hadtok = 1; 769 return (T_COMMA); 770 case ';': 771 hadtok = 1; 772 return (T_SEMI); 773 case '(': 774 hadtok = 1; 775 return (T_LPAREN); 776 case ')': 777 hadtok = 1; 778 return (T_RPAREN); 779 case '>': 780 hadtok = 1; 781 return (T_GT); 782 case '<': 783 /* symbol start! */ 784 hadtok = 1; 785 return (get_symbol()); 786 case ' ': 787 case '\t': 788 /* whitespace, just ignore it */ 789 continue; 790 case '"': 791 hadtok = 1; 792 instring = 1; 793 return (T_QUOTE); 794 default: 795 hadtok = 1; 796 add_tok(c); 797 continue; 798 } 799 } 800 return (EOF); 801 } 802 803 void 804 yyerror(const char *msg) 805 { 806 (void) fprintf(stderr, _("%s: %d: error: %s\n"), 807 filename, lineno, msg); 808 exit(4); 809 } 810 811 void 812 errf(const char *fmt, ...) 813 { 814 char *msg; 815 816 va_list va; 817 va_start(va, fmt); 818 (void) vasprintf(&msg, fmt, va); 819 va_end(va); 820 821 (void) fprintf(stderr, _("%s: %d: error: %s\n"), 822 filename, lineno, msg); 823 free(msg); 824 exit(4); 825 } 826 827 void 828 warn(const char *fmt, ...) 829 { 830 char *msg; 831 832 va_list va; 833 va_start(va, fmt); 834 (void) vasprintf(&msg, fmt, va); 835 va_end(va); 836 837 (void) fprintf(stderr, _("%s: %d: warning: %s\n"), 838 filename, lineno, msg); 839 free(msg); 840 warnings++; 841 if (!warnok) 842 exit(4); 843 } 844