1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2010 Nexenta Systems, Inc. All rights reserved. 14 */ 15 16 /* 17 * This file contains the "scanner", which tokenizes the input files 18 * for localedef for processing by the higher level grammar processor. 19 */ 20 21 #include <stdio.h> 22 #include <stdlib.h> 23 #include <ctype.h> 24 #include <limits.h> 25 #include <string.h> 26 #include <widec.h> 27 #include <sys/types.h> 28 #include <assert.h> 29 #include "localedef.h" 30 #include "parser.tab.h" 31 32 int com_char = '#'; 33 int esc_char = '\\'; 34 int mb_cur_min = 1; 35 int mb_cur_max = 1; 36 int lineno = 1; 37 int warnings = 0; 38 static int nextline; 39 static FILE *input = stdin; 40 static const char *filename = "<stdin>"; 41 static int instring = 0; 42 static int escaped = 0; 43 44 /* 45 * Token space ... grows on demand. 46 */ 47 static char *token = NULL; 48 static int tokidx; 49 static int toksz = 0; 50 static int hadtok = 0; 51 52 /* 53 * Wide string space ... grows on demand. 54 */ 55 static wchar_t *widestr = NULL; 56 static int wideidx = 0; 57 static int widesz = 0; 58 59 /* 60 * The last keyword seen. This is useful to trigger the special lexer rules 61 * for "copy" and also collating symbols and elements. 62 */ 63 int last_kw = 0; 64 static int category = T_END; 65 66 static struct token { 67 int id; 68 const char *name; 69 } keywords[] = { 70 { T_COM_CHAR, "comment_char" }, 71 { T_ESC_CHAR, "escape_char" }, 72 { T_END, "END" }, 73 { T_COPY, "copy" }, 74 { T_MESSAGES, "LC_MESSAGES" }, 75 { T_YESSTR, "yesstr" }, 76 { T_YESEXPR, "yesexpr" }, 77 { T_NOSTR, "nostr" }, 78 { T_NOEXPR, "noexpr" }, 79 { T_MONETARY, "LC_MONETARY" }, 80 { T_INT_CURR_SYMBOL, "int_curr_symbol" }, 81 { T_CURRENCY_SYMBOL, "currency_symbol" }, 82 { T_MON_DECIMAL_POINT, "mon_decimal_point" }, 83 { T_MON_THOUSANDS_SEP, "mon_thousands_sep" }, 84 { T_POSITIVE_SIGN, "positive_sign" }, 85 { T_NEGATIVE_SIGN, "negative_sign" }, 86 { T_MON_GROUPING, "mon_grouping" }, 87 { T_INT_FRAC_DIGITS, "int_frac_digits" }, 88 { T_FRAC_DIGITS, "frac_digits" }, 89 { T_P_CS_PRECEDES, "p_cs_precedes" }, 90 { T_P_SEP_BY_SPACE, "p_sep_by_space" }, 91 { T_N_CS_PRECEDES, "n_cs_precedes" }, 92 { T_N_SEP_BY_SPACE, "n_sep_by_space" }, 93 { T_P_SIGN_POSN, "p_sign_posn" }, 94 { T_N_SIGN_POSN, "n_sign_posn" }, 95 { T_INT_P_CS_PRECEDES, "int_p_cs_precedes" }, 96 { T_INT_N_CS_PRECEDES, "int_n_cs_precedes" }, 97 { T_INT_P_SEP_BY_SPACE, "int_p_sep_by_space" }, 98 { T_INT_N_SEP_BY_SPACE, "int_n_sep_by_space" }, 99 { T_INT_P_SIGN_POSN, "int_p_sign_posn" }, 100 { T_INT_N_SIGN_POSN, "int_n_sign_posn" }, 101 { T_COLLATE, "LC_COLLATE" }, 102 { T_COLLATING_SYMBOL, "collating-symbol" }, 103 { T_COLLATING_ELEMENT, "collating-element" }, 104 { T_FROM, "from" }, 105 { T_ORDER_START, "order_start" }, 106 { T_ORDER_END, "order_end" }, 107 { T_FORWARD, "forward" }, 108 { T_BACKWARD, "backward" }, 109 { T_POSITION, "position" }, 110 { T_IGNORE, "IGNORE" }, 111 { T_UNDEFINED, "UNDEFINED" }, 112 { T_NUMERIC, "LC_NUMERIC" }, 113 { T_DECIMAL_POINT, "decimal_point" }, 114 { T_THOUSANDS_SEP, "thousands_sep" }, 115 { T_GROUPING, "grouping" }, 116 { T_TIME, "LC_TIME" }, 117 { T_ABDAY, "abday" }, 118 { T_DAY, "day" }, 119 { T_ABMON, "abmon" }, 120 { T_MON, "mon" }, 121 { T_D_T_FMT, "d_t_fmt" }, 122 { T_D_FMT, "d_fmt" }, 123 { T_T_FMT, "t_fmt" }, 124 { T_AM_PM, "am_pm" }, 125 { T_T_FMT_AMPM, "t_fmt_ampm" }, 126 { T_ERA, "era" }, 127 { T_ERA_D_FMT, "era_d_fmt" }, 128 { T_ERA_T_FMT, "era_t_fmt" }, 129 { T_ERA_D_T_FMT, "era_d_t_fmt" }, 130 { T_ALT_DIGITS, "alt_digits" }, 131 { T_CTYPE, "LC_CTYPE" }, 132 { T_ISUPPER, "upper" }, 133 { T_ISLOWER, "lower" }, 134 { T_ISALPHA, "alpha" }, 135 { T_ISDIGIT, "digit" }, 136 { T_ISPUNCT, "punct" }, 137 { T_ISXDIGIT, "xdigit" }, 138 { T_ISSPACE, "space" }, 139 { T_ISPRINT, "print" }, 140 { T_ISGRAPH, "graph" }, 141 { T_ISBLANK, "blank" }, 142 { T_ISCNTRL, "cntrl" }, 143 /* 144 * These entries are local additions, and not specified by 145 * TOG. Note that they are not guaranteed to be accurate for 146 * all locales, and so applications should not depend on them. 147 */ 148 { T_ISSPECIAL, "special" }, 149 { T_ISENGLISH, "english" }, 150 { T_ISPHONOGRAM, "phonogram" }, 151 { T_ISIDEOGRAM, "ideogram" }, 152 { T_ISNUMBER, "number" }, 153 /* 154 * We have to support this in the grammar, but it would be a 155 * syntax error to define a character as one of these without 156 * also defining it as an alpha or digit. We ignore it in our 157 * parsing. 158 */ 159 { T_ISALNUM, "alnum" }, 160 { T_TOUPPER, "toupper" }, 161 { T_TOLOWER, "tolower" }, 162 163 /* 164 * These are keywords used in the charmap file. Note that 165 * Solaris orginally used angle brackets to wrap some of them, 166 * but we removed that to simplify our parser. The first of these 167 * items are "global items." 168 */ 169 { T_CHARMAP, "CHARMAP" }, 170 { T_WIDTH, "WIDTH" }, 171 { T_WIDTH_DEFAULT, "WIDTH_DEFAULT" }, 172 173 { -1, NULL }, 174 }; 175 176 /* 177 * These special words are only used in a charmap file, enclosed in <>. 178 */ 179 static struct token symwords[] = { 180 { T_COM_CHAR, "comment_char" }, 181 { T_ESC_CHAR, "escape_char" }, 182 { T_CODE_SET, "code_set_name" }, 183 { T_MB_CUR_MAX, "mb_cur_max" }, 184 { T_MB_CUR_MIN, "mb_cur_min" }, 185 { -1, NULL }, 186 }; 187 188 static int categories[] = { 189 T_CHARMAP, 190 T_CTYPE, 191 T_COLLATE, 192 T_MESSAGES, 193 T_MONETARY, 194 T_NUMERIC, 195 T_TIME, 196 0 197 }; 198 199 void 200 reset_scanner(const char *fname) 201 { 202 if (fname == NULL) { 203 filename = "<stdin>"; 204 input = stdin; 205 } else { 206 if (input != stdin) 207 (void) fclose(input); 208 if ((input = fopen(fname, "r")) == NULL) { 209 perror("fopen"); 210 exit(4); 211 } 212 filename = fname; 213 } 214 com_char = '#'; 215 esc_char = '\\'; 216 instring = 0; 217 escaped = 0; 218 lineno = 1; 219 nextline = 1; 220 tokidx = 0; 221 wideidx = 0; 222 } 223 224 #define hex(x) \ 225 (isdigit(x) ? (x - '0') : ((islower(x) ? (x - 'a') : (x - 'A')) + 10)) 226 #define isodigit(x) ((x >= '0') && (x <= '7')) 227 228 static int 229 scanc(void) 230 { 231 int c; 232 233 c = getc(input); 234 lineno = nextline; 235 if (c == '\n') { 236 nextline++; 237 } 238 return (c); 239 } 240 241 static void 242 unscanc(int c) 243 { 244 if (c == '\n') { 245 nextline--; 246 } 247 if (ungetc(c, input) < 0) { 248 yyerror(_("ungetc failed")); 249 } 250 } 251 252 static int 253 scan_hex_byte(void) 254 { 255 int c1, c2; 256 int v; 257 258 c1 = scanc(); 259 if (!isxdigit(c1)) { 260 yyerror(_("malformed hex digit")); 261 return (0); 262 } 263 c2 = scanc(); 264 if (!isxdigit(c2)) { 265 yyerror(_("malformed hex digit")); 266 return (0); 267 } 268 v = ((hex(c1) << 4) | hex(c2)); 269 return (v); 270 } 271 272 static int 273 scan_dec_byte(void) 274 { 275 int c1, c2, c3; 276 int b; 277 278 c1 = scanc(); 279 if (!isdigit(c1)) { 280 yyerror(_("malformed decimal digit")); 281 return (0); 282 } 283 b = c1 - '0'; 284 c2 = scanc(); 285 if (!isdigit(c2)) { 286 yyerror(_("malformed decimal digit")); 287 return (0); 288 } 289 b *= 10; 290 b += (c2 - '0'); 291 c3 = scanc(); 292 if (!isdigit(c3)) { 293 unscanc(c3); 294 } else { 295 b *= 10; 296 b += (c3 - '0'); 297 } 298 return (b); 299 } 300 301 static int 302 scan_oct_byte(void) 303 { 304 int c1, c2, c3; 305 int b; 306 307 b = 0; 308 309 c1 = scanc(); 310 if (!isodigit(c1)) { 311 yyerror(_("malformed octal digit")); 312 return (0); 313 } 314 b = c1 - '0'; 315 c2 = scanc(); 316 if (!isodigit(c2)) { 317 yyerror(_("malformed octal digit")); 318 return (0); 319 } 320 b *= 8; 321 b += (c2 - '0'); 322 c3 = scanc(); 323 if (!isodigit(c3)) { 324 unscanc(c3); 325 } else { 326 b *= 8; 327 b += (c3 - '0'); 328 } 329 return (b); 330 } 331 332 void 333 add_tok(int c) 334 { 335 if ((tokidx + 1) >= toksz) { 336 toksz += 64; 337 if ((token = realloc(token, toksz)) == NULL) { 338 yyerror(_("out of memory")); 339 tokidx = 0; 340 toksz = 0; 341 return; 342 } 343 } 344 345 token[tokidx++] = (char)c; 346 token[tokidx] = 0; 347 } 348 void 349 add_wcs(wchar_t c) 350 { 351 if ((wideidx + 1) >= widesz) { 352 widesz += 64; 353 widestr = realloc(widestr, (widesz * sizeof (wchar_t))); 354 if (widestr == NULL) { 355 yyerror(_("out of memory")); 356 wideidx = 0; 357 widesz = 0; 358 return; 359 } 360 } 361 362 widestr[wideidx++] = c; 363 widestr[wideidx] = 0; 364 } 365 366 wchar_t * 367 get_wcs(void) 368 { 369 wchar_t *ws = widestr; 370 wideidx = 0; 371 widestr = NULL; 372 widesz = 0; 373 if (ws == NULL) { 374 if ((ws = wsdup(L"")) == NULL) { 375 yyerror(_("out of memory")); 376 } 377 } 378 return (ws); 379 } 380 381 static int 382 get_byte(void) 383 { 384 int c; 385 386 if ((c = scanc()) != esc_char) { 387 unscanc(c); 388 return (EOF); 389 } 390 c = scanc(); 391 392 switch (c) { 393 case 'd': 394 case 'D': 395 return (scan_dec_byte()); 396 case 'x': 397 case 'X': 398 return (scan_hex_byte()); 399 case '0': 400 case '1': 401 case '2': 402 case '3': 403 case '4': 404 case '5': 405 case '6': 406 case '7': 407 /* put the character back so we can get it */ 408 unscanc(c); 409 return (scan_oct_byte()); 410 default: 411 unscanc(c); 412 unscanc(esc_char); 413 return (EOF); 414 } 415 } 416 417 int 418 get_escaped(int c) 419 { 420 switch (c) { 421 case 'n': 422 return ('\n'); 423 case 'r': 424 return ('\r'); 425 case 't': 426 return ('\t'); 427 case 'f': 428 return ('\f'); 429 case 'v': 430 return ('\v'); 431 case 'b': 432 return ('\b'); 433 case 'a': 434 return ('\a'); 435 default: 436 return (c); 437 } 438 } 439 440 int 441 get_wide(void) 442 { 443 static char mbs[MB_LEN_MAX + 1] = ""; 444 static int mbi = 0; 445 int c; 446 wchar_t wc; 447 448 if (mb_cur_max >= sizeof (mbs)) { 449 yyerror(_("max multibyte character size too big")); 450 mbi = 0; 451 return (T_NULL); 452 } 453 for (;;) { 454 if ((mbi == mb_cur_max) || ((c = get_byte()) == EOF)) { 455 /* 456 * end of the byte sequence reached, but no 457 * valid wide decoding. fatal error. 458 */ 459 mbi = 0; 460 yyerror(_("not a valid character encoding")); 461 return (T_NULL); 462 } 463 mbs[mbi++] = c; 464 mbs[mbi] = 0; 465 466 /* does it decode? */ 467 if (to_wide(&wc, mbs) >= 0) { 468 break; 469 } 470 } 471 472 mbi = 0; 473 if (category != T_CHARMAP) { 474 if (check_charmap(wc) < 0) { 475 yyerror(_("no symbolic name for character")); 476 return (T_NULL); 477 } 478 } 479 480 yylval.wc = wc; 481 return (T_CHAR); 482 } 483 484 int 485 get_symbol(void) 486 { 487 int c; 488 489 while ((c = scanc()) != EOF) { 490 if (escaped) { 491 escaped = 0; 492 if (c == '\n') 493 continue; 494 add_tok(get_escaped(c)); 495 continue; 496 } 497 if (c == esc_char) { 498 escaped = 1; 499 continue; 500 } 501 if (c == '\n') { /* well that's strange! */ 502 yyerror(_("unterminated symbolic name")); 503 continue; 504 } 505 if (c == '>') { /* end of symbol */ 506 507 /* 508 * This restarts the token from the beginning 509 * the next time we scan a character. (This 510 * token is complete.) 511 */ 512 513 if (token == NULL) { 514 yyerror(_("missing symbolic name")); 515 return (T_NULL); 516 } 517 tokidx = 0; 518 519 /* 520 * A few symbols are handled as keywords outside 521 * of the normal categories. 522 */ 523 if (category == T_END) { 524 int i; 525 for (i = 0; symwords[i].name != 0; i++) { 526 if (strcmp(token, symwords[i].name) == 527 0) { 528 last_kw = symwords[i].id; 529 return (last_kw); 530 } 531 } 532 } 533 /* 534 * Contextual rule: Only literal characters are 535 * permitted in CHARMAP. Anywhere else the symbolic 536 * forms are fine. 537 */ 538 if ((category != T_CHARMAP) && 539 (lookup_charmap(token, &yylval.wc)) != -1) { 540 return (T_CHAR); 541 } 542 if ((yylval.collsym = lookup_collsym(token)) != NULL) { 543 return (T_COLLSYM); 544 } 545 if ((yylval.collelem = lookup_collelem(token)) != 546 NULL) { 547 return (T_COLLELEM); 548 } 549 /* its an undefined symbol */ 550 yylval.token = strdup(token); 551 token = NULL; 552 toksz = 0; 553 tokidx = 0; 554 return (T_SYMBOL); 555 } 556 add_tok(c); 557 } 558 559 yyerror(_("unterminated symbolic name")); 560 return (EOF); 561 } 562 563 int 564 get_category(void) 565 { 566 return (category); 567 } 568 569 static int 570 consume_token(void) 571 { 572 int len = tokidx; 573 int i; 574 575 tokidx = 0; 576 if (token == NULL) 577 return (T_NULL); 578 579 /* 580 * this one is special, because we don't want it to alter the 581 * last_kw field. 582 */ 583 if (strcmp(token, "...") == 0) { 584 return (T_ELLIPSIS); 585 } 586 587 /* search for reserved words first */ 588 for (i = 0; keywords[i].name; i++) { 589 int j; 590 if (strcmp(keywords[i].name, token) != 0) { 591 continue; 592 } 593 594 last_kw = keywords[i].id; 595 596 /* clear the top level category if we're done with it */ 597 if (last_kw == T_END) { 598 category = T_END; 599 } 600 601 /* set the top level category if we're changing */ 602 for (j = 0; categories[j]; j++) { 603 if (categories[j] != last_kw) 604 continue; 605 category = last_kw; 606 } 607 608 return (keywords[i].id); 609 } 610 611 /* maybe its a numeric constant? */ 612 if (isdigit(*token) || (*token == '-' && isdigit(token[1]))) { 613 char *eptr; 614 yylval.num = strtol(token, &eptr, 10); 615 if (*eptr != 0) 616 yyerror(_("malformed number")); 617 return (T_NUMBER); 618 } 619 620 /* 621 * A single lone character is treated as a character literal. 622 * To avoid duplication of effort, we stick in the charmap. 623 */ 624 if (len == 1) { 625 yylval.wc = token[0]; 626 return (T_CHAR); 627 } 628 629 /* anything else is treated as a symbolic name */ 630 yylval.token = strdup(token); 631 token = NULL; 632 toksz = 0; 633 tokidx = 0; 634 return (T_NAME); 635 } 636 637 void 638 scan_to_eol(void) 639 { 640 int c; 641 while ((c = scanc()) != '\n') { 642 if (c == EOF) { 643 /* end of file without newline! */ 644 errf(_("missing newline")); 645 return; 646 } 647 } 648 assert(c == '\n'); 649 } 650 651 int 652 yylex(void) 653 { 654 int c; 655 656 while ((c = scanc()) != EOF) { 657 658 /* special handling for quoted string */ 659 if (instring) { 660 if (escaped) { 661 escaped = 0; 662 663 /* if newline, just eat and forget it */ 664 if (c == '\n') 665 continue; 666 667 if (strchr("xXd01234567", c)) { 668 unscanc(c); 669 unscanc(esc_char); 670 return (get_wide()); 671 } 672 yylval.wc = get_escaped(c); 673 return (T_CHAR); 674 } 675 if (c == esc_char) { 676 escaped = 1; 677 continue; 678 } 679 switch (c) { 680 case '<': 681 return (get_symbol()); 682 case '>': 683 /* oops! should generate syntax error */ 684 return (T_GT); 685 case '"': 686 instring = 0; 687 return (T_QUOTE); 688 default: 689 yylval.wc = c; 690 return (T_CHAR); 691 } 692 } 693 694 /* escaped characters first */ 695 if (escaped) { 696 escaped = 0; 697 if (c == '\n') { 698 /* eat the newline */ 699 continue; 700 } 701 hadtok = 1; 702 if (tokidx) { 703 /* an escape mid-token is nonsense */ 704 return (T_NULL); 705 } 706 707 /* numeric escapes are treated as wide characters */ 708 if (strchr("xXd01234567", c)) { 709 unscanc(c); 710 unscanc(esc_char); 711 return (get_wide()); 712 } 713 714 add_tok(get_escaped(c)); 715 continue; 716 } 717 718 /* if it is the escape charter itself note it */ 719 if (c == esc_char) { 720 escaped = 1; 721 continue; 722 } 723 724 /* remove from the comment char to end of line */ 725 if (c == com_char) { 726 while (c != '\n') { 727 if ((c = scanc()) == EOF) { 728 /* end of file without newline! */ 729 return (EOF); 730 } 731 } 732 assert(c == '\n'); 733 if (!hadtok) { 734 /* 735 * If there were no tokens on this line, 736 * then just pretend it didn't exist at all. 737 */ 738 continue; 739 } 740 hadtok = 0; 741 return (T_NL); 742 } 743 744 if (strchr(" \t\n;()<>,\"", c) && (tokidx != 0)) { 745 /* 746 * These are all token delimiters. If there 747 * is a token already in progress, we need to 748 * process it. 749 */ 750 unscanc(c); 751 return (consume_token()); 752 } 753 754 switch (c) { 755 case '\n': 756 if (!hadtok) { 757 /* 758 * If the line was completely devoid of tokens, 759 * then just ignore it. 760 */ 761 continue; 762 } 763 /* we're starting a new line, reset the token state */ 764 hadtok = 0; 765 return (T_NL); 766 case ',': 767 hadtok = 1; 768 return (T_COMMA); 769 case ';': 770 hadtok = 1; 771 return (T_SEMI); 772 case '(': 773 hadtok = 1; 774 return (T_LPAREN); 775 case ')': 776 hadtok = 1; 777 return (T_RPAREN); 778 case '>': 779 hadtok = 1; 780 return (T_GT); 781 case '<': 782 /* symbol start! */ 783 hadtok = 1; 784 return (get_symbol()); 785 case ' ': 786 case '\t': 787 /* whitespace, just ignore it */ 788 continue; 789 case '"': 790 hadtok = 1; 791 instring = 1; 792 return (T_QUOTE); 793 default: 794 hadtok = 1; 795 add_tok(c); 796 continue; 797 } 798 } 799 return (EOF); 800 } 801 802 void 803 yyerror(const char *msg) 804 { 805 (void) fprintf(stderr, _("%s: %d: error: %s\n"), 806 filename, lineno, msg); 807 exit(4); 808 } 809 810 void 811 errf(const char *fmt, ...) 812 { 813 char *msg; 814 815 va_list va; 816 va_start(va, fmt); 817 (void) vasprintf(&msg, fmt, va); 818 va_end(va); 819 820 (void) fprintf(stderr, _("%s: %d: error: %s\n"), 821 filename, lineno, msg); 822 free(msg); 823 exit(4); 824 } 825 826 void 827 warn(const char *fmt, ...) 828 { 829 char *msg; 830 831 va_list va; 832 va_start(va, fmt); 833 (void) vasprintf(&msg, fmt, va); 834 va_end(va); 835 836 (void) fprintf(stderr, _("%s: %d: warning: %s\n"), 837 filename, lineno, msg); 838 free(msg); 839 warnings++; 840 if (!warnok) 841 exit(4); 842 } 843