1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 14 */ 15 16 /* 17 * This file contains the "scanner", which tokenizes charmap files 18 * for iconv for processing by the higher level grammar processor. 19 */ 20 21 #include <stdio.h> 22 #include <stdlib.h> 23 #include <ctype.h> 24 #include <limits.h> 25 #include <string.h> 26 #include <widec.h> 27 #include <sys/types.h> 28 #include <assert.h> 29 #include "charmap.h" 30 #include "parser.tab.h" 31 32 int com_char = '#'; 33 int esc_char = '\\'; 34 int mb_cur_min = 1; 35 int mb_cur_max = MB_LEN_MAX; 36 int lineno = 1; 37 int warnings = 0; 38 static int nextline; 39 static FILE *input = stdin; 40 static const char *filename = "<stdin>"; 41 static int instring = 0; 42 static int escaped = 0; 43 44 /* 45 * Token space ... grows on demand. 46 */ 47 static char *token = NULL; 48 static int tokidx; 49 static int toksz = 0; 50 static int hadtok = 0; 51 52 /* 53 * The last keyword seen. This is useful to trigger the special lexer rules 54 * for "copy" and also collating symbols and elements. 55 */ 56 int last_kw = 0; 57 static int category = T_END; 58 59 static struct token { 60 int id; 61 const char *name; 62 } keywords[] = { 63 { T_COM_CHAR, "comment_char" }, 64 { T_ESC_CHAR, "escape_char" }, 65 { T_END, "END" }, 66 67 /* 68 * These are keywords used in the charmap file. Note that 69 * Solaris orginally used angle brackets to wrap some of them, 70 * but we removed that to simplify our parser. The first of these 71 * items are "global items." 72 */ 73 { T_CHARMAP, "CHARMAP" }, 74 { T_WIDTH, "WIDTH" }, 75 { T_WIDTH_DEFAULT, "WIDTH_DEFAULT" }, 76 77 { -1, NULL }, 78 }; 79 80 /* 81 * These special words are only used in a charmap file, enclosed in <>. 82 */ 83 static struct token symwords[] = { 84 { T_COM_CHAR, "comment_char" }, 85 { T_ESC_CHAR, "escape_char" }, 86 { T_CODE_SET, "code_set_name" }, 87 { T_MB_CUR_MAX, "mb_cur_max" }, 88 { T_MB_CUR_MIN, "mb_cur_min" }, 89 { -1, NULL }, 90 }; 91 92 static int categories[] = { 93 T_CHARMAP, 94 0 95 }; 96 97 void 98 reset_scanner(const char *fname) 99 { 100 if (fname == NULL) { 101 filename = "<stdin>"; 102 input = stdin; 103 } else { 104 if (input != stdin) 105 (void) fclose(input); 106 if ((input = fopen(fname, "r")) == NULL) { 107 perror(fname); 108 exit(1); 109 } 110 filename = fname; 111 } 112 com_char = '#'; 113 esc_char = '\\'; 114 instring = 0; 115 escaped = 0; 116 lineno = 1; 117 nextline = 1; 118 tokidx = 0; 119 last_kw = 0; 120 category = T_END; 121 } 122 123 #define hex(x) \ 124 (isdigit(x) ? (x - '0') : ((islower(x) ? (x - 'a') : (x - 'A')) + 10)) 125 #define isodigit(x) ((x >= '0') && (x <= '7')) 126 127 static int 128 scanc(void) 129 { 130 int c; 131 132 c = getc(input); 133 lineno = nextline; 134 if (c == '\n') { 135 nextline++; 136 } 137 return (c); 138 } 139 140 static void 141 unscanc(int c) 142 { 143 if (c == '\n') { 144 nextline--; 145 } 146 if (ungetc(c, input) < 0) { 147 yyerror(_("ungetc failed")); 148 } 149 } 150 151 static int 152 scan_hex_byte(void) 153 { 154 int c1, c2; 155 int v; 156 157 c1 = scanc(); 158 if (!isxdigit(c1)) { 159 yyerror(_("malformed hex digit")); 160 return (0); 161 } 162 c2 = scanc(); 163 if (!isxdigit(c2)) { 164 yyerror(_("malformed hex digit")); 165 return (0); 166 } 167 v = ((hex(c1) << 4) | hex(c2)); 168 return (v); 169 } 170 171 static int 172 scan_dec_byte(void) 173 { 174 int c1, c2, c3; 175 int b; 176 177 c1 = scanc(); 178 if (!isdigit(c1)) { 179 yyerror(_("malformed decimal digit")); 180 return (0); 181 } 182 b = c1 - '0'; 183 c2 = scanc(); 184 if (!isdigit(c2)) { 185 yyerror(_("malformed decimal digit")); 186 return (0); 187 } 188 b *= 10; 189 b += (c2 - '0'); 190 c3 = scanc(); 191 if (!isdigit(c3)) { 192 unscanc(c3); 193 } else { 194 b *= 10; 195 b += (c3 - '0'); 196 } 197 return (b); 198 } 199 200 static int 201 scan_oct_byte(void) 202 { 203 int c1, c2, c3; 204 int b; 205 206 b = 0; 207 208 c1 = scanc(); 209 if (!isodigit(c1)) { 210 yyerror(_("malformed octal digit")); 211 return (0); 212 } 213 b = c1 - '0'; 214 c2 = scanc(); 215 if (!isodigit(c2)) { 216 yyerror(_("malformed octal digit")); 217 return (0); 218 } 219 b *= 8; 220 b += (c2 - '0'); 221 c3 = scanc(); 222 if (!isodigit(c3)) { 223 unscanc(c3); 224 } else { 225 b *= 8; 226 b += (c3 - '0'); 227 } 228 return (b); 229 } 230 231 void 232 add_tok(int c) 233 { 234 if ((tokidx + 1) >= toksz) { 235 toksz += 64; 236 if ((token = realloc(token, toksz)) == NULL) { 237 yyerror(_("out of memory")); 238 tokidx = 0; 239 toksz = 0; 240 return; 241 } 242 } 243 244 token[tokidx++] = (char)c; 245 token[tokidx] = 0; 246 } 247 248 static int 249 get_byte(void) 250 { 251 int c; 252 253 if ((c = scanc()) != esc_char) { 254 unscanc(c); 255 return (EOF); 256 } 257 c = scanc(); 258 259 switch (c) { 260 case 'd': 261 case 'D': 262 return (scan_dec_byte()); 263 case 'x': 264 case 'X': 265 return (scan_hex_byte()); 266 case '0': 267 case '1': 268 case '2': 269 case '3': 270 case '4': 271 case '5': 272 case '6': 273 case '7': 274 /* put the character back so we can get it */ 275 unscanc(c); 276 return (scan_oct_byte()); 277 default: 278 unscanc(c); 279 unscanc(esc_char); 280 return (EOF); 281 } 282 } 283 284 int 285 get_escaped(int c) 286 { 287 switch (c) { 288 case 'n': 289 return ('\n'); 290 case 'r': 291 return ('\r'); 292 case 't': 293 return ('\t'); 294 case 'f': 295 return ('\f'); 296 case 'v': 297 return ('\v'); 298 case 'b': 299 return ('\b'); 300 case 'a': 301 return ('\a'); 302 default: 303 return (c); 304 } 305 } 306 307 int 308 get_wide(void) 309 { 310 /* NB: yylval.mbs[0] is the length */ 311 char *mbs = &yylval.mbs[1]; 312 int mbi = 0; 313 int c; 314 315 mbs[mbi] = 0; 316 if (mb_cur_max > MB_LEN_MAX) { 317 yyerror(_("max multibyte character size too big")); 318 return (T_NULL); 319 } 320 for (;;) { 321 if ((c = get_byte()) == EOF) 322 break; 323 if (mbi == mb_cur_max) { 324 unscanc(c); 325 yyerror(_("length > mb_cur_max")); 326 return (T_NULL); 327 } 328 mbs[mbi++] = c; 329 mbs[mbi] = 0; 330 } 331 332 /* result in yylval.mbs */ 333 mbs[-1] = mbi; 334 return (T_CHAR); 335 } 336 337 int 338 get_symbol(void) 339 { 340 int c; 341 342 while ((c = scanc()) != EOF) { 343 if (escaped) { 344 escaped = 0; 345 if (c == '\n') 346 continue; 347 add_tok(get_escaped(c)); 348 continue; 349 } 350 if (c == esc_char) { 351 escaped = 1; 352 continue; 353 } 354 if (c == '\n') { /* well that's strange! */ 355 yyerror(_("unterminated symbolic name")); 356 continue; 357 } 358 if (c == '>') { /* end of symbol */ 359 360 /* 361 * This restarts the token from the beginning 362 * the next time we scan a character. (This 363 * token is complete.) 364 */ 365 366 if (token == NULL) { 367 yyerror(_("missing symbolic name")); 368 return (T_NULL); 369 } 370 tokidx = 0; 371 372 /* 373 * A few symbols are handled as keywords outside 374 * of the normal categories. 375 */ 376 if (category == T_END) { 377 int i; 378 for (i = 0; symwords[i].name != 0; i++) { 379 if (strcmp(token, symwords[i].name) == 380 0) { 381 last_kw = symwords[i].id; 382 return (last_kw); 383 } 384 } 385 } 386 /* its an undefined symbol */ 387 yylval.token = strdup(token); 388 if (yylval.token == NULL) { 389 perror("malloc"); 390 exit(1); 391 } 392 token = NULL; 393 toksz = 0; 394 tokidx = 0; 395 return (T_SYMBOL); 396 } 397 add_tok(c); 398 } 399 400 yyerror(_("unterminated symbolic name")); 401 return (EOF); 402 } 403 404 405 static int 406 consume_token(void) 407 { 408 int len = tokidx; 409 int i; 410 411 tokidx = 0; 412 if (token == NULL) 413 return (T_NULL); 414 415 /* 416 * this one is special, because we don't want it to alter the 417 * last_kw field. 418 */ 419 if (strcmp(token, "...") == 0) { 420 return (T_ELLIPSIS); 421 } 422 423 /* search for reserved words first */ 424 for (i = 0; keywords[i].name; i++) { 425 int j; 426 if (strcmp(keywords[i].name, token) != 0) { 427 continue; 428 } 429 430 last_kw = keywords[i].id; 431 432 /* clear the top level category if we're done with it */ 433 if (last_kw == T_END) { 434 category = T_END; 435 } 436 437 /* set the top level category if we're changing */ 438 for (j = 0; categories[j]; j++) { 439 if (categories[j] != last_kw) 440 continue; 441 category = last_kw; 442 } 443 444 return (keywords[i].id); 445 } 446 447 /* maybe its a numeric constant? */ 448 if (isdigit(*token) || (*token == '-' && isdigit(token[1]))) { 449 char *eptr; 450 yylval.num = strtol(token, &eptr, 10); 451 if (*eptr != 0) 452 yyerror(_("malformed number")); 453 return (T_NUMBER); 454 } 455 456 /* 457 * A single lone character is treated as a character literal. 458 * To avoid duplication of effort, we stick in the charmap. 459 */ 460 if (len == 1) { 461 yylval.mbs[0] = 1; /* length */ 462 yylval.mbs[1] = token[0]; 463 yylval.mbs[2] = '\0'; 464 return (T_CHAR); 465 } 466 467 /* anything else is treated as a symbolic name */ 468 yylval.token = strdup(token); 469 token = NULL; 470 toksz = 0; 471 tokidx = 0; 472 return (T_NAME); 473 } 474 475 void 476 scan_to_eol(void) 477 { 478 int c; 479 while ((c = scanc()) != '\n') { 480 if (c == EOF) { 481 /* end of file without newline! */ 482 errf(_("missing newline")); 483 return; 484 } 485 } 486 assert(c == '\n'); 487 } 488 489 int 490 yylex(void) 491 { 492 int c; 493 494 while ((c = scanc()) != EOF) { 495 496 /* special handling for quoted string */ 497 if (instring) { 498 if (escaped) { 499 escaped = 0; 500 501 /* if newline, just eat and forget it */ 502 if (c == '\n') 503 continue; 504 505 if (strchr("xXd01234567", c)) { 506 unscanc(c); 507 unscanc(esc_char); 508 return (get_wide()); 509 } 510 yylval.mbs[0] = 1; /* length */ 511 yylval.mbs[1] = get_escaped(c); 512 yylval.mbs[2] = '\0'; 513 return (T_CHAR); 514 } 515 if (c == esc_char) { 516 escaped = 1; 517 continue; 518 } 519 switch (c) { 520 case '<': 521 return (get_symbol()); 522 case '>': 523 /* oops! should generate syntax error */ 524 return (T_GT); 525 case '"': 526 instring = 0; 527 return (T_QUOTE); 528 default: 529 yylval.mbs[0] = 1; /* length */ 530 yylval.mbs[1] = c; 531 yylval.mbs[2] = '\0'; 532 return (T_CHAR); 533 } 534 } 535 536 /* escaped characters first */ 537 if (escaped) { 538 escaped = 0; 539 if (c == '\n') { 540 /* eat the newline */ 541 continue; 542 } 543 hadtok = 1; 544 if (tokidx) { 545 /* an escape mid-token is nonsense */ 546 return (T_NULL); 547 } 548 549 /* numeric escapes are treated as wide characters */ 550 if (strchr("xXd01234567", c)) { 551 unscanc(c); 552 unscanc(esc_char); 553 return (get_wide()); 554 } 555 556 add_tok(get_escaped(c)); 557 continue; 558 } 559 560 /* if it is the escape charter itself note it */ 561 if (c == esc_char) { 562 escaped = 1; 563 continue; 564 } 565 566 /* remove from the comment char to end of line */ 567 if (c == com_char) { 568 while (c != '\n') { 569 if ((c = scanc()) == EOF) { 570 /* end of file without newline! */ 571 return (EOF); 572 } 573 } 574 assert(c == '\n'); 575 if (!hadtok) { 576 /* 577 * If there were no tokens on this line, 578 * then just pretend it didn't exist at all. 579 */ 580 continue; 581 } 582 hadtok = 0; 583 return (T_NL); 584 } 585 586 if (strchr(" \t\n;()<>,\"", c) && (tokidx != 0)) { 587 /* 588 * These are all token delimiters. If there 589 * is a token already in progress, we need to 590 * process it. 591 */ 592 unscanc(c); 593 return (consume_token()); 594 } 595 596 switch (c) { 597 case '\n': 598 if (!hadtok) { 599 /* 600 * If the line was completely devoid of tokens, 601 * then just ignore it. 602 */ 603 continue; 604 } 605 /* we're starting a new line, reset the token state */ 606 hadtok = 0; 607 return (T_NL); 608 case ',': 609 hadtok = 1; 610 return (T_COMMA); 611 case ';': 612 hadtok = 1; 613 return (T_SEMI); 614 case '(': 615 hadtok = 1; 616 return (T_LPAREN); 617 case ')': 618 hadtok = 1; 619 return (T_RPAREN); 620 case '>': 621 hadtok = 1; 622 return (T_GT); 623 case '<': 624 /* symbol start! */ 625 hadtok = 1; 626 return (get_symbol()); 627 case ' ': 628 case '\t': 629 /* whitespace, just ignore it */ 630 continue; 631 case '"': 632 hadtok = 1; 633 instring = 1; 634 return (T_QUOTE); 635 default: 636 hadtok = 1; 637 add_tok(c); 638 continue; 639 } 640 } 641 return (EOF); 642 } 643 644 void 645 yyerror(const char *msg) 646 { 647 (void) fprintf(stderr, _("%s: %d: error: %s\n"), 648 filename, lineno, msg); 649 exit(1); 650 } 651 652 void 653 errf(const char *fmt, ...) 654 { 655 char *msg; 656 657 va_list va; 658 va_start(va, fmt); 659 (void) vasprintf(&msg, fmt, va); 660 va_end(va); 661 662 (void) fprintf(stderr, _("%s: %d: error: %s\n"), 663 filename, lineno, msg); 664 free(msg); 665 exit(1); 666 } 667 668 void 669 warn(const char *fmt, ...) 670 { 671 char *msg; 672 673 va_list va; 674 va_start(va, fmt); 675 (void) vasprintf(&msg, fmt, va); 676 va_end(va); 677 678 (void) fprintf(stderr, _("%s: %d: warning: %s\n"), 679 filename, lineno, msg); 680 free(msg); 681 warnings++; 682 } 683