1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Copyright 1986, 1994 by Mortice Kern Systems Inc. All rights reserved. 29 */ 30 31 #pragma ident "%Z%%M% %I% %E% SMI" 32 33 /* 34 * awk -- mainline, yylex, etc. 35 * 36 * Based on MKS awk(1) ported to be /usr/xpg4/bin/awk with POSIX/XCU4 changes 37 */ 38 39 #include "awk.h" 40 #include "y.tab.h" 41 #include <stdarg.h> 42 #include <unistd.h> 43 #include <locale.h> 44 45 static char *progfiles[NPFILE]; /* Programmes files for yylex */ 46 static char **progfilep = &progfiles[0]; /* Pointer to last file */ 47 static wchar_t *progptr; /* In-memory programme */ 48 static int proglen; /* Length of progptr */ 49 static wchar_t context[NCONTEXT]; /* Circular buffer of context */ 50 static wchar_t *conptr = &context[0]; /* context ptr */ 51 static FILE *progfp; /* Stdio stream for programme */ 52 static char *filename; 53 #ifdef DEBUG 54 static int dflag; 55 #endif 56 57 #define AWK_EXEC_MAGIC "<MKS AWKC>" 58 #define LEN_EXEC_MAGIC 10 59 60 static char unbal[] = "unbalanced E char"; 61 62 static void awkarginit(int c, char **av); 63 static int lexid(wint_t c); 64 static int lexnumber(wint_t c); 65 static int lexstring(wint_t endc); 66 static int lexregexp(wint_t endc); 67 68 static void awkvarinit(void); 69 static wint_t lexgetc(void); 70 static void lexungetc(wint_t c); 71 static size_t lexescape(wint_t endc, int regx, int cmd_line_operand); 72 static void awkierr(int perr, char *fmt, va_list ap); 73 static int usage(void); 74 void strescape(wchar_t *str); 75 static const char *toprint(wint_t); 76 char *_cmdname; 77 static wchar_t *mbconvert(char *str); 78 79 extern int isclvar(wchar_t *arg); 80 81 /* 82 * mainline for awk 83 */ 84 int 85 main(int argc, char *argv[]) 86 { 87 wchar_t *ap; 88 char *cmd; 89 90 cmd = argv[0]; 91 _cmdname = cmd; 92 93 linebuf = emalloc(NLINE * sizeof (wchar_t)); 94 95 /* 96 * At this point only messaging should be internationalized. 97 * numbers are still scanned as in the Posix locale. 98 */ 99 (void) setlocale(LC_ALL, ""); 100 (void) setlocale(LC_NUMERIC, "C"); 101 #if !defined(TEXT_DOMAIN) 102 #define TEXT_DOMAIN "SYS_TEST" 103 #endif 104 (void) textdomain(TEXT_DOMAIN); 105 106 awkvarinit(); 107 /* running = 1; */ 108 while (argc > 1 && *argv[1] == '-') { 109 void *save_ptr = NULL; 110 ap = mbstowcsdup(&argv[1][1]); 111 if (ap == NULL) 112 break; 113 if (*ap == '\0') { 114 free(ap); 115 break; 116 } 117 save_ptr = (void *) ap; 118 ++argv; 119 --argc; 120 if (*ap == '-' && ap[1] == '\0') 121 break; 122 for (; *ap != '\0'; ++ap) { 123 switch (*ap) { 124 #ifdef DEBUG 125 case 'd': 126 dflag = 1; 127 continue; 128 129 #endif 130 case 'f': 131 if (argc < 2) { 132 (void) fprintf(stderr, 133 gettext("Missing script file\n")); 134 return (1); 135 } 136 *progfilep++ = argv[1]; 137 --argc; 138 ++argv; 139 continue; 140 141 case 'F': 142 if (ap[1] == '\0') { 143 if (argc < 2) { 144 (void) fprintf(stderr, 145 gettext("Missing field separator\n")); 146 return (1); 147 } 148 ap = mbstowcsdup(argv[1]); 149 --argc; 150 ++argv; 151 } else 152 ++ap; 153 strescape(ap); 154 strassign(varFS, linebuf, FALLOC, 155 wcslen(linebuf)); 156 break; 157 158 case 'v': { 159 wchar_t *vp; 160 wchar_t *arg; 161 162 if (argc < 2) { 163 (void) fprintf(stderr, 164 gettext("Missing variable assignment\n")); 165 return (1); 166 } 167 arg = mbconvert(argv[1]); 168 /* 169 * Ensure the variable expression 170 * is valid (correct form). 171 */ 172 if (((vp = wcschr(arg, '=')) != NULL) && 173 isclvar(arg)) { 174 *vp = '\0'; 175 strescape(vp+1); 176 strassign(vlook(arg), linebuf, 177 FALLOC|FSENSE, 178 wcslen(linebuf)); 179 *vp = '='; 180 } else { 181 (void) fprintf(stderr, gettext( 182 "Invalid form for variable " 183 "assignment: %S\n"), arg); 184 return (1); 185 } 186 --argc; 187 ++argv; 188 continue; 189 } 190 191 default: 192 (void) fprintf(stderr, 193 gettext("Unknown option \"-%S\"\n"), ap); 194 return (usage()); 195 } 196 break; 197 } 198 if (save_ptr) 199 free(save_ptr); 200 } 201 if (progfilep == &progfiles[0]) { 202 if (argc < 2) 203 return (usage()); 204 filename = "[command line]"; /* BUG: NEEDS TRANSLATION */ 205 progptr = mbstowcsdup(argv[1]); 206 proglen = wcslen(progptr); 207 --argc; 208 ++argv; 209 } 210 211 argv[0] = cmd; 212 213 awkarginit(argc, argv); 214 215 /* running = 0; */ 216 (void) yyparse(); 217 218 lineno = 0; 219 /* 220 * Ok, done parsing, so now activate the rest of the nls stuff, set 221 * the radix character. 222 */ 223 (void) setlocale(LC_ALL, ""); 224 radixpoint = *localeconv()->decimal_point; 225 awk(); 226 /* NOTREACHED */ 227 return (0); 228 } 229 230 /* 231 * Do initial setup of buffers, etc. 232 * This must be called before most processing 233 * and especially before lexical analysis. 234 * Variables initialised here will be overruled by command 235 * line parameter initialisation. 236 */ 237 static void 238 awkvarinit() 239 { 240 NODE *np; 241 242 (void) setvbuf(stderr, NULL, _IONBF, 0); 243 244 if ((NIOSTREAM = sysconf(_SC_OPEN_MAX) - 4) <= 0) { 245 (void) fprintf(stderr, 246 gettext("not enough available file descriptors")); 247 exit(1); 248 } 249 ofiles = (OFILE *)emalloc(sizeof (OFILE)*NIOSTREAM); 250 #ifdef A_ZERO_POINTERS 251 (void) memset((wchar_t *)ofiles, 0, sizeof (OFILE) * NIOSTREAM); 252 #else 253 { 254 /* initialize file descriptor table */ 255 OFILE *fp; 256 for (fp = ofiles; fp < &ofiles[NIOSTREAM]; fp += 1) { 257 fp->f_fp = FNULL; 258 fp->f_mode = 0; 259 fp->f_name = (char *)0; 260 } 261 } 262 #endif 263 constant = intnode((INT)0); 264 265 const0 = intnode((INT)0); 266 const1 = intnode((INT)1); 267 constundef = emptynode(CONSTANT, 0); 268 constundef->n_flags = FSTRING|FVINT; 269 constundef->n_string = _null; 270 constundef->n_strlen = 0; 271 inc_oper = emptynode(ADD, 0); 272 inc_oper->n_right = const1; 273 asn_oper = emptynode(ADD, 0); 274 field0 = node(FIELD, const0, NNULL); 275 276 { 277 RESFUNC near*rp; 278 279 for (rp = &resfuncs[0]; rp->rf_name != (LOCCHARP)NULL; ++rp) { 280 np = finstall(rp->rf_name, rp->rf_func, rp->rf_type); 281 } 282 } 283 { 284 RESERVED near*rp; 285 286 for (rp = &reserved[0]; rp->r_name != (LOCCHARP)NULL; ++rp) { 287 switch (rp->r_type) { 288 case SVAR: 289 case VAR: 290 running = 1; 291 np = vlook(rp->r_name); 292 if (rp->r_type == SVAR) 293 np->n_flags |= FSPECIAL; 294 if (rp->r_svalue != NULL) 295 strassign(np, rp->r_svalue, FSTATIC, 296 (size_t)rp->r_ivalue); 297 else { 298 constant->n_int = rp->r_ivalue; 299 (void) assign(np, constant); 300 } 301 running = 0; 302 break; 303 304 case KEYWORD: 305 kinstall(rp->r_name, (int)rp->r_ivalue); 306 break; 307 } 308 } 309 } 310 311 varNR = vlook(s_NR); 312 varFNR = vlook(s_FNR); 313 varNF = vlook(s_NF); 314 varOFMT = vlook(s_OFMT); 315 varCONVFMT = vlook(s_CONVFMT); 316 varOFS = vlook(s_OFS); 317 varORS = vlook(s_ORS); 318 varRS = vlook(s_RS); 319 varFS = vlook(s_FS); 320 varARGC = vlook(s_ARGC); 321 varSUBSEP = vlook(s_SUBSEP); 322 varENVIRON = vlook(s_ENVIRON); 323 varFILENAME = vlook(s_FILENAME); 324 varSYMTAB = vlook(s_SYMTAB); 325 incNR = node(ASG, varNR, node(ADD, varNR, const1)); 326 incFNR = node(ASG, varFNR, node(ADD, varFNR, const1)); 327 clrFNR = node(ASG, varFNR, const0); 328 } 329 330 /* 331 * Initialise awk ARGC, ARGV variables. 332 */ 333 static void 334 awkarginit(int ac, char **av) 335 { 336 int i; 337 wchar_t *cp; 338 339 ARGVsubi = node(INDEX, vlook(s_ARGV), constant); 340 running = 1; 341 constant->n_int = ac; 342 (void) assign(varARGC, constant); 343 for (i = 0; i < ac; ++i) { 344 cp = mbstowcsdup(av[i]); 345 constant->n_int = i; 346 strassign(exprreduce(ARGVsubi), cp, 347 FSTATIC|FSENSE, wcslen(cp)); 348 } 349 running = 0; 350 } 351 352 /* 353 * Clean up when done parsing a function. 354 * All formal parameters, because of a deal (funparm) in 355 * yylex, get put into the symbol table in front of any 356 * global variable of the same name. When the entire 357 * function is parsed, remove these formal dummy nodes 358 * from the symbol table but retain the nodes because 359 * the generated tree points at them. 360 */ 361 void 362 uexit(NODE *np) 363 { 364 NODE *formal; 365 366 while ((formal = getlist(&np)) != NNULL) 367 delsymtab(formal, 0); 368 } 369 370 /* 371 * The lexical analyzer. 372 */ 373 int 374 yylex() 375 #ifdef DEBUG 376 { 377 int l; 378 379 l = yyhex(); 380 if (dflag) 381 (void) printf("%d\n", l); 382 return (l); 383 } 384 yyhex() 385 #endif 386 { 387 wint_t c, c1; 388 int i; 389 static int savetoken = 0; 390 static int wasfield; 391 static int isfuncdef; 392 static int nbrace, nparen, nbracket; 393 static struct ctosymstruct { 394 wint_t c, sym; 395 } ctosym[] = { 396 { '|', BAR }, { '^', CARAT }, 397 { '~', TILDE }, { '<', LANGLE }, 398 { '>', RANGLE }, { '+', PLUSC }, 399 { '-', HYPHEN }, { '*', STAR }, 400 { '/', SLASH }, { '%', PERCENT }, 401 { '!', EXCLAMATION }, { '$', DOLLAR }, 402 { '[', LSQUARE }, { ']', RSQUARE }, 403 { '(', LPAREN }, { ')', RPAREN }, 404 { ';', SEMI }, { '{', LBRACE }, 405 { '}', RBRACE }, { 0, 0 } 406 }; 407 408 if (savetoken) { 409 c = savetoken; 410 savetoken = 0; 411 } else if (redelim != '\0') { 412 c = redelim; 413 redelim = 0; 414 catterm = 0; 415 savetoken = c; 416 return (lexlast = lexregexp(c)); 417 } else while ((c = lexgetc()) != WEOF) { 418 if (iswalpha(c) || c == '_') { 419 c = lexid(c); 420 } else if (iswdigit(c) || c == '.') { 421 c = lexnumber(c); 422 } else if (isWblank(c)) { 423 continue; 424 } else switch (c) { 425 #if DOS || OS2 426 case 032: /* ^Z */ 427 continue; 428 #endif 429 430 case '"': 431 c = lexstring(c); 432 break; 433 434 case '#': 435 while ((c = lexgetc()) != '\n' && c != WEOF) 436 ; 437 lexungetc(c); 438 continue; 439 440 case '+': 441 if ((c1 = lexgetc()) == '+') 442 c = INC; 443 else if (c1 == '=') 444 c = AADD; 445 else 446 lexungetc(c1); 447 break; 448 449 case '-': 450 if ((c1 = lexgetc()) == '-') 451 c = DEC; 452 else if (c1 == '=') 453 c = ASUB; 454 else 455 lexungetc(c1); 456 break; 457 458 case '*': 459 if ((c1 = lexgetc()) == '=') 460 c = AMUL; 461 else if (c1 == '*') { 462 if ((c1 = lexgetc()) == '=') 463 c = AEXP; 464 else { 465 c = EXP; 466 lexungetc(c1); 467 } 468 } else 469 lexungetc(c1); 470 break; 471 472 case '^': 473 if ((c1 = lexgetc()) == '=') { 474 c = AEXP; 475 } else { 476 c = EXP; 477 lexungetc(c1); 478 } 479 break; 480 481 case '/': 482 if ((c1 = lexgetc()) == '=' && 483 lexlast != RE && lexlast != NRE && 484 lexlast != ';' && lexlast != '\n' && 485 lexlast != ',' && lexlast != '(') 486 c = ADIV; 487 else 488 lexungetc(c1); 489 break; 490 491 case '%': 492 if ((c1 = lexgetc()) == '=') 493 c = AREM; 494 else 495 lexungetc(c1); 496 break; 497 498 case '&': 499 if ((c1 = lexgetc()) == '&') 500 c = AND; 501 else 502 lexungetc(c1); 503 break; 504 505 case '|': 506 if ((c1 = lexgetc()) == '|') 507 c = OR; 508 else { 509 lexungetc(c1); 510 if (inprint) 511 c = PIPE; 512 } 513 break; 514 515 case '>': 516 if ((c1 = lexgetc()) == '=') 517 c = GE; 518 else if (c1 == '>') 519 c = APPEND; 520 else { 521 lexungetc(c1); 522 if (nparen == 0 && inprint) 523 c = WRITE; 524 } 525 break; 526 527 case '<': 528 if ((c1 = lexgetc()) == '=') 529 c = LE; 530 else 531 lexungetc(c1); 532 break; 533 534 case '!': 535 if ((c1 = lexgetc()) == '=') 536 c = NE; 537 else if (c1 == '~') 538 c = NRE; 539 else 540 lexungetc(c1); 541 break; 542 543 case '=': 544 if ((c1 = lexgetc()) == '=') 545 c = EQ; 546 else { 547 lexungetc(c1); 548 c = ASG; 549 } 550 break; 551 552 case '\n': 553 switch (lexlast) { 554 case ')': 555 if (catterm || inprint) { 556 c = ';'; 557 break; 558 } 559 case AND: 560 case OR: 561 case COMMA: 562 case '{': 563 case ELSE: 564 case ';': 565 case DO: 566 continue; 567 568 case '}': 569 if (nbrace != 0) 570 continue; 571 572 default: 573 c = ';'; 574 break; 575 } 576 break; 577 578 case ELSE: 579 if (lexlast != ';') { 580 savetoken = ELSE; 581 c = ';'; 582 } 583 break; 584 585 case '(': 586 ++nparen; 587 break; 588 589 case ')': 590 if (--nparen < 0) 591 awkerr(unbal, "()"); 592 break; 593 594 case '{': 595 nbrace++; 596 break; 597 598 case '}': 599 if (--nbrace < 0) { 600 char brk[3]; 601 602 brk[0] = '{'; 603 brk[1] = '}'; 604 brk[2] = '\0'; 605 awkerr(unbal, brk); 606 } 607 if (lexlast != ';') { 608 savetoken = c; 609 c = ';'; 610 } 611 break; 612 613 case '[': 614 ++nbracket; 615 break; 616 617 case ']': 618 if (--nbracket < 0) { 619 char brk[3]; 620 621 brk[0] = '['; 622 brk[1] = ']'; 623 brk[2] = '\0'; 624 awkerr(unbal, brk); 625 } 626 break; 627 628 case '\\': 629 if ((c1 = lexgetc()) == '\n') 630 continue; 631 lexungetc(c1); 632 break; 633 634 case ',': 635 c = COMMA; 636 break; 637 638 case '?': 639 c = QUEST; 640 break; 641 642 case ':': 643 c = COLON; 644 break; 645 646 default: 647 if (!iswprint(c)) 648 awkerr( 649 gettext("invalid character \"%s\""), 650 toprint(c)); 651 break; 652 } 653 break; 654 } 655 656 switch (c) { 657 case ']': 658 ++catterm; 659 break; 660 661 case VAR: 662 if (catterm) { 663 savetoken = c; 664 c = CONCAT; 665 catterm = 0; 666 } else if (!isfuncdef) { 667 if ((c1 = lexgetc()) != '(') 668 ++catterm; 669 lexungetc(c1); 670 } 671 isfuncdef = 0; 672 break; 673 674 case PARM: 675 case CONSTANT: 676 if (catterm) { 677 savetoken = c; 678 c = CONCAT; 679 catterm = 0; 680 } else { 681 if (lexlast == '$') 682 wasfield = 2; 683 ++catterm; 684 } 685 break; 686 687 case INC: 688 case DEC: 689 if (!catterm || lexlast != CONSTANT || wasfield) 690 break; 691 692 case UFUNC: 693 case FUNC: 694 case GETLINE: 695 case '!': 696 case '$': 697 case '(': 698 if (catterm) { 699 savetoken = c; 700 c = CONCAT; 701 catterm = 0; 702 } 703 break; 704 705 /* { */ case '}': 706 if (nbrace == 0) 707 savetoken = ';'; 708 case ';': 709 inprint = 0; 710 default: 711 if (c == DEFFUNC) 712 isfuncdef = 1; 713 catterm = 0; 714 } 715 lexlast = c; 716 if (wasfield) 717 wasfield--; 718 /* 719 * Map character constants to symbolic names. 720 */ 721 for (i = 0; ctosym[i].c != 0; i++) 722 if (c == ctosym[i].c) { 723 c = ctosym[i].sym; 724 break; 725 } 726 return ((int)c); 727 } 728 729 /* 730 * Read a number for the lexical analyzer. 731 * Input is the first character of the number. 732 * Return value is the lexical type. 733 */ 734 static int 735 lexnumber(wint_t c) 736 { 737 wchar_t *cp; 738 int dotfound = 0; 739 int efound = 0; 740 INT number; 741 742 cp = linebuf; 743 do { 744 if (iswdigit(c)) 745 ; 746 else if (c == '.') { 747 if (dotfound++) 748 break; 749 } else if (c == 'e' || c == 'E') { 750 if ((c = lexgetc()) != '-' && c != '+') { 751 lexungetc(c); 752 c = 'e'; 753 } else 754 *cp++ = 'e'; 755 if (efound++) 756 break; 757 } else 758 break; 759 *cp++ = c; 760 } while ((c = lexgetc()) != WEOF); 761 *cp = '\0'; 762 if (dotfound && cp == linebuf+1) 763 return (DOT); 764 lexungetc(c); 765 errno = 0; 766 if (!dotfound && !efound && 767 ((number = wcstol(linebuf, (wchar_t **)0, 10)), errno != ERANGE)) 768 yylval.node = intnode(number); 769 else 770 yylval.node = realnode((REAL)wcstod(linebuf, (wchar_t **)0)); 771 return (CONSTANT); 772 } 773 774 /* 775 * Read an identifier. 776 * Input is first character of identifier. 777 * Return VAR. 778 */ 779 static int 780 lexid(wint_t c) 781 { 782 wchar_t *cp; 783 size_t i; 784 NODE *np; 785 786 cp = linebuf; 787 do { 788 *cp++ = c; 789 c = lexgetc(); 790 } while (iswalpha(c) || iswdigit(c) || c == '_'); 791 *cp = '\0'; 792 lexungetc(c); 793 yylval.node = np = vlook(linebuf); 794 795 switch (np->n_type) { 796 case KEYWORD: 797 switch (np->n_keywtype) { 798 case PRINT: 799 case PRINTF: 800 ++inprint; 801 default: 802 return ((int)np->n_keywtype); 803 } 804 /* NOTREACHED */ 805 806 case ARRAY: 807 case VAR: 808 /* 809 * If reading the argument list, create a dummy node 810 * for the duration of that function. These variables 811 * can be removed from the symbol table at function end 812 * but they must still exist because the execution tree 813 * knows about them. 814 */ 815 if (funparm) { 816 do_funparm: 817 np = emptynode(PARM, i = (cp-linebuf)); 818 np->n_flags = FSTRING; 819 np->n_string = _null; 820 np->n_strlen = 0; 821 (void) memcpy(np->n_name, linebuf, 822 (i+1) * sizeof (wchar_t)); 823 addsymtab(np); 824 yylval.node = np; 825 } else if (np == varNF || (np == varFS && 826 (!doing_begin || begin_getline))) { 827 /* 828 * If the user program references NF or sets 829 * FS either outside of a begin block or 830 * in a begin block after a getline then the 831 * input line will be split immediately upon read 832 * rather than when a field is first referenced. 833 */ 834 needsplit = 1; 835 } else if (np == varENVIRON) 836 needenviron = 1; 837 case PARM: 838 return (VAR); 839 840 case UFUNC: 841 /* 842 * It is ok to redefine functions as parameters 843 */ 844 if (funparm) goto do_funparm; 845 case FUNC: 846 case GETLINE: 847 /* 848 * When a getline is encountered, clear the 'doing_begin' flag. 849 * This will force the 'needsplit' flag to be set, even inside 850 * a begin block, if FS is altered. (See VAR case above) 851 */ 852 if (doing_begin) 853 begin_getline = 1; 854 return (np->n_type); 855 } 856 /* NOTREACHED */ 857 return (0); 858 } 859 860 /* 861 * Read a string for the lexical analyzer. 862 * `endc' terminates the string. 863 */ 864 static int 865 lexstring(wint_t endc) 866 { 867 size_t length = lexescape(endc, 0, 0); 868 869 yylval.node = stringnode(linebuf, FALLOC, length); 870 return (CONSTANT); 871 } 872 873 /* 874 * Read a regular expression. 875 */ 876 static int 877 lexregexp(wint_t endc) 878 { 879 (void) lexescape(endc, 1, 0); 880 yylval.node = renode(linebuf); 881 return (URE); 882 } 883 884 /* 885 * Process a string, converting the escape characters as required by 886 * 1003.2. The processed string ends up in the global linebuf[]. This 887 * routine also changes the value of 'progfd' - the program file 888 * descriptor, so it should be used with some care. It is presently used to 889 * process -v (awk1.c) and var=str type arguments (awk2.c, nextrecord()). 890 */ 891 void 892 strescape(wchar_t *str) 893 { 894 progptr = str; 895 proglen = wcslen(str) + 1; /* Include \0 */ 896 (void) lexescape('\0', 0, 1); 897 progptr = NULL; 898 } 899 900 /* 901 * Read a string or regular expression, terminated by ``endc'', 902 * for lexical analyzer, processing escape sequences. 903 * Return string length. 904 */ 905 static size_t 906 lexescape(wint_t endc, int regx, int cmd_line_operand) 907 { 908 static char nlre[256]; 909 static char nlstr[256]; 910 static char eofre[256]; 911 static char eofstr[256]; 912 int first_time = 1; 913 wint_t c; 914 wchar_t *cp; 915 int n, max; 916 917 if (first_time == 1) { 918 (void) strcpy(nlre, gettext("Newline in regular expression\n")); 919 (void) strcpy(nlstr, gettext("Newline in string\n")); 920 (void) strcpy(eofre, gettext("EOF in regular expression\n")); 921 (void) strcpy(eofstr, gettext("EOF in string\n")); 922 first_time = 0; 923 } 924 925 cp = linebuf; 926 while ((c = lexgetc()) != endc) { 927 if (c == '\n') 928 awkerr(regx ? nlre : nlstr); 929 if (c == '\\') { 930 switch (c = lexgetc(), c) { 931 case '\\': 932 if (regx) 933 *cp++ = '\\'; 934 break; 935 936 case '/': 937 c = '/'; 938 break; 939 940 case 'n': 941 c = '\n'; 942 break; 943 944 case 'b': 945 c = '\b'; 946 break; 947 948 case 't': 949 c = '\t'; 950 break; 951 952 case 'r': 953 c = '\r'; 954 break; 955 956 case 'f': 957 c = '\f'; 958 break; 959 960 case 'v': 961 c = '\v'; 962 break; 963 964 case 'a': 965 c = (char)0x07; 966 break; 967 968 case 'x': 969 n = 0; 970 while (iswxdigit(c = lexgetc())) { 971 if (iswdigit(c)) 972 c -= '0'; 973 else if (iswupper(c)) 974 c -= 'A'-10; 975 else 976 c -= 'a'-10; 977 n = (n<<4) + c; 978 } 979 lexungetc(c); 980 c = n; 981 break; 982 983 case '0': 984 case '1': 985 case '2': 986 case '3': 987 case '4': 988 case '5': 989 case '6': 990 case '7': 991 #if 0 992 /* 993 * Posix.2 draft 10 disallows the use of back-referencing - it explicitly 994 * requires processing of the octal escapes both in strings and 995 * regular expressions. The following code is disabled instead of 996 * removed as back-referencing may be reintroduced in a future draft 997 * of the standard. 998 */ 999 /* 1000 * For regular expressions, we disallow 1001 * \ooo to mean octal character, in favour 1002 * of back referencing. 1003 */ 1004 if (regx) { 1005 *cp++ = '\\'; 1006 break; 1007 } 1008 #endif 1009 max = 3; 1010 n = 0; 1011 do { 1012 n = (n<<3) + c-'0'; 1013 if ((c = lexgetc()) > '7' || c < '0') 1014 break; 1015 } while (--max); 1016 lexungetc(c); 1017 /* 1018 * an octal escape sequence must have at least 1019 * 2 digits after the backslash, otherwise 1020 * it gets passed straight thru for possible 1021 * use in backreferencing. 1022 */ 1023 if (max == 3) { 1024 *cp++ = '\\'; 1025 n += '0'; 1026 } 1027 c = n; 1028 break; 1029 1030 case '\n': 1031 continue; 1032 1033 default: 1034 if (c != endc || cmd_line_operand) { 1035 *cp++ = '\\'; 1036 if (c == endc) 1037 lexungetc(c); 1038 } 1039 } 1040 } 1041 if (c == WEOF) 1042 awkerr(regx ? eofre : eofstr); 1043 *cp++ = c; 1044 } 1045 *cp = '\0'; 1046 return (cp - linebuf); 1047 } 1048 1049 /* 1050 * Build a regular expression NODE. 1051 * Argument is the string holding the expression. 1052 */ 1053 NODE * 1054 renode(wchar_t *s) 1055 { 1056 NODE *np; 1057 int n; 1058 1059 np = emptynode(RE, 0); 1060 np->n_left = np->n_right = NNULL; 1061 np->n_regexp = (REGEXP)emalloc(sizeof (regex_t)); 1062 if ((n = REGWCOMP(np->n_regexp, s, REG_EXTENDED)) != REG_OK) { 1063 int m; 1064 char *p; 1065 1066 m = regerror(n, np->n_regexp, NULL, 0); 1067 p = (char *)emalloc(m); 1068 regerror(n, np->n_regexp, p, m); 1069 awkerr("/%S/: %s", s, p); 1070 } 1071 return (np); 1072 } 1073 /* 1074 * Get a character for the lexical analyser routine. 1075 */ 1076 static wint_t 1077 lexgetc() 1078 { 1079 wint_t c; 1080 static char **files = &progfiles[0]; 1081 1082 if (progfp != FNULL && (c = fgetwc(progfp)) != WEOF) 1083 ; 1084 else { 1085 if (progptr != NULL) { 1086 if (proglen-- <= 0) 1087 c = WEOF; 1088 else 1089 c = *progptr++; 1090 } else { 1091 if (progfp != FNULL) 1092 if (progfp != stdin) 1093 (void) fclose(progfp); 1094 else 1095 clearerr(progfp); 1096 progfp = FNULL; 1097 if (files < progfilep) { 1098 filename = *files++; 1099 lineno = 1; 1100 if (filename[0] == '-' && filename[1] == '\0') 1101 progfp = stdin; 1102 else if ((progfp = fopen(filename, r)) 1103 == FNULL) { 1104 (void) fprintf(stderr, 1105 gettext("script file \"%s\""), filename); 1106 exit(1); 1107 } 1108 c = fgetwc(progfp); 1109 } 1110 } 1111 } 1112 if (c == '\n') 1113 ++lineno; 1114 if (conptr >= &context[NCONTEXT]) 1115 conptr = &context[0]; 1116 if (c != WEOF) 1117 *conptr++ = c; 1118 return (c); 1119 } 1120 1121 /* 1122 * Return a character for lexical analyser. 1123 * Only one returned character is (not enforced) legitimite. 1124 */ 1125 static void 1126 lexungetc(wint_t c) 1127 { 1128 if (c == '\n') 1129 --lineno; 1130 if (c != WEOF) { 1131 if (conptr == &context[0]) 1132 conptr = &context[NCONTEXT]; 1133 *--conptr = '\0'; 1134 } 1135 if (progfp != FNULL) { 1136 (void) ungetwc(c, progfp); 1137 return; 1138 } 1139 if (c == WEOF) 1140 return; 1141 *--progptr = c; 1142 proglen++; 1143 } 1144 1145 /* 1146 * Syntax errors during parsing. 1147 */ 1148 void 1149 yyerror(char *s, ...) 1150 { 1151 if (lexlast == FUNC || lexlast == GETLINE || lexlast == KEYWORD) 1152 if (lexlast == KEYWORD) 1153 awkerr(gettext("inadmissible use of reserved keyword")); 1154 else 1155 awkerr(gettext("attempt to redefine builtin function")); 1156 awkerr(s); 1157 } 1158 1159 /* 1160 * Error routine for all awk errors. 1161 */ 1162 /* ARGSUSED */ 1163 void 1164 awkerr(char *fmt, ...) 1165 { 1166 va_list args; 1167 1168 va_start(args, fmt); 1169 awkierr(0, fmt, args); 1170 va_end(args); 1171 } 1172 1173 /* 1174 * Error routine like "awkerr" except that it prints out 1175 * a message that includes an errno-specific indication. 1176 */ 1177 /* ARGSUSED */ 1178 void 1179 awkperr(char *fmt, ...) 1180 { 1181 va_list args; 1182 1183 va_start(args, fmt); 1184 awkierr(1, fmt, args); 1185 va_end(args); 1186 } 1187 1188 /* 1189 * Common internal routine for awkerr, awkperr 1190 */ 1191 static void 1192 awkierr(int perr, char *fmt, va_list ap) 1193 { 1194 static char sep1[] = "\n>>>\t"; 1195 static char sep2[] = "\t<<<"; 1196 int saveerr = errno; 1197 1198 (void) fprintf(stderr, "%s: ", _cmdname); 1199 if (running) { 1200 (void) fprintf(stderr, gettext("line %u ("), 1201 curnode == NNULL ? 0 : curnode->n_lineno); 1202 if (phase == 0) 1203 (void) fprintf(stderr, "NR=%lld): ", 1204 (INT)exprint(varNR)); 1205 else 1206 (void) fprintf(stderr, "%s): ", 1207 phase == BEGIN ? s_BEGIN : s_END); 1208 } else if (lineno != 0) { 1209 (void) fprintf(stderr, gettext("file \"%s\": "), filename); 1210 (void) fprintf(stderr, gettext("line %u: "), lineno); 1211 } 1212 (void) vfprintf(stderr, gettext(fmt), ap); 1213 if (perr == 1) 1214 (void) fprintf(stderr, ": %s", strerror(saveerr)); 1215 if (perr != 2 && !running) { 1216 wchar_t *cp; 1217 int n; 1218 int c; 1219 1220 (void) fprintf(stderr, gettext(" Context is:%s"), sep1); 1221 cp = conptr; 1222 n = NCONTEXT; 1223 do { 1224 if (cp >= &context[NCONTEXT]) 1225 cp = &context[0]; 1226 if ((c = *cp++) != '\0') 1227 (void) fputs(c == '\n' ? sep1 : toprint(c), 1228 stderr); 1229 } while (--n != 0); 1230 (void) fputs(sep2, stderr); 1231 } 1232 (void) fprintf(stderr, "\n"); 1233 exit(1); 1234 } 1235 1236 wchar_t * 1237 emalloc(unsigned n) 1238 { 1239 wchar_t *cp; 1240 1241 if ((cp = malloc(n)) == NULL) 1242 awkerr(nomem); 1243 return (cp); 1244 } 1245 1246 wchar_t * 1247 erealloc(wchar_t *p, unsigned n) 1248 { 1249 wchar_t *cp; 1250 1251 if ((cp = realloc(p, n)) == NULL) 1252 awkerr(nomem); 1253 return (cp); 1254 } 1255 1256 1257 /* 1258 * usage message for awk 1259 */ 1260 static int 1261 usage() 1262 { 1263 (void) fprintf(stderr, gettext( 1264 "Usage: awk [-F ERE] [-v var=val] 'program' [var=val ...] [file ...]\n" 1265 " awk [-F ERE] -f progfile ... [-v var=val] [var=val ...] [file ...]\n")); 1266 return (2); 1267 } 1268 1269 1270 static wchar_t * 1271 mbconvert(char *str) 1272 { 1273 static wchar_t *op = 0; 1274 1275 if (op != 0) 1276 free(op); 1277 return (op = mbstowcsdup(str)); 1278 } 1279 1280 char * 1281 mbunconvert(wchar_t *str) 1282 { 1283 static char *op = 0; 1284 1285 if (op != 0) 1286 free(op); 1287 return (op = wcstombsdup(str)); 1288 } 1289 1290 /* 1291 * Solaris port - following functions are typical MKS functions written 1292 * to work for Solaris. 1293 */ 1294 1295 wchar_t * 1296 mbstowcsdup(s) 1297 char *s; 1298 { 1299 int n; 1300 wchar_t *w; 1301 1302 n = strlen(s) + 1; 1303 if ((w = (wchar_t *)malloc(n * sizeof (wchar_t))) == NULL) 1304 return (NULL); 1305 1306 if (mbstowcs(w, s, n) == -1) 1307 return (NULL); 1308 return (w); 1309 1310 } 1311 1312 char * 1313 wcstombsdup(wchar_t *w) 1314 { 1315 int n; 1316 char *mb; 1317 1318 /* Fetch memory for worst case string length */ 1319 n = wslen(w) + 1; 1320 n *= MB_CUR_MAX; 1321 if ((mb = (char *)malloc(n)) == NULL) { 1322 return (NULL); 1323 } 1324 1325 /* Convert the string */ 1326 if ((n = wcstombs(mb, w, n)) == -1) { 1327 int saverr = errno; 1328 1329 free(mb); 1330 errno = saverr; 1331 return (0); 1332 } 1333 1334 /* Shrink the string down */ 1335 if ((mb = (char *)realloc(mb, strlen(mb)+1)) == NULL) { 1336 return (NULL); 1337 } 1338 return (mb); 1339 } 1340 1341 /* 1342 * The upe_ctrls[] table contains the printable 'control-sequences' for the 1343 * character values 0..31 and 127. The first entry is for value 127, thus the 1344 * entries for the remaining character values are from 1..32. 1345 */ 1346 static const char *const upe_ctrls[] = 1347 { 1348 "^?", 1349 "^@", "^A", "^B", "^C", "^D", "^E", "^F", "^G", 1350 "^H", "^I", "^J", "^K", "^L", "^M", "^N", "^O", 1351 "^P", "^Q", "^R", "^S", "^T", "^U", "^V", "^W", 1352 "^X", "^Y", "^Z", "^[", "^\\", "^]", "^^", "^_" 1353 }; 1354 1355 1356 /* 1357 * Return a printable string corresponding to the given character value. If 1358 * the character is printable, simply return it as the string. If it is in 1359 * the range specified by table 5-101 in the UPE, return the corresponding 1360 * string. Otherwise, return an octal escape sequence. 1361 */ 1362 static const char * 1363 toprint(c) 1364 wchar_t c; 1365 { 1366 int n, len; 1367 unsigned char *ptr; 1368 static char mbch[MB_LEN_MAX+1]; 1369 static char buf[5 * MB_LEN_MAX + 1]; 1370 1371 if ((n = wctomb(mbch, c)) == -1) { 1372 /* Should never happen */ 1373 (void) sprintf(buf, "\\%x", c); 1374 return (buf); 1375 } 1376 mbch[n] = '\0'; 1377 if (iswprint(c)) { 1378 return (mbch); 1379 } else if (c == 127) { 1380 return (upe_ctrls[0]); 1381 } else if (c < 32) { 1382 /* Print as in Table 5-101 in the UPE */ 1383 return (upe_ctrls[c+1]); 1384 } else { 1385 /* Print as an octal escape sequence */ 1386 for (len = 0, ptr = (unsigned char *) mbch; 0 < n; --n, ++ptr) 1387 len += sprintf(buf+len, "\\%03o", *ptr); 1388 } 1389 return (buf); 1390 } 1391 1392 static int 1393 wcoff(const wchar_t *astring, const int off) 1394 { 1395 const wchar_t *s = astring; 1396 int c = 0; 1397 char mb[MB_LEN_MAX]; 1398 1399 while (c < off) { 1400 int n; 1401 if ((n = wctomb(mb, *s)) == 0) 1402 break; 1403 if (n == -1) 1404 n = 1; 1405 c += n; 1406 s++; 1407 } 1408 1409 return (s - astring); 1410 } 1411 1412 int 1413 int_regwcomp(regex_t *r, const wchar_t *pattern, int uflags) 1414 { 1415 char *mbpattern; 1416 int ret; 1417 1418 if ((mbpattern = wcstombsdup((wchar_t *)pattern)) == NULL) 1419 return (REG_ESPACE); 1420 1421 ret = regcomp(r, mbpattern, uflags); 1422 1423 free(mbpattern); 1424 1425 return (ret); 1426 } 1427 1428 int 1429 int_regwexec(const regex_t *r, /* compiled RE */ 1430 const wchar_t *astring, /* subject string */ 1431 size_t nsub, /* number of subexpressions */ 1432 int_regwmatch_t *sub, /* subexpression pointers */ 1433 int flags) 1434 { 1435 char *mbs; 1436 regmatch_t *mbsub = NULL; 1437 int i; 1438 1439 if ((mbs = wcstombsdup((wchar_t *)astring)) == NULL) 1440 return (REG_ESPACE); 1441 1442 if (nsub > 0 && sub) { 1443 if ((mbsub = malloc(nsub * sizeof (regmatch_t))) == NULL) 1444 return (REG_ESPACE); 1445 } 1446 1447 i = regexec(r, mbs, nsub, mbsub, flags); 1448 1449 /* Now, adjust the pointers/counts in sub */ 1450 if (i == REG_OK && nsub > 0 && mbsub) { 1451 int j, k; 1452 1453 for (j = 0; j < nsub; j++) { 1454 regmatch_t *ms = &mbsub[j]; 1455 int_regwmatch_t *ws = &sub[j]; 1456 1457 if ((k = ms->rm_so) >= 0) { 1458 ws->rm_so = wcoff(astring, k); 1459 ws->rm_sp = astring + ws->rm_so; 1460 } 1461 if ((k = ms->rm_eo) >= 0) { 1462 ws->rm_eo = wcoff(astring, k); 1463 ws->rm_ep = astring + ws->rm_eo; 1464 } 1465 } 1466 } 1467 1468 free(mbs); 1469 if (mbsub) 1470 free(mbsub); 1471 return (i); 1472 } 1473 1474 int 1475 int_regwdosuba(regex_t *rp, /* compiled RE: Pattern */ 1476 const wchar_t *rpl, /* replacement string: /rpl/ */ 1477 const wchar_t *src, /* source string */ 1478 wchar_t **dstp, /* destination string */ 1479 int len, /* destination length */ 1480 int *globp) /* IN: occurence, 0 for all; OUT: substitutions */ 1481 { 1482 wchar_t *dst, *odst; 1483 const wchar_t *ip, *xp; 1484 wchar_t *op; 1485 int i; 1486 wchar_t c; 1487 int glob, iglob = *globp, oglob = 0; 1488 #define NSUB 10 1489 int_regwmatch_t rm[NSUB], *rmp; 1490 int flags; 1491 wchar_t *end; 1492 int regerr; 1493 1494 /* handle overflow of dst. we need "i" more bytes */ 1495 #ifdef OVERFLOW 1496 #undef OVERFLOW 1497 #define OVERFLOW(i) if (1) { \ 1498 int pos = op - dst; \ 1499 dst = (wchar_t *)realloc(odst = dst, \ 1500 (len += len + i) * sizeof (wchar_t)); \ 1501 if (dst == NULL) \ 1502 goto nospace; \ 1503 op = dst + pos; \ 1504 end = dst + len; \ 1505 } else 1506 #endif 1507 1508 *dstp = dst = (wchar_t *)malloc(len * sizeof (wchar_t)); 1509 if (dst == NULL) 1510 return (REG_ESPACE); 1511 1512 if (rp == NULL || rpl == NULL || src == NULL || dst == NULL) 1513 return (REG_EFATAL); 1514 1515 glob = 0; /* match count */ 1516 ip = src; /* source position */ 1517 op = dst; /* destination position */ 1518 end = dst + len; 1519 1520 flags = 0; 1521 while ((regerr = int_regwexec(rp, ip, NSUB, rm, flags)) == REG_OK) { 1522 /* Copy text preceding match */ 1523 if (op + (i = rm[0].rm_sp - ip) >= end) 1524 OVERFLOW(i); 1525 while (i--) 1526 *op++ = *ip++; 1527 1528 if (iglob == 0 || ++glob == iglob) { 1529 oglob++; 1530 xp = rpl; /* do substitute */ 1531 } else 1532 xp = L"&"; /* preserve text */ 1533 1534 /* Perform replacement of matched substing */ 1535 while ((c = *xp++) != '\0') { 1536 rmp = NULL; 1537 if (c == '&') 1538 rmp = &rm[0]; 1539 else if (c == '\\') { 1540 if ('0' <= *xp && *xp <= '9') 1541 rmp = &rm[*xp++ - '0']; 1542 else if (*xp != '\0') 1543 c = *xp++; 1544 } 1545 1546 if (rmp == NULL) { /* Ordinary character. */ 1547 *op++ = c; 1548 if (op >= end) 1549 OVERFLOW(1); 1550 } else if (rmp->rm_sp != NULL && rmp->rm_ep != NULL) { 1551 ip = rmp->rm_sp; 1552 if (op + (i = rmp->rm_ep - rmp->rm_sp) >= end) 1553 OVERFLOW(i); 1554 while (i--) 1555 *op++ = *ip++; 1556 } 1557 } 1558 1559 ip = rm[0].rm_ep; 1560 if (*ip == '\0') /* If at end break */ 1561 break; 1562 else if (rm[0].rm_sp == rm[0].rm_ep) { 1563 /* If empty match copy next char */ 1564 *op++ = *ip++; 1565 if (op >= end) 1566 OVERFLOW(1); 1567 } 1568 flags = REG_NOTBOL; 1569 } 1570 1571 if (regerr != REG_OK && regerr != REG_NOMATCH) 1572 return (regerr); 1573 1574 /* Copy rest of text */ 1575 if (op + (i = wcslen(ip)) >= end) 1576 OVERFLOW(i); 1577 while (i--) 1578 *op++ = *ip++; 1579 *op++ = '\0'; 1580 1581 if ((*dstp = dst = (wchar_t *)realloc(odst = dst, 1582 sizeof (wchar_t) * (size_t)(op - dst))) == NULL) { 1583 nospace: 1584 free(odst); 1585 return (REG_ESPACE); 1586 } 1587 1588 *globp = oglob; 1589 1590 return ((oglob == 0) ? REG_NOMATCH : REG_OK); 1591 } 1592