1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2003 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * awk -- mainline, yylex, etc. 29 * 30 * Copyright 1986, 1994 by Mortice Kern Systems Inc. All rights reserved. 31 * 32 * Based on MKS awk(1) ported to be /usr/xpg4/bin/awk with POSIX/XCU4 changes 33 */ 34 35 #pragma ident "%Z%%M% %I% %E% SMI" 36 37 #include "awk.h" 38 #include "y.tab.h" 39 #include <stdarg.h> 40 #include <unistd.h> 41 #include <locale.h> 42 43 static char *progfiles[NPFILE]; /* Programmes files for yylex */ 44 static char **progfilep = &progfiles[0]; /* Pointer to last file */ 45 static wchar_t *progptr; /* In-memory programme */ 46 static int proglen; /* Length of progptr */ 47 static wchar_t context[NCONTEXT]; /* Circular buffer of context */ 48 static wchar_t *conptr = &context[0]; /* context ptr */ 49 static FILE *progfp; /* Stdio stream for programme */ 50 static char *filename; 51 #ifdef DEBUG 52 static int dflag; 53 #endif 54 55 #define AWK_EXEC_MAGIC "<MKS AWKC>" 56 #define LEN_EXEC_MAGIC 10 57 58 static char unbal[] = "unbalanced E char"; 59 60 static void awkarginit(int c, char **av); 61 static int lexid(wint_t c); 62 static int lexnumber(wint_t c); 63 static int lexstring(wint_t endc); 64 static int lexregexp(register wint_t endc); 65 66 static void awkvarinit(void); 67 static wint_t lexgetc(void); 68 static void lexungetc(wint_t c); 69 static size_t lexescape(wint_t endc, int regx, int cmd_line_operand); 70 static void awkierr(int perr, char *fmt, va_list ap); 71 static int usage(void); 72 void strescape(wchar_t *str); 73 static const char *toprint(wint_t); 74 char *_cmdname; 75 static wchar_t *mbconvert(char *str); 76 77 78 /* 79 * mainline for awk 80 */ 81 int 82 main(int argc, char *argv[]) 83 { 84 register wchar_t *ap; 85 register char *cmd; 86 87 cmd = argv[0]; 88 _cmdname = cmd; 89 90 linebuf = emalloc(NLINE * sizeof(wchar_t)); 91 92 /*l 93 * At this point only messaging should be internationalized. 94 * numbers are still scanned as in the Posix locale. 95 */ 96 (void) setlocale(LC_ALL,""); 97 (void) setlocale(LC_NUMERIC,"C"); 98 #if !defined(TEXT_DOMAIN) 99 #define TEXT_DOMAIN "SYS_TEST" 100 #endif 101 (void) textdomain(TEXT_DOMAIN); 102 103 awkvarinit(); 104 /*running = 1;*/ 105 while (argc>1 && *argv[1]=='-') { 106 void *save_ptr = NULL; 107 ap = mbstowcsdup(&argv[1][1]); 108 if (ap == NULL) 109 break; 110 if (*ap == '\0') { 111 free(ap); 112 break; 113 } 114 save_ptr = (void *) ap; 115 ++argv; 116 --argc; 117 if (*ap=='-' && ap[1]=='\0') 118 break; 119 for ( ; *ap != '\0'; ++ap) { 120 switch (*ap) { 121 #ifdef DEBUG 122 case 'd': 123 dflag = 1; 124 continue; 125 126 #endif 127 case 'f': 128 if (argc < 2) { 129 (void) fprintf(stderr, 130 gettext("Missing script file\n")); 131 return (1); 132 } 133 *progfilep++ = argv[1]; 134 --argc; 135 ++argv; 136 continue; 137 138 case 'F': 139 if (ap[1] == '\0') { 140 if (argc < 2) { 141 (void) fprintf(stderr, 142 gettext("Missing field separator\n")); 143 return (1); 144 } 145 ap = mbstowcsdup(argv[1]); 146 --argc; 147 ++argv; 148 } else 149 ++ap; 150 strescape(ap); 151 strassign(varFS, linebuf, FALLOC, 152 wcslen(linebuf)); 153 break; 154 155 case 'v': { 156 register wchar_t *vp; 157 register wchar_t *arg; 158 159 if (argc < 2) { 160 (void) fprintf(stderr, 161 gettext("Missing variable assignment\n")); 162 return (1); 163 } 164 arg = mbconvert(argv[1]); 165 if ((vp = wcschr(arg, '=')) != NULL) { 166 *vp = '\0'; 167 strescape(vp+1); 168 strassign(vlook(arg), linebuf, 169 FALLOC|FSENSE, wcslen(linebuf)); 170 *vp = '='; 171 } 172 --argc; 173 ++argv; 174 continue; 175 } 176 177 default: 178 (void) fprintf(stderr, 179 gettext("Unknown option \"-%S\"\n"), ap); 180 return (usage()); 181 } 182 break; 183 } 184 if (save_ptr) 185 free(save_ptr); 186 } 187 if (progfilep == &progfiles[0]) { 188 if (argc < 2) 189 return (usage()); 190 filename = "[command line]"; /* BUG: NEEDS TRANSLATION */ 191 progptr = mbstowcsdup(argv[1]); 192 proglen = wcslen(progptr); 193 --argc; 194 ++argv; 195 } 196 197 argv[0] = cmd; 198 199 awkarginit(argc, argv); 200 201 /*running = 0;*/ 202 (void)yyparse(); 203 204 lineno = 0; 205 /* 206 * Ok, done parsing, so now activate the rest of the nls stuff, set 207 * the radix character. 208 */ 209 (void) setlocale(LC_ALL,""); 210 radixpoint = *localeconv()->decimal_point; 211 awk(); 212 /* NOTREACHED */ 213 return (0); 214 } 215 216 /* 217 * Do initial setup of buffers, etc. 218 * This must be called before most processing 219 * and especially before lexical analysis. 220 * Variables initialised here will be overruled by command 221 * line parameter initialisation. 222 */ 223 static void 224 awkvarinit() 225 { 226 register NODE *np; 227 228 (void) setvbuf(stderr, NULL, _IONBF, 0); 229 230 if ((NIOSTREAM = sysconf(_SC_OPEN_MAX) - 4) <= 0) { 231 (void) fprintf(stderr, 232 gettext("not enough available file descriptors")); 233 exit(1); 234 } 235 ofiles = (OFILE *) emalloc(sizeof(OFILE)*NIOSTREAM); 236 #ifdef A_ZERO_POINTERS 237 (void) memset((wchar_t *) ofiles, 0, sizeof(OFILE) * NIOSTREAM); 238 #else 239 { 240 /* initialize file descriptor table */ 241 OFILE *fp; 242 for (fp = ofiles; fp < &ofiles[NIOSTREAM]; fp += 1) { 243 fp->f_fp = FNULL; 244 fp->f_mode = 0; 245 fp->f_name = (char *)0; 246 } 247 } 248 #endif 249 constant = intnode((INT)0); 250 251 const0 = intnode((INT)0); 252 const1 = intnode((INT)1); 253 constundef = emptynode(CONSTANT, 0); 254 constundef->n_flags = FSTRING|FVINT; 255 constundef->n_string = _null; 256 constundef->n_strlen = 0; 257 inc_oper = emptynode(ADD, 0); 258 inc_oper->n_right = const1; 259 asn_oper = emptynode(ADD, 0); 260 field0 = node(FIELD, const0, NNULL); 261 262 { 263 register RESFUNC near*rp; 264 265 for (rp = &resfuncs[0]; rp->rf_name != (LOCCHARP)NULL; ++rp) { 266 np = finstall(rp->rf_name, rp->rf_func, rp->rf_type); 267 } 268 } 269 { 270 register RESERVED near*rp; 271 272 for (rp = &reserved[0]; rp->r_name != (LOCCHARP)NULL; ++rp) { 273 switch (rp->r_type) { 274 case SVAR: 275 case VAR: 276 running = 1; 277 np = vlook(rp->r_name); 278 if (rp->r_type == SVAR) 279 np->n_flags |= FSPECIAL; 280 if (rp->r_svalue != NULL) 281 strassign(np, rp->r_svalue, FSTATIC, 282 (size_t)rp->r_ivalue); 283 else { 284 constant->n_int = rp->r_ivalue; 285 (void)assign(np, constant); 286 } 287 running = 0; 288 break; 289 290 case KEYWORD: 291 kinstall(rp->r_name, (int)rp->r_ivalue); 292 break; 293 } 294 } 295 } 296 297 varNR = vlook(s_NR); 298 varFNR = vlook(s_FNR); 299 varNF = vlook(s_NF); 300 varOFMT = vlook(s_OFMT); 301 varCONVFMT = vlook(s_CONVFMT); 302 varOFS = vlook(s_OFS); 303 varORS = vlook(s_ORS); 304 varRS = vlook(s_RS); 305 varFS = vlook(s_FS); 306 varARGC = vlook(s_ARGC); 307 varSUBSEP = vlook(s_SUBSEP); 308 varENVIRON = vlook(s_ENVIRON); 309 varFILENAME = vlook(s_FILENAME); 310 varSYMTAB = vlook(s_SYMTAB); 311 incNR = node(ASG, varNR, node(ADD, varNR, const1)); 312 incFNR = node(ASG, varFNR, node(ADD, varFNR, const1)); 313 clrFNR = node(ASG, varFNR, const0); 314 } 315 316 /* 317 * Initialise awk ARGC, ARGV variables. 318 */ 319 static void 320 awkarginit(int ac, char **av) 321 { 322 register int i; 323 register wchar_t *cp; 324 325 ARGVsubi = node(INDEX, vlook(s_ARGV), constant); 326 running = 1; 327 constant->n_int = ac; 328 (void)assign(varARGC, constant); 329 for (i = 0; i < ac; ++i) { 330 cp = mbstowcsdup(av[i]); 331 constant->n_int = i; 332 strassign(exprreduce(ARGVsubi), cp, 333 FSTATIC|FSENSE, wcslen(cp)); 334 } 335 running = 0; 336 } 337 338 /* 339 * Clean up when done parsing a function. 340 * All formal parameters, because of a deal (funparm) in 341 * yylex, get put into the symbol table in front of any 342 * global variable of the same name. When the entire 343 * function is parsed, remove these formal dummy nodes 344 * from the symbol table but retain the nodes because 345 * the generated tree points at them. 346 */ 347 void 348 uexit(NODE *np) 349 { 350 register NODE *formal; 351 352 while ((formal = getlist(&np)) != NNULL) 353 delsymtab(formal, 0); 354 } 355 356 /* 357 * The lexical analyzer. 358 */ 359 int 360 yylex() 361 #ifdef DEBUG 362 { 363 register int l; 364 365 l = yyhex(); 366 if (dflag) 367 (void) printf("%d\n", l); 368 return (l); 369 } 370 yyhex() 371 #endif 372 { 373 register wint_t c, c1; 374 int i; 375 static int savetoken = 0; 376 static wasfield; 377 static int isfuncdef; 378 static int nbrace, nparen, nbracket; 379 static struct ctosymstruct { 380 wint_t c, sym; 381 } ctosym[] = { 382 { '|', BAR }, { '^', CARAT }, 383 { '~', TILDE }, { '<', LANGLE }, 384 { '>', RANGLE }, { '+', PLUSC }, 385 { '-', HYPHEN }, { '*', STAR }, 386 { '/', SLASH }, { '%', PERCENT }, 387 { '!', EXCLAMATION }, { '$', DOLLAR }, 388 { '[', LSQUARE }, { ']', RSQUARE }, 389 { '(', LPAREN }, { ')', RPAREN }, 390 { ';', SEMI }, { '{', LBRACE }, 391 { '}', RBRACE }, { 0, 0 } 392 }; 393 394 if (savetoken) { 395 c = savetoken; 396 savetoken = 0; 397 } else if (redelim != '\0') { 398 c = redelim; 399 redelim = 0; 400 catterm = 0; 401 savetoken = c; 402 return (lexlast = lexregexp(c)); 403 } else while ((c = lexgetc()) != WEOF) { 404 if (iswalpha(c) || c=='_') { 405 c = lexid(c); 406 } else if (iswdigit(c) || c=='.') { 407 c = lexnumber(c); 408 } else if (isWblank(c)) { 409 continue; 410 } else switch (c) { 411 #if DOS || OS2 412 case 032: /* ^Z */ 413 continue; 414 #endif 415 416 case '"': 417 c = lexstring(c); 418 break; 419 420 case '#': 421 while ((c = lexgetc())!='\n' && c!=WEOF) 422 ; 423 lexungetc(c); 424 continue; 425 426 case '+': 427 if ((c1 = lexgetc()) == '+') 428 c = INC; 429 else if (c1 == '=') 430 c = AADD; 431 else 432 lexungetc(c1); 433 break; 434 435 case '-': 436 if ((c1 = lexgetc()) == '-') 437 c = DEC; 438 else if (c1 == '=') 439 c = ASUB; 440 else 441 lexungetc(c1); 442 break; 443 444 case '*': 445 if ((c1 = lexgetc()) == '=') 446 c = AMUL; 447 else if (c1 == '*') { 448 if ((c1 = lexgetc()) == '=') 449 c = AEXP; 450 else { 451 c = EXP; 452 lexungetc(c1); 453 } 454 } else 455 lexungetc(c1); 456 break; 457 458 case '^': 459 if ((c1 = lexgetc()) == '=') { 460 c = AEXP; 461 } else { 462 c = EXP; 463 lexungetc(c1); 464 } 465 break; 466 467 case '/': 468 if ((c1 = lexgetc()) == '=' 469 && lexlast!=RE && lexlast!=NRE 470 && lexlast!=';' && lexlast!='\n' 471 && lexlast!=',' && lexlast!='(') 472 c = ADIV; 473 else 474 lexungetc(c1); 475 break; 476 477 case '%': 478 if ((c1 = lexgetc()) == '=') 479 c = AREM; 480 else 481 lexungetc(c1); 482 break; 483 484 case '&': 485 if ((c1 = lexgetc()) == '&') 486 c = AND; 487 else 488 lexungetc(c1); 489 break; 490 491 case '|': 492 if ((c1 = lexgetc()) == '|') 493 c = OR; 494 else { 495 lexungetc(c1); 496 if (inprint) 497 c = PIPE; 498 } 499 break; 500 501 case '>': 502 if ((c1 = lexgetc()) == '=') 503 c = GE; 504 else if (c1 == '>') 505 c = APPEND; 506 else { 507 lexungetc(c1); 508 if (nparen==0 && inprint) 509 c = WRITE; 510 } 511 break; 512 513 case '<': 514 if ((c1 = lexgetc()) == '=') 515 c = LE; 516 else 517 lexungetc(c1); 518 break; 519 520 case '!': 521 if ((c1 = lexgetc()) == '=') 522 c = NE; 523 else if (c1 == '~') 524 c = NRE; 525 else 526 lexungetc(c1); 527 break; 528 529 case '=': 530 if ((c1 = lexgetc()) == '=') 531 c = EQ; 532 else { 533 lexungetc(c1); 534 c = ASG; 535 } 536 break; 537 538 case '\n': 539 switch (lexlast) { 540 case ')': 541 if (catterm || inprint) { 542 c = ';'; 543 break; 544 } 545 case AND: 546 case OR: 547 case COMMA: 548 case '{': 549 case ELSE: 550 case ';': 551 case DO: 552 continue; 553 554 case '}': 555 if (nbrace != 0) 556 continue; 557 558 default: 559 c = ';'; 560 break; 561 } 562 break; 563 564 case ELSE: 565 if (lexlast != ';') { 566 savetoken = ELSE; 567 c = ';'; 568 } 569 break; 570 571 case '(': 572 ++nparen; 573 break; 574 575 case ')': 576 if (--nparen < 0) 577 awkerr(unbal, "()"); 578 break; 579 580 case '{': 581 nbrace++; 582 break; 583 584 case '}': 585 if (--nbrace < 0) { 586 char brk[3]; 587 588 brk[0] = '{'; 589 brk[1] = '}'; 590 brk[2] = '\0'; 591 awkerr(unbal, brk); 592 } 593 if (lexlast != ';') { 594 savetoken = c; 595 c = ';'; 596 } 597 break; 598 599 case '[': 600 ++nbracket; 601 break; 602 603 case ']': 604 if (--nbracket < 0) { 605 char brk[3]; 606 607 brk[0] = '['; 608 brk[1] = ']'; 609 brk[2] = '\0'; 610 awkerr(unbal, brk); 611 } 612 break; 613 614 case '\\': 615 if ((c1 = lexgetc()) == '\n') 616 continue; 617 lexungetc(c1); 618 break; 619 620 case ',': 621 c = COMMA; 622 break; 623 624 case '?': 625 c = QUEST; 626 break; 627 628 case ':': 629 c = COLON; 630 break; 631 632 default: 633 if (!iswprint(c)) 634 awkerr( 635 gettext("invalid character \"%s\""), 636 toprint(c)); 637 break; 638 } 639 break; 640 } 641 642 switch (c) { 643 case ']': 644 ++catterm; 645 break; 646 647 case VAR: 648 if (catterm) { 649 savetoken = c; 650 c = CONCAT; 651 catterm = 0; 652 } else if (!isfuncdef) { 653 if ((c1=lexgetc()) != '(') 654 ++catterm; 655 lexungetc(c1); 656 } 657 isfuncdef = 0; 658 break; 659 660 case PARM: 661 case CONSTANT: 662 if (catterm) { 663 savetoken = c; 664 c = CONCAT; 665 catterm = 0; 666 } else { 667 if (lexlast == '$') 668 wasfield = 2; 669 ++catterm; 670 } 671 break; 672 673 case INC: 674 case DEC: 675 if (!catterm || lexlast!=CONSTANT || wasfield) 676 break; 677 678 case UFUNC: 679 case FUNC: 680 case GETLINE: 681 case '!': 682 case '$': 683 case '(': 684 if (catterm) { 685 savetoken = c; 686 c = CONCAT; 687 catterm = 0; 688 } 689 break; 690 691 /*{*/case '}': 692 if (nbrace == 0) 693 savetoken = ';'; 694 case ';': 695 inprint = 0; 696 default: 697 if (c == DEFFUNC) 698 isfuncdef = 1; 699 catterm = 0; 700 } 701 lexlast = c; 702 if (wasfield) 703 wasfield--; 704 /* 705 * Map character constants to symbolic names. 706 */ 707 for (i = 0; ctosym[i].c != 0; i++) 708 if (c == ctosym[i].c) { 709 c = ctosym[i].sym; 710 break; 711 } 712 return ((int)c); 713 } 714 715 /* 716 * Read a number for the lexical analyzer. 717 * Input is the first character of the number. 718 * Return value is the lexical type. 719 */ 720 static int 721 lexnumber(wint_t c) 722 { 723 register wchar_t *cp; 724 register int dotfound = 0; 725 register int efound = 0; 726 INT number; 727 728 cp = linebuf; 729 do { 730 if (iswdigit(c)) 731 ; 732 else if (c == '.') { 733 if (dotfound++) 734 break; 735 } else if (c=='e' || c=='E') { 736 if ((c = lexgetc())!='-' && c!='+') { 737 lexungetc(c); 738 c = 'e'; 739 } else 740 *cp++ = 'e'; 741 if (efound++) 742 break; 743 } else 744 break; 745 *cp++ = c; 746 } while ((c = lexgetc()) != WEOF); 747 *cp = '\0'; 748 if (dotfound && cp==linebuf+1) 749 return (DOT); 750 lexungetc(c); 751 errno = 0; 752 if (!dotfound 753 && !efound 754 && ((number=wcstol(linebuf, (wchar_t **)0, 10)), errno!=ERANGE)) 755 yylval.node = intnode(number); 756 else 757 yylval.node = realnode((REAL)wcstod(linebuf, (wchar_t **)0)); 758 return (CONSTANT); 759 } 760 761 /* 762 * Read an identifier. 763 * Input is first character of identifier. 764 * Return VAR. 765 */ 766 static int 767 lexid(wint_t c) 768 { 769 register wchar_t *cp; 770 register size_t i; 771 register NODE *np; 772 773 cp = linebuf; 774 do { 775 *cp++ = c; 776 c = lexgetc(); 777 } while (iswalpha(c) || iswdigit(c) || c=='_'); 778 *cp = '\0'; 779 lexungetc(c); 780 yylval.node = np = vlook(linebuf); 781 782 switch(np->n_type) { 783 case KEYWORD: 784 switch (np->n_keywtype) { 785 case PRINT: 786 case PRINTF: 787 ++inprint; 788 default: 789 return ((int)np->n_keywtype); 790 } 791 /* NOTREACHED */ 792 793 case ARRAY: 794 case VAR: 795 /* 796 * If reading the argument list, create a dummy node 797 * for the duration of that function. These variables 798 * can be removed from the symbol table at function end 799 * but they must still exist because the execution tree 800 * knows about them. 801 */ 802 if (funparm) { 803 do_funparm: 804 np = emptynode(PARM, i=(cp-linebuf)); 805 np->n_flags = FSTRING; 806 np->n_string = _null; 807 np->n_strlen = 0; 808 (void) memcpy(np->n_name, linebuf, 809 (i+1) * sizeof(wchar_t)); 810 addsymtab(np); 811 yylval.node = np; 812 } else if (np == varNF || (np == varFS && 813 (!doing_begin || begin_getline))) { 814 /* 815 * If the user program references NF or sets 816 * FS either outside of a begin block or 817 * in a begin block after a getline then the 818 * input line will be split immediately upon read 819 * rather than when a field is first referenced. 820 */ 821 needsplit = 1; 822 } else if (np == varENVIRON) 823 needenviron = 1; 824 case PARM: 825 return (VAR); 826 827 case UFUNC: 828 /* 829 * It is ok to redefine functions as parameters 830 */ 831 if (funparm) goto do_funparm; 832 case FUNC: 833 case GETLINE: 834 /* 835 * When a getline is encountered, clear the 'doing_begin' flag. 836 * This will force the 'needsplit' flag to be set, even inside 837 * a begin block, if FS is altered. (See VAR case above) 838 */ 839 if (doing_begin) 840 begin_getline = 1; 841 return (np->n_type); 842 } 843 /* NOTREACHED */ 844 } 845 846 /* 847 * Read a string for the lexical analyzer. 848 * `endc' terminates the string. 849 */ 850 static int 851 lexstring(wint_t endc) 852 { 853 register size_t length = lexescape(endc, 0, 0); 854 855 yylval.node = stringnode(linebuf, FALLOC, length); 856 return (CONSTANT); 857 } 858 859 /* 860 * Read a regular expression. 861 */ 862 static int 863 lexregexp(wint_t endc) 864 { 865 (void) lexescape(endc, 1, 0); 866 yylval.node = renode(linebuf); 867 return (URE); 868 } 869 870 /* 871 * Process a string, converting the escape characters as required by 872 * 1003.2. The processed string ends up in the global linebuf[]. This 873 * routine also changes the value of 'progfd' - the program file 874 * descriptor, so it should be used with some care. It is presently used to 875 * process -v (awk1.c) and var=str type arguments (awk2.c, nextrecord()). 876 */ 877 void 878 strescape(wchar_t *str) 879 { 880 progptr = str; 881 proglen = wcslen(str) + 1; /* Include \0 */ 882 (void) lexescape('\0', 0, 1); 883 progptr = NULL; 884 } 885 886 /* 887 * Read a string or regular expression, terminated by ``endc'', 888 * for lexical analyzer, processing escape sequences. 889 * Return string length. 890 */ 891 static size_t 892 lexescape(wint_t endc, int regx, int cmd_line_operand) 893 { 894 static char nlre[256]; 895 static char nlstr[256]; 896 static char eofre[256]; 897 static char eofstr[256]; 898 int first_time = 1; 899 wint_t c; 900 wchar_t *cp; 901 int n, max; 902 903 if (first_time == 1) { 904 (void) strcpy(nlre, gettext("Newline in regular expression\n")); 905 (void) strcpy(nlstr, gettext("Newline in string\n")); 906 (void) strcpy(eofre, gettext("EOF in regular expression\n")); 907 (void) strcpy(eofstr, gettext("EOF in string\n")); 908 first_time = 0; 909 } 910 911 cp = linebuf; 912 while ((c = lexgetc()) != endc) { 913 if (c == '\n') 914 awkerr(regx ? nlre : nlstr); 915 if (c == '\\') { 916 switch (c = lexgetc(), c) { 917 case '\\': 918 if (regx) 919 *cp++ = '\\'; 920 break; 921 922 case '/': 923 c = '/'; 924 break; 925 926 case 'n': 927 c = '\n'; 928 break; 929 930 case 'b': 931 c = '\b'; 932 break; 933 934 case 't': 935 c = '\t'; 936 break; 937 938 case 'r': 939 c = '\r'; 940 break; 941 942 case 'f': 943 c = '\f'; 944 break; 945 946 case 'v': 947 c = '\v'; 948 break; 949 950 case 'a': 951 c = (char) 0x07; 952 break; 953 954 case 'x': 955 n = 0; 956 while (iswxdigit(c = lexgetc())) { 957 if (iswdigit(c)) 958 c -= '0'; 959 else if (iswupper(c)) 960 c -= 'A'-10; 961 else 962 c -= 'a'-10; 963 n = (n<<4) + c; 964 } 965 lexungetc(c); 966 c = n; 967 break; 968 969 case '0': 970 case '1': 971 case '2': 972 case '3': 973 case '4': 974 case '5': 975 case '6': 976 case '7': 977 #if 0 978 /* 979 * Posix.2 draft 10 disallows the use of back-referencing - it explicitly 980 * requires processing of the octal escapes both in strings and 981 * regular expressions. The following code is disabled instead of 982 * removed as back-referencing may be reintroduced in a future draft 983 * of the standard. 984 */ 985 /* 986 * For regular expressions, we disallow 987 * \ooo to mean octal character, in favour 988 * of back referencing. 989 */ 990 if (regx) { 991 *cp++ = '\\'; 992 break; 993 } 994 #endif 995 max = 3; 996 n = 0; 997 do { 998 n = (n<<3) + c-'0'; 999 if ((c = lexgetc())>'7' || c<'0') 1000 break; 1001 } while (--max); 1002 lexungetc(c); 1003 /* 1004 * an octal escape sequence must have at least 1005 * 2 digits after the backslash, otherwise 1006 * it gets passed straight thru for possible 1007 * use in backreferencing. 1008 */ 1009 if (max == 3) { 1010 *cp++ = '\\'; 1011 n += '0'; 1012 } 1013 c = n; 1014 break; 1015 1016 case '\n': 1017 continue; 1018 1019 default: 1020 if (c != endc || cmd_line_operand) { 1021 *cp++ = '\\'; 1022 if (c == endc) 1023 lexungetc(c); 1024 } 1025 } 1026 } 1027 if (c == WEOF) 1028 awkerr(regx ? eofre : eofstr); 1029 *cp++ = c; 1030 } 1031 *cp = '\0'; 1032 return (cp - linebuf); 1033 } 1034 1035 /* 1036 * Build a regular expression NODE. 1037 * Argument is the string holding the expression. 1038 */ 1039 NODE * 1040 renode(wchar_t *s) 1041 { 1042 register NODE *np; 1043 int n; 1044 1045 np = emptynode(RE, 0); 1046 np->n_left = np->n_right = NNULL; 1047 np->n_regexp = (REGEXP)emalloc(sizeof(regex_t)); 1048 if ((n = REGWCOMP(np->n_regexp, s, REG_EXTENDED)) != REG_OK) { 1049 int m; 1050 char *p; 1051 1052 m = regerror(n, np->n_regexp, NULL, 0); 1053 p = (char *)emalloc(m); 1054 regerror(n, np->n_regexp, p, m); 1055 awkerr("/%S/: %s", s, p); 1056 } 1057 return (np); 1058 } 1059 /* 1060 * Get a character for the lexical analyser routine. 1061 */ 1062 static wint_t 1063 lexgetc() 1064 { 1065 register wint_t c; 1066 static char **files = &progfiles[0]; 1067 1068 if (progfp!=FNULL && (c = fgetwc(progfp))!=WEOF) 1069 ; 1070 else { 1071 if (progptr != NULL) { 1072 if (proglen-- <= 0) 1073 c = WEOF; 1074 else 1075 c = *progptr++; 1076 } else { 1077 if (progfp != FNULL) 1078 if (progfp != stdin) 1079 (void)fclose(progfp); 1080 else 1081 clearerr(progfp); 1082 progfp = FNULL; 1083 if (files < progfilep) { 1084 filename = *files++; 1085 lineno = 1; 1086 if (filename[0]=='-' && filename[1]=='\0') 1087 progfp = stdin; 1088 else if ((progfp=fopen(filename, r)) == FNULL) { 1089 (void) fprintf(stderr, 1090 gettext("script file \"%s\""), filename); 1091 exit(1); 1092 } 1093 c = fgetwc(progfp); 1094 } 1095 } 1096 } 1097 if (c == '\n') 1098 ++lineno; 1099 if (conptr >= &context[NCONTEXT]) 1100 conptr = &context[0]; 1101 if (c != WEOF) 1102 *conptr++ = c; 1103 return (c); 1104 } 1105 1106 /* 1107 * Return a character for lexical analyser. 1108 * Only one returned character is (not enforced) legitimite. 1109 */ 1110 static void 1111 lexungetc(wint_t c) 1112 { 1113 if (c == '\n') 1114 --lineno; 1115 if (c != WEOF) { 1116 if (conptr == &context[0]) 1117 conptr = &context[NCONTEXT]; 1118 *--conptr = '\0'; 1119 } 1120 if (progfp != FNULL) { 1121 (void)ungetwc(c, progfp); 1122 return; 1123 } 1124 if (c == WEOF) 1125 return; 1126 *--progptr = c; 1127 proglen++; 1128 } 1129 1130 /* 1131 * Syntax errors during parsing. 1132 */ 1133 void 1134 yyerror(char *s, ...) 1135 { 1136 if (lexlast==FUNC || lexlast==GETLINE || lexlast==KEYWORD) 1137 if (lexlast == KEYWORD) 1138 awkerr(gettext("inadmissible use of reserved keyword")); 1139 else 1140 awkerr(gettext("attempt to redefine builtin function")); 1141 awkerr(s); 1142 } 1143 1144 /* 1145 * Error routine for all awk errors. 1146 */ 1147 /* ARGSUSED */ 1148 void 1149 awkerr(char *fmt, ...) 1150 { 1151 va_list args; 1152 1153 va_start(args, fmt); 1154 awkierr(0, fmt, args); 1155 va_end(args); 1156 } 1157 1158 /* 1159 * Error routine like "awkerr" except that it prints out 1160 * a message that includes an errno-specific indication. 1161 */ 1162 /* ARGSUSED */ 1163 void 1164 awkperr(char *fmt, ...) 1165 { 1166 va_list args; 1167 1168 va_start(args, fmt); 1169 awkierr(1, fmt, args); 1170 va_end(args); 1171 } 1172 1173 /* 1174 * Common internal routine for awkerr, awkperr 1175 */ 1176 static void 1177 awkierr(int perr, char *fmt, va_list ap) 1178 { 1179 static char sep1[] = "\n>>>\t"; 1180 static char sep2[] = "\t<<<"; 1181 int saveerr = errno; 1182 1183 (void) fprintf(stderr, "%s: ", _cmdname); 1184 if (running) { 1185 (void) fprintf(stderr, gettext("line %u ("), 1186 curnode==NNULL ? 0 : curnode->n_lineno); 1187 if (phase == 0) 1188 (void) fprintf(stderr, "NR=%lld): ", (INT)exprint(varNR)); 1189 else 1190 (void) fprintf(stderr, "%s): ", 1191 phase==BEGIN ? s_BEGIN : s_END); 1192 } else if (lineno != 0) { 1193 (void) fprintf(stderr, gettext("file \"%s\": "), filename); 1194 (void) fprintf(stderr, gettext("line %u: "), lineno); 1195 } 1196 (void) vfprintf(stderr, gettext(fmt), ap); 1197 if (perr == 1) 1198 (void) fprintf(stderr, ": %s", strerror(saveerr)); 1199 if (perr != 2 && !running) { 1200 register wchar_t *cp; 1201 register int n; 1202 register int c; 1203 1204 (void) fprintf(stderr, gettext(" Context is:%s"), sep1); 1205 cp = conptr; 1206 n = NCONTEXT; 1207 do { 1208 if (cp >= &context[NCONTEXT]) 1209 cp = &context[0]; 1210 if ((c = *cp++) != '\0') 1211 (void)fputs(c=='\n' ? sep1 : toprint(c), 1212 stderr); 1213 } while (--n != 0); 1214 (void)fputs(sep2, stderr); 1215 } 1216 (void) fprintf(stderr, "\n"); 1217 exit(1); 1218 } 1219 1220 wchar_t * 1221 emalloc(unsigned n) 1222 { 1223 wchar_t *cp; 1224 1225 if ((cp = malloc(n)) == NULL) 1226 awkerr(nomem); 1227 return cp; 1228 } 1229 1230 wchar_t * 1231 erealloc(wchar_t *p, unsigned n) 1232 { 1233 wchar_t *cp; 1234 1235 if ((cp = realloc(p, n)) == NULL) 1236 awkerr(nomem); 1237 return cp; 1238 } 1239 1240 1241 /* 1242 * usage message for awk 1243 */ 1244 static int 1245 usage() 1246 { 1247 (void) fprintf(stderr, gettext( 1248 "Usage: awk [-F ERE] [-v var=val] 'program' [var=val ...] [file ...]\n" 1249 " awk [-F ERE] -f progfile ... [-v var=val] [var=val ...] [file ...]\n")); 1250 return (2); 1251 } 1252 1253 1254 static wchar_t * 1255 mbconvert(char *str) 1256 { 1257 static wchar_t *op = 0; 1258 1259 if (op != 0) 1260 free(op); 1261 return (op = mbstowcsdup(str)); 1262 } 1263 1264 char * 1265 mbunconvert(wchar_t *str) 1266 { 1267 static char *op = 0; 1268 1269 if (op != 0) 1270 free(op); 1271 return (op = wcstombsdup(str)); 1272 } 1273 1274 /* 1275 * Solaris port - following functions are typical MKS functions written 1276 * to work for Solaris. 1277 */ 1278 1279 wchar_t * 1280 mbstowcsdup(s) 1281 char *s; 1282 { 1283 int n; 1284 wchar_t *w; 1285 1286 n = strlen(s) + 1; 1287 if ((w = (wchar_t *)malloc(n * sizeof (wchar_t))) == NULL) 1288 return (NULL); 1289 1290 if (mbstowcs(w, s, n) == -1) 1291 return (NULL); 1292 return (w); 1293 1294 } 1295 1296 char * 1297 wcstombsdup(wchar_t *w) 1298 { 1299 int n; 1300 char *mb; 1301 1302 /* Fetch memory for worst case string length */ 1303 n = wslen(w) + 1; 1304 n *= MB_CUR_MAX; 1305 if ((mb = (char *)malloc(n)) == NULL) { 1306 return (NULL); 1307 } 1308 1309 /* Convert the string */ 1310 if ((n = wcstombs(mb, w, n)) == -1) { 1311 int saverr = errno; 1312 1313 free(mb); 1314 errno = saverr; 1315 return (0); 1316 } 1317 1318 /* Shrink the string down */ 1319 if ((mb = (char *)realloc(mb, strlen(mb)+1)) == NULL) { 1320 return (NULL); 1321 } 1322 return (mb); 1323 } 1324 1325 /* 1326 * The upe_ctrls[] table contains the printable 'control-sequences' for the 1327 * character values 0..31 and 127. The first entry is for value 127, thus the 1328 * entries for the remaining character values are from 1..32. 1329 */ 1330 static const char *const upe_ctrls[] = 1331 { 1332 "^?", 1333 "^@", "^A", "^B", "^C", "^D", "^E", "^F", "^G", 1334 "^H", "^I", "^J", "^K", "^L", "^M", "^N", "^O", 1335 "^P", "^Q", "^R", "^S", "^T", "^U", "^V", "^W", 1336 "^X", "^Y", "^Z", "^[", "^\\", "^]", "^^", "^_" 1337 }; 1338 1339 1340 /* 1341 * Return a printable string corresponding to the given character value. If 1342 * the character is printable, simply return it as the string. If it is in 1343 * the range specified by table 5-101 in the UPE, return the corresponding 1344 * string. Otherwise, return an octal escape sequence. 1345 */ 1346 static const char * 1347 toprint(c) 1348 wchar_t c; 1349 { 1350 int n, len; 1351 unsigned char *ptr; 1352 static char mbch[MB_LEN_MAX+1]; 1353 static char buf[5 * MB_LEN_MAX + 1]; 1354 1355 if ((n = wctomb(mbch, c)) == -1) { 1356 /* Should never happen */ 1357 (void) sprintf(buf, "\\%x", c); 1358 return (buf); 1359 } 1360 mbch[n] = '\0'; 1361 if (iswprint(c)) { 1362 return (mbch); 1363 } else if (c == 127) { 1364 return (upe_ctrls[0]); 1365 } else if (c < 32) { 1366 /* Print as in Table 5-101 in the UPE */ 1367 return (upe_ctrls[c+1]); 1368 } else { 1369 /* Print as an octal escape sequence */ 1370 for (len = 0, ptr = (unsigned char *) mbch; 0 < n; --n, ++ptr) 1371 len += sprintf(buf+len, "\\%03o", *ptr); 1372 } 1373 return (buf); 1374 } 1375 1376 static int 1377 wcoff(const wchar_t *astring, const int off) 1378 { 1379 const wchar_t *s = astring; 1380 int c = 0; 1381 char mb[MB_LEN_MAX]; 1382 1383 while (c < off) { 1384 int n; 1385 if ((n = wctomb(mb, *s)) == 0) 1386 break; 1387 if (n == -1) 1388 n = 1; 1389 c += n; 1390 s++; 1391 } 1392 1393 return (s - astring); 1394 } 1395 1396 int 1397 int_regwcomp(register regex_t *r, const wchar_t *pattern, int uflags) 1398 { 1399 char *mbpattern; 1400 int ret; 1401 1402 if ((mbpattern = wcstombsdup((wchar_t *) pattern)) == NULL) 1403 return (REG_ESPACE); 1404 1405 ret = regcomp(r, mbpattern, uflags); 1406 1407 free(mbpattern); 1408 1409 return (ret); 1410 } 1411 1412 int 1413 int_regwexec(const regex_t *r, /* compiled RE */ 1414 const wchar_t *astring, /* subject string */ 1415 size_t nsub, /* number of subexpressions */ 1416 int_regwmatch_t *sub, /* subexpression pointers */ 1417 int flags) 1418 { 1419 char *mbs; 1420 regmatch_t *mbsub = NULL; 1421 register int i; 1422 1423 if ((mbs = wcstombsdup((wchar_t *) astring)) == NULL) 1424 return (REG_ESPACE); 1425 1426 if (nsub > 0 && sub) { 1427 if ((mbsub = malloc(nsub * sizeof (regmatch_t))) == NULL) 1428 return (REG_ESPACE); 1429 } 1430 1431 i = regexec(r, mbs, nsub, mbsub, flags); 1432 1433 /* Now, adjust the pointers/counts in sub */ 1434 if (i == REG_OK && nsub > 0 && mbsub) { 1435 register int j, k; 1436 1437 for (j = 0; j < nsub; j++) { 1438 regmatch_t *ms = &mbsub[j]; 1439 int_regwmatch_t *ws = &sub[j]; 1440 1441 if ((k = ms->rm_so) >= 0) { 1442 ws->rm_so = wcoff(astring, k); 1443 ws->rm_sp = astring + ws->rm_so; 1444 } 1445 if ((k = ms->rm_eo) >= 0) { 1446 ws->rm_eo = wcoff(astring, k); 1447 ws->rm_ep = astring + ws->rm_eo; 1448 } 1449 } 1450 } 1451 1452 free(mbs); 1453 if (mbsub) 1454 free(mbsub); 1455 return (i); 1456 } 1457 1458 int 1459 int_regwdosuba(register regex_t *rp, /* compiled RE: Pattern */ 1460 const wchar_t *rpl, /* replacement string: /rpl/ */ 1461 const wchar_t *src, /* source string */ 1462 wchar_t **dstp, /* destination string */ 1463 int len, /* destination length */ 1464 int *globp) /* IN: occurence, 0 for all; OUT: substitutions */ 1465 { 1466 wchar_t *dst, *odst; 1467 register const wchar_t *ip, *xp; 1468 register wchar_t *op; 1469 register int i; 1470 register wchar_t c; 1471 int glob, iglob = *globp, oglob = 0; 1472 #define NSUB 10 1473 int_regwmatch_t rm[NSUB], *rmp; 1474 int flags; 1475 wchar_t *end; 1476 int regerr; 1477 1478 /* handle overflow of dst. we need "i" more bytes */ 1479 #ifdef OVERFLOW 1480 #undef OVERFLOW 1481 #define OVERFLOW(i) if (1) { \ 1482 int pos = op - dst; \ 1483 dst = (wchar_t *) realloc(odst = dst, \ 1484 (len += len + i) * sizeof (wchar_t)); \ 1485 if (dst == NULL) \ 1486 goto nospace; \ 1487 op = dst + pos; \ 1488 end = dst + len; \ 1489 } else 1490 #endif 1491 1492 *dstp = dst = (wchar_t *) malloc(len * sizeof (wchar_t)); 1493 if (dst == NULL) 1494 return (REG_ESPACE); 1495 1496 if (rp == NULL || rpl == NULL || src == NULL || dst == NULL) 1497 return (REG_EFATAL); 1498 1499 glob = 0; /* match count */ 1500 ip = src; /* source position */ 1501 op = dst; /* destination position */ 1502 end = dst + len; 1503 1504 flags = 0; 1505 while ((regerr = int_regwexec(rp, ip, NSUB, rm, flags)) == REG_OK) { 1506 /* Copy text preceding match */ 1507 if (op + (i = rm[0].rm_sp - ip) >= end) 1508 OVERFLOW(i); 1509 while (i--) 1510 *op++ = *ip++; 1511 1512 if (iglob == 0 || ++glob == iglob) { 1513 oglob++; 1514 xp = rpl; /* do substitute */ 1515 } else 1516 xp = L"&"; /* preserve text */ 1517 1518 /* Perform replacement of matched substing */ 1519 while ((c = *xp++) != '\0') { 1520 rmp = NULL; 1521 if (c == '&') 1522 rmp = &rm[0]; 1523 else if (c == '\\') { 1524 if ('0' <= *xp && *xp <= '9') 1525 rmp = &rm[*xp++ - '0']; 1526 else if (*xp != '\0') 1527 c = *xp++; 1528 } 1529 1530 if (rmp == NULL) { /* Ordinary character. */ 1531 *op++ = c; 1532 if (op >= end) 1533 OVERFLOW(1); 1534 } else if (rmp->rm_sp != NULL && rmp->rm_ep != NULL) { 1535 ip = rmp->rm_sp; 1536 if (op + (i = rmp->rm_ep - rmp->rm_sp) >= end) 1537 OVERFLOW(i); 1538 while (i--) 1539 *op++ = *ip++; 1540 } 1541 } 1542 1543 ip = rm[0].rm_ep; 1544 if (*ip == '\0') /* If at end break */ 1545 break; 1546 else if (rm[0].rm_sp == rm[0].rm_ep) { 1547 /* If empty match copy next char */ 1548 *op++ = *ip++; 1549 if (op >= end) 1550 OVERFLOW(1); 1551 } 1552 flags = REG_NOTBOL; 1553 } 1554 1555 if (regerr != REG_OK && regerr != REG_NOMATCH) 1556 return (regerr); 1557 1558 /* Copy rest of text */ 1559 if (op + (i = wcslen(ip)) >= end) 1560 OVERFLOW(i); 1561 while (i--) 1562 *op++ = *ip++; 1563 *op++ = '\0'; 1564 1565 if ((*dstp = dst = (wchar_t *) realloc(odst = dst, 1566 sizeof (wchar_t) * (size_t)(op - dst))) == NULL) { 1567 nospace: 1568 free(odst); 1569 return (REG_ESPACE); 1570 } 1571 1572 *globp = oglob; 1573 1574 return ((oglob == 0) ? REG_NOMATCH : REG_OK); 1575 } 1576