1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * awk -- mainline, yylex, etc. 29 * 30 * Copyright 1986, 1994 by Mortice Kern Systems Inc. All rights reserved. 31 * 32 * Based on MKS awk(1) ported to be /usr/xpg4/bin/awk with POSIX/XCU4 changes 33 */ 34 35 #pragma ident "%Z%%M% %I% %E% SMI" 36 37 #include "awk.h" 38 #include "y.tab.h" 39 #include <stdarg.h> 40 #include <unistd.h> 41 #include <locale.h> 42 43 static char *progfiles[NPFILE]; /* Programmes files for yylex */ 44 static char **progfilep = &progfiles[0]; /* Pointer to last file */ 45 static wchar_t *progptr; /* In-memory programme */ 46 static int proglen; /* Length of progptr */ 47 static wchar_t context[NCONTEXT]; /* Circular buffer of context */ 48 static wchar_t *conptr = &context[0]; /* context ptr */ 49 static FILE *progfp; /* Stdio stream for programme */ 50 static char *filename; 51 #ifdef DEBUG 52 static int dflag; 53 #endif 54 55 #define AWK_EXEC_MAGIC "<MKS AWKC>" 56 #define LEN_EXEC_MAGIC 10 57 58 static char unbal[] = "unbalanced E char"; 59 60 static void awkarginit(int c, char **av); 61 static int lexid(wint_t c); 62 static int lexnumber(wint_t c); 63 static int lexstring(wint_t endc); 64 static int lexregexp(register wint_t endc); 65 66 static void awkvarinit(void); 67 static wint_t lexgetc(void); 68 static void lexungetc(wint_t c); 69 static size_t lexescape(wint_t endc, int regx, int cmd_line_operand); 70 static void awkierr(int perr, char *fmt, va_list ap); 71 static int usage(void); 72 void strescape(wchar_t *str); 73 static const char *toprint(wint_t); 74 char *_cmdname; 75 static wchar_t *mbconvert(char *str); 76 77 78 /* 79 * mainline for awk 80 */ 81 int 82 main(int argc, char *argv[]) 83 { 84 register wchar_t *ap; 85 register char *cmd; 86 87 cmd = argv[0]; 88 _cmdname = cmd; 89 90 linebuf = emalloc(NLINE * sizeof(wchar_t)); 91 92 /*l 93 * At this point only messaging should be internationalized. 94 * numbers are still scanned as in the Posix locale. 95 */ 96 (void) setlocale(LC_ALL,""); 97 (void) setlocale(LC_NUMERIC,"C"); 98 #if !defined(TEXT_DOMAIN) 99 #define TEXT_DOMAIN "SYS_TEST" 100 #endif 101 (void) textdomain(TEXT_DOMAIN); 102 103 awkvarinit(); 104 /*running = 1;*/ 105 while (argc>1 && *argv[1]=='-') { 106 void *save_ptr = NULL; 107 ap = mbstowcsdup(&argv[1][1]); 108 if (ap == NULL) 109 break; 110 if (*ap == '\0') { 111 free(ap); 112 break; 113 } 114 save_ptr = (void *) ap; 115 ++argv; 116 --argc; 117 if (*ap=='-' && ap[1]=='\0') 118 break; 119 for ( ; *ap != '\0'; ++ap) { 120 switch (*ap) { 121 #ifdef DEBUG 122 case 'd': 123 dflag = 1; 124 continue; 125 126 #endif 127 case 'f': 128 if (argc < 2) { 129 (void) fprintf(stderr, 130 gettext("Missing script file\n")); 131 return (1); 132 } 133 *progfilep++ = argv[1]; 134 --argc; 135 ++argv; 136 continue; 137 138 case 'F': 139 if (ap[1] == '\0') { 140 if (argc < 2) { 141 (void) fprintf(stderr, 142 gettext("Missing field separator\n")); 143 return (1); 144 } 145 ap = mbstowcsdup(argv[1]); 146 --argc; 147 ++argv; 148 } else 149 ++ap; 150 strescape(ap); 151 strassign(varFS, linebuf, FALLOC, 152 wcslen(linebuf)); 153 break; 154 155 case 'v': { 156 register wchar_t *vp; 157 register wchar_t *arg; 158 159 if (argc < 2) { 160 (void) fprintf(stderr, 161 gettext("Missing variable assignment\n")); 162 return (1); 163 } 164 arg = mbconvert(argv[1]); 165 if ((vp = wcschr(arg, '=')) != NULL) { 166 *vp = '\0'; 167 strescape(vp+1); 168 strassign(vlook(arg), linebuf, 169 FALLOC|FSENSE, wcslen(linebuf)); 170 *vp = '='; 171 } 172 --argc; 173 ++argv; 174 continue; 175 } 176 177 default: 178 (void) fprintf(stderr, 179 gettext("Unknown option \"-%S\"\n"), ap); 180 return (usage()); 181 } 182 break; 183 } 184 if (save_ptr) 185 free(save_ptr); 186 } 187 if (progfilep == &progfiles[0]) { 188 if (argc < 2) 189 return (usage()); 190 filename = "[command line]"; /* BUG: NEEDS TRANSLATION */ 191 progptr = mbstowcsdup(argv[1]); 192 proglen = wcslen(progptr); 193 --argc; 194 ++argv; 195 } 196 197 argv[0] = cmd; 198 199 awkarginit(argc, argv); 200 201 /*running = 0;*/ 202 (void)yyparse(); 203 204 lineno = 0; 205 /* 206 * Ok, done parsing, so now activate the rest of the nls stuff, set 207 * the radix character. 208 */ 209 (void) setlocale(LC_ALL,""); 210 radixpoint = *localeconv()->decimal_point; 211 awk(); 212 /* NOTREACHED */ 213 return (0); 214 } 215 216 /* 217 * Do initial setup of buffers, etc. 218 * This must be called before most processing 219 * and especially before lexical analysis. 220 * Variables initialised here will be overruled by command 221 * line parameter initialisation. 222 */ 223 static void 224 awkvarinit() 225 { 226 register NODE *np; 227 228 (void) setvbuf(stderr, NULL, _IONBF, 0); 229 230 if ((NIOSTREAM = sysconf(_SC_OPEN_MAX) - 4) <= 0) { 231 (void) fprintf(stderr, 232 gettext("not enough available file descriptors")); 233 exit(1); 234 } 235 ofiles = (OFILE *) emalloc(sizeof(OFILE)*NIOSTREAM); 236 #ifdef A_ZERO_POINTERS 237 (void) memset((wchar_t *) ofiles, 0, sizeof(OFILE) * NIOSTREAM); 238 #else 239 { 240 /* initialize file descriptor table */ 241 OFILE *fp; 242 for (fp = ofiles; fp < &ofiles[NIOSTREAM]; fp += 1) { 243 fp->f_fp = FNULL; 244 fp->f_mode = 0; 245 fp->f_name = (char *)0; 246 } 247 } 248 #endif 249 constant = intnode((INT)0); 250 251 const0 = intnode((INT)0); 252 const1 = intnode((INT)1); 253 constundef = emptynode(CONSTANT, 0); 254 constundef->n_flags = FSTRING|FVINT; 255 constundef->n_string = _null; 256 constundef->n_strlen = 0; 257 inc_oper = emptynode(ADD, 0); 258 inc_oper->n_right = const1; 259 asn_oper = emptynode(ADD, 0); 260 field0 = node(FIELD, const0, NNULL); 261 262 { 263 register RESFUNC near*rp; 264 265 for (rp = &resfuncs[0]; rp->rf_name != (LOCCHARP)NULL; ++rp) { 266 np = finstall(rp->rf_name, rp->rf_func, rp->rf_type); 267 } 268 } 269 { 270 register RESERVED near*rp; 271 272 for (rp = &reserved[0]; rp->r_name != (LOCCHARP)NULL; ++rp) { 273 switch (rp->r_type) { 274 case SVAR: 275 case VAR: 276 running = 1; 277 np = vlook(rp->r_name); 278 if (rp->r_type == SVAR) 279 np->n_flags |= FSPECIAL; 280 if (rp->r_svalue != NULL) 281 strassign(np, rp->r_svalue, FSTATIC, 282 (size_t)rp->r_ivalue); 283 else { 284 constant->n_int = rp->r_ivalue; 285 (void)assign(np, constant); 286 } 287 running = 0; 288 break; 289 290 case KEYWORD: 291 kinstall(rp->r_name, (int)rp->r_ivalue); 292 break; 293 } 294 } 295 } 296 297 varNR = vlook(s_NR); 298 varFNR = vlook(s_FNR); 299 varNF = vlook(s_NF); 300 varOFMT = vlook(s_OFMT); 301 varCONVFMT = vlook(s_CONVFMT); 302 varOFS = vlook(s_OFS); 303 varORS = vlook(s_ORS); 304 varRS = vlook(s_RS); 305 varFS = vlook(s_FS); 306 varARGC = vlook(s_ARGC); 307 varSUBSEP = vlook(s_SUBSEP); 308 varENVIRON = vlook(s_ENVIRON); 309 varFILENAME = vlook(s_FILENAME); 310 varSYMTAB = vlook(s_SYMTAB); 311 incNR = node(ASG, varNR, node(ADD, varNR, const1)); 312 incFNR = node(ASG, varFNR, node(ADD, varFNR, const1)); 313 clrFNR = node(ASG, varFNR, const0); 314 } 315 316 /* 317 * Initialise awk ARGC, ARGV variables. 318 */ 319 static void 320 awkarginit(int ac, char **av) 321 { 322 register int i; 323 register wchar_t *cp; 324 325 ARGVsubi = node(INDEX, vlook(s_ARGV), constant); 326 running = 1; 327 constant->n_int = ac; 328 (void)assign(varARGC, constant); 329 for (i = 0; i < ac; ++i) { 330 cp = mbstowcsdup(av[i]); 331 constant->n_int = i; 332 strassign(exprreduce(ARGVsubi), cp, 333 FSTATIC|FSENSE, wcslen(cp)); 334 } 335 running = 0; 336 } 337 338 /* 339 * Clean up when done parsing a function. 340 * All formal parameters, because of a deal (funparm) in 341 * yylex, get put into the symbol table in front of any 342 * global variable of the same name. When the entire 343 * function is parsed, remove these formal dummy nodes 344 * from the symbol table but retain the nodes because 345 * the generated tree points at them. 346 */ 347 void 348 uexit(NODE *np) 349 { 350 register NODE *formal; 351 352 while ((formal = getlist(&np)) != NNULL) 353 delsymtab(formal, 0); 354 } 355 356 /* 357 * The lexical analyzer. 358 */ 359 int 360 yylex() 361 #ifdef DEBUG 362 { 363 register int l; 364 365 l = yyhex(); 366 if (dflag) 367 (void) printf("%d\n", l); 368 return (l); 369 } 370 yyhex() 371 #endif 372 { 373 register wint_t c, c1; 374 int i; 375 static int savetoken = 0; 376 static int wasfield; 377 static int isfuncdef; 378 static int nbrace, nparen, nbracket; 379 static struct ctosymstruct { 380 wint_t c, sym; 381 } ctosym[] = { 382 { '|', BAR }, { '^', CARAT }, 383 { '~', TILDE }, { '<', LANGLE }, 384 { '>', RANGLE }, { '+', PLUSC }, 385 { '-', HYPHEN }, { '*', STAR }, 386 { '/', SLASH }, { '%', PERCENT }, 387 { '!', EXCLAMATION }, { '$', DOLLAR }, 388 { '[', LSQUARE }, { ']', RSQUARE }, 389 { '(', LPAREN }, { ')', RPAREN }, 390 { ';', SEMI }, { '{', LBRACE }, 391 { '}', RBRACE }, { 0, 0 } 392 }; 393 394 if (savetoken) { 395 c = savetoken; 396 savetoken = 0; 397 } else if (redelim != '\0') { 398 c = redelim; 399 redelim = 0; 400 catterm = 0; 401 savetoken = c; 402 return (lexlast = lexregexp(c)); 403 } else while ((c = lexgetc()) != WEOF) { 404 if (iswalpha(c) || c=='_') { 405 c = lexid(c); 406 } else if (iswdigit(c) || c=='.') { 407 c = lexnumber(c); 408 } else if (isWblank(c)) { 409 continue; 410 } else switch (c) { 411 #if DOS || OS2 412 case 032: /* ^Z */ 413 continue; 414 #endif 415 416 case '"': 417 c = lexstring(c); 418 break; 419 420 case '#': 421 while ((c = lexgetc())!='\n' && c!=WEOF) 422 ; 423 lexungetc(c); 424 continue; 425 426 case '+': 427 if ((c1 = lexgetc()) == '+') 428 c = INC; 429 else if (c1 == '=') 430 c = AADD; 431 else 432 lexungetc(c1); 433 break; 434 435 case '-': 436 if ((c1 = lexgetc()) == '-') 437 c = DEC; 438 else if (c1 == '=') 439 c = ASUB; 440 else 441 lexungetc(c1); 442 break; 443 444 case '*': 445 if ((c1 = lexgetc()) == '=') 446 c = AMUL; 447 else if (c1 == '*') { 448 if ((c1 = lexgetc()) == '=') 449 c = AEXP; 450 else { 451 c = EXP; 452 lexungetc(c1); 453 } 454 } else 455 lexungetc(c1); 456 break; 457 458 case '^': 459 if ((c1 = lexgetc()) == '=') { 460 c = AEXP; 461 } else { 462 c = EXP; 463 lexungetc(c1); 464 } 465 break; 466 467 case '/': 468 if ((c1 = lexgetc()) == '=' 469 && lexlast!=RE && lexlast!=NRE 470 && lexlast!=';' && lexlast!='\n' 471 && lexlast!=',' && lexlast!='(') 472 c = ADIV; 473 else 474 lexungetc(c1); 475 break; 476 477 case '%': 478 if ((c1 = lexgetc()) == '=') 479 c = AREM; 480 else 481 lexungetc(c1); 482 break; 483 484 case '&': 485 if ((c1 = lexgetc()) == '&') 486 c = AND; 487 else 488 lexungetc(c1); 489 break; 490 491 case '|': 492 if ((c1 = lexgetc()) == '|') 493 c = OR; 494 else { 495 lexungetc(c1); 496 if (inprint) 497 c = PIPE; 498 } 499 break; 500 501 case '>': 502 if ((c1 = lexgetc()) == '=') 503 c = GE; 504 else if (c1 == '>') 505 c = APPEND; 506 else { 507 lexungetc(c1); 508 if (nparen==0 && inprint) 509 c = WRITE; 510 } 511 break; 512 513 case '<': 514 if ((c1 = lexgetc()) == '=') 515 c = LE; 516 else 517 lexungetc(c1); 518 break; 519 520 case '!': 521 if ((c1 = lexgetc()) == '=') 522 c = NE; 523 else if (c1 == '~') 524 c = NRE; 525 else 526 lexungetc(c1); 527 break; 528 529 case '=': 530 if ((c1 = lexgetc()) == '=') 531 c = EQ; 532 else { 533 lexungetc(c1); 534 c = ASG; 535 } 536 break; 537 538 case '\n': 539 switch (lexlast) { 540 case ')': 541 if (catterm || inprint) { 542 c = ';'; 543 break; 544 } 545 case AND: 546 case OR: 547 case COMMA: 548 case '{': 549 case ELSE: 550 case ';': 551 case DO: 552 continue; 553 554 case '}': 555 if (nbrace != 0) 556 continue; 557 558 default: 559 c = ';'; 560 break; 561 } 562 break; 563 564 case ELSE: 565 if (lexlast != ';') { 566 savetoken = ELSE; 567 c = ';'; 568 } 569 break; 570 571 case '(': 572 ++nparen; 573 break; 574 575 case ')': 576 if (--nparen < 0) 577 awkerr(unbal, "()"); 578 break; 579 580 case '{': 581 nbrace++; 582 break; 583 584 case '}': 585 if (--nbrace < 0) { 586 char brk[3]; 587 588 brk[0] = '{'; 589 brk[1] = '}'; 590 brk[2] = '\0'; 591 awkerr(unbal, brk); 592 } 593 if (lexlast != ';') { 594 savetoken = c; 595 c = ';'; 596 } 597 break; 598 599 case '[': 600 ++nbracket; 601 break; 602 603 case ']': 604 if (--nbracket < 0) { 605 char brk[3]; 606 607 brk[0] = '['; 608 brk[1] = ']'; 609 brk[2] = '\0'; 610 awkerr(unbal, brk); 611 } 612 break; 613 614 case '\\': 615 if ((c1 = lexgetc()) == '\n') 616 continue; 617 lexungetc(c1); 618 break; 619 620 case ',': 621 c = COMMA; 622 break; 623 624 case '?': 625 c = QUEST; 626 break; 627 628 case ':': 629 c = COLON; 630 break; 631 632 default: 633 if (!iswprint(c)) 634 awkerr( 635 gettext("invalid character \"%s\""), 636 toprint(c)); 637 break; 638 } 639 break; 640 } 641 642 switch (c) { 643 case ']': 644 ++catterm; 645 break; 646 647 case VAR: 648 if (catterm) { 649 savetoken = c; 650 c = CONCAT; 651 catterm = 0; 652 } else if (!isfuncdef) { 653 if ((c1=lexgetc()) != '(') 654 ++catterm; 655 lexungetc(c1); 656 } 657 isfuncdef = 0; 658 break; 659 660 case PARM: 661 case CONSTANT: 662 if (catterm) { 663 savetoken = c; 664 c = CONCAT; 665 catterm = 0; 666 } else { 667 if (lexlast == '$') 668 wasfield = 2; 669 ++catterm; 670 } 671 break; 672 673 case INC: 674 case DEC: 675 if (!catterm || lexlast!=CONSTANT || wasfield) 676 break; 677 678 case UFUNC: 679 case FUNC: 680 case GETLINE: 681 case '!': 682 case '$': 683 case '(': 684 if (catterm) { 685 savetoken = c; 686 c = CONCAT; 687 catterm = 0; 688 } 689 break; 690 691 /*{*/case '}': 692 if (nbrace == 0) 693 savetoken = ';'; 694 case ';': 695 inprint = 0; 696 default: 697 if (c == DEFFUNC) 698 isfuncdef = 1; 699 catterm = 0; 700 } 701 lexlast = c; 702 if (wasfield) 703 wasfield--; 704 /* 705 * Map character constants to symbolic names. 706 */ 707 for (i = 0; ctosym[i].c != 0; i++) 708 if (c == ctosym[i].c) { 709 c = ctosym[i].sym; 710 break; 711 } 712 return ((int)c); 713 } 714 715 /* 716 * Read a number for the lexical analyzer. 717 * Input is the first character of the number. 718 * Return value is the lexical type. 719 */ 720 static int 721 lexnumber(wint_t c) 722 { 723 register wchar_t *cp; 724 register int dotfound = 0; 725 register int efound = 0; 726 INT number; 727 728 cp = linebuf; 729 do { 730 if (iswdigit(c)) 731 ; 732 else if (c == '.') { 733 if (dotfound++) 734 break; 735 } else if (c=='e' || c=='E') { 736 if ((c = lexgetc())!='-' && c!='+') { 737 lexungetc(c); 738 c = 'e'; 739 } else 740 *cp++ = 'e'; 741 if (efound++) 742 break; 743 } else 744 break; 745 *cp++ = c; 746 } while ((c = lexgetc()) != WEOF); 747 *cp = '\0'; 748 if (dotfound && cp==linebuf+1) 749 return (DOT); 750 lexungetc(c); 751 errno = 0; 752 if (!dotfound 753 && !efound 754 && ((number=wcstol(linebuf, (wchar_t **)0, 10)), errno!=ERANGE)) 755 yylval.node = intnode(number); 756 else 757 yylval.node = realnode((REAL)wcstod(linebuf, (wchar_t **)0)); 758 return (CONSTANT); 759 } 760 761 /* 762 * Read an identifier. 763 * Input is first character of identifier. 764 * Return VAR. 765 */ 766 static int 767 lexid(wint_t c) 768 { 769 register wchar_t *cp; 770 register size_t i; 771 register NODE *np; 772 773 cp = linebuf; 774 do { 775 *cp++ = c; 776 c = lexgetc(); 777 } while (iswalpha(c) || iswdigit(c) || c=='_'); 778 *cp = '\0'; 779 lexungetc(c); 780 yylval.node = np = vlook(linebuf); 781 782 switch(np->n_type) { 783 case KEYWORD: 784 switch (np->n_keywtype) { 785 case PRINT: 786 case PRINTF: 787 ++inprint; 788 default: 789 return ((int)np->n_keywtype); 790 } 791 /* NOTREACHED */ 792 793 case ARRAY: 794 case VAR: 795 /* 796 * If reading the argument list, create a dummy node 797 * for the duration of that function. These variables 798 * can be removed from the symbol table at function end 799 * but they must still exist because the execution tree 800 * knows about them. 801 */ 802 if (funparm) { 803 do_funparm: 804 np = emptynode(PARM, i=(cp-linebuf)); 805 np->n_flags = FSTRING; 806 np->n_string = _null; 807 np->n_strlen = 0; 808 (void) memcpy(np->n_name, linebuf, 809 (i+1) * sizeof(wchar_t)); 810 addsymtab(np); 811 yylval.node = np; 812 } else if (np == varNF || (np == varFS && 813 (!doing_begin || begin_getline))) { 814 /* 815 * If the user program references NF or sets 816 * FS either outside of a begin block or 817 * in a begin block after a getline then the 818 * input line will be split immediately upon read 819 * rather than when a field is first referenced. 820 */ 821 needsplit = 1; 822 } else if (np == varENVIRON) 823 needenviron = 1; 824 case PARM: 825 return (VAR); 826 827 case UFUNC: 828 /* 829 * It is ok to redefine functions as parameters 830 */ 831 if (funparm) goto do_funparm; 832 case FUNC: 833 case GETLINE: 834 /* 835 * When a getline is encountered, clear the 'doing_begin' flag. 836 * This will force the 'needsplit' flag to be set, even inside 837 * a begin block, if FS is altered. (See VAR case above) 838 */ 839 if (doing_begin) 840 begin_getline = 1; 841 return (np->n_type); 842 } 843 /* NOTREACHED */ 844 return (0); 845 } 846 847 /* 848 * Read a string for the lexical analyzer. 849 * `endc' terminates the string. 850 */ 851 static int 852 lexstring(wint_t endc) 853 { 854 register size_t length = lexescape(endc, 0, 0); 855 856 yylval.node = stringnode(linebuf, FALLOC, length); 857 return (CONSTANT); 858 } 859 860 /* 861 * Read a regular expression. 862 */ 863 static int 864 lexregexp(wint_t endc) 865 { 866 (void) lexescape(endc, 1, 0); 867 yylval.node = renode(linebuf); 868 return (URE); 869 } 870 871 /* 872 * Process a string, converting the escape characters as required by 873 * 1003.2. The processed string ends up in the global linebuf[]. This 874 * routine also changes the value of 'progfd' - the program file 875 * descriptor, so it should be used with some care. It is presently used to 876 * process -v (awk1.c) and var=str type arguments (awk2.c, nextrecord()). 877 */ 878 void 879 strescape(wchar_t *str) 880 { 881 progptr = str; 882 proglen = wcslen(str) + 1; /* Include \0 */ 883 (void) lexescape('\0', 0, 1); 884 progptr = NULL; 885 } 886 887 /* 888 * Read a string or regular expression, terminated by ``endc'', 889 * for lexical analyzer, processing escape sequences. 890 * Return string length. 891 */ 892 static size_t 893 lexescape(wint_t endc, int regx, int cmd_line_operand) 894 { 895 static char nlre[256]; 896 static char nlstr[256]; 897 static char eofre[256]; 898 static char eofstr[256]; 899 int first_time = 1; 900 wint_t c; 901 wchar_t *cp; 902 int n, max; 903 904 if (first_time == 1) { 905 (void) strcpy(nlre, gettext("Newline in regular expression\n")); 906 (void) strcpy(nlstr, gettext("Newline in string\n")); 907 (void) strcpy(eofre, gettext("EOF in regular expression\n")); 908 (void) strcpy(eofstr, gettext("EOF in string\n")); 909 first_time = 0; 910 } 911 912 cp = linebuf; 913 while ((c = lexgetc()) != endc) { 914 if (c == '\n') 915 awkerr(regx ? nlre : nlstr); 916 if (c == '\\') { 917 switch (c = lexgetc(), c) { 918 case '\\': 919 if (regx) 920 *cp++ = '\\'; 921 break; 922 923 case '/': 924 c = '/'; 925 break; 926 927 case 'n': 928 c = '\n'; 929 break; 930 931 case 'b': 932 c = '\b'; 933 break; 934 935 case 't': 936 c = '\t'; 937 break; 938 939 case 'r': 940 c = '\r'; 941 break; 942 943 case 'f': 944 c = '\f'; 945 break; 946 947 case 'v': 948 c = '\v'; 949 break; 950 951 case 'a': 952 c = (char) 0x07; 953 break; 954 955 case 'x': 956 n = 0; 957 while (iswxdigit(c = lexgetc())) { 958 if (iswdigit(c)) 959 c -= '0'; 960 else if (iswupper(c)) 961 c -= 'A'-10; 962 else 963 c -= 'a'-10; 964 n = (n<<4) + c; 965 } 966 lexungetc(c); 967 c = n; 968 break; 969 970 case '0': 971 case '1': 972 case '2': 973 case '3': 974 case '4': 975 case '5': 976 case '6': 977 case '7': 978 #if 0 979 /* 980 * Posix.2 draft 10 disallows the use of back-referencing - it explicitly 981 * requires processing of the octal escapes both in strings and 982 * regular expressions. The following code is disabled instead of 983 * removed as back-referencing may be reintroduced in a future draft 984 * of the standard. 985 */ 986 /* 987 * For regular expressions, we disallow 988 * \ooo to mean octal character, in favour 989 * of back referencing. 990 */ 991 if (regx) { 992 *cp++ = '\\'; 993 break; 994 } 995 #endif 996 max = 3; 997 n = 0; 998 do { 999 n = (n<<3) + c-'0'; 1000 if ((c = lexgetc())>'7' || c<'0') 1001 break; 1002 } while (--max); 1003 lexungetc(c); 1004 /* 1005 * an octal escape sequence must have at least 1006 * 2 digits after the backslash, otherwise 1007 * it gets passed straight thru for possible 1008 * use in backreferencing. 1009 */ 1010 if (max == 3) { 1011 *cp++ = '\\'; 1012 n += '0'; 1013 } 1014 c = n; 1015 break; 1016 1017 case '\n': 1018 continue; 1019 1020 default: 1021 if (c != endc || cmd_line_operand) { 1022 *cp++ = '\\'; 1023 if (c == endc) 1024 lexungetc(c); 1025 } 1026 } 1027 } 1028 if (c == WEOF) 1029 awkerr(regx ? eofre : eofstr); 1030 *cp++ = c; 1031 } 1032 *cp = '\0'; 1033 return (cp - linebuf); 1034 } 1035 1036 /* 1037 * Build a regular expression NODE. 1038 * Argument is the string holding the expression. 1039 */ 1040 NODE * 1041 renode(wchar_t *s) 1042 { 1043 register NODE *np; 1044 int n; 1045 1046 np = emptynode(RE, 0); 1047 np->n_left = np->n_right = NNULL; 1048 np->n_regexp = (REGEXP)emalloc(sizeof(regex_t)); 1049 if ((n = REGWCOMP(np->n_regexp, s, REG_EXTENDED)) != REG_OK) { 1050 int m; 1051 char *p; 1052 1053 m = regerror(n, np->n_regexp, NULL, 0); 1054 p = (char *)emalloc(m); 1055 regerror(n, np->n_regexp, p, m); 1056 awkerr("/%S/: %s", s, p); 1057 } 1058 return (np); 1059 } 1060 /* 1061 * Get a character for the lexical analyser routine. 1062 */ 1063 static wint_t 1064 lexgetc() 1065 { 1066 register wint_t c; 1067 static char **files = &progfiles[0]; 1068 1069 if (progfp!=FNULL && (c = fgetwc(progfp))!=WEOF) 1070 ; 1071 else { 1072 if (progptr != NULL) { 1073 if (proglen-- <= 0) 1074 c = WEOF; 1075 else 1076 c = *progptr++; 1077 } else { 1078 if (progfp != FNULL) 1079 if (progfp != stdin) 1080 (void)fclose(progfp); 1081 else 1082 clearerr(progfp); 1083 progfp = FNULL; 1084 if (files < progfilep) { 1085 filename = *files++; 1086 lineno = 1; 1087 if (filename[0]=='-' && filename[1]=='\0') 1088 progfp = stdin; 1089 else if ((progfp=fopen(filename, r)) == FNULL) { 1090 (void) fprintf(stderr, 1091 gettext("script file \"%s\""), filename); 1092 exit(1); 1093 } 1094 c = fgetwc(progfp); 1095 } 1096 } 1097 } 1098 if (c == '\n') 1099 ++lineno; 1100 if (conptr >= &context[NCONTEXT]) 1101 conptr = &context[0]; 1102 if (c != WEOF) 1103 *conptr++ = c; 1104 return (c); 1105 } 1106 1107 /* 1108 * Return a character for lexical analyser. 1109 * Only one returned character is (not enforced) legitimite. 1110 */ 1111 static void 1112 lexungetc(wint_t c) 1113 { 1114 if (c == '\n') 1115 --lineno; 1116 if (c != WEOF) { 1117 if (conptr == &context[0]) 1118 conptr = &context[NCONTEXT]; 1119 *--conptr = '\0'; 1120 } 1121 if (progfp != FNULL) { 1122 (void)ungetwc(c, progfp); 1123 return; 1124 } 1125 if (c == WEOF) 1126 return; 1127 *--progptr = c; 1128 proglen++; 1129 } 1130 1131 /* 1132 * Syntax errors during parsing. 1133 */ 1134 void 1135 yyerror(char *s, ...) 1136 { 1137 if (lexlast==FUNC || lexlast==GETLINE || lexlast==KEYWORD) 1138 if (lexlast == KEYWORD) 1139 awkerr(gettext("inadmissible use of reserved keyword")); 1140 else 1141 awkerr(gettext("attempt to redefine builtin function")); 1142 awkerr(s); 1143 } 1144 1145 /* 1146 * Error routine for all awk errors. 1147 */ 1148 /* ARGSUSED */ 1149 void 1150 awkerr(char *fmt, ...) 1151 { 1152 va_list args; 1153 1154 va_start(args, fmt); 1155 awkierr(0, fmt, args); 1156 va_end(args); 1157 } 1158 1159 /* 1160 * Error routine like "awkerr" except that it prints out 1161 * a message that includes an errno-specific indication. 1162 */ 1163 /* ARGSUSED */ 1164 void 1165 awkperr(char *fmt, ...) 1166 { 1167 va_list args; 1168 1169 va_start(args, fmt); 1170 awkierr(1, fmt, args); 1171 va_end(args); 1172 } 1173 1174 /* 1175 * Common internal routine for awkerr, awkperr 1176 */ 1177 static void 1178 awkierr(int perr, char *fmt, va_list ap) 1179 { 1180 static char sep1[] = "\n>>>\t"; 1181 static char sep2[] = "\t<<<"; 1182 int saveerr = errno; 1183 1184 (void) fprintf(stderr, "%s: ", _cmdname); 1185 if (running) { 1186 (void) fprintf(stderr, gettext("line %u ("), 1187 curnode==NNULL ? 0 : curnode->n_lineno); 1188 if (phase == 0) 1189 (void) fprintf(stderr, "NR=%lld): ", (INT)exprint(varNR)); 1190 else 1191 (void) fprintf(stderr, "%s): ", 1192 phase==BEGIN ? s_BEGIN : s_END); 1193 } else if (lineno != 0) { 1194 (void) fprintf(stderr, gettext("file \"%s\": "), filename); 1195 (void) fprintf(stderr, gettext("line %u: "), lineno); 1196 } 1197 (void) vfprintf(stderr, gettext(fmt), ap); 1198 if (perr == 1) 1199 (void) fprintf(stderr, ": %s", strerror(saveerr)); 1200 if (perr != 2 && !running) { 1201 register wchar_t *cp; 1202 register int n; 1203 register int c; 1204 1205 (void) fprintf(stderr, gettext(" Context is:%s"), sep1); 1206 cp = conptr; 1207 n = NCONTEXT; 1208 do { 1209 if (cp >= &context[NCONTEXT]) 1210 cp = &context[0]; 1211 if ((c = *cp++) != '\0') 1212 (void)fputs(c=='\n' ? sep1 : toprint(c), 1213 stderr); 1214 } while (--n != 0); 1215 (void)fputs(sep2, stderr); 1216 } 1217 (void) fprintf(stderr, "\n"); 1218 exit(1); 1219 } 1220 1221 wchar_t * 1222 emalloc(unsigned n) 1223 { 1224 wchar_t *cp; 1225 1226 if ((cp = malloc(n)) == NULL) 1227 awkerr(nomem); 1228 return cp; 1229 } 1230 1231 wchar_t * 1232 erealloc(wchar_t *p, unsigned n) 1233 { 1234 wchar_t *cp; 1235 1236 if ((cp = realloc(p, n)) == NULL) 1237 awkerr(nomem); 1238 return cp; 1239 } 1240 1241 1242 /* 1243 * usage message for awk 1244 */ 1245 static int 1246 usage() 1247 { 1248 (void) fprintf(stderr, gettext( 1249 "Usage: awk [-F ERE] [-v var=val] 'program' [var=val ...] [file ...]\n" 1250 " awk [-F ERE] -f progfile ... [-v var=val] [var=val ...] [file ...]\n")); 1251 return (2); 1252 } 1253 1254 1255 static wchar_t * 1256 mbconvert(char *str) 1257 { 1258 static wchar_t *op = 0; 1259 1260 if (op != 0) 1261 free(op); 1262 return (op = mbstowcsdup(str)); 1263 } 1264 1265 char * 1266 mbunconvert(wchar_t *str) 1267 { 1268 static char *op = 0; 1269 1270 if (op != 0) 1271 free(op); 1272 return (op = wcstombsdup(str)); 1273 } 1274 1275 /* 1276 * Solaris port - following functions are typical MKS functions written 1277 * to work for Solaris. 1278 */ 1279 1280 wchar_t * 1281 mbstowcsdup(s) 1282 char *s; 1283 { 1284 int n; 1285 wchar_t *w; 1286 1287 n = strlen(s) + 1; 1288 if ((w = (wchar_t *)malloc(n * sizeof (wchar_t))) == NULL) 1289 return (NULL); 1290 1291 if (mbstowcs(w, s, n) == -1) 1292 return (NULL); 1293 return (w); 1294 1295 } 1296 1297 char * 1298 wcstombsdup(wchar_t *w) 1299 { 1300 int n; 1301 char *mb; 1302 1303 /* Fetch memory for worst case string length */ 1304 n = wslen(w) + 1; 1305 n *= MB_CUR_MAX; 1306 if ((mb = (char *)malloc(n)) == NULL) { 1307 return (NULL); 1308 } 1309 1310 /* Convert the string */ 1311 if ((n = wcstombs(mb, w, n)) == -1) { 1312 int saverr = errno; 1313 1314 free(mb); 1315 errno = saverr; 1316 return (0); 1317 } 1318 1319 /* Shrink the string down */ 1320 if ((mb = (char *)realloc(mb, strlen(mb)+1)) == NULL) { 1321 return (NULL); 1322 } 1323 return (mb); 1324 } 1325 1326 /* 1327 * The upe_ctrls[] table contains the printable 'control-sequences' for the 1328 * character values 0..31 and 127. The first entry is for value 127, thus the 1329 * entries for the remaining character values are from 1..32. 1330 */ 1331 static const char *const upe_ctrls[] = 1332 { 1333 "^?", 1334 "^@", "^A", "^B", "^C", "^D", "^E", "^F", "^G", 1335 "^H", "^I", "^J", "^K", "^L", "^M", "^N", "^O", 1336 "^P", "^Q", "^R", "^S", "^T", "^U", "^V", "^W", 1337 "^X", "^Y", "^Z", "^[", "^\\", "^]", "^^", "^_" 1338 }; 1339 1340 1341 /* 1342 * Return a printable string corresponding to the given character value. If 1343 * the character is printable, simply return it as the string. If it is in 1344 * the range specified by table 5-101 in the UPE, return the corresponding 1345 * string. Otherwise, return an octal escape sequence. 1346 */ 1347 static const char * 1348 toprint(c) 1349 wchar_t c; 1350 { 1351 int n, len; 1352 unsigned char *ptr; 1353 static char mbch[MB_LEN_MAX+1]; 1354 static char buf[5 * MB_LEN_MAX + 1]; 1355 1356 if ((n = wctomb(mbch, c)) == -1) { 1357 /* Should never happen */ 1358 (void) sprintf(buf, "\\%x", c); 1359 return (buf); 1360 } 1361 mbch[n] = '\0'; 1362 if (iswprint(c)) { 1363 return (mbch); 1364 } else if (c == 127) { 1365 return (upe_ctrls[0]); 1366 } else if (c < 32) { 1367 /* Print as in Table 5-101 in the UPE */ 1368 return (upe_ctrls[c+1]); 1369 } else { 1370 /* Print as an octal escape sequence */ 1371 for (len = 0, ptr = (unsigned char *) mbch; 0 < n; --n, ++ptr) 1372 len += sprintf(buf+len, "\\%03o", *ptr); 1373 } 1374 return (buf); 1375 } 1376 1377 static int 1378 wcoff(const wchar_t *astring, const int off) 1379 { 1380 const wchar_t *s = astring; 1381 int c = 0; 1382 char mb[MB_LEN_MAX]; 1383 1384 while (c < off) { 1385 int n; 1386 if ((n = wctomb(mb, *s)) == 0) 1387 break; 1388 if (n == -1) 1389 n = 1; 1390 c += n; 1391 s++; 1392 } 1393 1394 return (s - astring); 1395 } 1396 1397 int 1398 int_regwcomp(register regex_t *r, const wchar_t *pattern, int uflags) 1399 { 1400 char *mbpattern; 1401 int ret; 1402 1403 if ((mbpattern = wcstombsdup((wchar_t *) pattern)) == NULL) 1404 return (REG_ESPACE); 1405 1406 ret = regcomp(r, mbpattern, uflags); 1407 1408 free(mbpattern); 1409 1410 return (ret); 1411 } 1412 1413 int 1414 int_regwexec(const regex_t *r, /* compiled RE */ 1415 const wchar_t *astring, /* subject string */ 1416 size_t nsub, /* number of subexpressions */ 1417 int_regwmatch_t *sub, /* subexpression pointers */ 1418 int flags) 1419 { 1420 char *mbs; 1421 regmatch_t *mbsub = NULL; 1422 register int i; 1423 1424 if ((mbs = wcstombsdup((wchar_t *) astring)) == NULL) 1425 return (REG_ESPACE); 1426 1427 if (nsub > 0 && sub) { 1428 if ((mbsub = malloc(nsub * sizeof (regmatch_t))) == NULL) 1429 return (REG_ESPACE); 1430 } 1431 1432 i = regexec(r, mbs, nsub, mbsub, flags); 1433 1434 /* Now, adjust the pointers/counts in sub */ 1435 if (i == REG_OK && nsub > 0 && mbsub) { 1436 register int j, k; 1437 1438 for (j = 0; j < nsub; j++) { 1439 regmatch_t *ms = &mbsub[j]; 1440 int_regwmatch_t *ws = &sub[j]; 1441 1442 if ((k = ms->rm_so) >= 0) { 1443 ws->rm_so = wcoff(astring, k); 1444 ws->rm_sp = astring + ws->rm_so; 1445 } 1446 if ((k = ms->rm_eo) >= 0) { 1447 ws->rm_eo = wcoff(astring, k); 1448 ws->rm_ep = astring + ws->rm_eo; 1449 } 1450 } 1451 } 1452 1453 free(mbs); 1454 if (mbsub) 1455 free(mbsub); 1456 return (i); 1457 } 1458 1459 int 1460 int_regwdosuba(register regex_t *rp, /* compiled RE: Pattern */ 1461 const wchar_t *rpl, /* replacement string: /rpl/ */ 1462 const wchar_t *src, /* source string */ 1463 wchar_t **dstp, /* destination string */ 1464 int len, /* destination length */ 1465 int *globp) /* IN: occurence, 0 for all; OUT: substitutions */ 1466 { 1467 wchar_t *dst, *odst; 1468 register const wchar_t *ip, *xp; 1469 register wchar_t *op; 1470 register int i; 1471 register wchar_t c; 1472 int glob, iglob = *globp, oglob = 0; 1473 #define NSUB 10 1474 int_regwmatch_t rm[NSUB], *rmp; 1475 int flags; 1476 wchar_t *end; 1477 int regerr; 1478 1479 /* handle overflow of dst. we need "i" more bytes */ 1480 #ifdef OVERFLOW 1481 #undef OVERFLOW 1482 #define OVERFLOW(i) if (1) { \ 1483 int pos = op - dst; \ 1484 dst = (wchar_t *) realloc(odst = dst, \ 1485 (len += len + i) * sizeof (wchar_t)); \ 1486 if (dst == NULL) \ 1487 goto nospace; \ 1488 op = dst + pos; \ 1489 end = dst + len; \ 1490 } else 1491 #endif 1492 1493 *dstp = dst = (wchar_t *) malloc(len * sizeof (wchar_t)); 1494 if (dst == NULL) 1495 return (REG_ESPACE); 1496 1497 if (rp == NULL || rpl == NULL || src == NULL || dst == NULL) 1498 return (REG_EFATAL); 1499 1500 glob = 0; /* match count */ 1501 ip = src; /* source position */ 1502 op = dst; /* destination position */ 1503 end = dst + len; 1504 1505 flags = 0; 1506 while ((regerr = int_regwexec(rp, ip, NSUB, rm, flags)) == REG_OK) { 1507 /* Copy text preceding match */ 1508 if (op + (i = rm[0].rm_sp - ip) >= end) 1509 OVERFLOW(i); 1510 while (i--) 1511 *op++ = *ip++; 1512 1513 if (iglob == 0 || ++glob == iglob) { 1514 oglob++; 1515 xp = rpl; /* do substitute */ 1516 } else 1517 xp = L"&"; /* preserve text */ 1518 1519 /* Perform replacement of matched substing */ 1520 while ((c = *xp++) != '\0') { 1521 rmp = NULL; 1522 if (c == '&') 1523 rmp = &rm[0]; 1524 else if (c == '\\') { 1525 if ('0' <= *xp && *xp <= '9') 1526 rmp = &rm[*xp++ - '0']; 1527 else if (*xp != '\0') 1528 c = *xp++; 1529 } 1530 1531 if (rmp == NULL) { /* Ordinary character. */ 1532 *op++ = c; 1533 if (op >= end) 1534 OVERFLOW(1); 1535 } else if (rmp->rm_sp != NULL && rmp->rm_ep != NULL) { 1536 ip = rmp->rm_sp; 1537 if (op + (i = rmp->rm_ep - rmp->rm_sp) >= end) 1538 OVERFLOW(i); 1539 while (i--) 1540 *op++ = *ip++; 1541 } 1542 } 1543 1544 ip = rm[0].rm_ep; 1545 if (*ip == '\0') /* If at end break */ 1546 break; 1547 else if (rm[0].rm_sp == rm[0].rm_ep) { 1548 /* If empty match copy next char */ 1549 *op++ = *ip++; 1550 if (op >= end) 1551 OVERFLOW(1); 1552 } 1553 flags = REG_NOTBOL; 1554 } 1555 1556 if (regerr != REG_OK && regerr != REG_NOMATCH) 1557 return (regerr); 1558 1559 /* Copy rest of text */ 1560 if (op + (i = wcslen(ip)) >= end) 1561 OVERFLOW(i); 1562 while (i--) 1563 *op++ = *ip++; 1564 *op++ = '\0'; 1565 1566 if ((*dstp = dst = (wchar_t *) realloc(odst = dst, 1567 sizeof (wchar_t) * (size_t)(op - dst))) == NULL) { 1568 nospace: 1569 free(odst); 1570 return (REG_ESPACE); 1571 } 1572 1573 *globp = oglob; 1574 1575 return ((oglob == 0) ? REG_NOMATCH : REG_OK); 1576 } 1577