1 /* 2 * Copyright (C) Lucent Technologies 1997 3 * All Rights Reserved 4 * 5 * Permission to use, copy, modify, and distribute this software and 6 * its documentation for any purpose and without fee is hereby 7 * granted, provided that the above copyright notice appear in all 8 * copies and that both that the copyright notice and this 9 * permission notice and warranty disclaimer appear in supporting 10 * documentation, and that the name Lucent Technologies or any of 11 * its entities not be used in advertising or publicity pertaining 12 * to distribution of the software without specific, written prior 13 * permission. 14 * 15 * LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 16 * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 17 * IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 20 * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 21 * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 22 * THIS SOFTWARE. 23 */ 24 25 #include <stdio.h> 26 #include <stdlib.h> 27 #include <string.h> 28 #include <ctype.h> 29 #include "awk.h" 30 #include "y.tab.h" 31 32 extern YYSTYPE yylval; 33 extern int infunc; 34 35 off_t lineno = 1; 36 int bracecnt = 0; 37 int brackcnt = 0; 38 int parencnt = 0; 39 40 typedef struct Keyword { 41 const char *word; 42 int sub; 43 int type; 44 } Keyword; 45 46 Keyword keywords[] = { /* keep sorted: binary searched */ 47 { "BEGIN", XBEGIN, XBEGIN }, 48 { "END", XEND, XEND }, 49 { "NF", VARNF, VARNF }, 50 { "atan2", FATAN, BLTIN }, 51 { "break", BREAK, BREAK }, 52 { "close", CLOSE, CLOSE }, 53 { "continue", CONTINUE, CONTINUE }, 54 { "cos", FCOS, BLTIN }, 55 { "delete", DELETE, DELETE }, 56 { "do", DO, DO }, 57 { "else", ELSE, ELSE }, 58 { "exit", EXIT, EXIT }, 59 { "exp", FEXP, BLTIN }, 60 { "fflush", FFLUSH, BLTIN }, 61 { "for", FOR, FOR }, 62 { "func", FUNC, FUNC }, 63 { "function", FUNC, FUNC }, 64 { "getline", GETLINE, GETLINE }, 65 { "gsub", GSUB, GSUB }, 66 { "if", IF, IF }, 67 { "in", IN, IN }, 68 { "index", INDEX, INDEX }, 69 { "int", FINT, BLTIN }, 70 { "length", FLENGTH, BLTIN }, 71 { "log", FLOG, BLTIN }, 72 { "match", MATCHFCN, MATCHFCN }, 73 { "next", NEXT, NEXT }, 74 { "nextfile", NEXTFILE, NEXTFILE }, 75 { "print", PRINT, PRINT }, 76 { "printf", PRINTF, PRINTF }, 77 { "rand", FRAND, BLTIN }, 78 { "return", RETURN, RETURN }, 79 { "sin", FSIN, BLTIN }, 80 { "split", SPLIT, SPLIT }, 81 { "sprintf", SPRINTF, SPRINTF }, 82 { "sqrt", FSQRT, BLTIN }, 83 { "srand", FSRAND, BLTIN }, 84 { "sub", SUB, SUB }, 85 { "substr", SUBSTR, SUBSTR }, 86 { "system", FSYSTEM, BLTIN }, 87 { "tolower", FTOLOWER, BLTIN }, 88 { "toupper", FTOUPPER, BLTIN }, 89 { "while", WHILE, WHILE }, 90 }; 91 92 #define RET(x) { if (dbg) (void) printf("lex %s\n", tokname(x)); return (x); } 93 94 int 95 peek(void) 96 { 97 int c = input(); 98 unput(c); 99 return (c); 100 } 101 102 int 103 gettok(char **pbuf, size_t *psz) /* get next input token */ 104 { 105 int c, retc; 106 char *buf = *pbuf; 107 size_t sz = *psz; 108 char *bp = buf; 109 110 c = input(); 111 if (c == 0) 112 return (0); 113 buf[0] = c; 114 buf[1] = 0; 115 if (!isalnum(c) && c != '.' && c != '_') 116 return (c); 117 118 *bp++ = c; 119 if (isalpha(c) || c == '_') { /* it's a varname */ 120 for (; (c = input()) != 0; ) { 121 if (bp-buf >= sz && 122 !adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok")) 123 FATAL("out of space for name %.10s...", buf); 124 if (isalnum(c) || c == '_') 125 *bp++ = c; 126 else { 127 *bp = 0; 128 unput(c); 129 break; 130 } 131 } 132 *bp = 0; 133 retc = 'a'; /* alphanumeric */ 134 } else { /* maybe it's a number, but could be . */ 135 char *rem; 136 /* read input until can't be a number */ 137 for (; (c = input()) != 0; ) { 138 if (bp-buf >= sz && 139 !adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok")) 140 FATAL("out of space for number %.10s...", buf); 141 if (isdigit(c) || c == 'e' || c == 'E' || 142 c == '.' || c == '+' || c == '-') 143 *bp++ = c; 144 else { 145 unput(c); 146 break; 147 } 148 } 149 *bp = 0; 150 (void) strtod(buf, &rem); /* parse the number */ 151 if (rem == buf) { /* it wasn't a valid number at all */ 152 buf[1] = 0; /* return one character as token */ 153 retc = buf[0]; /* character is its own type */ 154 unputstr(rem+1); /* put rest back for later */ 155 } else { /* some prefix was a number */ 156 unputstr(rem); /* put rest back for later */ 157 rem[0] = 0; /* truncate buf after number part */ 158 retc = '0'; /* type is number */ 159 } 160 } 161 *pbuf = buf; 162 *psz = sz; 163 return (retc); 164 } 165 166 int word(char *); 167 int string(void); 168 int regexpr(void); 169 int sc = 0; /* 1 => return a } right now */ 170 int reg = 0; /* 1 => return a REGEXPR now */ 171 172 int 173 yylex(void) 174 { 175 int c; 176 static char *buf = NULL; 177 /* BUG: setting this small causes core dump! */ 178 static size_t bufsize = 5; 179 180 if (buf == NULL && (buf = (char *)malloc(bufsize)) == NULL) 181 FATAL("out of space in yylex"); 182 if (sc) { 183 sc = 0; 184 RET('}'); 185 } 186 if (reg) { 187 reg = 0; 188 return (regexpr()); 189 } 190 for (;;) { 191 c = gettok(&buf, &bufsize); 192 if (c == 0) 193 return (0); 194 if (isalpha(c) || c == '_') 195 return (word(buf)); 196 if (isdigit(c)) { 197 yylval.cp = setsymtab( 198 buf, tostring(buf), atof(buf), CON|NUM, symtab); 199 /* should this also have STR set? */ 200 RET(NUMBER); 201 } 202 203 yylval.i = c; 204 switch (c) { 205 case '\n': /* {EOL} */ 206 lineno++; 207 RET(NL); 208 case '\r': /* assume \n is coming */ 209 case ' ': /* {WS}+ */ 210 case '\t': 211 break; 212 case '#': /* #.* strip comments */ 213 while ((c = input()) != '\n' && c != 0) 214 ; 215 unput(c); 216 break; 217 case ';': 218 RET(';'); 219 case '\\': 220 if (peek() == '\n') { 221 (void) input(); 222 lineno++; 223 } else if (peek() == '\r') { 224 (void) input(); 225 (void) input(); /* BUG: check for \n */ 226 lineno++; 227 } else { 228 RET(c); 229 } 230 break; 231 case '&': 232 if (peek() == '&') { 233 (void) input(); 234 RET(AND); 235 } else 236 RET('&'); 237 case '|': 238 if (peek() == '|') { 239 (void) input(); 240 RET(BOR); 241 } else 242 RET('|'); 243 case '!': 244 if (peek() == '=') { 245 (void) input(); 246 yylval.i = NE; 247 RET(NE); 248 } else if (peek() == '~') { 249 (void) input(); 250 yylval.i = NOTMATCH; 251 RET(MATCHOP); 252 } else 253 RET(NOT); 254 case '~': 255 yylval.i = MATCH; 256 RET(MATCHOP); 257 case '<': 258 if (peek() == '=') { 259 (void) input(); 260 yylval.i = LE; 261 RET(LE); 262 } else { 263 yylval.i = LT; 264 RET(LT); 265 } 266 case '=': 267 if (peek() == '=') { 268 (void) input(); 269 yylval.i = EQ; 270 RET(EQ); 271 } else { 272 yylval.i = ASSIGN; 273 RET(ASGNOP); 274 } 275 case '>': 276 if (peek() == '=') { 277 (void) input(); 278 yylval.i = GE; 279 RET(GE); 280 } else if (peek() == '>') { 281 (void) input(); 282 yylval.i = APPEND; 283 RET(APPEND); 284 } else { 285 yylval.i = GT; 286 RET(GT); 287 } 288 case '+': 289 if (peek() == '+') { 290 (void) input(); 291 yylval.i = INCR; 292 RET(INCR); 293 } else if (peek() == '=') { 294 (void) input(); 295 yylval.i = ADDEQ; 296 RET(ASGNOP); 297 } else 298 RET('+'); 299 case '-': 300 if (peek() == '-') { 301 (void) input(); 302 yylval.i = DECR; 303 RET(DECR); 304 } else if (peek() == '=') { 305 (void) input(); 306 yylval.i = SUBEQ; 307 RET(ASGNOP); 308 } else 309 RET('-'); 310 case '*': 311 if (peek() == '=') { /* *= */ 312 (void) input(); 313 yylval.i = MULTEQ; 314 RET(ASGNOP); 315 } else if (peek() == '*') { /* ** or **= */ 316 (void) input(); /* eat 2nd * */ 317 if (peek() == '=') { 318 (void) input(); 319 yylval.i = POWEQ; 320 RET(ASGNOP); 321 } else { 322 RET(POWER); 323 } 324 } else 325 RET('*'); 326 case '/': 327 RET('/'); 328 case '%': 329 if (peek() == '=') { 330 (void) input(); 331 yylval.i = MODEQ; 332 RET(ASGNOP); 333 } else 334 RET('%'); 335 case '^': 336 if (peek() == '=') { 337 (void) input(); 338 yylval.i = POWEQ; 339 RET(ASGNOP); 340 } else 341 RET(POWER); 342 343 case '$': 344 /* BUG: awkward, if not wrong */ 345 c = gettok(&buf, &bufsize); 346 if (isalpha(c)) { 347 if (strcmp(buf, "NF") == 0) { 348 /* very special */ 349 unputstr("(NF)"); 350 RET(INDIRECT); 351 } 352 c = peek(); 353 if (c == '(' || c == '[' || 354 (infunc && isarg(buf) >= 0)) { 355 unputstr(buf); 356 RET(INDIRECT); 357 } 358 yylval.cp = setsymtab( 359 buf, "", 0.0, STR|NUM, symtab); 360 RET(IVAR); 361 } else if (c == 0) { /* */ 362 SYNTAX("unexpected end of input after $"); 363 RET(';'); 364 } else { 365 unputstr(buf); 366 RET(INDIRECT); 367 } 368 369 case '}': 370 if (--bracecnt < 0) 371 SYNTAX("extra }"); 372 sc = 1; 373 RET(';'); 374 case ']': 375 if (--brackcnt < 0) 376 SYNTAX("extra ]"); 377 RET(']'); 378 case ')': 379 if (--parencnt < 0) 380 SYNTAX("extra )"); 381 RET(')'); 382 case '{': 383 bracecnt++; 384 RET('{'); 385 case '[': 386 brackcnt++; 387 RET('['); 388 case '(': 389 parencnt++; 390 RET('('); 391 392 case '"': 393 /* BUG: should be like tran.c ? */ 394 return (string()); 395 396 default: 397 RET(c); 398 } 399 } 400 } 401 402 int 403 string(void) 404 { 405 int c, n; 406 char *s, *bp; 407 static char *buf = NULL; 408 static size_t bufsz = 500; 409 410 if (buf == NULL && (buf = (char *)malloc(bufsz)) == NULL) 411 FATAL("out of space for strings"); 412 for (bp = buf; (c = input()) != '"'; ) { 413 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string")) 414 FATAL("out of space for string %.10s...", buf); 415 switch (c) { 416 case '\n': 417 case '\r': 418 case 0: 419 *bp = '\0'; 420 SYNTAX("non-terminated string %.10s...", buf); 421 if (c == 0) /* hopeless */ 422 FATAL("giving up"); 423 lineno++; 424 break; 425 case '\\': 426 c = input(); 427 switch (c) { 428 case '"': *bp++ = '"'; break; 429 case 'n': *bp++ = '\n'; break; 430 case 't': *bp++ = '\t'; break; 431 case 'f': *bp++ = '\f'; break; 432 case 'r': *bp++ = '\r'; break; 433 case 'b': *bp++ = '\b'; break; 434 case 'v': *bp++ = '\v'; break; 435 case 'a': *bp++ = '\007'; break; 436 case '\\': *bp++ = '\\'; break; 437 438 case '0': case '1': case '2': /* octal: \d \dd \ddd */ 439 case '3': case '4': case '5': case '6': case '7': 440 n = c - '0'; 441 if ((c = peek()) >= '0' && c < '8') { 442 n = 8 * n + input() - '0'; 443 if ((c = peek()) >= '0' && c < '8') 444 n = 8 * n + input() - '0'; 445 } 446 *bp++ = n; 447 break; 448 449 case 'x': { /* hex \x0-9a-fA-F + */ 450 char xbuf[100], *px; 451 px = xbuf; 452 while ((c = input()) != 0 && px-xbuf < 100-2) { 453 if (isdigit(c) || 454 (c >= 'a' && c <= 'f') || 455 (c >= 'A' && c <= 'F')) 456 *px++ = c; 457 else 458 break; 459 } 460 *px = 0; 461 unput(c); 462 (void) sscanf(xbuf, "%x", (unsigned int *)&n); 463 *bp++ = n; 464 break; 465 } 466 467 default: 468 *bp++ = c; 469 break; 470 } 471 break; 472 default: 473 *bp++ = c; 474 break; 475 } 476 } 477 *bp = 0; 478 s = tostring(buf); 479 *bp++ = ' '; *bp++ = 0; 480 yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab); 481 RET(STRING); 482 } 483 484 485 int 486 binsearch(char *w, Keyword *kp, int n) 487 { 488 int cond, low, mid, high; 489 490 low = 0; 491 high = n - 1; 492 while (low <= high) { 493 mid = (low + high) / 2; 494 if ((cond = strcmp(w, kp[mid].word)) < 0) 495 high = mid - 1; 496 else if (cond > 0) 497 low = mid + 1; 498 else 499 return (mid); 500 } 501 return (-1); 502 } 503 504 int 505 word(char *w) 506 { 507 Keyword *kp; 508 int c, n; 509 510 n = binsearch(w, keywords, sizeof (keywords) / sizeof (keywords[0])); 511 if (n != -1) { /* found in table */ 512 kp = keywords + n; 513 yylval.i = kp->sub; 514 switch (kp->type) { /* special handling */ 515 case BLTIN: 516 if (kp->sub == FSYSTEM && safe) 517 SYNTAX("system is unsafe"); 518 RET(kp->type); 519 case FUNC: 520 if (infunc) 521 SYNTAX("illegal nested function"); 522 RET(kp->type); 523 case RETURN: 524 if (!infunc) 525 SYNTAX("return not in function"); 526 RET(kp->type); 527 case VARNF: 528 yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab); 529 RET(VARNF); 530 default: 531 RET(kp->type); 532 } 533 } 534 c = peek(); /* look for '(' */ 535 if (c != '(' && infunc && (n = isarg(w)) >= 0) { 536 yylval.i = n; 537 RET(ARG); 538 } else { 539 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab); 540 if (c == '(') { 541 RET(CALL); 542 } else { 543 RET(VAR); 544 } 545 } 546 } 547 548 void 549 startreg(void) /* next call to yylex will return a regular expression */ 550 { 551 reg = 1; 552 } 553 554 int 555 regexpr(void) 556 { 557 int c; 558 static char *buf = NULL; 559 static size_t bufsz = 500; 560 char *bp; 561 562 if (buf == NULL && (buf = (char *)malloc(bufsz)) == NULL) 563 FATAL("out of space for rex expr"); 564 bp = buf; 565 for (; (c = input()) != '/' && c != 0; ) { 566 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr")) 567 FATAL("out of space for reg expr %.10s...", buf); 568 if (c == '\n') { 569 *bp = '\0'; 570 SYNTAX("newline in regular expression %.10s...", buf); 571 unput('\n'); 572 break; 573 } else if (c == '\\') { 574 *bp++ = '\\'; 575 *bp++ = input(); 576 } else { 577 *bp++ = c; 578 } 579 } 580 *bp = 0; 581 if (c == 0) 582 SYNTAX("non-terminated regular expression %.10s...", buf); 583 yylval.s = tostring(buf); 584 unput('/'); 585 RET(REGEXPR); 586 } 587 588 /* low-level lexical stuff, sort of inherited from lex */ 589 590 char ebuf[300]; 591 char *ep = ebuf; 592 char yysbuf[100]; /* pushback buffer */ 593 char *yysptr = yysbuf; 594 FILE *yyin = NULL; 595 596 int 597 input(void) /* get next lexical input character */ 598 { 599 int c; 600 extern char *lexprog; 601 602 if (yysptr > yysbuf) 603 c = (uschar)*--yysptr; 604 else if (lexprog != NULL) { /* awk '...' */ 605 if ((c = (uschar)*lexprog) != 0) 606 lexprog++; 607 } else /* awk -f ... */ 608 c = pgetc(); 609 if (c == EOF) 610 c = 0; 611 if (ep >= ebuf + sizeof (ebuf)) 612 ep = ebuf; 613 *ep = c; 614 if (c != 0) { 615 ep++; 616 } 617 return (c); 618 } 619 620 void 621 unput(int c) /* put lexical character back on input */ 622 { 623 if (yysptr >= yysbuf + sizeof (yysbuf)) 624 FATAL("pushed back too much: %.20s...", yysbuf); 625 *yysptr++ = c; 626 if (--ep < ebuf) 627 ep = ebuf + sizeof (ebuf) - 1; 628 } 629 630 void 631 unputstr(const char *s) /* put a string back on input */ 632 { 633 int i; 634 635 for (i = strlen(s)-1; i >= 0; i--) 636 unput(s[i]); 637 } 638