1 /**************************************************************** 2 Copyright (C) Lucent Technologies 1997 3 All Rights Reserved 4 5 Permission to use, copy, modify, and distribute this software and 6 its documentation for any purpose and without fee is hereby 7 granted, provided that the above copyright notice appear in all 8 copies and that both that the copyright notice and this 9 permission notice and warranty disclaimer appear in supporting 10 documentation, and that the name Lucent Technologies or any of 11 its entities not be used in advertising or publicity pertaining 12 to distribution of the software without specific, written prior 13 permission. 14 15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 22 THIS SOFTWARE. 23 ****************************************************************/ 24 25 #include <stdio.h> 26 #include <stdlib.h> 27 #include <string.h> 28 #include <ctype.h> 29 #include "awk.h" 30 #include "awkgram.tab.h" 31 32 extern YYSTYPE yylval; 33 extern bool infunc; 34 35 int lineno = 1; 36 int bracecnt = 0; 37 int brackcnt = 0; 38 int parencnt = 0; 39 40 typedef struct Keyword { 41 const char *word; 42 int sub; 43 int type; 44 } Keyword; 45 46 const Keyword keywords[] = { /* keep sorted: binary searched */ 47 { "BEGIN", XBEGIN, XBEGIN }, 48 { "END", XEND, XEND }, 49 { "NF", VARNF, VARNF }, 50 { "atan2", FATAN, BLTIN }, 51 { "break", BREAK, BREAK }, 52 { "close", CLOSE, CLOSE }, 53 { "continue", CONTINUE, CONTINUE }, 54 { "cos", FCOS, BLTIN }, 55 { "delete", DELETE, DELETE }, 56 { "do", DO, DO }, 57 { "else", ELSE, ELSE }, 58 { "exit", EXIT, EXIT }, 59 { "exp", FEXP, BLTIN }, 60 { "fflush", FFLUSH, BLTIN }, 61 { "for", FOR, FOR }, 62 { "func", FUNC, FUNC }, 63 { "function", FUNC, FUNC }, 64 { "getline", GETLINE, GETLINE }, 65 { "gsub", GSUB, GSUB }, 66 { "if", IF, IF }, 67 { "in", IN, IN }, 68 { "index", INDEX, INDEX }, 69 { "int", FINT, BLTIN }, 70 { "length", FLENGTH, BLTIN }, 71 { "log", FLOG, BLTIN }, 72 { "match", MATCHFCN, MATCHFCN }, 73 { "next", NEXT, NEXT }, 74 { "nextfile", NEXTFILE, NEXTFILE }, 75 { "print", PRINT, PRINT }, 76 { "printf", PRINTF, PRINTF }, 77 { "rand", FRAND, BLTIN }, 78 { "return", RETURN, RETURN }, 79 { "sin", FSIN, BLTIN }, 80 { "split", SPLIT, SPLIT }, 81 { "sprintf", SPRINTF, SPRINTF }, 82 { "sqrt", FSQRT, BLTIN }, 83 { "srand", FSRAND, BLTIN }, 84 { "sub", SUB, SUB }, 85 { "substr", SUBSTR, SUBSTR }, 86 { "system", FSYSTEM, BLTIN }, 87 { "tolower", FTOLOWER, BLTIN }, 88 { "toupper", FTOUPPER, BLTIN }, 89 { "while", WHILE, WHILE }, 90 }; 91 92 #define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); } 93 94 static int peek(void) 95 { 96 int c = input(); 97 unput(c); 98 return c; 99 } 100 101 static int gettok(char **pbuf, int *psz) /* get next input token */ 102 { 103 int c, retc; 104 char *buf = *pbuf; 105 int sz = *psz; 106 char *bp = buf; 107 108 c = input(); 109 if (c == 0) 110 return 0; 111 buf[0] = c; 112 buf[1] = 0; 113 if (!isalnum(c) && c != '.' && c != '_') 114 return c; 115 116 *bp++ = c; 117 if (isalpha(c) || c == '_') { /* it's a varname */ 118 for ( ; (c = input()) != 0; ) { 119 if (bp-buf >= sz) 120 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok")) 121 FATAL( "out of space for name %.10s...", buf ); 122 if (isalnum(c) || c == '_') 123 *bp++ = c; 124 else { 125 *bp = 0; 126 unput(c); 127 break; 128 } 129 } 130 *bp = 0; 131 retc = 'a'; /* alphanumeric */ 132 } else { /* maybe it's a number, but could be . */ 133 char *rem; 134 /* read input until can't be a number */ 135 for ( ; (c = input()) != 0; ) { 136 if (bp-buf >= sz) 137 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok")) 138 FATAL( "out of space for number %.10s...", buf ); 139 if (isdigit(c) || c == 'e' || c == 'E' 140 || c == '.' || c == '+' || c == '-') 141 *bp++ = c; 142 else { 143 unput(c); 144 break; 145 } 146 } 147 *bp = 0; 148 strtod(buf, &rem); /* parse the number */ 149 if (rem == buf) { /* it wasn't a valid number at all */ 150 buf[1] = 0; /* return one character as token */ 151 retc = (uschar)buf[0]; /* character is its own type */ 152 unputstr(rem+1); /* put rest back for later */ 153 } else { /* some prefix was a number */ 154 unputstr(rem); /* put rest back for later */ 155 rem[0] = 0; /* truncate buf after number part */ 156 retc = '0'; /* type is number */ 157 } 158 } 159 *pbuf = buf; 160 *psz = sz; 161 return retc; 162 } 163 164 int word(char *); 165 int string(void); 166 int regexpr(void); 167 bool sc = false; /* true => return a } right now */ 168 bool reg = false; /* true => return a REGEXPR now */ 169 170 int yylex(void) 171 { 172 int c; 173 static char *buf = NULL; 174 static int bufsize = 5; /* BUG: setting this small causes core dump! */ 175 176 if (buf == NULL && (buf = (char *) malloc(bufsize)) == NULL) 177 FATAL( "out of space in yylex" ); 178 if (sc) { 179 sc = false; 180 RET('}'); 181 } 182 if (reg) { 183 reg = false; 184 return regexpr(); 185 } 186 for (;;) { 187 c = gettok(&buf, &bufsize); 188 if (c == 0) 189 return 0; 190 if (isalpha(c) || c == '_') 191 return word(buf); 192 if (isdigit(c)) { 193 char *cp = tostring(buf); 194 double result; 195 196 if (is_number(cp, & result)) 197 yylval.cp = setsymtab(buf, cp, result, CON|NUM, symtab); 198 else 199 yylval.cp = setsymtab(buf, cp, 0.0, STR, symtab); 200 free(cp); 201 /* should this also have STR set? */ 202 RET(NUMBER); 203 } 204 205 yylval.i = c; 206 switch (c) { 207 case '\n': /* {EOL} */ 208 lineno++; 209 RET(NL); 210 case '\r': /* assume \n is coming */ 211 case ' ': /* {WS}+ */ 212 case '\t': 213 break; 214 case '#': /* #.* strip comments */ 215 while ((c = input()) != '\n' && c != 0) 216 ; 217 unput(c); 218 /* 219 * Next line is a hack, itcompensates for 220 * unput's treatment of \n. 221 */ 222 lineno++; 223 break; 224 case ';': 225 RET(';'); 226 case '\\': 227 if (peek() == '\n') { 228 input(); 229 lineno++; 230 } else if (peek() == '\r') { 231 input(); input(); /* \n */ 232 lineno++; 233 } else { 234 RET(c); 235 } 236 break; 237 case '&': 238 if (peek() == '&') { 239 input(); RET(AND); 240 } else 241 RET('&'); 242 case '|': 243 if (peek() == '|') { 244 input(); RET(BOR); 245 } else 246 RET('|'); 247 case '!': 248 if (peek() == '=') { 249 input(); yylval.i = NE; RET(NE); 250 } else if (peek() == '~') { 251 input(); yylval.i = NOTMATCH; RET(MATCHOP); 252 } else 253 RET(NOT); 254 case '~': 255 yylval.i = MATCH; 256 RET(MATCHOP); 257 case '<': 258 if (peek() == '=') { 259 input(); yylval.i = LE; RET(LE); 260 } else { 261 yylval.i = LT; RET(LT); 262 } 263 case '=': 264 if (peek() == '=') { 265 input(); yylval.i = EQ; RET(EQ); 266 } else { 267 yylval.i = ASSIGN; RET(ASGNOP); 268 } 269 case '>': 270 if (peek() == '=') { 271 input(); yylval.i = GE; RET(GE); 272 } else if (peek() == '>') { 273 input(); yylval.i = APPEND; RET(APPEND); 274 } else { 275 yylval.i = GT; RET(GT); 276 } 277 case '+': 278 if (peek() == '+') { 279 input(); yylval.i = INCR; RET(INCR); 280 } else if (peek() == '=') { 281 input(); yylval.i = ADDEQ; RET(ASGNOP); 282 } else 283 RET('+'); 284 case '-': 285 if (peek() == '-') { 286 input(); yylval.i = DECR; RET(DECR); 287 } else if (peek() == '=') { 288 input(); yylval.i = SUBEQ; RET(ASGNOP); 289 } else 290 RET('-'); 291 case '*': 292 if (peek() == '=') { /* *= */ 293 input(); yylval.i = MULTEQ; RET(ASGNOP); 294 } else if (peek() == '*') { /* ** or **= */ 295 input(); /* eat 2nd * */ 296 if (peek() == '=') { 297 input(); yylval.i = POWEQ; RET(ASGNOP); 298 } else { 299 RET(POWER); 300 } 301 } else 302 RET('*'); 303 case '/': 304 RET('/'); 305 case '%': 306 if (peek() == '=') { 307 input(); yylval.i = MODEQ; RET(ASGNOP); 308 } else 309 RET('%'); 310 case '^': 311 if (peek() == '=') { 312 input(); yylval.i = POWEQ; RET(ASGNOP); 313 } else 314 RET(POWER); 315 316 case '$': 317 /* BUG: awkward, if not wrong */ 318 c = gettok(&buf, &bufsize); 319 if (isalpha(c)) { 320 if (strcmp(buf, "NF") == 0) { /* very special */ 321 unputstr("(NF)"); 322 RET(INDIRECT); 323 } 324 c = peek(); 325 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) { 326 unputstr(buf); 327 RET(INDIRECT); 328 } 329 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab); 330 RET(IVAR); 331 } else if (c == 0) { /* */ 332 SYNTAX( "unexpected end of input after $" ); 333 RET(';'); 334 } else { 335 unputstr(buf); 336 RET(INDIRECT); 337 } 338 339 case '}': 340 if (--bracecnt < 0) 341 SYNTAX( "extra }" ); 342 sc = true; 343 RET(';'); 344 case ']': 345 if (--brackcnt < 0) 346 SYNTAX( "extra ]" ); 347 RET(']'); 348 case ')': 349 if (--parencnt < 0) 350 SYNTAX( "extra )" ); 351 RET(')'); 352 case '{': 353 bracecnt++; 354 RET('{'); 355 case '[': 356 brackcnt++; 357 RET('['); 358 case '(': 359 parencnt++; 360 RET('('); 361 362 case '"': 363 return string(); /* BUG: should be like tran.c ? */ 364 365 default: 366 RET(c); 367 } 368 } 369 } 370 371 extern int runetochar(char *str, int c); 372 373 int string(void) 374 { 375 int c, n; 376 char *s, *bp; 377 static char *buf = NULL; 378 static int bufsz = 500; 379 380 if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL) 381 FATAL("out of space for strings"); 382 for (bp = buf; (c = input()) != '"'; ) { 383 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string")) 384 FATAL("out of space for string %.10s...", buf); 385 switch (c) { 386 case '\n': 387 case '\r': 388 case 0: 389 *bp = '\0'; 390 SYNTAX( "non-terminated string %.10s...", buf ); 391 if (c == 0) /* hopeless */ 392 FATAL( "giving up" ); 393 lineno++; 394 break; 395 case '\\': 396 c = input(); 397 switch (c) { 398 case '\n': break; 399 case '"': *bp++ = '"'; break; 400 case 'n': *bp++ = '\n'; break; 401 case 't': *bp++ = '\t'; break; 402 case 'f': *bp++ = '\f'; break; 403 case 'r': *bp++ = '\r'; break; 404 case 'b': *bp++ = '\b'; break; 405 case 'v': *bp++ = '\v'; break; 406 case 'a': *bp++ = '\a'; break; 407 case '\\': *bp++ = '\\'; break; 408 409 case '0': case '1': case '2': /* octal: \d \dd \ddd */ 410 case '3': case '4': case '5': case '6': case '7': 411 n = c - '0'; 412 if ((c = peek()) >= '0' && c < '8') { 413 n = 8 * n + input() - '0'; 414 if ((c = peek()) >= '0' && c < '8') 415 n = 8 * n + input() - '0'; 416 } 417 *bp++ = n; 418 break; 419 420 case 'x': /* hex \x0-9a-fA-F (exactly two) */ 421 { 422 int i; 423 424 if (!isxdigit(peek())) { 425 unput(c); 426 break; 427 } 428 n = 0; 429 for (i = 0; i < 2; i++) { 430 c = input(); 431 if (c == 0) 432 break; 433 if (isxdigit(c)) { 434 c = tolower(c); 435 n *= 16; 436 if (isdigit(c)) 437 n += (c - '0'); 438 else 439 n += 10 + (c - 'a'); 440 } else { 441 unput(c); 442 break; 443 } 444 } 445 if (i) 446 *bp++ = n; 447 break; 448 } 449 450 case 'u': /* utf \u0-9a-fA-F (1..8) */ 451 { 452 int i; 453 454 n = 0; 455 for (i = 0; i < 8; i++) { 456 c = input(); 457 if (!isxdigit(c) || c == 0) 458 break; 459 c = tolower(c); 460 n *= 16; 461 if (isdigit(c)) 462 n += (c - '0'); 463 else 464 n += 10 + (c - 'a'); 465 } 466 unput(c); 467 bp += runetochar(bp, n); 468 break; 469 } 470 471 default: 472 *bp++ = c; 473 break; 474 } 475 break; 476 default: 477 *bp++ = c; 478 break; 479 } 480 } 481 *bp = 0; 482 s = tostring(buf); 483 *bp++ = ' '; *bp++ = '\0'; 484 yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab); 485 free(s); 486 RET(STRING); 487 } 488 489 490 static int binsearch(char *w, const Keyword *kp, int n) 491 { 492 int cond, low, mid, high; 493 494 low = 0; 495 high = n - 1; 496 while (low <= high) { 497 mid = (low + high) / 2; 498 if ((cond = strcmp(w, kp[mid].word)) < 0) 499 high = mid - 1; 500 else if (cond > 0) 501 low = mid + 1; 502 else 503 return mid; 504 } 505 return -1; 506 } 507 508 int word(char *w) 509 { 510 const Keyword *kp; 511 int c, n; 512 513 n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0])); 514 if (n != -1) { /* found in table */ 515 kp = keywords + n; 516 yylval.i = kp->sub; 517 switch (kp->type) { /* special handling */ 518 case BLTIN: 519 if (kp->sub == FSYSTEM && safe) 520 SYNTAX( "system is unsafe" ); 521 RET(kp->type); 522 case FUNC: 523 if (infunc) 524 SYNTAX( "illegal nested function" ); 525 RET(kp->type); 526 case RETURN: 527 if (!infunc) 528 SYNTAX( "return not in function" ); 529 RET(kp->type); 530 case VARNF: 531 yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab); 532 RET(VARNF); 533 default: 534 RET(kp->type); 535 } 536 } 537 c = peek(); /* look for '(' */ 538 if (c != '(' && infunc && (n=isarg(w)) >= 0) { 539 yylval.i = n; 540 RET(ARG); 541 } else { 542 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab); 543 if (c == '(') { 544 RET(CALL); 545 } else { 546 RET(VAR); 547 } 548 } 549 } 550 551 void startreg(void) /* next call to yylex will return a regular expression */ 552 { 553 reg = true; 554 } 555 556 int regexpr(void) 557 { 558 int c; 559 static char *buf = NULL; 560 static int bufsz = 500; 561 char *bp; 562 563 if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL) 564 FATAL("out of space for reg expr"); 565 bp = buf; 566 for ( ; (c = input()) != '/' && c != 0; ) { 567 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr")) 568 FATAL("out of space for reg expr %.10s...", buf); 569 if (c == '\n') { 570 *bp = '\0'; 571 SYNTAX( "newline in regular expression %.10s...", buf ); 572 unput('\n'); 573 break; 574 } else if (c == '\\') { 575 *bp++ = '\\'; 576 *bp++ = input(); 577 } else { 578 *bp++ = c; 579 } 580 } 581 *bp = 0; 582 if (c == 0) 583 SYNTAX("non-terminated regular expression %.10s...", buf); 584 yylval.s = tostring(buf); 585 unput('/'); 586 RET(REGEXPR); 587 } 588 589 /* low-level lexical stuff, sort of inherited from lex */ 590 591 char ebuf[300]; 592 char *ep = ebuf; 593 char yysbuf[100]; /* pushback buffer */ 594 char *yysptr = yysbuf; 595 FILE *yyin = NULL; 596 597 int input(void) /* get next lexical input character */ 598 { 599 int c; 600 extern char *lexprog; 601 602 if (yysptr > yysbuf) 603 c = (uschar)*--yysptr; 604 else if (lexprog != NULL) { /* awk '...' */ 605 if ((c = (uschar)*lexprog) != 0) 606 lexprog++; 607 } else /* awk -f ... */ 608 c = pgetc(); 609 if (c == EOF) 610 c = 0; 611 if (ep >= ebuf + sizeof ebuf) 612 ep = ebuf; 613 *ep = c; 614 if (c != 0) { 615 ep++; 616 } 617 return (c); 618 } 619 620 void unput(int c) /* put lexical character back on input */ 621 { 622 if (c == '\n') 623 lineno--; 624 if (yysptr >= yysbuf + sizeof(yysbuf)) 625 FATAL("pushed back too much: %.20s...", yysbuf); 626 *yysptr++ = c; 627 if (--ep < ebuf) 628 ep = ebuf + sizeof(ebuf) - 1; 629 } 630 631 void unputstr(const char *s) /* put a string back on input */ 632 { 633 int i; 634 635 for (i = strlen(s)-1; i >= 0; i--) 636 unput(s[i]); 637 } 638