1 /**************************************************************** 2 Copyright (C) Lucent Technologies 1997 3 All Rights Reserved 4 5 Permission to use, copy, modify, and distribute this software and 6 its documentation for any purpose and without fee is hereby 7 granted, provided that the above copyright notice appear in all 8 copies and that both that the copyright notice and this 9 permission notice and warranty disclaimer appear in supporting 10 documentation, and that the name Lucent Technologies or any of 11 its entities not be used in advertising or publicity pertaining 12 to distribution of the software without specific, written prior 13 permission. 14 15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 22 THIS SOFTWARE. 23 ****************************************************************/ 24 25 #include <stdio.h> 26 #include <stdlib.h> 27 #include <string.h> 28 #include <ctype.h> 29 #include "awk.h" 30 #include "awkgram.tab.h" 31 32 extern YYSTYPE yylval; 33 extern bool infunc; 34 35 int lineno = 1; 36 int bracecnt = 0; 37 int brackcnt = 0; 38 int parencnt = 0; 39 40 typedef struct Keyword { 41 const char *word; 42 int sub; 43 int type; 44 } Keyword; 45 46 const Keyword keywords[] = { /* keep sorted: binary searched */ 47 { "BEGIN", XBEGIN, XBEGIN }, 48 { "END", XEND, XEND }, 49 { "NF", VARNF, VARNF }, 50 { "and", FAND, BLTIN }, 51 { "atan2", FATAN, BLTIN }, 52 { "break", BREAK, BREAK }, 53 { "close", CLOSE, CLOSE }, 54 { "compl", FCOMPL, BLTIN }, 55 { "continue", CONTINUE, CONTINUE }, 56 { "cos", FCOS, BLTIN }, 57 { "delete", DELETE, DELETE }, 58 { "do", DO, DO }, 59 { "else", ELSE, ELSE }, 60 { "exit", EXIT, EXIT }, 61 { "exp", FEXP, BLTIN }, 62 { "fflush", FFLUSH, BLTIN }, 63 { "for", FOR, FOR }, 64 { "func", FUNC, FUNC }, 65 { "function", FUNC, FUNC }, 66 { "gensub", GENSUB, GENSUB }, 67 { "getline", GETLINE, GETLINE }, 68 { "gsub", GSUB, GSUB }, 69 { "if", IF, IF }, 70 { "in", IN, IN }, 71 { "index", INDEX, INDEX }, 72 { "int", FINT, BLTIN }, 73 { "length", FLENGTH, BLTIN }, 74 { "log", FLOG, BLTIN }, 75 { "lshift", FLSHIFT, BLTIN }, 76 { "match", MATCHFCN, MATCHFCN }, 77 { "next", NEXT, NEXT }, 78 { "nextfile", NEXTFILE, NEXTFILE }, 79 { "or", FFOR, BLTIN }, 80 { "print", PRINT, PRINT }, 81 { "printf", PRINTF, PRINTF }, 82 { "rand", FRAND, BLTIN }, 83 { "return", RETURN, RETURN }, 84 { "rshift", FRSHIFT, BLTIN }, 85 { "sin", FSIN, BLTIN }, 86 { "split", SPLIT, SPLIT }, 87 { "sprintf", SPRINTF, SPRINTF }, 88 { "sqrt", FSQRT, BLTIN }, 89 { "srand", FSRAND, BLTIN }, 90 { "strftime", FSTRFTIME, BLTIN }, 91 { "sub", SUB, SUB }, 92 { "substr", SUBSTR, SUBSTR }, 93 { "system", FSYSTEM, BLTIN }, 94 { "systime", FSYSTIME, BLTIN }, 95 { "tolower", FTOLOWER, BLTIN }, 96 { "toupper", FTOUPPER, BLTIN }, 97 { "while", WHILE, WHILE }, 98 { "xor", FXOR, BLTIN }, 99 }; 100 101 #define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); } 102 103 static int peek(void) 104 { 105 int c = input(); 106 unput(c); 107 return c; 108 } 109 110 static int gettok(char **pbuf, int *psz) /* get next input token */ 111 { 112 int c, retc; 113 char *buf = *pbuf; 114 int sz = *psz; 115 char *bp = buf; 116 117 c = input(); 118 if (c == 0) 119 return 0; 120 buf[0] = c; 121 buf[1] = 0; 122 if (!isalnum(c) && c != '.' && c != '_') 123 return c; 124 125 *bp++ = c; 126 if (isalpha(c) || c == '_') { /* it's a varname */ 127 for ( ; (c = input()) != 0; ) { 128 if (bp-buf >= sz) 129 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok")) 130 FATAL( "out of space for name %.10s...", buf ); 131 if (isalnum(c) || c == '_') 132 *bp++ = c; 133 else { 134 *bp = 0; 135 unput(c); 136 break; 137 } 138 } 139 *bp = 0; 140 retc = 'a'; /* alphanumeric */ 141 } else { /* maybe it's a number, but could be . */ 142 char *rem; 143 /* read input until can't be a number */ 144 for ( ; (c = input()) != 0; ) { 145 if (bp-buf >= sz) 146 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok")) 147 FATAL( "out of space for number %.10s...", buf ); 148 if (isdigit(c) || c == 'e' || c == 'E' 149 || c == '.' || c == '+' || c == '-') 150 *bp++ = c; 151 else { 152 unput(c); 153 break; 154 } 155 } 156 *bp = 0; 157 strtod(buf, &rem); /* parse the number */ 158 if (rem == buf) { /* it wasn't a valid number at all */ 159 buf[1] = 0; /* return one character as token */ 160 retc = (uschar)buf[0]; /* character is its own type */ 161 unputstr(rem+1); /* put rest back for later */ 162 } else { /* some prefix was a number */ 163 unputstr(rem); /* put rest back for later */ 164 rem[0] = 0; /* truncate buf after number part */ 165 retc = '0'; /* type is number */ 166 } 167 } 168 *pbuf = buf; 169 *psz = sz; 170 return retc; 171 } 172 173 int word(char *); 174 int string(void); 175 int regexpr(void); 176 bool sc = false; /* true => return a } right now */ 177 bool reg = false; /* true => return a REGEXPR now */ 178 179 int yylex(void) 180 { 181 int c; 182 static char *buf = NULL; 183 static int bufsize = 5; /* BUG: setting this small causes core dump! */ 184 185 if (buf == NULL && (buf = (char *) malloc(bufsize)) == NULL) 186 FATAL( "out of space in yylex" ); 187 if (sc) { 188 sc = false; 189 RET('}'); 190 } 191 if (reg) { 192 reg = false; 193 return regexpr(); 194 } 195 for (;;) { 196 c = gettok(&buf, &bufsize); 197 if (c == 0) 198 return 0; 199 if (isalpha(c) || c == '_') 200 return word(buf); 201 if (isdigit(c)) { 202 char *cp = tostring(buf); 203 double result; 204 205 if (is_number(cp, & result)) 206 yylval.cp = setsymtab(buf, cp, result, CON|NUM, symtab); 207 else 208 yylval.cp = setsymtab(buf, cp, 0.0, STR, symtab); 209 free(cp); 210 /* should this also have STR set? */ 211 RET(NUMBER); 212 } 213 214 yylval.i = c; 215 switch (c) { 216 case '\n': /* {EOL} */ 217 lineno++; 218 RET(NL); 219 case '\r': /* assume \n is coming */ 220 case ' ': /* {WS}+ */ 221 case '\t': 222 break; 223 case '#': /* #.* strip comments */ 224 while ((c = input()) != '\n' && c != 0) 225 ; 226 unput(c); 227 /* 228 * Next line is a hack, it compensates for 229 * unput's treatment of \n. 230 */ 231 lineno++; 232 break; 233 case ';': 234 RET(';'); 235 case '\\': 236 if (peek() == '\n') { 237 input(); 238 lineno++; 239 } else if (peek() == '\r') { 240 input(); input(); /* \n */ 241 lineno++; 242 } else { 243 RET(c); 244 } 245 break; 246 case '&': 247 if (peek() == '&') { 248 input(); RET(AND); 249 } else 250 RET('&'); 251 case '|': 252 if (peek() == '|') { 253 input(); RET(BOR); 254 } else 255 RET('|'); 256 case '!': 257 if (peek() == '=') { 258 input(); yylval.i = NE; RET(NE); 259 } else if (peek() == '~') { 260 input(); yylval.i = NOTMATCH; RET(MATCHOP); 261 } else 262 RET(NOT); 263 case '~': 264 yylval.i = MATCH; 265 RET(MATCHOP); 266 case '<': 267 if (peek() == '=') { 268 input(); yylval.i = LE; RET(LE); 269 } else { 270 yylval.i = LT; RET(LT); 271 } 272 case '=': 273 if (peek() == '=') { 274 input(); yylval.i = EQ; RET(EQ); 275 } else { 276 yylval.i = ASSIGN; RET(ASGNOP); 277 } 278 case '>': 279 if (peek() == '=') { 280 input(); yylval.i = GE; RET(GE); 281 } else if (peek() == '>') { 282 input(); yylval.i = APPEND; RET(APPEND); 283 } else { 284 yylval.i = GT; RET(GT); 285 } 286 case '+': 287 if (peek() == '+') { 288 input(); yylval.i = INCR; RET(INCR); 289 } else if (peek() == '=') { 290 input(); yylval.i = ADDEQ; RET(ASGNOP); 291 } else 292 RET('+'); 293 case '-': 294 if (peek() == '-') { 295 input(); yylval.i = DECR; RET(DECR); 296 } else if (peek() == '=') { 297 input(); yylval.i = SUBEQ; RET(ASGNOP); 298 } else 299 RET('-'); 300 case '*': 301 if (peek() == '=') { /* *= */ 302 input(); yylval.i = MULTEQ; RET(ASGNOP); 303 } else if (peek() == '*') { /* ** or **= */ 304 input(); /* eat 2nd * */ 305 if (peek() == '=') { 306 input(); yylval.i = POWEQ; RET(ASGNOP); 307 } else { 308 RET(POWER); 309 } 310 } else 311 RET('*'); 312 case '/': 313 RET('/'); 314 case '%': 315 if (peek() == '=') { 316 input(); yylval.i = MODEQ; RET(ASGNOP); 317 } else 318 RET('%'); 319 case '^': 320 if (peek() == '=') { 321 input(); yylval.i = POWEQ; RET(ASGNOP); 322 } else 323 RET(POWER); 324 325 case '$': 326 /* BUG: awkward, if not wrong */ 327 c = gettok(&buf, &bufsize); 328 if (isalpha(c)) { 329 if (strcmp(buf, "NF") == 0) { /* very special */ 330 unputstr("(NF)"); 331 RET(INDIRECT); 332 } 333 c = peek(); 334 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) { 335 unputstr(buf); 336 RET(INDIRECT); 337 } 338 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab); 339 RET(IVAR); 340 } else if (c == 0) { /* */ 341 SYNTAX( "unexpected end of input after $" ); 342 RET(';'); 343 } else { 344 unputstr(buf); 345 RET(INDIRECT); 346 } 347 348 case '}': 349 if (--bracecnt < 0) 350 SYNTAX( "extra }" ); 351 sc = true; 352 RET(';'); 353 case ']': 354 if (--brackcnt < 0) 355 SYNTAX( "extra ]" ); 356 RET(']'); 357 case ')': 358 if (--parencnt < 0) 359 SYNTAX( "extra )" ); 360 RET(')'); 361 case '{': 362 bracecnt++; 363 RET('{'); 364 case '[': 365 brackcnt++; 366 RET('['); 367 case '(': 368 parencnt++; 369 RET('('); 370 371 case '"': 372 return string(); /* BUG: should be like tran.c ? */ 373 374 default: 375 RET(c); 376 } 377 } 378 } 379 380 extern int runetochar(char *str, int c); 381 382 int string(void) 383 { 384 int c, n; 385 char *s, *bp; 386 static char *buf = NULL; 387 static int bufsz = 500; 388 389 if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL) 390 FATAL("out of space for strings"); 391 for (bp = buf; (c = input()) != '"'; ) { 392 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string")) 393 FATAL("out of space for string %.10s...", buf); 394 switch (c) { 395 case '\n': 396 case '\r': 397 case 0: 398 *bp = '\0'; 399 SYNTAX( "non-terminated string %.10s...", buf ); 400 if (c == 0) /* hopeless */ 401 FATAL( "giving up" ); 402 lineno++; 403 break; 404 case '\\': 405 c = input(); 406 switch (c) { 407 case '\n': break; 408 case '"': *bp++ = '"'; break; 409 case 'n': *bp++ = '\n'; break; 410 case 't': *bp++ = '\t'; break; 411 case 'f': *bp++ = '\f'; break; 412 case 'r': *bp++ = '\r'; break; 413 case 'b': *bp++ = '\b'; break; 414 case 'v': *bp++ = '\v'; break; 415 case 'a': *bp++ = '\a'; break; 416 case '\\': *bp++ = '\\'; break; 417 418 case '0': case '1': case '2': /* octal: \d \dd \ddd */ 419 case '3': case '4': case '5': case '6': case '7': 420 n = c - '0'; 421 if ((c = peek()) >= '0' && c < '8') { 422 n = 8 * n + input() - '0'; 423 if ((c = peek()) >= '0' && c < '8') 424 n = 8 * n + input() - '0'; 425 } 426 *bp++ = n; 427 break; 428 429 case 'x': /* hex \x0-9a-fA-F (exactly two) */ 430 { 431 int i; 432 433 if (!isxdigit(peek())) { 434 unput(c); 435 break; 436 } 437 n = 0; 438 for (i = 0; i < 2; i++) { 439 c = input(); 440 if (c == 0) 441 break; 442 if (isxdigit(c)) { 443 c = tolower(c); 444 n *= 16; 445 if (isdigit(c)) 446 n += (c - '0'); 447 else 448 n += 10 + (c - 'a'); 449 } else { 450 unput(c); 451 break; 452 } 453 } 454 if (i) 455 *bp++ = n; 456 break; 457 } 458 459 case 'u': /* utf \u0-9a-fA-F (1..8) */ 460 { 461 int i; 462 463 n = 0; 464 for (i = 0; i < 8; i++) { 465 c = input(); 466 if (!isxdigit(c) || c == 0) 467 break; 468 c = tolower(c); 469 n *= 16; 470 if (isdigit(c)) 471 n += (c - '0'); 472 else 473 n += 10 + (c - 'a'); 474 } 475 unput(c); 476 bp += runetochar(bp, n); 477 break; 478 } 479 480 default: 481 *bp++ = c; 482 break; 483 } 484 break; 485 default: 486 *bp++ = c; 487 break; 488 } 489 } 490 *bp = 0; 491 s = tostring(buf); 492 *bp++ = ' '; *bp++ = '\0'; 493 yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab); 494 free(s); 495 RET(STRING); 496 } 497 498 499 static int binsearch(char *w, const Keyword *kp, int n) 500 { 501 int cond, low, mid, high; 502 503 low = 0; 504 high = n - 1; 505 while (low <= high) { 506 mid = (low + high) / 2; 507 if ((cond = strcmp(w, kp[mid].word)) < 0) 508 high = mid - 1; 509 else if (cond > 0) 510 low = mid + 1; 511 else 512 return mid; 513 } 514 return -1; 515 } 516 517 int word(char *w) 518 { 519 const Keyword *kp; 520 int c, n; 521 522 n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0])); 523 if (n != -1) { /* found in table */ 524 kp = keywords + n; 525 yylval.i = kp->sub; 526 switch (kp->type) { /* special handling */ 527 case BLTIN: 528 if (kp->sub == FSYSTEM && safe) 529 SYNTAX( "system is unsafe" ); 530 RET(kp->type); 531 case FUNC: 532 if (infunc) 533 SYNTAX( "illegal nested function" ); 534 RET(kp->type); 535 case RETURN: 536 if (!infunc) 537 SYNTAX( "return not in function" ); 538 RET(kp->type); 539 case VARNF: 540 yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab); 541 RET(VARNF); 542 default: 543 RET(kp->type); 544 } 545 } 546 c = peek(); /* look for '(' */ 547 if (c != '(' && infunc && (n=isarg(w)) >= 0) { 548 yylval.i = n; 549 RET(ARG); 550 } else { 551 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab); 552 if (c == '(') { 553 RET(CALL); 554 } else { 555 RET(VAR); 556 } 557 } 558 } 559 560 void startreg(void) /* next call to yylex will return a regular expression */ 561 { 562 reg = true; 563 } 564 565 int regexpr(void) 566 { 567 int c; 568 static char *buf = NULL; 569 static int bufsz = 500; 570 char *bp; 571 572 if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL) 573 FATAL("out of space for reg expr"); 574 bp = buf; 575 for ( ; (c = input()) != '/' && c != 0; ) { 576 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr")) 577 FATAL("out of space for reg expr %.10s...", buf); 578 if (c == '\n') { 579 *bp = '\0'; 580 SYNTAX( "newline in regular expression %.10s...", buf ); 581 unput('\n'); 582 break; 583 } else if (c == '\\') { 584 *bp++ = '\\'; 585 *bp++ = input(); 586 } else { 587 *bp++ = c; 588 } 589 } 590 *bp = 0; 591 if (c == 0) 592 SYNTAX("non-terminated regular expression %.10s...", buf); 593 yylval.s = tostring(buf); 594 unput('/'); 595 RET(REGEXPR); 596 } 597 598 /* low-level lexical stuff, sort of inherited from lex */ 599 600 char ebuf[300]; 601 char *ep = ebuf; 602 char yysbuf[100]; /* pushback buffer */ 603 char *yysptr = yysbuf; 604 FILE *yyin = NULL; 605 606 int input(void) /* get next lexical input character */ 607 { 608 int c; 609 extern char *lexprog; 610 611 if (yysptr > yysbuf) 612 c = (uschar)*--yysptr; 613 else if (lexprog != NULL) { /* awk '...' */ 614 if ((c = (uschar)*lexprog) != 0) 615 lexprog++; 616 } else /* awk -f ... */ 617 c = pgetc(); 618 if (c == EOF) 619 c = 0; 620 if (ep >= ebuf + sizeof ebuf) 621 ep = ebuf; 622 *ep = c; 623 if (c != 0) { 624 ep++; 625 } 626 return (c); 627 } 628 629 void unput(int c) /* put lexical character back on input */ 630 { 631 if (c == '\n') 632 lineno--; 633 if (yysptr >= yysbuf + sizeof(yysbuf)) 634 FATAL("pushed back too much: %.20s...", yysbuf); 635 *yysptr++ = c; 636 if (--ep < ebuf) 637 ep = ebuf + sizeof(ebuf) - 1; 638 } 639 640 void unputstr(const char *s) /* put a string back on input */ 641 { 642 int i; 643 644 for (i = strlen(s)-1; i >= 0; i--) 645 unput(s[i]); 646 } 647