1 /**************************************************************** 2 Copyright (C) Lucent Technologies 1997 3 All Rights Reserved 4 5 Permission to use, copy, modify, and distribute this software and 6 its documentation for any purpose and without fee is hereby 7 granted, provided that the above copyright notice appear in all 8 copies and that both that the copyright notice and this 9 permission notice and warranty disclaimer appear in supporting 10 documentation, and that the name Lucent Technologies or any of 11 its entities not be used in advertising or publicity pertaining 12 to distribution of the software without specific, written prior 13 permission. 14 15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 22 THIS SOFTWARE. 23 ****************************************************************/ 24 25 #include <stdio.h> 26 #include <stdlib.h> 27 #include <string.h> 28 #include <ctype.h> 29 #include "awk.h" 30 #include "awkgram.tab.h" 31 32 extern YYSTYPE yylval; 33 extern bool infunc; 34 35 int lineno = 1; 36 int bracecnt = 0; 37 int brackcnt = 0; 38 int parencnt = 0; 39 40 typedef struct Keyword { 41 const char *word; 42 int sub; 43 int type; 44 } Keyword; 45 46 const Keyword keywords[] = { /* keep sorted: binary searched */ 47 { "BEGIN", XBEGIN, XBEGIN }, 48 { "END", XEND, XEND }, 49 { "NF", VARNF, VARNF }, 50 { "and", FAND, BLTIN }, 51 { "atan2", FATAN, BLTIN }, 52 { "break", BREAK, BREAK }, 53 { "close", CLOSE, CLOSE }, 54 { "compl", FCOMPL, BLTIN }, 55 { "continue", CONTINUE, CONTINUE }, 56 { "cos", FCOS, BLTIN }, 57 { "delete", DELETE, DELETE }, 58 { "do", DO, DO }, 59 { "else", ELSE, ELSE }, 60 { "exit", EXIT, EXIT }, 61 { "exp", FEXP, BLTIN }, 62 { "fflush", FFLUSH, BLTIN }, 63 { "for", FOR, FOR }, 64 { "func", FUNC, FUNC }, 65 { "function", FUNC, FUNC }, 66 { "gensub", GENSUB, GENSUB }, 67 { "getline", GETLINE, GETLINE }, 68 { "gsub", GSUB, GSUB }, 69 { "if", IF, IF }, 70 { "in", IN, IN }, 71 { "index", INDEX, INDEX }, 72 { "int", FINT, BLTIN }, 73 { "length", FLENGTH, BLTIN }, 74 { "log", FLOG, BLTIN }, 75 { "lshift", FLSHIFT, BLTIN }, 76 { "match", MATCHFCN, MATCHFCN }, 77 { "mktime", FMKTIME, BLTIN }, 78 { "next", NEXT, NEXT }, 79 { "nextfile", NEXTFILE, NEXTFILE }, 80 { "or", FFOR, BLTIN }, 81 { "print", PRINT, PRINT }, 82 { "printf", PRINTF, PRINTF }, 83 { "rand", FRAND, BLTIN }, 84 { "return", RETURN, RETURN }, 85 { "rshift", FRSHIFT, BLTIN }, 86 { "sin", FSIN, BLTIN }, 87 { "split", SPLIT, SPLIT }, 88 { "sprintf", SPRINTF, SPRINTF }, 89 { "sqrt", FSQRT, BLTIN }, 90 { "srand", FSRAND, BLTIN }, 91 { "strftime", FSTRFTIME, BLTIN }, 92 { "sub", SUB, SUB }, 93 { "substr", SUBSTR, SUBSTR }, 94 { "system", FSYSTEM, BLTIN }, 95 { "systime", FSYSTIME, BLTIN }, 96 { "tolower", FTOLOWER, BLTIN }, 97 { "toupper", FTOUPPER, BLTIN }, 98 { "while", WHILE, WHILE }, 99 { "xor", FXOR, BLTIN }, 100 }; 101 102 #define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); } 103 104 static int peek(void) 105 { 106 int c = input(); 107 unput(c); 108 return c; 109 } 110 111 static int gettok(char **pbuf, int *psz) /* get next input token */ 112 { 113 int c, retc; 114 char *buf = *pbuf; 115 int sz = *psz; 116 char *bp = buf; 117 118 c = input(); 119 if (c == 0) 120 return 0; 121 buf[0] = c; 122 buf[1] = 0; 123 if (!isalnum(c) && c != '.' && c != '_') 124 return c; 125 126 *bp++ = c; 127 if (isalpha(c) || c == '_') { /* it's a varname */ 128 for ( ; (c = input()) != 0; ) { 129 if (bp-buf >= sz) 130 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok")) 131 FATAL( "out of space for name %.10s...", buf ); 132 if (isalnum(c) || c == '_') 133 *bp++ = c; 134 else { 135 *bp = 0; 136 unput(c); 137 break; 138 } 139 } 140 *bp = 0; 141 retc = 'a'; /* alphanumeric */ 142 } else { /* maybe it's a number, but could be . */ 143 char *rem; 144 /* read input until can't be a number */ 145 for ( ; (c = input()) != 0; ) { 146 if (bp-buf >= sz) 147 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok")) 148 FATAL( "out of space for number %.10s...", buf ); 149 if (isdigit(c) || c == 'e' || c == 'E' 150 || c == '.' || c == '+' || c == '-') 151 *bp++ = c; 152 else { 153 unput(c); 154 break; 155 } 156 } 157 *bp = 0; 158 strtod(buf, &rem); /* parse the number */ 159 if (rem == buf) { /* it wasn't a valid number at all */ 160 buf[1] = 0; /* return one character as token */ 161 retc = (uschar)buf[0]; /* character is its own type */ 162 unputstr(rem+1); /* put rest back for later */ 163 } else { /* some prefix was a number */ 164 unputstr(rem); /* put rest back for later */ 165 rem[0] = 0; /* truncate buf after number part */ 166 retc = '0'; /* type is number */ 167 } 168 } 169 *pbuf = buf; 170 *psz = sz; 171 return retc; 172 } 173 174 int word(char *); 175 int string(void); 176 int regexpr(void); 177 bool sc = false; /* true => return a } right now */ 178 bool reg = false; /* true => return a REGEXPR now */ 179 180 int yylex(void) 181 { 182 int c; 183 static char *buf = NULL; 184 static int bufsize = 5; /* BUG: setting this small causes core dump! */ 185 186 if (buf == NULL && (buf = (char *) malloc(bufsize)) == NULL) 187 FATAL( "out of space in yylex" ); 188 if (sc) { 189 sc = false; 190 RET('}'); 191 } 192 if (reg) { 193 reg = false; 194 return regexpr(); 195 } 196 for (;;) { 197 c = gettok(&buf, &bufsize); 198 if (c == 0) 199 return 0; 200 if (isalpha(c) || c == '_') 201 return word(buf); 202 if (isdigit(c)) { 203 char *cp = tostring(buf); 204 double result; 205 206 if (is_number(cp, & result)) 207 yylval.cp = setsymtab(buf, cp, result, CON|NUM, symtab); 208 else 209 yylval.cp = setsymtab(buf, cp, 0.0, STR, symtab); 210 free(cp); 211 /* should this also have STR set? */ 212 RET(NUMBER); 213 } 214 215 yylval.i = c; 216 switch (c) { 217 case '\n': /* {EOL} */ 218 lineno++; 219 RET(NL); 220 case '\r': /* assume \n is coming */ 221 case ' ': /* {WS}+ */ 222 case '\t': 223 break; 224 case '#': /* #.* strip comments */ 225 while ((c = input()) != '\n' && c != 0) 226 ; 227 unput(c); 228 /* 229 * Next line is a hack, it compensates for 230 * unput's treatment of \n. 231 */ 232 lineno++; 233 break; 234 case ';': 235 RET(';'); 236 case '\\': 237 if (peek() == '\n') { 238 input(); 239 lineno++; 240 } else if (peek() == '\r') { 241 input(); input(); /* \n */ 242 lineno++; 243 } else { 244 RET(c); 245 } 246 break; 247 case '&': 248 if (peek() == '&') { 249 input(); RET(AND); 250 } else 251 RET('&'); 252 case '|': 253 if (peek() == '|') { 254 input(); RET(BOR); 255 } else 256 RET('|'); 257 case '!': 258 if (peek() == '=') { 259 input(); yylval.i = NE; RET(NE); 260 } else if (peek() == '~') { 261 input(); yylval.i = NOTMATCH; RET(MATCHOP); 262 } else 263 RET(NOT); 264 case '~': 265 yylval.i = MATCH; 266 RET(MATCHOP); 267 case '<': 268 if (peek() == '=') { 269 input(); yylval.i = LE; RET(LE); 270 } else { 271 yylval.i = LT; RET(LT); 272 } 273 case '=': 274 if (peek() == '=') { 275 input(); yylval.i = EQ; RET(EQ); 276 } else { 277 yylval.i = ASSIGN; RET(ASGNOP); 278 } 279 case '>': 280 if (peek() == '=') { 281 input(); yylval.i = GE; RET(GE); 282 } else if (peek() == '>') { 283 input(); yylval.i = APPEND; RET(APPEND); 284 } else { 285 yylval.i = GT; RET(GT); 286 } 287 case '+': 288 if (peek() == '+') { 289 input(); yylval.i = INCR; RET(INCR); 290 } else if (peek() == '=') { 291 input(); yylval.i = ADDEQ; RET(ASGNOP); 292 } else 293 RET('+'); 294 case '-': 295 if (peek() == '-') { 296 input(); yylval.i = DECR; RET(DECR); 297 } else if (peek() == '=') { 298 input(); yylval.i = SUBEQ; RET(ASGNOP); 299 } else 300 RET('-'); 301 case '*': 302 if (peek() == '=') { /* *= */ 303 input(); yylval.i = MULTEQ; RET(ASGNOP); 304 } else if (peek() == '*') { /* ** or **= */ 305 input(); /* eat 2nd * */ 306 if (peek() == '=') { 307 input(); yylval.i = POWEQ; RET(ASGNOP); 308 } else { 309 RET(POWER); 310 } 311 } else 312 RET('*'); 313 case '/': 314 RET('/'); 315 case '%': 316 if (peek() == '=') { 317 input(); yylval.i = MODEQ; RET(ASGNOP); 318 } else 319 RET('%'); 320 case '^': 321 if (peek() == '=') { 322 input(); yylval.i = POWEQ; RET(ASGNOP); 323 } else 324 RET(POWER); 325 326 case '$': 327 /* BUG: awkward, if not wrong */ 328 c = gettok(&buf, &bufsize); 329 if (isalpha(c)) { 330 if (strcmp(buf, "NF") == 0) { /* very special */ 331 unputstr("(NF)"); 332 RET(INDIRECT); 333 } 334 c = peek(); 335 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) { 336 unputstr(buf); 337 RET(INDIRECT); 338 } 339 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab); 340 RET(IVAR); 341 } else if (c == 0) { /* */ 342 SYNTAX( "unexpected end of input after $" ); 343 RET(';'); 344 } else { 345 unputstr(buf); 346 RET(INDIRECT); 347 } 348 349 case '}': 350 if (--bracecnt < 0) 351 SYNTAX( "extra }" ); 352 sc = true; 353 RET(';'); 354 case ']': 355 if (--brackcnt < 0) 356 SYNTAX( "extra ]" ); 357 RET(']'); 358 case ')': 359 if (--parencnt < 0) 360 SYNTAX( "extra )" ); 361 RET(')'); 362 case '{': 363 bracecnt++; 364 RET('{'); 365 case '[': 366 brackcnt++; 367 RET('['); 368 case '(': 369 parencnt++; 370 RET('('); 371 372 case '"': 373 return string(); /* BUG: should be like tran.c ? */ 374 375 default: 376 RET(c); 377 } 378 } 379 } 380 381 extern int runetochar(char *str, int c); 382 383 int string(void) 384 { 385 int c, n; 386 char *s, *bp; 387 static char *buf = NULL; 388 static int bufsz = 500; 389 390 if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL) 391 FATAL("out of space for strings"); 392 for (bp = buf; (c = input()) != '"'; ) { 393 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string")) 394 FATAL("out of space for string %.10s...", buf); 395 switch (c) { 396 case '\n': 397 case '\r': 398 case 0: 399 *bp = '\0'; 400 SYNTAX( "non-terminated string %.10s...", buf ); 401 if (c == 0) /* hopeless */ 402 FATAL( "giving up" ); 403 lineno++; 404 break; 405 case '\\': 406 c = input(); 407 switch (c) { 408 case '\n': break; 409 case '"': *bp++ = '"'; break; 410 case 'n': *bp++ = '\n'; break; 411 case 't': *bp++ = '\t'; break; 412 case 'f': *bp++ = '\f'; break; 413 case 'r': *bp++ = '\r'; break; 414 case 'b': *bp++ = '\b'; break; 415 case 'v': *bp++ = '\v'; break; 416 case 'a': *bp++ = '\a'; break; 417 case '\\': *bp++ = '\\'; break; 418 419 case '0': case '1': case '2': /* octal: \d \dd \ddd */ 420 case '3': case '4': case '5': case '6': case '7': 421 n = c - '0'; 422 if ((c = peek()) >= '0' && c < '8') { 423 n = 8 * n + input() - '0'; 424 if ((c = peek()) >= '0' && c < '8') 425 n = 8 * n + input() - '0'; 426 } 427 *bp++ = n; 428 break; 429 430 case 'x': /* hex \x0-9a-fA-F (exactly two) */ 431 { 432 int i; 433 434 if (!isxdigit(peek())) { 435 unput(c); 436 break; 437 } 438 n = 0; 439 for (i = 0; i < 2; i++) { 440 c = input(); 441 if (c == 0) 442 break; 443 if (isxdigit(c)) { 444 c = tolower(c); 445 n *= 16; 446 if (isdigit(c)) 447 n += (c - '0'); 448 else 449 n += 10 + (c - 'a'); 450 } else { 451 unput(c); 452 break; 453 } 454 } 455 if (i) 456 *bp++ = n; 457 break; 458 } 459 460 case 'u': /* utf \u0-9a-fA-F (1..8) */ 461 { 462 int i; 463 464 n = 0; 465 for (i = 0; i < 8; i++) { 466 c = input(); 467 if (!isxdigit(c) || c == 0) 468 break; 469 c = tolower(c); 470 n *= 16; 471 if (isdigit(c)) 472 n += (c - '0'); 473 else 474 n += 10 + (c - 'a'); 475 } 476 unput(c); 477 bp += runetochar(bp, n); 478 break; 479 } 480 481 default: 482 *bp++ = c; 483 break; 484 } 485 break; 486 default: 487 *bp++ = c; 488 break; 489 } 490 } 491 *bp = 0; 492 s = tostring(buf); 493 *bp++ = ' '; *bp++ = '\0'; 494 yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab); 495 free(s); 496 RET(STRING); 497 } 498 499 500 static int binsearch(char *w, const Keyword *kp, int n) 501 { 502 int cond, low, mid, high; 503 504 low = 0; 505 high = n - 1; 506 while (low <= high) { 507 mid = (low + high) / 2; 508 if ((cond = strcmp(w, kp[mid].word)) < 0) 509 high = mid - 1; 510 else if (cond > 0) 511 low = mid + 1; 512 else 513 return mid; 514 } 515 return -1; 516 } 517 518 int word(char *w) 519 { 520 const Keyword *kp; 521 int c, n; 522 523 n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0])); 524 if (n != -1) { /* found in table */ 525 kp = keywords + n; 526 yylval.i = kp->sub; 527 switch (kp->type) { /* special handling */ 528 case BLTIN: 529 if (kp->sub == FSYSTEM && safe) 530 SYNTAX( "system is unsafe" ); 531 RET(kp->type); 532 case FUNC: 533 if (infunc) 534 SYNTAX( "illegal nested function" ); 535 RET(kp->type); 536 case RETURN: 537 if (!infunc) 538 SYNTAX( "return not in function" ); 539 RET(kp->type); 540 case VARNF: 541 yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab); 542 RET(VARNF); 543 default: 544 RET(kp->type); 545 } 546 } 547 c = peek(); /* look for '(' */ 548 if (c != '(' && infunc && (n=isarg(w)) >= 0) { 549 yylval.i = n; 550 RET(ARG); 551 } else { 552 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab); 553 if (c == '(') { 554 RET(CALL); 555 } else { 556 RET(VAR); 557 } 558 } 559 } 560 561 void startreg(void) /* next call to yylex will return a regular expression */ 562 { 563 reg = true; 564 } 565 566 int regexpr(void) 567 { 568 int c; 569 static char *buf = NULL; 570 static int bufsz = 500; 571 char *bp; 572 573 if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL) 574 FATAL("out of space for reg expr"); 575 bp = buf; 576 for ( ; (c = input()) != '/' && c != 0; ) { 577 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr")) 578 FATAL("out of space for reg expr %.10s...", buf); 579 if (c == '\n') { 580 *bp = '\0'; 581 SYNTAX( "newline in regular expression %.10s...", buf ); 582 unput('\n'); 583 break; 584 } else if (c == '\\') { 585 *bp++ = '\\'; 586 *bp++ = input(); 587 } else { 588 *bp++ = c; 589 } 590 } 591 *bp = 0; 592 if (c == 0) 593 SYNTAX("non-terminated regular expression %.10s...", buf); 594 yylval.s = tostring(buf); 595 unput('/'); 596 RET(REGEXPR); 597 } 598 599 /* low-level lexical stuff, sort of inherited from lex */ 600 601 char ebuf[300]; 602 char *ep = ebuf; 603 char yysbuf[100]; /* pushback buffer */ 604 char *yysptr = yysbuf; 605 FILE *yyin = NULL; 606 607 int input(void) /* get next lexical input character */ 608 { 609 int c; 610 extern char *lexprog; 611 612 if (yysptr > yysbuf) 613 c = (uschar)*--yysptr; 614 else if (lexprog != NULL) { /* awk '...' */ 615 if ((c = (uschar)*lexprog) != 0) 616 lexprog++; 617 } else /* awk -f ... */ 618 c = pgetc(); 619 if (c == EOF) 620 c = 0; 621 if (ep >= ebuf + sizeof ebuf) 622 ep = ebuf; 623 *ep = c; 624 if (c != 0) { 625 ep++; 626 } 627 return (c); 628 } 629 630 void unput(int c) /* put lexical character back on input */ 631 { 632 if (c == '\n') 633 lineno--; 634 if (yysptr >= yysbuf + sizeof(yysbuf)) 635 FATAL("pushed back too much: %.20s...", yysbuf); 636 *yysptr++ = c; 637 if (--ep < ebuf) 638 ep = ebuf + sizeof(ebuf) - 1; 639 } 640 641 void unputstr(const char *s) /* put a string back on input */ 642 { 643 int i; 644 645 for (i = strlen(s)-1; i >= 0; i--) 646 unput(s[i]); 647 } 648