1 /**************************************************************** 2 Copyright (C) Lucent Technologies 1997 3 All Rights Reserved 4 5 Permission to use, copy, modify, and distribute this software and 6 its documentation for any purpose and without fee is hereby 7 granted, provided that the above copyright notice appear in all 8 copies and that both that the copyright notice and this 9 permission notice and warranty disclaimer appear in supporting 10 documentation, and that the name Lucent Technologies or any of 11 its entities not be used in advertising or publicity pertaining 12 to distribution of the software without specific, written prior 13 permission. 14 15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 22 THIS SOFTWARE. 23 ****************************************************************/ 24 25 #include <stdio.h> 26 #include <stdlib.h> 27 #include <string.h> 28 #include <ctype.h> 29 #include "awk.h" 30 #include "awkgram.tab.h" 31 32 extern YYSTYPE yylval; 33 extern bool infunc; 34 35 int lineno = 1; 36 int bracecnt = 0; 37 int brackcnt = 0; 38 int parencnt = 0; 39 40 typedef struct Keyword { 41 const char *word; 42 int sub; 43 int type; 44 } Keyword; 45 46 const Keyword keywords[] = { /* keep sorted: binary searched */ 47 { "BEGIN", XBEGIN, XBEGIN }, 48 { "END", XEND, XEND }, 49 { "NF", VARNF, VARNF }, 50 { "and", FAND, BLTIN }, 51 { "atan2", FATAN, BLTIN }, 52 { "break", BREAK, BREAK }, 53 { "close", CLOSE, CLOSE }, 54 { "compl", FCOMPL, BLTIN }, 55 { "continue", CONTINUE, CONTINUE }, 56 { "cos", FCOS, BLTIN }, 57 { "delete", DELETE, DELETE }, 58 { "do", DO, DO }, 59 { "else", ELSE, ELSE }, 60 { "exit", EXIT, EXIT }, 61 { "exp", FEXP, BLTIN }, 62 { "fflush", FFLUSH, BLTIN }, 63 { "for", FOR, FOR }, 64 { "func", FUNC, FUNC }, 65 { "function", FUNC, FUNC }, 66 { "gensub", GENSUB, GENSUB }, 67 { "getline", GETLINE, GETLINE }, 68 { "gsub", GSUB, GSUB }, 69 { "if", IF, IF }, 70 { "in", IN, IN }, 71 { "index", INDEX, INDEX }, 72 { "int", FINT, BLTIN }, 73 { "length", FLENGTH, BLTIN }, 74 { "log", FLOG, BLTIN }, 75 { "lshift", FLSHIFT, BLTIN }, 76 { "match", MATCHFCN, MATCHFCN }, 77 { "next", NEXT, NEXT }, 78 { "nextfile", NEXTFILE, NEXTFILE }, 79 { "or", FFOR, BLTIN }, 80 { "print", PRINT, PRINT }, 81 { "printf", PRINTF, PRINTF }, 82 { "rand", FRAND, BLTIN }, 83 { "return", RETURN, RETURN }, 84 { "rshift", FRSHIFT, BLTIN }, 85 { "sin", FSIN, BLTIN }, 86 { "split", SPLIT, SPLIT }, 87 { "sprintf", SPRINTF, SPRINTF }, 88 { "sqrt", FSQRT, BLTIN }, 89 { "srand", FSRAND, BLTIN }, 90 { "strftime", FSTRFTIME, BLTIN }, 91 { "sub", SUB, SUB }, 92 { "substr", SUBSTR, SUBSTR }, 93 { "system", FSYSTEM, BLTIN }, 94 { "systime", FSYSTIME, BLTIN }, 95 { "tolower", FTOLOWER, BLTIN }, 96 { "toupper", FTOUPPER, BLTIN }, 97 { "while", WHILE, WHILE }, 98 { "xor", FXOR, BLTIN }, 99 }; 100 101 #define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); } 102 103 static int peek(void) 104 { 105 int c = input(); 106 unput(c); 107 return c; 108 } 109 110 static int gettok(char **pbuf, int *psz) /* get next input token */ 111 { 112 int c, retc; 113 char *buf = *pbuf; 114 int sz = *psz; 115 char *bp = buf; 116 117 c = input(); 118 if (c == 0) 119 return 0; 120 buf[0] = c; 121 buf[1] = 0; 122 if (!isalnum(c) && c != '.' && c != '_') 123 return c; 124 125 *bp++ = c; 126 if (isalpha(c) || c == '_') { /* it's a varname */ 127 for ( ; (c = input()) != 0; ) { 128 if (bp-buf >= sz) 129 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok")) 130 FATAL( "out of space for name %.10s...", buf ); 131 if (isalnum(c) || c == '_') 132 *bp++ = c; 133 else { 134 *bp = 0; 135 unput(c); 136 break; 137 } 138 } 139 *bp = 0; 140 retc = 'a'; /* alphanumeric */ 141 } else { /* maybe it's a number, but could be . */ 142 char *rem; 143 /* read input until can't be a number */ 144 for ( ; (c = input()) != 0; ) { 145 if (bp-buf >= sz) 146 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok")) 147 FATAL( "out of space for number %.10s...", buf ); 148 if (isdigit(c) || c == 'e' || c == 'E' 149 || c == '.' || c == '+' || c == '-') 150 *bp++ = c; 151 else { 152 unput(c); 153 break; 154 } 155 } 156 *bp = 0; 157 strtod(buf, &rem); /* parse the number */ 158 if (rem == buf) { /* it wasn't a valid number at all */ 159 buf[1] = 0; /* return one character as token */ 160 retc = (uschar)buf[0]; /* character is its own type */ 161 unputstr(rem+1); /* put rest back for later */ 162 } else { /* some prefix was a number */ 163 unputstr(rem); /* put rest back for later */ 164 rem[0] = 0; /* truncate buf after number part */ 165 retc = '0'; /* type is number */ 166 } 167 } 168 *pbuf = buf; 169 *psz = sz; 170 return retc; 171 } 172 173 int word(char *); 174 int string(void); 175 int regexpr(void); 176 bool sc = false; /* true => return a } right now */ 177 bool reg = false; /* true => return a REGEXPR now */ 178 179 int yylex(void) 180 { 181 int c; 182 static char *buf = NULL; 183 static int bufsize = 5; /* BUG: setting this small causes core dump! */ 184 185 if (buf == NULL && (buf = (char *) malloc(bufsize)) == NULL) 186 FATAL( "out of space in yylex" ); 187 if (sc) { 188 sc = false; 189 RET('}'); 190 } 191 if (reg) { 192 reg = false; 193 return regexpr(); 194 } 195 for (;;) { 196 c = gettok(&buf, &bufsize); 197 if (c == 0) 198 return 0; 199 if (isalpha(c) || c == '_') 200 return word(buf); 201 if (isdigit(c)) { 202 char *cp = tostring(buf); 203 double result; 204 205 if (is_number(cp, & result)) 206 yylval.cp = setsymtab(buf, cp, result, CON|NUM, symtab); 207 else 208 yylval.cp = setsymtab(buf, cp, 0.0, STR, symtab); 209 free(cp); 210 /* should this also have STR set? */ 211 RET(NUMBER); 212 } 213 214 yylval.i = c; 215 switch (c) { 216 case '\n': /* {EOL} */ 217 lineno++; 218 RET(NL); 219 case '\r': /* assume \n is coming */ 220 case ' ': /* {WS}+ */ 221 case '\t': 222 break; 223 case '#': /* #.* strip comments */ 224 while ((c = input()) != '\n' && c != 0) 225 ; 226 unput(c); 227 /* 228 * Next line is a hack, itcompensates for 229 * unput's treatment of \n. 230 */ 231 lineno++; 232 break; 233 case ';': 234 RET(';'); 235 case '\\': 236 if (peek() == '\n') { 237 input(); 238 lineno++; 239 } else if (peek() == '\r') { 240 input(); input(); /* \n */ 241 lineno++; 242 } else { 243 RET(c); 244 } 245 break; 246 case '&': 247 if (peek() == '&') { 248 input(); RET(AND); 249 } else 250 RET('&'); 251 case '|': 252 if (peek() == '|') { 253 input(); RET(BOR); 254 } else 255 RET('|'); 256 case '!': 257 if (peek() == '=') { 258 input(); yylval.i = NE; RET(NE); 259 } else if (peek() == '~') { 260 input(); yylval.i = NOTMATCH; RET(MATCHOP); 261 } else 262 RET(NOT); 263 case '~': 264 yylval.i = MATCH; 265 RET(MATCHOP); 266 case '<': 267 if (peek() == '=') { 268 input(); yylval.i = LE; RET(LE); 269 } else { 270 yylval.i = LT; RET(LT); 271 } 272 case '=': 273 if (peek() == '=') { 274 input(); yylval.i = EQ; RET(EQ); 275 } else { 276 yylval.i = ASSIGN; RET(ASGNOP); 277 } 278 case '>': 279 if (peek() == '=') { 280 input(); yylval.i = GE; RET(GE); 281 } else if (peek() == '>') { 282 input(); yylval.i = APPEND; RET(APPEND); 283 } else { 284 yylval.i = GT; RET(GT); 285 } 286 case '+': 287 if (peek() == '+') { 288 input(); yylval.i = INCR; RET(INCR); 289 } else if (peek() == '=') { 290 input(); yylval.i = ADDEQ; RET(ASGNOP); 291 } else 292 RET('+'); 293 case '-': 294 if (peek() == '-') { 295 input(); yylval.i = DECR; RET(DECR); 296 } else if (peek() == '=') { 297 input(); yylval.i = SUBEQ; RET(ASGNOP); 298 } else 299 RET('-'); 300 case '*': 301 if (peek() == '=') { /* *= */ 302 input(); yylval.i = MULTEQ; RET(ASGNOP); 303 } else if (peek() == '*') { /* ** or **= */ 304 input(); /* eat 2nd * */ 305 if (peek() == '=') { 306 input(); yylval.i = POWEQ; RET(ASGNOP); 307 } else { 308 RET(POWER); 309 } 310 } else 311 RET('*'); 312 case '/': 313 RET('/'); 314 case '%': 315 if (peek() == '=') { 316 input(); yylval.i = MODEQ; RET(ASGNOP); 317 } else 318 RET('%'); 319 case '^': 320 if (peek() == '=') { 321 input(); yylval.i = POWEQ; RET(ASGNOP); 322 } else 323 RET(POWER); 324 325 case '$': 326 /* BUG: awkward, if not wrong */ 327 c = gettok(&buf, &bufsize); 328 if (isalpha(c)) { 329 if (strcmp(buf, "NF") == 0) { /* very special */ 330 unputstr("(NF)"); 331 RET(INDIRECT); 332 } 333 c = peek(); 334 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) { 335 unputstr(buf); 336 RET(INDIRECT); 337 } 338 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab); 339 RET(IVAR); 340 } else if (c == 0) { /* */ 341 SYNTAX( "unexpected end of input after $" ); 342 RET(';'); 343 } else { 344 unputstr(buf); 345 RET(INDIRECT); 346 } 347 348 case '}': 349 if (--bracecnt < 0) 350 SYNTAX( "extra }" ); 351 sc = true; 352 RET(';'); 353 case ']': 354 if (--brackcnt < 0) 355 SYNTAX( "extra ]" ); 356 RET(']'); 357 case ')': 358 if (--parencnt < 0) 359 SYNTAX( "extra )" ); 360 RET(')'); 361 case '{': 362 bracecnt++; 363 RET('{'); 364 case '[': 365 brackcnt++; 366 RET('['); 367 case '(': 368 parencnt++; 369 RET('('); 370 371 case '"': 372 return string(); /* BUG: should be like tran.c ? */ 373 374 default: 375 RET(c); 376 } 377 } 378 } 379 380 extern int runetochar(char *str, int c); 381 382 int string(void) 383 { 384 int c, n; 385 char *s, *bp; 386 static char *buf = NULL; 387 static int bufsz = 500; 388 389 if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL) 390 FATAL("out of space for strings"); 391 for (bp = buf; (c = input()) != '"'; ) { 392 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string")) 393 FATAL("out of space for string %.10s...", buf); 394 switch (c) { 395 case '\n': 396 case '\r': 397 case 0: 398 *bp = '\0'; 399 SYNTAX( "non-terminated string %.10s...", buf ); 400 if (c == 0) /* hopeless */ 401 FATAL( "giving up" ); 402 lineno++; 403 break; 404 case '\\': 405 c = input(); 406 switch (c) { 407 case '\n': break; 408 case '"': *bp++ = '"'; break; 409 case 'n': *bp++ = '\n'; break; 410 case 't': *bp++ = '\t'; break; 411 case 'f': *bp++ = '\f'; break; 412 case 'r': *bp++ = '\r'; break; 413 case 'b': *bp++ = '\b'; break; 414 case 'v': *bp++ = '\v'; break; 415 case 'a': *bp++ = '\a'; break; 416 case '\\': *bp++ = '\\'; break; 417 418 case '0': case '1': case '2': /* octal: \d \dd \ddd */ 419 case '3': case '4': case '5': case '6': case '7': 420 n = c - '0'; 421 if ((c = peek()) >= '0' && c < '8') { 422 n = 8 * n + input() - '0'; 423 if ((c = peek()) >= '0' && c < '8') 424 n = 8 * n + input() - '0'; 425 } 426 *bp++ = n; 427 break; 428 429 case 'x': /* hex \x0-9a-fA-F (exactly two) */ 430 { 431 int i; 432 433 n = 0; 434 for (i = 1; i <= 2; i++) { 435 c = input(); 436 if (c == 0) 437 break; 438 if (isxdigit(c)) { 439 c = tolower(c); 440 n *= 16; 441 if (isdigit(c)) 442 n += (c - '0'); 443 else 444 n += 10 + (c - 'a'); 445 } else 446 break; 447 } 448 if (n) 449 *bp++ = n; 450 else 451 unput(c); 452 break; 453 } 454 455 case 'u': /* utf \u0-9a-fA-F (1..8) */ 456 { 457 int i; 458 459 n = 0; 460 for (i = 0; i < 8; i++) { 461 c = input(); 462 if (!isxdigit(c) || c == 0) 463 break; 464 c = tolower(c); 465 n *= 16; 466 if (isdigit(c)) 467 n += (c - '0'); 468 else 469 n += 10 + (c - 'a'); 470 } 471 unput(c); 472 bp += runetochar(bp, n); 473 break; 474 } 475 476 default: 477 *bp++ = c; 478 break; 479 } 480 break; 481 default: 482 *bp++ = c; 483 break; 484 } 485 } 486 *bp = 0; 487 s = tostring(buf); 488 *bp++ = ' '; *bp++ = '\0'; 489 yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab); 490 free(s); 491 RET(STRING); 492 } 493 494 495 static int binsearch(char *w, const Keyword *kp, int n) 496 { 497 int cond, low, mid, high; 498 499 low = 0; 500 high = n - 1; 501 while (low <= high) { 502 mid = (low + high) / 2; 503 if ((cond = strcmp(w, kp[mid].word)) < 0) 504 high = mid - 1; 505 else if (cond > 0) 506 low = mid + 1; 507 else 508 return mid; 509 } 510 return -1; 511 } 512 513 int word(char *w) 514 { 515 const Keyword *kp; 516 int c, n; 517 518 n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0])); 519 if (n != -1) { /* found in table */ 520 kp = keywords + n; 521 yylval.i = kp->sub; 522 switch (kp->type) { /* special handling */ 523 case BLTIN: 524 if (kp->sub == FSYSTEM && safe) 525 SYNTAX( "system is unsafe" ); 526 RET(kp->type); 527 case FUNC: 528 if (infunc) 529 SYNTAX( "illegal nested function" ); 530 RET(kp->type); 531 case RETURN: 532 if (!infunc) 533 SYNTAX( "return not in function" ); 534 RET(kp->type); 535 case VARNF: 536 yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab); 537 RET(VARNF); 538 default: 539 RET(kp->type); 540 } 541 } 542 c = peek(); /* look for '(' */ 543 if (c != '(' && infunc && (n=isarg(w)) >= 0) { 544 yylval.i = n; 545 RET(ARG); 546 } else { 547 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab); 548 if (c == '(') { 549 RET(CALL); 550 } else { 551 RET(VAR); 552 } 553 } 554 } 555 556 void startreg(void) /* next call to yylex will return a regular expression */ 557 { 558 reg = true; 559 } 560 561 int regexpr(void) 562 { 563 int c; 564 static char *buf = NULL; 565 static int bufsz = 500; 566 char *bp; 567 568 if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL) 569 FATAL("out of space for reg expr"); 570 bp = buf; 571 for ( ; (c = input()) != '/' && c != 0; ) { 572 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr")) 573 FATAL("out of space for reg expr %.10s...", buf); 574 if (c == '\n') { 575 *bp = '\0'; 576 SYNTAX( "newline in regular expression %.10s...", buf ); 577 unput('\n'); 578 break; 579 } else if (c == '\\') { 580 *bp++ = '\\'; 581 *bp++ = input(); 582 } else { 583 *bp++ = c; 584 } 585 } 586 *bp = 0; 587 if (c == 0) 588 SYNTAX("non-terminated regular expression %.10s...", buf); 589 yylval.s = tostring(buf); 590 unput('/'); 591 RET(REGEXPR); 592 } 593 594 /* low-level lexical stuff, sort of inherited from lex */ 595 596 char ebuf[300]; 597 char *ep = ebuf; 598 char yysbuf[100]; /* pushback buffer */ 599 char *yysptr = yysbuf; 600 FILE *yyin = NULL; 601 602 int input(void) /* get next lexical input character */ 603 { 604 int c; 605 extern char *lexprog; 606 607 if (yysptr > yysbuf) 608 c = (uschar)*--yysptr; 609 else if (lexprog != NULL) { /* awk '...' */ 610 if ((c = (uschar)*lexprog) != 0) 611 lexprog++; 612 } else /* awk -f ... */ 613 c = pgetc(); 614 if (c == EOF) 615 c = 0; 616 if (ep >= ebuf + sizeof ebuf) 617 ep = ebuf; 618 *ep = c; 619 if (c != 0) { 620 ep++; 621 } 622 return (c); 623 } 624 625 void unput(int c) /* put lexical character back on input */ 626 { 627 if (c == '\n') 628 lineno--; 629 if (yysptr >= yysbuf + sizeof(yysbuf)) 630 FATAL("pushed back too much: %.20s...", yysbuf); 631 *yysptr++ = c; 632 if (--ep < ebuf) 633 ep = ebuf + sizeof(ebuf) - 1; 634 } 635 636 void unputstr(const char *s) /* put a string back on input */ 637 { 638 int i; 639 640 for (i = strlen(s)-1; i >= 0; i--) 641 unput(s[i]); 642 } 643