1 /**************************************************************** 2 Copyright (C) Lucent Technologies 1997 3 All Rights Reserved 4 5 Permission to use, copy, modify, and distribute this software and 6 its documentation for any purpose and without fee is hereby 7 granted, provided that the above copyright notice appear in all 8 copies and that both that the copyright notice and this 9 permission notice and warranty disclaimer appear in supporting 10 documentation, and that the name Lucent Technologies or any of 11 its entities not be used in advertising or publicity pertaining 12 to distribution of the software without specific, written prior 13 permission. 14 15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 22 THIS SOFTWARE. 23 ****************************************************************/ 24 25 #include <stdio.h> 26 #include <stdlib.h> 27 #include <string.h> 28 #include <ctype.h> 29 #include "awk.h" 30 #include "awkgram.tab.h" 31 32 extern YYSTYPE yylval; 33 extern bool infunc; 34 35 int lineno = 1; 36 int bracecnt = 0; 37 int brackcnt = 0; 38 int parencnt = 0; 39 40 typedef struct Keyword { 41 const char *word; 42 int sub; 43 int type; 44 } Keyword; 45 46 const Keyword keywords[] = { /* keep sorted: binary searched */ 47 { "BEGIN", XBEGIN, XBEGIN }, 48 { "END", XEND, XEND }, 49 { "NF", VARNF, VARNF }, 50 { "and", FAND, BLTIN }, 51 { "atan2", FATAN, BLTIN }, 52 { "break", BREAK, BREAK }, 53 { "close", CLOSE, CLOSE }, 54 { "compl", FCOMPL, BLTIN }, 55 { "continue", CONTINUE, CONTINUE }, 56 { "cos", FCOS, BLTIN }, 57 { "delete", DELETE, DELETE }, 58 { "do", DO, DO }, 59 { "else", ELSE, ELSE }, 60 { "exit", EXIT, EXIT }, 61 { "exp", FEXP, BLTIN }, 62 { "fflush", FFLUSH, BLTIN }, 63 { "for", FOR, FOR }, 64 { "func", FUNC, FUNC }, 65 { "function", FUNC, FUNC }, 66 { "gensub", GENSUB, GENSUB }, 67 { "getline", GETLINE, GETLINE }, 68 { "gsub", GSUB, GSUB }, 69 { "if", IF, IF }, 70 { "in", IN, IN }, 71 { "index", INDEX, INDEX }, 72 { "int", FINT, BLTIN }, 73 { "length", FLENGTH, BLTIN }, 74 { "log", FLOG, BLTIN }, 75 { "lshift", FLSHIFT, BLTIN }, 76 { "match", MATCHFCN, MATCHFCN }, 77 { "mktime", FMKTIME, BLTIN }, 78 { "next", NEXT, NEXT }, 79 { "nextfile", NEXTFILE, NEXTFILE }, 80 { "or", FFOR, BLTIN }, 81 { "print", PRINT, PRINT }, 82 { "printf", PRINTF, PRINTF }, 83 { "rand", FRAND, BLTIN }, 84 { "return", RETURN, RETURN }, 85 { "rshift", FRSHIFT, BLTIN }, 86 { "sin", FSIN, BLTIN }, 87 { "split", SPLIT, SPLIT }, 88 { "sprintf", SPRINTF, SPRINTF }, 89 { "sqrt", FSQRT, BLTIN }, 90 { "srand", FSRAND, BLTIN }, 91 { "strftime", FSTRFTIME, BLTIN }, 92 { "sub", SUB, SUB }, 93 { "substr", SUBSTR, SUBSTR }, 94 { "system", FSYSTEM, BLTIN }, 95 { "systime", FSYSTIME, BLTIN }, 96 { "tolower", FTOLOWER, BLTIN }, 97 { "toupper", FTOUPPER, BLTIN }, 98 { "while", WHILE, WHILE }, 99 { "xor", FXOR, BLTIN }, 100 }; 101 102 #define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); } 103 104 static int peek(void) 105 { 106 int c = input(); 107 unput(c); 108 return c; 109 } 110 111 static int gettok(char **pbuf, int *psz) /* get next input token */ 112 { 113 int c, retc; 114 char *buf = *pbuf; 115 int sz = *psz; 116 char *bp = buf; 117 118 c = input(); 119 if (c == 0) 120 return 0; 121 buf[0] = c; 122 buf[1] = 0; 123 if (!isalnum(c) && c != '.' && c != '_') 124 return c; 125 126 *bp++ = c; 127 if (isalpha(c) || c == '_') { /* it's a varname */ 128 for ( ; (c = input()) != 0; ) { 129 if (bp-buf >= sz) 130 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok")) 131 FATAL( "out of space for name %.10s...", buf ); 132 if (isalnum(c) || c == '_') 133 *bp++ = c; 134 else { 135 *bp = 0; 136 unput(c); 137 break; 138 } 139 } 140 *bp = 0; 141 retc = 'a'; /* alphanumeric */ 142 } else { /* maybe it's a number, but could be . */ 143 char *rem; 144 /* read input until can't be a number */ 145 for ( ; (c = input()) != 0; ) { 146 if (bp-buf >= sz) 147 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok")) 148 FATAL( "out of space for number %.10s...", buf ); 149 if (isdigit(c) || c == 'e' || c == 'E' 150 || c == '.' || c == '+' || c == '-') 151 *bp++ = c; 152 else { 153 unput(c); 154 break; 155 } 156 } 157 *bp = 0; 158 strtod(buf, &rem); /* parse the number */ 159 if (rem == buf) { /* it wasn't a valid number at all */ 160 buf[1] = 0; /* return one character as token */ 161 retc = (uschar)buf[0]; /* character is its own type */ 162 unputstr(rem+1); /* put rest back for later */ 163 } else { /* some prefix was a number */ 164 unputstr(rem); /* put rest back for later */ 165 rem[0] = 0; /* truncate buf after number part */ 166 retc = '0'; /* type is number */ 167 } 168 } 169 *pbuf = buf; 170 *psz = sz; 171 return retc; 172 } 173 174 int word(char *); 175 int string(void); 176 int regexpr(void); 177 bool sc = false; /* true => return a } right now */ 178 bool reg = false; /* true => return a REGEXPR now */ 179 180 int yylex(void) 181 { 182 int c; 183 static char *buf = NULL; 184 static int bufsize = 5; /* BUG: setting this small causes core dump! */ 185 186 if (buf == NULL && (buf = (char *) malloc(bufsize)) == NULL) 187 FATAL( "out of space in yylex" ); 188 if (sc) { 189 sc = false; 190 RET('}'); 191 } 192 if (reg) { 193 reg = false; 194 return regexpr(); 195 } 196 for (;;) { 197 c = gettok(&buf, &bufsize); 198 if (c == 0) 199 return 0; 200 if (isalpha(c) || c == '_') 201 return word(buf); 202 if (isdigit(c)) { 203 char *cp = tostring(buf); 204 double result; 205 206 if (is_number(cp, & result)) 207 yylval.cp = setsymtab(buf, cp, result, CON|NUM, symtab); 208 else 209 yylval.cp = setsymtab(buf, cp, 0.0, STR, symtab); 210 free(cp); 211 /* should this also have STR set? */ 212 RET(NUMBER); 213 } 214 215 yylval.i = c; 216 switch (c) { 217 case '\n': /* {EOL} */ 218 lineno++; 219 RET(NL); 220 case '\r': /* assume \n is coming */ 221 case ' ': /* {WS}+ */ 222 case '\t': 223 break; 224 case '#': /* #.* strip comments */ 225 while ((c = input()) != '\n' && c != 0) 226 ; 227 unput(c); 228 break; 229 case ';': 230 RET(';'); 231 case '\\': 232 if (peek() == '\n') { 233 input(); 234 lineno++; 235 } else if (peek() == '\r') { 236 input(); input(); /* \n */ 237 lineno++; 238 } else { 239 RET(c); 240 } 241 break; 242 case '&': 243 if (peek() == '&') { 244 input(); RET(AND); 245 } else 246 RET('&'); 247 case '|': 248 if (peek() == '|') { 249 input(); RET(BOR); 250 } else 251 RET('|'); 252 case '!': 253 if (peek() == '=') { 254 input(); yylval.i = NE; RET(NE); 255 } else if (peek() == '~') { 256 input(); yylval.i = NOTMATCH; RET(MATCHOP); 257 } else 258 RET(NOT); 259 case '~': 260 yylval.i = MATCH; 261 RET(MATCHOP); 262 case '<': 263 if (peek() == '=') { 264 input(); yylval.i = LE; RET(LE); 265 } else { 266 yylval.i = LT; RET(LT); 267 } 268 case '=': 269 if (peek() == '=') { 270 input(); yylval.i = EQ; RET(EQ); 271 } else { 272 yylval.i = ASSIGN; RET(ASGNOP); 273 } 274 case '>': 275 if (peek() == '=') { 276 input(); yylval.i = GE; RET(GE); 277 } else if (peek() == '>') { 278 input(); yylval.i = APPEND; RET(APPEND); 279 } else { 280 yylval.i = GT; RET(GT); 281 } 282 case '+': 283 if (peek() == '+') { 284 input(); yylval.i = INCR; RET(INCR); 285 } else if (peek() == '=') { 286 input(); yylval.i = ADDEQ; RET(ASGNOP); 287 } else 288 RET('+'); 289 case '-': 290 if (peek() == '-') { 291 input(); yylval.i = DECR; RET(DECR); 292 } else if (peek() == '=') { 293 input(); yylval.i = SUBEQ; RET(ASGNOP); 294 } else 295 RET('-'); 296 case '*': 297 if (peek() == '=') { /* *= */ 298 input(); yylval.i = MULTEQ; RET(ASGNOP); 299 } else if (peek() == '*') { /* ** or **= */ 300 input(); /* eat 2nd * */ 301 if (peek() == '=') { 302 input(); yylval.i = POWEQ; RET(ASGNOP); 303 } else { 304 RET(POWER); 305 } 306 } else 307 RET('*'); 308 case '/': 309 RET('/'); 310 case '%': 311 if (peek() == '=') { 312 input(); yylval.i = MODEQ; RET(ASGNOP); 313 } else 314 RET('%'); 315 case '^': 316 if (peek() == '=') { 317 input(); yylval.i = POWEQ; RET(ASGNOP); 318 } else 319 RET(POWER); 320 321 case '$': 322 /* BUG: awkward, if not wrong */ 323 c = gettok(&buf, &bufsize); 324 if (isalpha(c)) { 325 if (strcmp(buf, "NF") == 0) { /* very special */ 326 unputstr("(NF)"); 327 RET(INDIRECT); 328 } 329 c = peek(); 330 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) { 331 unputstr(buf); 332 RET(INDIRECT); 333 } 334 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab); 335 RET(IVAR); 336 } else if (c == 0) { /* */ 337 SYNTAX( "unexpected end of input after $" ); 338 RET(';'); 339 } else { 340 unputstr(buf); 341 RET(INDIRECT); 342 } 343 344 case '}': 345 if (--bracecnt < 0) 346 SYNTAX( "extra }" ); 347 sc = true; 348 RET(';'); 349 case ']': 350 if (--brackcnt < 0) 351 SYNTAX( "extra ]" ); 352 RET(']'); 353 case ')': 354 if (--parencnt < 0) 355 SYNTAX( "extra )" ); 356 RET(')'); 357 case '{': 358 bracecnt++; 359 RET('{'); 360 case '[': 361 brackcnt++; 362 RET('['); 363 case '(': 364 parencnt++; 365 RET('('); 366 367 case '"': 368 return string(); /* BUG: should be like tran.c ? */ 369 370 default: 371 RET(c); 372 } 373 } 374 } 375 376 extern int runetochar(char *str, int c); 377 378 int string(void) 379 { 380 int c, n; 381 char *s, *bp; 382 static char *buf = NULL; 383 static int bufsz = 500; 384 385 if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL) 386 FATAL("out of space for strings"); 387 for (bp = buf; (c = input()) != '"'; ) { 388 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string")) 389 FATAL("out of space for string %.10s...", buf); 390 switch (c) { 391 case '\n': 392 case '\r': 393 case 0: 394 *bp = '\0'; 395 SYNTAX( "non-terminated string %.10s...", buf ); 396 if (c == 0) /* hopeless */ 397 FATAL( "giving up" ); 398 lineno++; 399 break; 400 case '\\': 401 c = input(); 402 switch (c) { 403 case '\n': break; 404 case '"': *bp++ = '"'; break; 405 case 'n': *bp++ = '\n'; break; 406 case 't': *bp++ = '\t'; break; 407 case 'f': *bp++ = '\f'; break; 408 case 'r': *bp++ = '\r'; break; 409 case 'b': *bp++ = '\b'; break; 410 case 'v': *bp++ = '\v'; break; 411 case 'a': *bp++ = '\a'; break; 412 case '\\': *bp++ = '\\'; break; 413 414 case '0': case '1': case '2': /* octal: \d \dd \ddd */ 415 case '3': case '4': case '5': case '6': case '7': 416 n = c - '0'; 417 if ((c = peek()) >= '0' && c < '8') { 418 n = 8 * n + input() - '0'; 419 if ((c = peek()) >= '0' && c < '8') 420 n = 8 * n + input() - '0'; 421 } 422 *bp++ = n; 423 break; 424 425 case 'x': /* hex \x0-9a-fA-F (exactly two) */ 426 { 427 int i; 428 429 if (!isxdigit(peek())) { 430 unput(c); 431 break; 432 } 433 n = 0; 434 for (i = 0; i < 2; i++) { 435 c = input(); 436 if (c == 0) 437 break; 438 if (isxdigit(c)) { 439 c = tolower(c); 440 n *= 16; 441 if (isdigit(c)) 442 n += (c - '0'); 443 else 444 n += 10 + (c - 'a'); 445 } else { 446 unput(c); 447 break; 448 } 449 } 450 if (i) 451 *bp++ = n; 452 break; 453 } 454 455 case 'u': /* utf \u0-9a-fA-F (1..8) */ 456 { 457 int i; 458 459 n = 0; 460 for (i = 0; i < 8; i++) { 461 c = input(); 462 if (!isxdigit(c) || c == 0) 463 break; 464 c = tolower(c); 465 n *= 16; 466 if (isdigit(c)) 467 n += (c - '0'); 468 else 469 n += 10 + (c - 'a'); 470 } 471 unput(c); 472 bp += runetochar(bp, n); 473 break; 474 } 475 476 default: 477 *bp++ = c; 478 break; 479 } 480 break; 481 default: 482 *bp++ = c; 483 break; 484 } 485 } 486 *bp = 0; 487 s = tostring(buf); 488 *bp++ = ' '; *bp++ = '\0'; 489 yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab); 490 free(s); 491 RET(STRING); 492 } 493 494 495 static int binsearch(char *w, const Keyword *kp, int n) 496 { 497 int cond, low, mid, high; 498 499 low = 0; 500 high = n - 1; 501 while (low <= high) { 502 mid = (low + high) / 2; 503 if ((cond = strcmp(w, kp[mid].word)) < 0) 504 high = mid - 1; 505 else if (cond > 0) 506 low = mid + 1; 507 else 508 return mid; 509 } 510 return -1; 511 } 512 513 int word(char *w) 514 { 515 const Keyword *kp; 516 int c, n; 517 518 n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0])); 519 if (n != -1) { /* found in table */ 520 kp = keywords + n; 521 yylval.i = kp->sub; 522 switch (kp->type) { /* special handling */ 523 case BLTIN: 524 if (kp->sub == FSYSTEM && safe) 525 SYNTAX( "system is unsafe" ); 526 RET(kp->type); 527 case FUNC: 528 if (infunc) 529 SYNTAX( "illegal nested function" ); 530 RET(kp->type); 531 case RETURN: 532 if (!infunc) 533 SYNTAX( "return not in function" ); 534 RET(kp->type); 535 case VARNF: 536 yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab); 537 RET(VARNF); 538 default: 539 RET(kp->type); 540 } 541 } 542 c = peek(); /* look for '(' */ 543 if (c != '(' && infunc && (n=isarg(w)) >= 0) { 544 yylval.i = n; 545 RET(ARG); 546 } else { 547 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab); 548 if (c == '(') { 549 RET(CALL); 550 } else { 551 RET(VAR); 552 } 553 } 554 } 555 556 void startreg(void) /* next call to yylex will return a regular expression */ 557 { 558 reg = true; 559 } 560 561 int regexpr(void) 562 { 563 int c; 564 static char *buf = NULL; 565 static int bufsz = 500; 566 char *bp; 567 568 if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL) 569 FATAL("out of space for reg expr"); 570 bp = buf; 571 for ( ; (c = input()) != '/' && c != 0; ) { 572 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr")) 573 FATAL("out of space for reg expr %.10s...", buf); 574 if (c == '\n') { 575 *bp = '\0'; 576 SYNTAX( "newline in regular expression %.10s...", buf ); 577 unput('\n'); 578 break; 579 } else if (c == '\\') { 580 *bp++ = '\\'; 581 *bp++ = input(); 582 } else { 583 *bp++ = c; 584 } 585 } 586 *bp = 0; 587 if (c == 0) 588 SYNTAX("non-terminated regular expression %.10s...", buf); 589 yylval.s = tostring(buf); 590 unput('/'); 591 RET(REGEXPR); 592 } 593 594 /* low-level lexical stuff, sort of inherited from lex */ 595 596 char ebuf[300]; 597 char *ep = ebuf; 598 char yysbuf[100]; /* pushback buffer */ 599 char *yysptr = yysbuf; 600 FILE *yyin = NULL; 601 602 int input(void) /* get next lexical input character */ 603 { 604 int c; 605 extern char *lexprog; 606 607 if (yysptr > yysbuf) 608 c = (uschar)*--yysptr; 609 else if (lexprog != NULL) { /* awk '...' */ 610 if ((c = (uschar)*lexprog) != 0) 611 lexprog++; 612 } else /* awk -f ... */ 613 c = pgetc(); 614 if (c == EOF) 615 c = 0; 616 if (ep >= ebuf + sizeof ebuf) 617 ep = ebuf; 618 *ep = c; 619 if (c != 0) { 620 ep++; 621 } 622 return (c); 623 } 624 625 void unput(int c) /* put lexical character back on input */ 626 { 627 if (yysptr >= yysbuf + sizeof(yysbuf)) 628 FATAL("pushed back too much: %.20s...", yysbuf); 629 *yysptr++ = c; 630 if (--ep < ebuf) 631 ep = ebuf + sizeof(ebuf) - 1; 632 } 633 634 void unputstr(const char *s) /* put a string back on input */ 635 { 636 int i; 637 638 for (i = strlen(s)-1; i >= 0; i--) 639 unput(s[i]); 640 } 641