1 /**************************************************************** 2 Copyright (C) Lucent Technologies 1997 3 All Rights Reserved 4 5 Permission to use, copy, modify, and distribute this software and 6 its documentation for any purpose and without fee is hereby 7 granted, provided that the above copyright notice appear in all 8 copies and that both that the copyright notice and this 9 permission notice and warranty disclaimer appear in supporting 10 documentation, and that the name Lucent Technologies or any of 11 its entities not be used in advertising or publicity pertaining 12 to distribution of the software without specific, written prior 13 permission. 14 15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 22 THIS SOFTWARE. 23 ****************************************************************/ 24 25 #include <stdio.h> 26 #include <stdlib.h> 27 #include <string.h> 28 #include <ctype.h> 29 #include "awk.h" 30 #include "ytab.h" 31 32 extern YYSTYPE yylval; 33 extern int infunc; 34 35 int lineno = 1; 36 int bracecnt = 0; 37 int brackcnt = 0; 38 int parencnt = 0; 39 40 typedef struct Keyword { 41 const char *word; 42 int sub; 43 int type; 44 } Keyword; 45 46 Keyword keywords[] ={ /* keep sorted: binary searched */ 47 { "BEGIN", XBEGIN, XBEGIN }, 48 { "END", XEND, XEND }, 49 { "NF", VARNF, VARNF }, 50 { "and", FAND, BLTIN }, 51 { "atan2", FATAN, BLTIN }, 52 { "break", BREAK, BREAK }, 53 { "close", CLOSE, CLOSE }, 54 { "compl", FCOMPL, BLTIN }, 55 { "continue", CONTINUE, CONTINUE }, 56 { "cos", FCOS, BLTIN }, 57 { "delete", DELETE, DELETE }, 58 { "do", DO, DO }, 59 { "else", ELSE, ELSE }, 60 { "exit", EXIT, EXIT }, 61 { "exp", FEXP, BLTIN }, 62 { "fflush", FFLUSH, BLTIN }, 63 { "for", FOR, FOR }, 64 { "func", FUNC, FUNC }, 65 { "function", FUNC, FUNC }, 66 { "getline", GETLINE, GETLINE }, 67 { "gsub", GSUB, GSUB }, 68 { "if", IF, IF }, 69 { "in", IN, IN }, 70 { "index", INDEX, INDEX }, 71 { "int", FINT, BLTIN }, 72 { "length", FLENGTH, BLTIN }, 73 { "log", FLOG, BLTIN }, 74 { "lshift", FLSHIFT, BLTIN }, 75 { "match", MATCHFCN, MATCHFCN }, 76 { "next", NEXT, NEXT }, 77 { "nextfile", NEXTFILE, NEXTFILE }, 78 { "or", FFOR, BLTIN }, 79 { "print", PRINT, PRINT }, 80 { "printf", PRINTF, PRINTF }, 81 { "rand", FRAND, BLTIN }, 82 { "return", RETURN, RETURN }, 83 { "rshift", FRSHIFT, BLTIN }, 84 { "sin", FSIN, BLTIN }, 85 { "split", SPLIT, SPLIT }, 86 { "sprintf", SPRINTF, SPRINTF }, 87 { "sqrt", FSQRT, BLTIN }, 88 { "srand", FSRAND, BLTIN }, 89 { "sub", SUB, SUB }, 90 { "substr", SUBSTR, SUBSTR }, 91 { "system", FSYSTEM, BLTIN }, 92 { "tolower", FTOLOWER, BLTIN }, 93 { "toupper", FTOUPPER, BLTIN }, 94 { "while", WHILE, WHILE }, 95 { "xor", FXOR, BLTIN }, 96 }; 97 98 #define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); } 99 100 int peek(void) 101 { 102 int c = input(); 103 unput(c); 104 return c; 105 } 106 107 int gettok(char **pbuf, int *psz) /* get next input token */ 108 { 109 int c, retc; 110 char *buf = *pbuf; 111 int sz = *psz; 112 char *bp = buf; 113 114 c = input(); 115 if (c == 0) 116 return 0; 117 buf[0] = c; 118 buf[1] = 0; 119 if (!isalnum(c) && c != '.' && c != '_') 120 return c; 121 122 *bp++ = c; 123 if (isalpha(c) || c == '_') { /* it's a varname */ 124 for ( ; (c = input()) != 0; ) { 125 if (bp-buf >= sz) 126 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok")) 127 FATAL( "out of space for name %.10s...", buf ); 128 if (isalnum(c) || c == '_') 129 *bp++ = c; 130 else { 131 *bp = 0; 132 unput(c); 133 break; 134 } 135 } 136 *bp = 0; 137 retc = 'a'; /* alphanumeric */ 138 } else { /* maybe it's a number, but could be . */ 139 char *rem; 140 /* read input until can't be a number */ 141 for ( ; (c = input()) != 0; ) { 142 if (bp-buf >= sz) 143 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok")) 144 FATAL( "out of space for number %.10s...", buf ); 145 if (isdigit(c) || c == 'e' || c == 'E' 146 || c == '.' || c == '+' || c == '-') 147 *bp++ = c; 148 else { 149 unput(c); 150 break; 151 } 152 } 153 *bp = 0; 154 strtod(buf, &rem); /* parse the number */ 155 if (rem == buf) { /* it wasn't a valid number at all */ 156 buf[1] = 0; /* return one character as token */ 157 retc = buf[0]; /* character is its own type */ 158 unputstr(rem+1); /* put rest back for later */ 159 } else { /* some prefix was a number */ 160 unputstr(rem); /* put rest back for later */ 161 rem[0] = 0; /* truncate buf after number part */ 162 retc = '0'; /* type is number */ 163 } 164 } 165 *pbuf = buf; 166 *psz = sz; 167 return retc; 168 } 169 170 int word(char *); 171 int string(void); 172 int regexpr(void); 173 int sc = 0; /* 1 => return a } right now */ 174 int reg = 0; /* 1 => return a REGEXPR now */ 175 176 int yylex(void) 177 { 178 int c; 179 static char *buf = NULL; 180 static int bufsize = 5; /* BUG: setting this small causes core dump! */ 181 182 if (buf == NULL && (buf = (char *) malloc(bufsize)) == NULL) 183 FATAL( "out of space in yylex" ); 184 if (sc) { 185 sc = 0; 186 RET('}'); 187 } 188 if (reg) { 189 reg = 0; 190 return regexpr(); 191 } 192 for (;;) { 193 c = gettok(&buf, &bufsize); 194 if (c == 0) 195 return 0; 196 if (isalpha(c) || c == '_') 197 return word(buf); 198 if (isdigit(c)) { 199 yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab); 200 /* should this also have STR set? */ 201 RET(NUMBER); 202 } 203 204 yylval.i = c; 205 switch (c) { 206 case '\n': /* {EOL} */ 207 lineno++; 208 RET(NL); 209 case '\r': /* assume \n is coming */ 210 case ' ': /* {WS}+ */ 211 case '\t': 212 break; 213 case '#': /* #.* strip comments */ 214 while ((c = input()) != '\n' && c != 0) 215 ; 216 unput(c); 217 break; 218 case ';': 219 RET(';'); 220 case '\\': 221 if (peek() == '\n') { 222 input(); 223 lineno++; 224 } else if (peek() == '\r') { 225 input(); input(); /* \n */ 226 lineno++; 227 } else { 228 RET(c); 229 } 230 break; 231 case '&': 232 if (peek() == '&') { 233 input(); RET(AND); 234 } else 235 RET('&'); 236 case '|': 237 if (peek() == '|') { 238 input(); RET(BOR); 239 } else 240 RET('|'); 241 case '!': 242 if (peek() == '=') { 243 input(); yylval.i = NE; RET(NE); 244 } else if (peek() == '~') { 245 input(); yylval.i = NOTMATCH; RET(MATCHOP); 246 } else 247 RET(NOT); 248 case '~': 249 yylval.i = MATCH; 250 RET(MATCHOP); 251 case '<': 252 if (peek() == '=') { 253 input(); yylval.i = LE; RET(LE); 254 } else { 255 yylval.i = LT; RET(LT); 256 } 257 case '=': 258 if (peek() == '=') { 259 input(); yylval.i = EQ; RET(EQ); 260 } else { 261 yylval.i = ASSIGN; RET(ASGNOP); 262 } 263 case '>': 264 if (peek() == '=') { 265 input(); yylval.i = GE; RET(GE); 266 } else if (peek() == '>') { 267 input(); yylval.i = APPEND; RET(APPEND); 268 } else { 269 yylval.i = GT; RET(GT); 270 } 271 case '+': 272 if (peek() == '+') { 273 input(); yylval.i = INCR; RET(INCR); 274 } else if (peek() == '=') { 275 input(); yylval.i = ADDEQ; RET(ASGNOP); 276 } else 277 RET('+'); 278 case '-': 279 if (peek() == '-') { 280 input(); yylval.i = DECR; RET(DECR); 281 } else if (peek() == '=') { 282 input(); yylval.i = SUBEQ; RET(ASGNOP); 283 } else 284 RET('-'); 285 case '*': 286 if (peek() == '=') { /* *= */ 287 input(); yylval.i = MULTEQ; RET(ASGNOP); 288 } else if (peek() == '*') { /* ** or **= */ 289 input(); /* eat 2nd * */ 290 if (peek() == '=') { 291 input(); yylval.i = POWEQ; RET(ASGNOP); 292 } else { 293 RET(POWER); 294 } 295 } else 296 RET('*'); 297 case '/': 298 RET('/'); 299 case '%': 300 if (peek() == '=') { 301 input(); yylval.i = MODEQ; RET(ASGNOP); 302 } else 303 RET('%'); 304 case '^': 305 if (peek() == '=') { 306 input(); yylval.i = POWEQ; RET(ASGNOP); 307 } else 308 RET(POWER); 309 310 case '$': 311 /* BUG: awkward, if not wrong */ 312 c = gettok(&buf, &bufsize); 313 if (isalpha(c)) { 314 if (strcmp(buf, "NF") == 0) { /* very special */ 315 unputstr("(NF)"); 316 RET(INDIRECT); 317 } 318 c = peek(); 319 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) { 320 unputstr(buf); 321 RET(INDIRECT); 322 } 323 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab); 324 RET(IVAR); 325 } else if (c == 0) { /* */ 326 SYNTAX( "unexpected end of input after $" ); 327 RET(';'); 328 } else { 329 unputstr(buf); 330 RET(INDIRECT); 331 } 332 333 case '}': 334 if (--bracecnt < 0) 335 SYNTAX( "extra }" ); 336 sc = 1; 337 RET(';'); 338 case ']': 339 if (--brackcnt < 0) 340 SYNTAX( "extra ]" ); 341 RET(']'); 342 case ')': 343 if (--parencnt < 0) 344 SYNTAX( "extra )" ); 345 RET(')'); 346 case '{': 347 bracecnt++; 348 RET('{'); 349 case '[': 350 brackcnt++; 351 RET('['); 352 case '(': 353 parencnt++; 354 RET('('); 355 356 case '"': 357 return string(); /* BUG: should be like tran.c ? */ 358 359 default: 360 RET(c); 361 } 362 } 363 } 364 365 int string(void) 366 { 367 int c, n; 368 char *s, *bp; 369 static char *buf = NULL; 370 static int bufsz = 500; 371 372 if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL) 373 FATAL("out of space for strings"); 374 for (bp = buf; (c = input()) != '"'; ) { 375 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string")) 376 FATAL("out of space for string %.10s...", buf); 377 switch (c) { 378 case '\n': 379 case '\r': 380 case 0: 381 *bp = '\0'; 382 SYNTAX( "non-terminated string %.10s...", buf ); 383 if (c == 0) /* hopeless */ 384 FATAL( "giving up" ); 385 lineno++; 386 break; 387 case '\\': 388 c = input(); 389 switch (c) { 390 case '"': *bp++ = '"'; break; 391 case 'n': *bp++ = '\n'; break; 392 case 't': *bp++ = '\t'; break; 393 case 'f': *bp++ = '\f'; break; 394 case 'r': *bp++ = '\r'; break; 395 case 'b': *bp++ = '\b'; break; 396 case 'v': *bp++ = '\v'; break; 397 case 'a': *bp++ = '\007'; break; 398 case '\\': *bp++ = '\\'; break; 399 400 case '0': case '1': case '2': /* octal: \d \dd \ddd */ 401 case '3': case '4': case '5': case '6': case '7': 402 n = c - '0'; 403 if ((c = peek()) >= '0' && c < '8') { 404 n = 8 * n + input() - '0'; 405 if ((c = peek()) >= '0' && c < '8') 406 n = 8 * n + input() - '0'; 407 } 408 *bp++ = n; 409 break; 410 411 case 'x': /* hex \x0-9a-fA-F + */ 412 { char xbuf[100], *px; 413 for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) { 414 if (isdigit(c) 415 || (c >= 'a' && c <= 'f') 416 || (c >= 'A' && c <= 'F')) 417 *px++ = c; 418 else 419 break; 420 } 421 *px = 0; 422 unput(c); 423 sscanf(xbuf, "%x", (unsigned int *) &n); 424 *bp++ = n; 425 break; 426 } 427 428 default: 429 *bp++ = c; 430 break; 431 } 432 break; 433 default: 434 *bp++ = c; 435 break; 436 } 437 } 438 *bp = 0; 439 s = tostring(buf); 440 *bp++ = ' '; *bp++ = 0; 441 yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab); 442 RET(STRING); 443 } 444 445 446 int binsearch(char *w, Keyword *kp, int n) 447 { 448 int cond, low, mid, high; 449 450 low = 0; 451 high = n - 1; 452 while (low <= high) { 453 mid = (low + high) / 2; 454 if ((cond = strcmp(w, kp[mid].word)) < 0) 455 high = mid - 1; 456 else if (cond > 0) 457 low = mid + 1; 458 else 459 return mid; 460 } 461 return -1; 462 } 463 464 int word(char *w) 465 { 466 Keyword *kp; 467 int c, n; 468 469 n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0])); 470 /* BUG: this ought to be inside the if; in theory could fault (daniel barrett) */ 471 kp = keywords + n; 472 if (n != -1) { /* found in table */ 473 yylval.i = kp->sub; 474 switch (kp->type) { /* special handling */ 475 case BLTIN: 476 if (kp->sub == FSYSTEM && safe) 477 SYNTAX( "system is unsafe" ); 478 RET(kp->type); 479 case FUNC: 480 if (infunc) 481 SYNTAX( "illegal nested function" ); 482 RET(kp->type); 483 case RETURN: 484 if (!infunc) 485 SYNTAX( "return not in function" ); 486 RET(kp->type); 487 case VARNF: 488 yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab); 489 RET(VARNF); 490 default: 491 RET(kp->type); 492 } 493 } 494 c = peek(); /* look for '(' */ 495 if (c != '(' && infunc && (n=isarg(w)) >= 0) { 496 yylval.i = n; 497 RET(ARG); 498 } else { 499 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab); 500 if (c == '(') { 501 RET(CALL); 502 } else { 503 RET(VAR); 504 } 505 } 506 } 507 508 void startreg(void) /* next call to yylex will return a regular expression */ 509 { 510 reg = 1; 511 } 512 513 int regexpr(void) 514 { 515 int c; 516 static char *buf = NULL; 517 static int bufsz = 500; 518 char *bp; 519 520 if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL) 521 FATAL("out of space for rex expr"); 522 bp = buf; 523 for ( ; (c = input()) != '/' && c != 0; ) { 524 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr")) 525 FATAL("out of space for reg expr %.10s...", buf); 526 if (c == '\n') { 527 *bp = '\0'; 528 SYNTAX( "newline in regular expression %.10s...", buf ); 529 unput('\n'); 530 break; 531 } else if (c == '\\') { 532 *bp++ = '\\'; 533 *bp++ = input(); 534 } else { 535 *bp++ = c; 536 } 537 } 538 *bp = 0; 539 if (c == 0) 540 SYNTAX("non-terminated regular expression %.10s...", buf); 541 yylval.s = tostring(buf); 542 unput('/'); 543 RET(REGEXPR); 544 } 545 546 /* low-level lexical stuff, sort of inherited from lex */ 547 548 char ebuf[300]; 549 char *ep = ebuf; 550 char yysbuf[100]; /* pushback buffer */ 551 char *yysptr = yysbuf; 552 FILE *yyin = NULL; 553 554 int input(void) /* get next lexical input character */ 555 { 556 int c; 557 extern char *lexprog; 558 559 if (yysptr > yysbuf) 560 c = (uschar)*--yysptr; 561 else if (lexprog != NULL) { /* awk '...' */ 562 if ((c = (uschar)*lexprog) != 0) 563 lexprog++; 564 } else /* awk -f ... */ 565 c = pgetc(); 566 if (c == EOF) 567 c = 0; 568 if (ep >= ebuf + sizeof ebuf) 569 ep = ebuf; 570 *ep = c; 571 if (c != 0) { 572 ep++; 573 } 574 return (c); 575 } 576 577 void unput(int c) /* put lexical character back on input */ 578 { 579 if (yysptr >= yysbuf + sizeof(yysbuf)) 580 FATAL("pushed back too much: %.20s...", yysbuf); 581 *yysptr++ = c; 582 if (--ep < ebuf) 583 ep = ebuf + sizeof(ebuf) - 1; 584 } 585 586 void unputstr(const char *s) /* put a string back on input */ 587 { 588 int i; 589 590 for (i = strlen(s)-1; i >= 0; i--) 591 unput(s[i]); 592 } 593