1 /**************************************************************** 2 Copyright (C) Lucent Technologies 1997 3 All Rights Reserved 4 5 Permission to use, copy, modify, and distribute this software and 6 its documentation for any purpose and without fee is hereby 7 granted, provided that the above copyright notice appear in all 8 copies and that both that the copyright notice and this 9 permission notice and warranty disclaimer appear in supporting 10 documentation, and that the name Lucent Technologies or any of 11 its entities not be used in advertising or publicity pertaining 12 to distribution of the software without specific, written prior 13 permission. 14 15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 22 THIS SOFTWARE. 23 ****************************************************************/ 24 25 #include <stdio.h> 26 #include <stdlib.h> 27 #include <string.h> 28 #include <ctype.h> 29 #include "awk.h" 30 #include "ytab.h" 31 32 extern YYSTYPE yylval; 33 extern int infunc; 34 35 int lineno = 1; 36 int bracecnt = 0; 37 int brackcnt = 0; 38 int parencnt = 0; 39 40 typedef struct Keyword { 41 const char *word; 42 int sub; 43 int type; 44 } Keyword; 45 46 Keyword keywords[] ={ /* keep sorted: binary searched */ 47 { "BEGIN", XBEGIN, XBEGIN }, 48 { "END", XEND, XEND }, 49 { "NF", VARNF, VARNF }, 50 { "and", FAND, BLTIN }, 51 { "atan2", FATAN, BLTIN }, 52 { "break", BREAK, BREAK }, 53 { "close", CLOSE, CLOSE }, 54 { "compl", FCOMPL, BLTIN }, 55 { "continue", CONTINUE, CONTINUE }, 56 { "cos", FCOS, BLTIN }, 57 { "delete", DELETE, DELETE }, 58 { "do", DO, DO }, 59 { "else", ELSE, ELSE }, 60 { "exit", EXIT, EXIT }, 61 { "exp", FEXP, BLTIN }, 62 { "fflush", FFLUSH, BLTIN }, 63 { "for", FOR, FOR }, 64 { "func", FUNC, FUNC }, 65 { "function", FUNC, FUNC }, 66 { "getline", GETLINE, GETLINE }, 67 { "gsub", GSUB, GSUB }, 68 { "if", IF, IF }, 69 { "in", IN, IN }, 70 { "index", INDEX, INDEX }, 71 { "int", FINT, BLTIN }, 72 { "length", FLENGTH, BLTIN }, 73 { "log", FLOG, BLTIN }, 74 { "lshift", FLSHIFT, BLTIN }, 75 { "match", MATCHFCN, MATCHFCN }, 76 { "next", NEXT, NEXT }, 77 { "nextfile", NEXTFILE, NEXTFILE }, 78 { "or", FFOR, BLTIN }, 79 { "print", PRINT, PRINT }, 80 { "printf", PRINTF, PRINTF }, 81 { "rand", FRAND, BLTIN }, 82 { "return", RETURN, RETURN }, 83 { "rshift", FRSHIFT, BLTIN }, 84 { "sin", FSIN, BLTIN }, 85 { "split", SPLIT, SPLIT }, 86 { "sprintf", SPRINTF, SPRINTF }, 87 { "sqrt", FSQRT, BLTIN }, 88 { "srand", FSRAND, BLTIN }, 89 { "sub", SUB, SUB }, 90 { "substr", SUBSTR, SUBSTR }, 91 { "system", FSYSTEM, BLTIN }, 92 { "tolower", FTOLOWER, BLTIN }, 93 { "toupper", FTOUPPER, BLTIN }, 94 { "while", WHILE, WHILE }, 95 { "xor", FXOR, BLTIN }, 96 }; 97 98 #define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); } 99 100 int peek(void) 101 { 102 int c = input(); 103 unput(c); 104 return c; 105 } 106 107 int gettok(char **pbuf, int *psz) /* get next input token */ 108 { 109 int c, retc; 110 char *buf = *pbuf; 111 int sz = *psz; 112 char *bp = buf; 113 114 c = input(); 115 if (c == 0) 116 return 0; 117 buf[0] = c; 118 buf[1] = 0; 119 if (!isalnum(c) && c != '.' && c != '_') 120 return c; 121 122 *bp++ = c; 123 if (isalpha(c) || c == '_') { /* it's a varname */ 124 for ( ; (c = input()) != 0; ) { 125 if (bp-buf >= sz) 126 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok")) 127 FATAL( "out of space for name %.10s...", buf ); 128 if (isalnum(c) || c == '_') 129 *bp++ = c; 130 else { 131 *bp = 0; 132 unput(c); 133 break; 134 } 135 } 136 *bp = 0; 137 retc = 'a'; /* alphanumeric */ 138 } else { /* maybe it's a number, but could be . */ 139 char *rem; 140 /* read input until can't be a number */ 141 for ( ; (c = input()) != 0; ) { 142 if (bp-buf >= sz) 143 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok")) 144 FATAL( "out of space for number %.10s...", buf ); 145 if (isdigit(c) || c == 'e' || c == 'E' 146 || c == '.' || c == '+' || c == '-') 147 *bp++ = c; 148 else { 149 unput(c); 150 break; 151 } 152 } 153 *bp = 0; 154 strtod(buf, &rem); /* parse the number */ 155 if (rem == buf) { /* it wasn't a valid number at all */ 156 buf[1] = 0; /* return one character as token */ 157 retc = buf[0]; /* character is its own type */ 158 unputstr(rem+1); /* put rest back for later */ 159 } else { /* some prefix was a number */ 160 unputstr(rem); /* put rest back for later */ 161 rem[0] = 0; /* truncate buf after number part */ 162 retc = '0'; /* type is number */ 163 } 164 } 165 *pbuf = buf; 166 *psz = sz; 167 return retc; 168 } 169 170 int word(char *); 171 int string(void); 172 int regexpr(void); 173 int sc = 0; /* 1 => return a } right now */ 174 int reg = 0; /* 1 => return a REGEXPR now */ 175 176 int yylex(void) 177 { 178 int c; 179 static char *buf = NULL; 180 static int bufsize = 5; /* BUG: setting this small causes core dump! */ 181 182 if (buf == NULL && (buf = (char *) malloc(bufsize)) == NULL) 183 FATAL( "out of space in yylex" ); 184 if (sc) { 185 sc = 0; 186 RET('}'); 187 } 188 if (reg) { 189 reg = 0; 190 return regexpr(); 191 } 192 for (;;) { 193 c = gettok(&buf, &bufsize); 194 if (c == 0) 195 return 0; 196 if (isalpha(c) || c == '_') 197 return word(buf); 198 if (isdigit(c)) { 199 yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab); 200 /* should this also have STR set? */ 201 RET(NUMBER); 202 } 203 204 yylval.i = c; 205 switch (c) { 206 case '\n': /* {EOL} */ 207 RET(NL); 208 case '\r': /* assume \n is coming */ 209 case ' ': /* {WS}+ */ 210 case '\t': 211 break; 212 case '#': /* #.* strip comments */ 213 while ((c = input()) != '\n' && c != 0) 214 ; 215 unput(c); 216 break; 217 case ';': 218 RET(';'); 219 case '\\': 220 if (peek() == '\n') { 221 input(); 222 } else if (peek() == '\r') { 223 input(); input(); /* \n */ 224 lineno++; 225 } else { 226 RET(c); 227 } 228 break; 229 case '&': 230 if (peek() == '&') { 231 input(); RET(AND); 232 } else 233 RET('&'); 234 case '|': 235 if (peek() == '|') { 236 input(); RET(BOR); 237 } else 238 RET('|'); 239 case '!': 240 if (peek() == '=') { 241 input(); yylval.i = NE; RET(NE); 242 } else if (peek() == '~') { 243 input(); yylval.i = NOTMATCH; RET(MATCHOP); 244 } else 245 RET(NOT); 246 case '~': 247 yylval.i = MATCH; 248 RET(MATCHOP); 249 case '<': 250 if (peek() == '=') { 251 input(); yylval.i = LE; RET(LE); 252 } else { 253 yylval.i = LT; RET(LT); 254 } 255 case '=': 256 if (peek() == '=') { 257 input(); yylval.i = EQ; RET(EQ); 258 } else { 259 yylval.i = ASSIGN; RET(ASGNOP); 260 } 261 case '>': 262 if (peek() == '=') { 263 input(); yylval.i = GE; RET(GE); 264 } else if (peek() == '>') { 265 input(); yylval.i = APPEND; RET(APPEND); 266 } else { 267 yylval.i = GT; RET(GT); 268 } 269 case '+': 270 if (peek() == '+') { 271 input(); yylval.i = INCR; RET(INCR); 272 } else if (peek() == '=') { 273 input(); yylval.i = ADDEQ; RET(ASGNOP); 274 } else 275 RET('+'); 276 case '-': 277 if (peek() == '-') { 278 input(); yylval.i = DECR; RET(DECR); 279 } else if (peek() == '=') { 280 input(); yylval.i = SUBEQ; RET(ASGNOP); 281 } else 282 RET('-'); 283 case '*': 284 if (peek() == '=') { /* *= */ 285 input(); yylval.i = MULTEQ; RET(ASGNOP); 286 } else if (peek() == '*') { /* ** or **= */ 287 input(); /* eat 2nd * */ 288 if (peek() == '=') { 289 input(); yylval.i = POWEQ; RET(ASGNOP); 290 } else { 291 RET(POWER); 292 } 293 } else 294 RET('*'); 295 case '/': 296 RET('/'); 297 case '%': 298 if (peek() == '=') { 299 input(); yylval.i = MODEQ; RET(ASGNOP); 300 } else 301 RET('%'); 302 case '^': 303 if (peek() == '=') { 304 input(); yylval.i = POWEQ; RET(ASGNOP); 305 } else 306 RET(POWER); 307 308 case '$': 309 /* BUG: awkward, if not wrong */ 310 c = gettok(&buf, &bufsize); 311 if (isalpha(c)) { 312 if (strcmp(buf, "NF") == 0) { /* very special */ 313 unputstr("(NF)"); 314 RET(INDIRECT); 315 } 316 c = peek(); 317 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) { 318 unputstr(buf); 319 RET(INDIRECT); 320 } 321 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab); 322 RET(IVAR); 323 } else if (c == 0) { /* */ 324 SYNTAX( "unexpected end of input after $" ); 325 RET(';'); 326 } else { 327 unputstr(buf); 328 RET(INDIRECT); 329 } 330 331 case '}': 332 if (--bracecnt < 0) 333 SYNTAX( "extra }" ); 334 sc = 1; 335 RET(';'); 336 case ']': 337 if (--brackcnt < 0) 338 SYNTAX( "extra ]" ); 339 RET(']'); 340 case ')': 341 if (--parencnt < 0) 342 SYNTAX( "extra )" ); 343 RET(')'); 344 case '{': 345 bracecnt++; 346 RET('{'); 347 case '[': 348 brackcnt++; 349 RET('['); 350 case '(': 351 parencnt++; 352 RET('('); 353 354 case '"': 355 return string(); /* BUG: should be like tran.c ? */ 356 357 default: 358 RET(c); 359 } 360 } 361 } 362 363 int string(void) 364 { 365 int c, n; 366 char *s, *bp; 367 static char *buf = NULL; 368 static int bufsz = 500; 369 370 if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL) 371 FATAL("out of space for strings"); 372 for (bp = buf; (c = input()) != '"'; ) { 373 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string")) 374 FATAL("out of space for string %.10s...", buf); 375 switch (c) { 376 case '\n': 377 case '\r': 378 case 0: 379 SYNTAX( "non-terminated string %.10s...", buf ); 380 lineno++; 381 if (c == 0) /* hopeless */ 382 FATAL( "giving up" ); 383 break; 384 case '\\': 385 c = input(); 386 switch (c) { 387 case '"': *bp++ = '"'; break; 388 case 'n': *bp++ = '\n'; break; 389 case 't': *bp++ = '\t'; break; 390 case 'f': *bp++ = '\f'; break; 391 case 'r': *bp++ = '\r'; break; 392 case 'b': *bp++ = '\b'; break; 393 case 'v': *bp++ = '\v'; break; 394 case 'a': *bp++ = '\007'; break; 395 case '\\': *bp++ = '\\'; break; 396 397 case '0': case '1': case '2': /* octal: \d \dd \ddd */ 398 case '3': case '4': case '5': case '6': case '7': 399 n = c - '0'; 400 if ((c = peek()) >= '0' && c < '8') { 401 n = 8 * n + input() - '0'; 402 if ((c = peek()) >= '0' && c < '8') 403 n = 8 * n + input() - '0'; 404 } 405 *bp++ = n; 406 break; 407 408 case 'x': /* hex \x0-9a-fA-F + */ 409 { char xbuf[100], *px; 410 for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) { 411 if (isdigit(c) 412 || (c >= 'a' && c <= 'f') 413 || (c >= 'A' && c <= 'F')) 414 *px++ = c; 415 else 416 break; 417 } 418 *px = 0; 419 unput(c); 420 sscanf(xbuf, "%x", (unsigned int *) &n); 421 *bp++ = n; 422 break; 423 } 424 425 default: 426 *bp++ = c; 427 break; 428 } 429 break; 430 default: 431 *bp++ = c; 432 break; 433 } 434 } 435 *bp = 0; 436 s = tostring(buf); 437 *bp++ = ' '; *bp++ = 0; 438 yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab); 439 RET(STRING); 440 } 441 442 443 int binsearch(char *w, Keyword *kp, int n) 444 { 445 int cond, low, mid, high; 446 447 low = 0; 448 high = n - 1; 449 while (low <= high) { 450 mid = (low + high) / 2; 451 if ((cond = strcmp(w, kp[mid].word)) < 0) 452 high = mid - 1; 453 else if (cond > 0) 454 low = mid + 1; 455 else 456 return mid; 457 } 458 return -1; 459 } 460 461 int word(char *w) 462 { 463 Keyword *kp; 464 int c, n; 465 466 n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0])); 467 /* BUG: this ought to be inside the if; in theory could fault (daniel barrett) */ 468 kp = keywords + n; 469 if (n != -1) { /* found in table */ 470 yylval.i = kp->sub; 471 switch (kp->type) { /* special handling */ 472 case BLTIN: 473 if (kp->sub == FSYSTEM && safe) 474 SYNTAX( "system is unsafe" ); 475 RET(kp->type); 476 case FUNC: 477 if (infunc) 478 SYNTAX( "illegal nested function" ); 479 RET(kp->type); 480 case RETURN: 481 if (!infunc) 482 SYNTAX( "return not in function" ); 483 RET(kp->type); 484 case VARNF: 485 yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab); 486 RET(VARNF); 487 default: 488 RET(kp->type); 489 } 490 } 491 c = peek(); /* look for '(' */ 492 if (c != '(' && infunc && (n=isarg(w)) >= 0) { 493 yylval.i = n; 494 RET(ARG); 495 } else { 496 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab); 497 if (c == '(') { 498 RET(CALL); 499 } else { 500 RET(VAR); 501 } 502 } 503 } 504 505 void startreg(void) /* next call to yylex will return a regular expression */ 506 { 507 reg = 1; 508 } 509 510 int regexpr(void) 511 { 512 int c; 513 static char *buf = NULL; 514 static int bufsz = 500; 515 char *bp; 516 517 if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL) 518 FATAL("out of space for rex expr"); 519 bp = buf; 520 for ( ; (c = input()) != '/' && c != 0; ) { 521 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr")) 522 FATAL("out of space for reg expr %.10s...", buf); 523 if (c == '\n') { 524 SYNTAX( "newline in regular expression %.10s...", buf ); 525 unput('\n'); 526 break; 527 } else if (c == '\\') { 528 *bp++ = '\\'; 529 *bp++ = input(); 530 } else { 531 *bp++ = c; 532 } 533 } 534 *bp = 0; 535 if (c == 0) 536 SYNTAX("non-terminated regular expression %.10s...", buf); 537 yylval.s = tostring(buf); 538 unput('/'); 539 RET(REGEXPR); 540 } 541 542 /* low-level lexical stuff, sort of inherited from lex */ 543 544 char ebuf[300]; 545 char *ep = ebuf; 546 char yysbuf[100]; /* pushback buffer */ 547 char *yysptr = yysbuf; 548 FILE *yyin = NULL; 549 550 int input(void) /* get next lexical input character */ 551 { 552 int c; 553 extern char *lexprog; 554 555 if (yysptr > yysbuf) 556 c = (uschar)*--yysptr; 557 else if (lexprog != NULL) { /* awk '...' */ 558 if ((c = (uschar)*lexprog) != 0) 559 lexprog++; 560 } else /* awk -f ... */ 561 c = pgetc(); 562 if (c == '\n') 563 lineno++; 564 else if (c == EOF) 565 c = 0; 566 if (ep >= ebuf + sizeof ebuf) 567 ep = ebuf; 568 return *ep++ = c; 569 } 570 571 void unput(int c) /* put lexical character back on input */ 572 { 573 if (c == '\n') 574 lineno--; 575 if (yysptr >= yysbuf + sizeof(yysbuf)) 576 FATAL("pushed back too much: %.20s...", yysbuf); 577 *yysptr++ = c; 578 if (--ep < ebuf) 579 ep = ebuf + sizeof(ebuf) - 1; 580 } 581 582 void unputstr(const char *s) /* put a string back on input */ 583 { 584 int i; 585 586 for (i = strlen(s)-1; i >= 0; i--) 587 unput(s[i]); 588 } 589