1 /**************************************************************** 2 Copyright (C) Lucent Technologies 1997 3 All Rights Reserved 4 5 Permission to use, copy, modify, and distribute this software and 6 its documentation for any purpose and without fee is hereby 7 granted, provided that the above copyright notice appear in all 8 copies and that both that the copyright notice and this 9 permission notice and warranty disclaimer appear in supporting 10 documentation, and that the name Lucent Technologies or any of 11 its entities not be used in advertising or publicity pertaining 12 to distribution of the software without specific, written prior 13 permission. 14 15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 22 THIS SOFTWARE. 23 ****************************************************************/ 24 25 #include <stdio.h> 26 #include <stdlib.h> 27 #include <string.h> 28 #include <ctype.h> 29 #include "awk.h" 30 #include "ytab.h" 31 32 extern YYSTYPE yylval; 33 extern int infunc; 34 35 int lineno = 1; 36 int bracecnt = 0; 37 int brackcnt = 0; 38 int parencnt = 0; 39 40 typedef struct Keyword { 41 const char *word; 42 int sub; 43 int type; 44 } Keyword; 45 46 Keyword keywords[] ={ /* keep sorted: binary searched */ 47 { "BEGIN", XBEGIN, XBEGIN }, 48 { "END", XEND, XEND }, 49 { "NF", VARNF, VARNF }, 50 { "and", FAND, BLTIN }, 51 { "atan2", FATAN, BLTIN }, 52 { "break", BREAK, BREAK }, 53 { "close", CLOSE, CLOSE }, 54 { "compl", FCOMPL, BLTIN }, 55 { "continue", CONTINUE, CONTINUE }, 56 { "cos", FCOS, BLTIN }, 57 { "delete", DELETE, DELETE }, 58 { "do", DO, DO }, 59 { "else", ELSE, ELSE }, 60 { "exit", EXIT, EXIT }, 61 { "exp", FEXP, BLTIN }, 62 { "fflush", FFLUSH, BLTIN }, 63 { "for", FOR, FOR }, 64 { "func", FUNC, FUNC }, 65 { "function", FUNC, FUNC }, 66 { "getline", GETLINE, GETLINE }, 67 { "gsub", GSUB, GSUB }, 68 { "if", IF, IF }, 69 { "in", IN, IN }, 70 { "index", INDEX, INDEX }, 71 { "int", FINT, BLTIN }, 72 { "length", FLENGTH, BLTIN }, 73 { "log", FLOG, BLTIN }, 74 { "lshift", FLSHIFT, BLTIN }, 75 { "match", MATCHFCN, MATCHFCN }, 76 { "next", NEXT, NEXT }, 77 { "nextfile", NEXTFILE, NEXTFILE }, 78 { "or", FFOR, BLTIN }, 79 { "print", PRINT, PRINT }, 80 { "printf", PRINTF, PRINTF }, 81 { "rand", FRAND, BLTIN }, 82 { "return", RETURN, RETURN }, 83 { "rshift", FRSHIFT, BLTIN }, 84 { "sin", FSIN, BLTIN }, 85 { "split", SPLIT, SPLIT }, 86 { "sprintf", SPRINTF, SPRINTF }, 87 { "sqrt", FSQRT, BLTIN }, 88 { "srand", FSRAND, BLTIN }, 89 { "sub", SUB, SUB }, 90 { "substr", SUBSTR, SUBSTR }, 91 { "system", FSYSTEM, BLTIN }, 92 { "tolower", FTOLOWER, BLTIN }, 93 { "toupper", FTOUPPER, BLTIN }, 94 { "while", WHILE, WHILE }, 95 { "xor", FXOR, BLTIN }, 96 }; 97 98 #define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); } 99 100 int peek(void) 101 { 102 int c = input(); 103 unput(c); 104 return c; 105 } 106 107 int gettok(char **pbuf, int *psz) /* get next input token */ 108 { 109 int c, retc; 110 char *buf = *pbuf; 111 int sz = *psz; 112 char *bp = buf; 113 114 c = input(); 115 if (c == 0) 116 return 0; 117 buf[0] = c; 118 buf[1] = 0; 119 if (!isalnum(c) && c != '.' && c != '_') 120 return c; 121 122 *bp++ = c; 123 if (isalpha(c) || c == '_') { /* it's a varname */ 124 for ( ; (c = input()) != 0; ) { 125 if (bp-buf >= sz) 126 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok")) 127 FATAL( "out of space for name %.10s...", buf ); 128 if (isalnum(c) || c == '_') 129 *bp++ = c; 130 else { 131 *bp = 0; 132 unput(c); 133 break; 134 } 135 } 136 *bp = 0; 137 retc = 'a'; /* alphanumeric */ 138 } else { /* maybe it's a number, but could be . */ 139 char *rem; 140 /* read input until can't be a number */ 141 for ( ; (c = input()) != 0; ) { 142 if (bp-buf >= sz) 143 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok")) 144 FATAL( "out of space for number %.10s...", buf ); 145 if (isdigit(c) || c == 'e' || c == 'E' 146 || c == '.' || c == '+' || c == '-') 147 *bp++ = c; 148 else { 149 unput(c); 150 break; 151 } 152 } 153 *bp = 0; 154 strtod(buf, &rem); /* parse the number */ 155 if (rem == buf) { /* it wasn't a valid number at all */ 156 buf[1] = 0; /* return one character as token */ 157 retc = buf[0]; /* character is its own type */ 158 unputstr(rem+1); /* put rest back for later */ 159 } else { /* some prefix was a number */ 160 unputstr(rem); /* put rest back for later */ 161 rem[0] = 0; /* truncate buf after number part */ 162 retc = '0'; /* type is number */ 163 } 164 } 165 *pbuf = buf; 166 *psz = sz; 167 return retc; 168 } 169 170 int word(char *); 171 int string(void); 172 int regexpr(void); 173 int sc = 0; /* 1 => return a } right now */ 174 int reg = 0; /* 1 => return a REGEXPR now */ 175 176 int yylex(void) 177 { 178 int c; 179 static char *buf = NULL; 180 static int bufsize = 5; /* BUG: setting this small causes core dump! */ 181 182 if (buf == NULL && (buf = (char *) malloc(bufsize)) == NULL) 183 FATAL( "out of space in yylex" ); 184 if (sc) { 185 sc = 0; 186 RET('}'); 187 } 188 if (reg) { 189 reg = 0; 190 return regexpr(); 191 } 192 for (;;) { 193 c = gettok(&buf, &bufsize); 194 if (c == 0) 195 return 0; 196 if (isalpha(c) || c == '_') 197 return word(buf); 198 if (isdigit(c)) { 199 yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab); 200 /* should this also have STR set? */ 201 RET(NUMBER); 202 } 203 204 yylval.i = c; 205 switch (c) { 206 case '\n': /* {EOL} */ 207 lineno++; 208 RET(NL); 209 case '\r': /* assume \n is coming */ 210 case ' ': /* {WS}+ */ 211 case '\t': 212 break; 213 case '#': /* #.* strip comments */ 214 while ((c = input()) != '\n' && c != 0) 215 ; 216 unput(c); 217 break; 218 case ';': 219 RET(';'); 220 case '\\': 221 if (peek() == '\n') { 222 input(); 223 lineno++; 224 } else if (peek() == '\r') { 225 input(); input(); /* \n */ 226 lineno++; 227 } else { 228 RET(c); 229 } 230 break; 231 case '&': 232 if (peek() == '&') { 233 input(); RET(AND); 234 } else 235 RET('&'); 236 case '|': 237 if (peek() == '|') { 238 input(); RET(BOR); 239 } else 240 RET('|'); 241 case '!': 242 if (peek() == '=') { 243 input(); yylval.i = NE; RET(NE); 244 } else if (peek() == '~') { 245 input(); yylval.i = NOTMATCH; RET(MATCHOP); 246 } else 247 RET(NOT); 248 case '~': 249 yylval.i = MATCH; 250 RET(MATCHOP); 251 case '<': 252 if (peek() == '=') { 253 input(); yylval.i = LE; RET(LE); 254 } else { 255 yylval.i = LT; RET(LT); 256 } 257 case '=': 258 if (peek() == '=') { 259 input(); yylval.i = EQ; RET(EQ); 260 } else { 261 yylval.i = ASSIGN; RET(ASGNOP); 262 } 263 case '>': 264 if (peek() == '=') { 265 input(); yylval.i = GE; RET(GE); 266 } else if (peek() == '>') { 267 input(); yylval.i = APPEND; RET(APPEND); 268 } else { 269 yylval.i = GT; RET(GT); 270 } 271 case '+': 272 if (peek() == '+') { 273 input(); yylval.i = INCR; RET(INCR); 274 } else if (peek() == '=') { 275 input(); yylval.i = ADDEQ; RET(ASGNOP); 276 } else 277 RET('+'); 278 case '-': 279 if (peek() == '-') { 280 input(); yylval.i = DECR; RET(DECR); 281 } else if (peek() == '=') { 282 input(); yylval.i = SUBEQ; RET(ASGNOP); 283 } else 284 RET('-'); 285 case '*': 286 if (peek() == '=') { /* *= */ 287 input(); yylval.i = MULTEQ; RET(ASGNOP); 288 } else if (peek() == '*') { /* ** or **= */ 289 input(); /* eat 2nd * */ 290 if (peek() == '=') { 291 input(); yylval.i = POWEQ; RET(ASGNOP); 292 } else { 293 RET(POWER); 294 } 295 } else 296 RET('*'); 297 case '/': 298 RET('/'); 299 case '%': 300 if (peek() == '=') { 301 input(); yylval.i = MODEQ; RET(ASGNOP); 302 } else 303 RET('%'); 304 case '^': 305 if (peek() == '=') { 306 input(); yylval.i = POWEQ; RET(ASGNOP); 307 } else 308 RET(POWER); 309 310 case '$': 311 /* BUG: awkward, if not wrong */ 312 c = gettok(&buf, &bufsize); 313 if (isalpha(c)) { 314 if (strcmp(buf, "NF") == 0) { /* very special */ 315 unputstr("(NF)"); 316 RET(INDIRECT); 317 } 318 c = peek(); 319 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) { 320 unputstr(buf); 321 RET(INDIRECT); 322 } 323 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab); 324 RET(IVAR); 325 } else if (c == 0) { /* */ 326 SYNTAX( "unexpected end of input after $" ); 327 RET(';'); 328 } else { 329 unputstr(buf); 330 RET(INDIRECT); 331 } 332 333 case '}': 334 if (--bracecnt < 0) 335 SYNTAX( "extra }" ); 336 sc = 1; 337 RET(';'); 338 case ']': 339 if (--brackcnt < 0) 340 SYNTAX( "extra ]" ); 341 RET(']'); 342 case ')': 343 if (--parencnt < 0) 344 SYNTAX( "extra )" ); 345 RET(')'); 346 case '{': 347 bracecnt++; 348 RET('{'); 349 case '[': 350 brackcnt++; 351 RET('['); 352 case '(': 353 parencnt++; 354 RET('('); 355 356 case '"': 357 return string(); /* BUG: should be like tran.c ? */ 358 359 default: 360 RET(c); 361 } 362 } 363 } 364 365 int string(void) 366 { 367 int c, n; 368 char *s, *bp; 369 static char *buf = NULL; 370 static int bufsz = 500; 371 372 if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL) 373 FATAL("out of space for strings"); 374 for (bp = buf; (c = input()) != '"'; ) { 375 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string")) 376 FATAL("out of space for string %.10s...", buf); 377 switch (c) { 378 case '\n': 379 case '\r': 380 case 0: 381 *bp = '\0'; 382 SYNTAX( "non-terminated string %.10s...", buf ); 383 if (c == 0) /* hopeless */ 384 FATAL( "giving up" ); 385 lineno++; 386 break; 387 case '\\': 388 c = input(); 389 switch (c) { 390 case '"': *bp++ = '"'; break; 391 case 'n': *bp++ = '\n'; break; 392 case 't': *bp++ = '\t'; break; 393 case 'f': *bp++ = '\f'; break; 394 case 'r': *bp++ = '\r'; break; 395 case 'b': *bp++ = '\b'; break; 396 case 'v': *bp++ = '\v'; break; 397 case 'a': *bp++ = '\007'; break; 398 case '\\': *bp++ = '\\'; break; 399 400 case '0': case '1': case '2': /* octal: \d \dd \ddd */ 401 case '3': case '4': case '5': case '6': case '7': 402 n = c - '0'; 403 if ((c = peek()) >= '0' && c < '8') { 404 n = 8 * n + input() - '0'; 405 if ((c = peek()) >= '0' && c < '8') 406 n = 8 * n + input() - '0'; 407 } 408 *bp++ = n; 409 break; 410 411 case 'x': /* hex \x0-9a-fA-F + */ 412 { char xbuf[100], *px; 413 for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) { 414 if (isdigit(c) 415 || (c >= 'a' && c <= 'f') 416 || (c >= 'A' && c <= 'F')) 417 *px++ = c; 418 else 419 break; 420 } 421 *px = 0; 422 unput(c); 423 sscanf(xbuf, "%x", (unsigned int *) &n); 424 *bp++ = n; 425 break; 426 } 427 428 default: 429 *bp++ = c; 430 break; 431 } 432 break; 433 default: 434 *bp++ = c; 435 break; 436 } 437 } 438 *bp = 0; 439 s = tostring(buf); 440 *bp++ = ' '; *bp++ = 0; 441 yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab); 442 RET(STRING); 443 } 444 445 446 int binsearch(char *w, Keyword *kp, int n) 447 { 448 int cond, low, mid, high; 449 450 low = 0; 451 high = n - 1; 452 while (low <= high) { 453 mid = (low + high) / 2; 454 if ((cond = strcmp(w, kp[mid].word)) < 0) 455 high = mid - 1; 456 else if (cond > 0) 457 low = mid + 1; 458 else 459 return mid; 460 } 461 return -1; 462 } 463 464 int word(char *w) 465 { 466 Keyword *kp; 467 int c, n; 468 469 n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0])); 470 if (n != -1) { /* found in table */ 471 kp = keywords + n; 472 yylval.i = kp->sub; 473 switch (kp->type) { /* special handling */ 474 case BLTIN: 475 if (kp->sub == FSYSTEM && safe) 476 SYNTAX( "system is unsafe" ); 477 RET(kp->type); 478 case FUNC: 479 if (infunc) 480 SYNTAX( "illegal nested function" ); 481 RET(kp->type); 482 case RETURN: 483 if (!infunc) 484 SYNTAX( "return not in function" ); 485 RET(kp->type); 486 case VARNF: 487 yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab); 488 RET(VARNF); 489 default: 490 RET(kp->type); 491 } 492 } 493 c = peek(); /* look for '(' */ 494 if (c != '(' && infunc && (n=isarg(w)) >= 0) { 495 yylval.i = n; 496 RET(ARG); 497 } else { 498 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab); 499 if (c == '(') { 500 RET(CALL); 501 } else { 502 RET(VAR); 503 } 504 } 505 } 506 507 void startreg(void) /* next call to yylex will return a regular expression */ 508 { 509 reg = 1; 510 } 511 512 int regexpr(void) 513 { 514 int c; 515 static char *buf = NULL; 516 static int bufsz = 500; 517 char *bp; 518 519 if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL) 520 FATAL("out of space for rex expr"); 521 bp = buf; 522 for ( ; (c = input()) != '/' && c != 0; ) { 523 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr")) 524 FATAL("out of space for reg expr %.10s...", buf); 525 if (c == '\n') { 526 *bp = '\0'; 527 SYNTAX( "newline in regular expression %.10s...", buf ); 528 unput('\n'); 529 break; 530 } else if (c == '\\') { 531 *bp++ = '\\'; 532 *bp++ = input(); 533 } else { 534 *bp++ = c; 535 } 536 } 537 *bp = 0; 538 if (c == 0) 539 SYNTAX("non-terminated regular expression %.10s...", buf); 540 yylval.s = tostring(buf); 541 unput('/'); 542 RET(REGEXPR); 543 } 544 545 /* low-level lexical stuff, sort of inherited from lex */ 546 547 char ebuf[300]; 548 char *ep = ebuf; 549 char yysbuf[100]; /* pushback buffer */ 550 char *yysptr = yysbuf; 551 FILE *yyin = NULL; 552 553 int input(void) /* get next lexical input character */ 554 { 555 int c; 556 extern char *lexprog; 557 558 if (yysptr > yysbuf) 559 c = (uschar)*--yysptr; 560 else if (lexprog != NULL) { /* awk '...' */ 561 if ((c = (uschar)*lexprog) != 0) 562 lexprog++; 563 } else /* awk -f ... */ 564 c = pgetc(); 565 if (c == EOF) 566 c = 0; 567 if (ep >= ebuf + sizeof ebuf) 568 ep = ebuf; 569 *ep = c; 570 if (c != 0) { 571 ep++; 572 } 573 return (c); 574 } 575 576 void unput(int c) /* put lexical character back on input */ 577 { 578 if (yysptr >= yysbuf + sizeof(yysbuf)) 579 FATAL("pushed back too much: %.20s...", yysbuf); 580 *yysptr++ = c; 581 if (--ep < ebuf) 582 ep = ebuf + sizeof(ebuf) - 1; 583 } 584 585 void unputstr(const char *s) /* put a string back on input */ 586 { 587 int i; 588 589 for (i = strlen(s)-1; i >= 0; i--) 590 unput(s[i]); 591 } 592