1 /**************************************************************** 2 Copyright (C) Lucent Technologies 1997 3 All Rights Reserved 4 5 Permission to use, copy, modify, and distribute this software and 6 its documentation for any purpose and without fee is hereby 7 granted, provided that the above copyright notice appear in all 8 copies and that both that the copyright notice and this 9 permission notice and warranty disclaimer appear in supporting 10 documentation, and that the name Lucent Technologies or any of 11 its entities not be used in advertising or publicity pertaining 12 to distribution of the software without specific, written prior 13 permission. 14 15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 22 THIS SOFTWARE. 23 ****************************************************************/ 24 25 #include <stdio.h> 26 #include <stdlib.h> 27 #include <string.h> 28 #include <ctype.h> 29 #include "awk.h" 30 #include "ytab.h" 31 32 extern YYSTYPE yylval; 33 extern int infunc; 34 35 int lineno = 1; 36 int bracecnt = 0; 37 int brackcnt = 0; 38 int parencnt = 0; 39 40 typedef struct Keyword { 41 const char *word; 42 int sub; 43 int type; 44 } Keyword; 45 46 Keyword keywords[] ={ /* keep sorted: binary searched */ 47 { "BEGIN", XBEGIN, XBEGIN }, 48 { "END", XEND, XEND }, 49 { "NF", VARNF, VARNF }, 50 { "atan2", FATAN, BLTIN }, 51 { "break", BREAK, BREAK }, 52 { "close", CLOSE, CLOSE }, 53 { "continue", CONTINUE, CONTINUE }, 54 { "cos", FCOS, BLTIN }, 55 { "delete", DELETE, DELETE }, 56 { "do", DO, DO }, 57 { "else", ELSE, ELSE }, 58 { "exit", EXIT, EXIT }, 59 { "exp", FEXP, BLTIN }, 60 { "fflush", FFLUSH, BLTIN }, 61 { "for", FOR, FOR }, 62 { "func", FUNC, FUNC }, 63 { "function", FUNC, FUNC }, 64 { "getline", GETLINE, GETLINE }, 65 { "gsub", GSUB, GSUB }, 66 { "if", IF, IF }, 67 { "in", IN, IN }, 68 { "index", INDEX, INDEX }, 69 { "int", FINT, BLTIN }, 70 { "length", FLENGTH, BLTIN }, 71 { "log", FLOG, BLTIN }, 72 { "match", MATCHFCN, MATCHFCN }, 73 { "next", NEXT, NEXT }, 74 { "nextfile", NEXTFILE, NEXTFILE }, 75 { "print", PRINT, PRINT }, 76 { "printf", PRINTF, PRINTF }, 77 { "rand", FRAND, BLTIN }, 78 { "return", RETURN, RETURN }, 79 { "sin", FSIN, BLTIN }, 80 { "split", SPLIT, SPLIT }, 81 { "sprintf", SPRINTF, SPRINTF }, 82 { "sqrt", FSQRT, BLTIN }, 83 { "srand", FSRAND, BLTIN }, 84 { "sub", SUB, SUB }, 85 { "substr", SUBSTR, SUBSTR }, 86 { "system", FSYSTEM, BLTIN }, 87 { "tolower", FTOLOWER, BLTIN }, 88 { "toupper", FTOUPPER, BLTIN }, 89 { "while", WHILE, WHILE }, 90 }; 91 92 #define DEBUG 93 #ifdef DEBUG 94 #define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); } 95 #else 96 #define RET(x) return(x) 97 #endif 98 99 int peek(void) 100 { 101 int c = input(); 102 unput(c); 103 return c; 104 } 105 106 int gettok(char **pbuf, int *psz) /* get next input token */ 107 { 108 int c, retc; 109 char *buf = *pbuf; 110 int sz = *psz; 111 char *bp = buf; 112 113 c = input(); 114 if (c == 0) 115 return 0; 116 buf[0] = c; 117 buf[1] = 0; 118 if (!isalnum(c) && c != '.' && c != '_') 119 return c; 120 121 *bp++ = c; 122 if (isalpha(c) || c == '_') { /* it's a varname */ 123 for ( ; (c = input()) != 0; ) { 124 if (bp-buf >= sz) 125 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0)) 126 FATAL( "out of space for name %.10s...", buf ); 127 if (isalnum(c) || c == '_') 128 *bp++ = c; 129 else { 130 *bp = 0; 131 unput(c); 132 break; 133 } 134 } 135 *bp = 0; 136 retc = 'a'; /* alphanumeric */ 137 } else { /* maybe it's a number, but could be . */ 138 char *rem; 139 /* read input until can't be a number */ 140 for ( ; (c = input()) != 0; ) { 141 if (bp-buf >= sz) 142 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0)) 143 FATAL( "out of space for number %.10s...", buf ); 144 if (isdigit(c) || c == 'e' || c == 'E' 145 || c == '.' || c == '+' || c == '-') 146 *bp++ = c; 147 else { 148 unput(c); 149 break; 150 } 151 } 152 *bp = 0; 153 strtod(buf, &rem); /* parse the number */ 154 if (rem == buf) { /* it wasn't a valid number at all */ 155 buf[1] = 0; /* return one character as token */ 156 retc = buf[0]; /* character is its own type */ 157 unputstr(rem+1); /* put rest back for later */ 158 } else { /* some prefix was a number */ 159 unputstr(rem); /* put rest back for later */ 160 rem[0] = 0; /* truncate buf after number part */ 161 retc = '0'; /* type is number */ 162 } 163 } 164 *pbuf = buf; 165 *psz = sz; 166 return retc; 167 } 168 169 int word(char *); 170 int string(void); 171 int regexpr(void); 172 int sc = 0; /* 1 => return a } right now */ 173 int reg = 0; /* 1 => return a REGEXPR now */ 174 175 int yylex(void) 176 { 177 int c; 178 static char *buf = 0; 179 static int bufsize = 500; 180 181 if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL) 182 FATAL( "out of space in yylex" ); 183 if (sc) { 184 sc = 0; 185 RET('}'); 186 } 187 if (reg) { 188 reg = 0; 189 return regexpr(); 190 } 191 /* printf("top\n"); */ 192 for (;;) { 193 c = gettok(&buf, &bufsize); 194 /* printf("gettok [%s]\n", buf); */ 195 if (c == 0) 196 return 0; 197 if (isalpha(c) || c == '_') 198 return word(buf); 199 if (isdigit(c)) { 200 yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab); 201 /* should this also have STR set? */ 202 RET(NUMBER); 203 } 204 205 yylval.i = c; 206 switch (c) { 207 case '\n': /* {EOL} */ 208 RET(NL); 209 case '\r': /* assume \n is coming */ 210 case ' ': /* {WS}+ */ 211 case '\t': 212 break; 213 case '#': /* #.* strip comments */ 214 while ((c = input()) != '\n' && c != 0) 215 ; 216 unput(c); 217 break; 218 case ';': 219 RET(';'); 220 case '\\': 221 if (peek() == '\n') { 222 input(); 223 } else if (peek() == '\r') { 224 input(); input(); /* \n */ 225 lineno++; 226 } else { 227 RET(c); 228 } 229 break; 230 case '&': 231 if (peek() == '&') { 232 input(); RET(AND); 233 } else 234 RET('&'); 235 case '|': 236 if (peek() == '|') { 237 input(); RET(BOR); 238 } else 239 RET('|'); 240 case '!': 241 if (peek() == '=') { 242 input(); yylval.i = NE; RET(NE); 243 } else if (peek() == '~') { 244 input(); yylval.i = NOTMATCH; RET(MATCHOP); 245 } else 246 RET(NOT); 247 case '~': 248 yylval.i = MATCH; 249 RET(MATCHOP); 250 case '<': 251 if (peek() == '=') { 252 input(); yylval.i = LE; RET(LE); 253 } else { 254 yylval.i = LT; RET(LT); 255 } 256 case '=': 257 if (peek() == '=') { 258 input(); yylval.i = EQ; RET(EQ); 259 } else { 260 yylval.i = ASSIGN; RET(ASGNOP); 261 } 262 case '>': 263 if (peek() == '=') { 264 input(); yylval.i = GE; RET(GE); 265 } else if (peek() == '>') { 266 input(); yylval.i = APPEND; RET(APPEND); 267 } else { 268 yylval.i = GT; RET(GT); 269 } 270 case '+': 271 if (peek() == '+') { 272 input(); yylval.i = INCR; RET(INCR); 273 } else if (peek() == '=') { 274 input(); yylval.i = ADDEQ; RET(ASGNOP); 275 } else 276 RET('+'); 277 case '-': 278 if (peek() == '-') { 279 input(); yylval.i = DECR; RET(DECR); 280 } else if (peek() == '=') { 281 input(); yylval.i = SUBEQ; RET(ASGNOP); 282 } else 283 RET('-'); 284 case '*': 285 if (peek() == '=') { /* *= */ 286 input(); yylval.i = MULTEQ; RET(ASGNOP); 287 } else if (peek() == '*') { /* ** or **= */ 288 input(); /* eat 2nd * */ 289 if (peek() == '=') { 290 input(); yylval.i = POWEQ; RET(ASGNOP); 291 } else { 292 RET(POWER); 293 } 294 } else 295 RET('*'); 296 case '/': 297 RET('/'); 298 case '%': 299 if (peek() == '=') { 300 input(); yylval.i = MODEQ; RET(ASGNOP); 301 } else 302 RET('%'); 303 case '^': 304 if (peek() == '=') { 305 input(); yylval.i = POWEQ; RET(ASGNOP); 306 } else 307 RET(POWER); 308 309 case '$': 310 /* BUG: awkward, if not wrong */ 311 c = gettok(&buf, &bufsize); 312 if (isalpha(c)) { 313 if (strcmp(buf, "NF") == 0) { /* very special */ 314 unputstr("(NF)"); 315 RET(INDIRECT); 316 } 317 c = peek(); 318 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) { 319 unputstr(buf); 320 RET(INDIRECT); 321 } 322 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab); 323 RET(IVAR); 324 } else if (c == 0) { /* */ 325 SYNTAX( "unexpected end of input after $" ); 326 RET(';'); 327 } else { 328 unputstr(buf); 329 RET(INDIRECT); 330 } 331 332 case '}': 333 if (--bracecnt < 0) 334 SYNTAX( "extra }" ); 335 sc = 1; 336 RET(';'); 337 case ']': 338 if (--brackcnt < 0) 339 SYNTAX( "extra ]" ); 340 RET(']'); 341 case ')': 342 if (--parencnt < 0) 343 SYNTAX( "extra )" ); 344 RET(')'); 345 case '{': 346 bracecnt++; 347 RET('{'); 348 case '[': 349 brackcnt++; 350 RET('['); 351 case '(': 352 parencnt++; 353 RET('('); 354 355 case '"': 356 return string(); /* BUG: should be like tran.c ? */ 357 358 default: 359 RET(c); 360 } 361 } 362 } 363 364 int string(void) 365 { 366 int c, n; 367 char *s, *bp; 368 static char *buf = 0; 369 static int bufsz = 500; 370 371 if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL) 372 FATAL("out of space for strings"); 373 for (bp = buf; (c = input()) != '"'; ) { 374 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, 0)) 375 FATAL("out of space for string %.10s...", buf); 376 switch (c) { 377 case '\n': 378 case '\r': 379 case 0: 380 SYNTAX( "non-terminated string %.10s...", buf ); 381 lineno++; 382 if (c == 0) /* hopeless */ 383 FATAL( "giving up" ); 384 break; 385 case '\\': 386 c = input(); 387 switch (c) { 388 case '"': *bp++ = '"'; break; 389 case 'n': *bp++ = '\n'; break; 390 case 't': *bp++ = '\t'; break; 391 case 'f': *bp++ = '\f'; break; 392 case 'r': *bp++ = '\r'; break; 393 case 'b': *bp++ = '\b'; break; 394 case 'v': *bp++ = '\v'; break; 395 case 'a': *bp++ = '\007'; break; 396 case '\\': *bp++ = '\\'; break; 397 398 case '0': case '1': case '2': /* octal: \d \dd \ddd */ 399 case '3': case '4': case '5': case '6': case '7': 400 n = c - '0'; 401 if ((c = peek()) >= '0' && c < '8') { 402 n = 8 * n + input() - '0'; 403 if ((c = peek()) >= '0' && c < '8') 404 n = 8 * n + input() - '0'; 405 } 406 *bp++ = n; 407 break; 408 409 case 'x': /* hex \x0-9a-fA-F + */ 410 { char xbuf[100], *px; 411 for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) { 412 if (isdigit(c) 413 || (c >= 'a' && c <= 'f') 414 || (c >= 'A' && c <= 'F')) 415 *px++ = c; 416 else 417 break; 418 } 419 *px = 0; 420 unput(c); 421 sscanf(xbuf, "%x", &n); 422 *bp++ = n; 423 break; 424 } 425 426 default: 427 *bp++ = c; 428 break; 429 } 430 break; 431 default: 432 *bp++ = c; 433 break; 434 } 435 } 436 *bp = 0; 437 s = tostring(buf); 438 *bp++ = ' '; *bp++ = 0; 439 yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab); 440 RET(STRING); 441 } 442 443 444 int binsearch(char *w, Keyword *kp, int n) 445 { 446 int cond, low, mid, high; 447 448 low = 0; 449 high = n - 1; 450 while (low <= high) { 451 mid = (low + high) / 2; 452 if ((cond = strcmp(w, kp[mid].word)) < 0) 453 high = mid - 1; 454 else if (cond > 0) 455 low = mid + 1; 456 else 457 return mid; 458 } 459 return -1; 460 } 461 462 int word(char *w) 463 { 464 Keyword *kp; 465 int c, n; 466 467 n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0])); 468 kp = keywords + n; 469 if (n != -1) { /* found in table */ 470 yylval.i = kp->sub; 471 switch (kp->type) { /* special handling */ 472 case FSYSTEM: 473 if (safe) 474 SYNTAX( "system is unsafe" ); 475 RET(kp->type); 476 case FUNC: 477 if (infunc) 478 SYNTAX( "illegal nested function" ); 479 RET(kp->type); 480 case RETURN: 481 if (!infunc) 482 SYNTAX( "return not in function" ); 483 RET(kp->type); 484 case VARNF: 485 yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab); 486 RET(VARNF); 487 default: 488 RET(kp->type); 489 } 490 } 491 c = peek(); /* look for '(' */ 492 if (c != '(' && infunc && (n=isarg(w)) >= 0) { 493 yylval.i = n; 494 RET(ARG); 495 } else { 496 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab); 497 if (c == '(') { 498 RET(CALL); 499 } else { 500 RET(VAR); 501 } 502 } 503 } 504 505 void startreg(void) /* next call to yylex will return a regular expression */ 506 { 507 reg = 1; 508 } 509 510 int regexpr(void) 511 { 512 int c; 513 static char *buf = 0; 514 static int bufsz = 500; 515 char *bp; 516 517 if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL) 518 FATAL("out of space for rex expr"); 519 bp = buf; 520 for ( ; (c = input()) != '/' && c != 0; ) { 521 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, 0)) 522 FATAL("out of space for reg expr %.10s...", buf); 523 if (c == '\n') { 524 SYNTAX( "newline in regular expression %.10s...", buf ); 525 unput('\n'); 526 break; 527 } else if (c == '\\') { 528 *bp++ = '\\'; 529 *bp++ = input(); 530 } else { 531 *bp++ = c; 532 } 533 } 534 *bp = 0; 535 if (c == 0) 536 SYNTAX("non-terminated regular expression %.10s...", buf); 537 yylval.s = tostring(buf); 538 unput('/'); 539 RET(REGEXPR); 540 } 541 542 /* low-level lexical stuff, sort of inherited from lex */ 543 544 char ebuf[300]; 545 char *ep = ebuf; 546 char yysbuf[100]; /* pushback buffer */ 547 char *yysptr = yysbuf; 548 FILE *yyin = 0; 549 550 int input(void) /* get next lexical input character */ 551 { 552 int c; 553 extern char *lexprog; 554 555 if (yysptr > yysbuf) 556 c = (uschar)*--yysptr; 557 else if (lexprog != NULL) { /* awk '...' */ 558 if ((c = (uschar)*lexprog) != 0) 559 lexprog++; 560 } else /* awk -f ... */ 561 c = pgetc(); 562 if (c == '\n') 563 lineno++; 564 else if (c == EOF) 565 c = 0; 566 if (ep >= ebuf + sizeof ebuf) 567 ep = ebuf; 568 return *ep++ = c; 569 } 570 571 void unput(int c) /* put lexical character back on input */ 572 { 573 if (c == '\n') 574 lineno--; 575 if (yysptr >= yysbuf + sizeof(yysbuf)) 576 FATAL("pushed back too much: %.20s...", yysbuf); 577 *yysptr++ = c; 578 if (--ep < ebuf) 579 ep = ebuf + sizeof(ebuf) - 1; 580 } 581 582 void unputstr(const char *s) /* put a string back on input */ 583 { 584 int i; 585 586 for (i = strlen(s)-1; i >= 0; i--) 587 unput(s[i]); 588 } 589