1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include "awk.h"
30 #include "awkgram.tab.h"
31
32 extern YYSTYPE yylval;
33 extern bool infunc;
34
35 int lineno = 1;
36 int bracecnt = 0;
37 int brackcnt = 0;
38 int parencnt = 0;
39
40 typedef struct Keyword {
41 const char *word;
42 int sub;
43 int type;
44 } Keyword;
45
46 const Keyword keywords[] = { /* keep sorted: binary searched */
47 { "BEGIN", XBEGIN, XBEGIN },
48 { "END", XEND, XEND },
49 { "NF", VARNF, VARNF },
50 { "and", FAND, BLTIN },
51 { "atan2", FATAN, BLTIN },
52 { "break", BREAK, BREAK },
53 { "close", CLOSE, CLOSE },
54 { "compl", FCOMPL, BLTIN },
55 { "continue", CONTINUE, CONTINUE },
56 { "cos", FCOS, BLTIN },
57 { "delete", DELETE, DELETE },
58 { "do", DO, DO },
59 { "else", ELSE, ELSE },
60 { "exit", EXIT, EXIT },
61 { "exp", FEXP, BLTIN },
62 { "fflush", FFLUSH, BLTIN },
63 { "for", FOR, FOR },
64 { "func", FUNC, FUNC },
65 { "function", FUNC, FUNC },
66 { "gensub", GENSUB, GENSUB },
67 { "getline", GETLINE, GETLINE },
68 { "gsub", GSUB, GSUB },
69 { "if", IF, IF },
70 { "in", IN, IN },
71 { "index", INDEX, INDEX },
72 { "int", FINT, BLTIN },
73 { "length", FLENGTH, BLTIN },
74 { "log", FLOG, BLTIN },
75 { "lshift", FLSHIFT, BLTIN },
76 { "match", MATCHFCN, MATCHFCN },
77 { "mktime", FMKTIME, BLTIN },
78 { "next", NEXT, NEXT },
79 { "nextfile", NEXTFILE, NEXTFILE },
80 { "or", FFOR, BLTIN },
81 { "print", PRINT, PRINT },
82 { "printf", PRINTF, PRINTF },
83 { "rand", FRAND, BLTIN },
84 { "return", RETURN, RETURN },
85 { "rshift", FRSHIFT, BLTIN },
86 { "sin", FSIN, BLTIN },
87 { "split", SPLIT, SPLIT },
88 { "sprintf", SPRINTF, SPRINTF },
89 { "sqrt", FSQRT, BLTIN },
90 { "srand", FSRAND, BLTIN },
91 { "strftime", FSTRFTIME, BLTIN },
92 { "sub", SUB, SUB },
93 { "substr", SUBSTR, SUBSTR },
94 { "system", FSYSTEM, BLTIN },
95 { "systime", FSYSTIME, BLTIN },
96 { "tolower", FTOLOWER, BLTIN },
97 { "toupper", FTOUPPER, BLTIN },
98 { "while", WHILE, WHILE },
99 { "xor", FXOR, BLTIN },
100 };
101
102 #define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); }
103
peek(void)104 static int peek(void)
105 {
106 int c = input();
107 unput(c);
108 return c;
109 }
110
gettok(char ** pbuf,int * psz)111 static int gettok(char **pbuf, int *psz) /* get next input token */
112 {
113 int c, retc;
114 char *buf = *pbuf;
115 int sz = *psz;
116 char *bp = buf;
117
118 c = input();
119 if (c == 0)
120 return 0;
121 buf[0] = c;
122 buf[1] = 0;
123 if (!isalnum(c) && c != '.' && c != '_')
124 return c;
125
126 *bp++ = c;
127 if (isalpha(c) || c == '_') { /* it's a varname */
128 for ( ; (c = input()) != 0; ) {
129 if (bp-buf >= sz)
130 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
131 FATAL( "out of space for name %.10s...", buf );
132 if (isalnum(c) || c == '_')
133 *bp++ = c;
134 else {
135 *bp = 0;
136 unput(c);
137 break;
138 }
139 }
140 *bp = 0;
141 retc = 'a'; /* alphanumeric */
142 } else { /* maybe it's a number, but could be . */
143 char *rem;
144 /* read input until can't be a number */
145 for ( ; (c = input()) != 0; ) {
146 if (bp-buf >= sz)
147 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
148 FATAL( "out of space for number %.10s...", buf );
149 if (isdigit(c) || c == 'e' || c == 'E'
150 || c == '.' || c == '+' || c == '-')
151 *bp++ = c;
152 else {
153 unput(c);
154 break;
155 }
156 }
157 *bp = 0;
158 strtod(buf, &rem); /* parse the number */
159 if (rem == buf) { /* it wasn't a valid number at all */
160 buf[1] = 0; /* return one character as token */
161 retc = (uschar)buf[0]; /* character is its own type */
162 unputstr(rem+1); /* put rest back for later */
163 } else { /* some prefix was a number */
164 unputstr(rem); /* put rest back for later */
165 rem[0] = 0; /* truncate buf after number part */
166 retc = '0'; /* type is number */
167 }
168 }
169 *pbuf = buf;
170 *psz = sz;
171 return retc;
172 }
173
174 int word(char *);
175 int string(void);
176 int regexpr(void);
177 bool sc = false; /* true => return a } right now */
178 bool reg = false; /* true => return a REGEXPR now */
179
yylex(void)180 int yylex(void)
181 {
182 int c;
183 static char *buf = NULL;
184 static int bufsize = 5; /* BUG: setting this small causes core dump! */
185
186 if (buf == NULL && (buf = (char *) malloc(bufsize)) == NULL)
187 FATAL( "out of space in yylex" );
188 if (sc) {
189 sc = false;
190 RET('}');
191 }
192 if (reg) {
193 reg = false;
194 return regexpr();
195 }
196 for (;;) {
197 c = gettok(&buf, &bufsize);
198 if (c == 0)
199 return 0;
200 if (isalpha(c) || c == '_')
201 return word(buf);
202 if (isdigit(c)) {
203 char *cp = tostring(buf);
204 double result;
205
206 if (is_number(cp, & result))
207 yylval.cp = setsymtab(buf, cp, result, CON|NUM, symtab);
208 else
209 yylval.cp = setsymtab(buf, cp, 0.0, STR, symtab);
210 free(cp);
211 /* should this also have STR set? */
212 RET(NUMBER);
213 }
214
215 yylval.i = c;
216 switch (c) {
217 case '\n': /* {EOL} */
218 lineno++;
219 RET(NL);
220 case '\r': /* assume \n is coming */
221 case ' ': /* {WS}+ */
222 case '\t':
223 break;
224 case '#': /* #.* strip comments */
225 while ((c = input()) != '\n' && c != 0)
226 ;
227 unput(c);
228 break;
229 case ';':
230 RET(';');
231 case '\\':
232 if (peek() == '\n') {
233 input();
234 lineno++;
235 } else if (peek() == '\r') {
236 input(); input(); /* \n */
237 lineno++;
238 } else {
239 RET(c);
240 }
241 break;
242 case '&':
243 if (peek() == '&') {
244 input(); RET(AND);
245 } else
246 RET('&');
247 case '|':
248 if (peek() == '|') {
249 input(); RET(BOR);
250 } else
251 RET('|');
252 case '!':
253 if (peek() == '=') {
254 input(); yylval.i = NE; RET(NE);
255 } else if (peek() == '~') {
256 input(); yylval.i = NOTMATCH; RET(MATCHOP);
257 } else
258 RET(NOT);
259 case '~':
260 yylval.i = MATCH;
261 RET(MATCHOP);
262 case '<':
263 if (peek() == '=') {
264 input(); yylval.i = LE; RET(LE);
265 } else {
266 yylval.i = LT; RET(LT);
267 }
268 case '=':
269 if (peek() == '=') {
270 input(); yylval.i = EQ; RET(EQ);
271 } else {
272 yylval.i = ASSIGN; RET(ASGNOP);
273 }
274 case '>':
275 if (peek() == '=') {
276 input(); yylval.i = GE; RET(GE);
277 } else if (peek() == '>') {
278 input(); yylval.i = APPEND; RET(APPEND);
279 } else {
280 yylval.i = GT; RET(GT);
281 }
282 case '+':
283 if (peek() == '+') {
284 input(); yylval.i = INCR; RET(INCR);
285 } else if (peek() == '=') {
286 input(); yylval.i = ADDEQ; RET(ASGNOP);
287 } else
288 RET('+');
289 case '-':
290 if (peek() == '-') {
291 input(); yylval.i = DECR; RET(DECR);
292 } else if (peek() == '=') {
293 input(); yylval.i = SUBEQ; RET(ASGNOP);
294 } else
295 RET('-');
296 case '*':
297 if (peek() == '=') { /* *= */
298 input(); yylval.i = MULTEQ; RET(ASGNOP);
299 } else if (peek() == '*') { /* ** or **= */
300 input(); /* eat 2nd * */
301 if (peek() == '=') {
302 input(); yylval.i = POWEQ; RET(ASGNOP);
303 } else {
304 RET(POWER);
305 }
306 } else
307 RET('*');
308 case '/':
309 RET('/');
310 case '%':
311 if (peek() == '=') {
312 input(); yylval.i = MODEQ; RET(ASGNOP);
313 } else
314 RET('%');
315 case '^':
316 if (peek() == '=') {
317 input(); yylval.i = POWEQ; RET(ASGNOP);
318 } else
319 RET(POWER);
320
321 case '$':
322 /* BUG: awkward, if not wrong */
323 c = gettok(&buf, &bufsize);
324 if (isalpha(c)) {
325 if (strcmp(buf, "NF") == 0) { /* very special */
326 unputstr("(NF)");
327 RET(INDIRECT);
328 }
329 c = peek();
330 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
331 unputstr(buf);
332 RET(INDIRECT);
333 }
334 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
335 RET(IVAR);
336 } else if (c == 0) { /* */
337 SYNTAX( "unexpected end of input after $" );
338 RET(';');
339 } else {
340 unputstr(buf);
341 RET(INDIRECT);
342 }
343
344 case '}':
345 if (--bracecnt < 0)
346 SYNTAX( "extra }" );
347 sc = true;
348 RET(';');
349 case ']':
350 if (--brackcnt < 0)
351 SYNTAX( "extra ]" );
352 RET(']');
353 case ')':
354 if (--parencnt < 0)
355 SYNTAX( "extra )" );
356 RET(')');
357 case '{':
358 bracecnt++;
359 RET('{');
360 case '[':
361 brackcnt++;
362 RET('[');
363 case '(':
364 parencnt++;
365 RET('(');
366
367 case '"':
368 return string(); /* BUG: should be like tran.c ? */
369
370 default:
371 RET(c);
372 }
373 }
374 }
375
376 extern int runetochar(char *str, int c);
377
string(void)378 int string(void)
379 {
380 int c, n;
381 char *s, *bp;
382 static char *buf = NULL;
383 static int bufsz = 500;
384
385 if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
386 FATAL("out of space for strings");
387 for (bp = buf; (c = input()) != '"'; ) {
388 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
389 FATAL("out of space for string %.10s...", buf);
390 switch (c) {
391 case '\n':
392 case '\r':
393 case 0:
394 *bp = '\0';
395 SYNTAX( "non-terminated string %.10s...", buf );
396 if (c == 0) /* hopeless */
397 FATAL( "giving up" );
398 lineno++;
399 break;
400 case '\\':
401 c = input();
402 switch (c) {
403 case '\n': break;
404 case '"': *bp++ = '"'; break;
405 case 'n': *bp++ = '\n'; break;
406 case 't': *bp++ = '\t'; break;
407 case 'f': *bp++ = '\f'; break;
408 case 'r': *bp++ = '\r'; break;
409 case 'b': *bp++ = '\b'; break;
410 case 'v': *bp++ = '\v'; break;
411 case 'a': *bp++ = '\a'; break;
412 case '\\': *bp++ = '\\'; break;
413
414 case '0': case '1': case '2': /* octal: \d \dd \ddd */
415 case '3': case '4': case '5': case '6': case '7':
416 n = c - '0';
417 if ((c = peek()) >= '0' && c < '8') {
418 n = 8 * n + input() - '0';
419 if ((c = peek()) >= '0' && c < '8')
420 n = 8 * n + input() - '0';
421 }
422 *bp++ = n;
423 break;
424
425 case 'x': /* hex \x0-9a-fA-F (exactly two) */
426 {
427 int i;
428
429 if (!isxdigit(peek())) {
430 unput(c);
431 break;
432 }
433 n = 0;
434 for (i = 0; i < 2; i++) {
435 c = input();
436 if (c == 0)
437 break;
438 if (isxdigit(c)) {
439 c = tolower(c);
440 n *= 16;
441 if (isdigit(c))
442 n += (c - '0');
443 else
444 n += 10 + (c - 'a');
445 } else {
446 unput(c);
447 break;
448 }
449 }
450 if (i)
451 *bp++ = n;
452 break;
453 }
454
455 case 'u': /* utf \u0-9a-fA-F (1..8) */
456 {
457 int i;
458
459 n = 0;
460 for (i = 0; i < 8; i++) {
461 c = input();
462 if (!isxdigit(c) || c == 0)
463 break;
464 c = tolower(c);
465 n *= 16;
466 if (isdigit(c))
467 n += (c - '0');
468 else
469 n += 10 + (c - 'a');
470 }
471 unput(c);
472 bp += runetochar(bp, n);
473 break;
474 }
475
476 default:
477 *bp++ = c;
478 break;
479 }
480 break;
481 default:
482 *bp++ = c;
483 break;
484 }
485 }
486 *bp = 0;
487 s = tostring(buf);
488 *bp++ = ' '; *bp++ = '\0';
489 yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
490 free(s);
491 RET(STRING);
492 }
493
494
binsearch(char * w,const Keyword * kp,int n)495 static int binsearch(char *w, const Keyword *kp, int n)
496 {
497 int cond, low, mid, high;
498
499 low = 0;
500 high = n - 1;
501 while (low <= high) {
502 mid = (low + high) / 2;
503 if ((cond = strcmp(w, kp[mid].word)) < 0)
504 high = mid - 1;
505 else if (cond > 0)
506 low = mid + 1;
507 else
508 return mid;
509 }
510 return -1;
511 }
512
word(char * w)513 int word(char *w)
514 {
515 const Keyword *kp;
516 int c, n;
517
518 n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
519 if (n != -1) { /* found in table */
520 kp = keywords + n;
521 yylval.i = kp->sub;
522 switch (kp->type) { /* special handling */
523 case BLTIN:
524 if (kp->sub == FSYSTEM && safe)
525 SYNTAX( "system is unsafe" );
526 RET(kp->type);
527 case FUNC:
528 if (infunc)
529 SYNTAX( "illegal nested function" );
530 RET(kp->type);
531 case RETURN:
532 if (!infunc)
533 SYNTAX( "return not in function" );
534 RET(kp->type);
535 case VARNF:
536 yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
537 RET(VARNF);
538 default:
539 RET(kp->type);
540 }
541 }
542 c = peek(); /* look for '(' */
543 if (c != '(' && infunc && (n=isarg(w)) >= 0) {
544 yylval.i = n;
545 RET(ARG);
546 } else {
547 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
548 if (c == '(') {
549 RET(CALL);
550 } else {
551 RET(VAR);
552 }
553 }
554 }
555
startreg(void)556 void startreg(void) /* next call to yylex will return a regular expression */
557 {
558 reg = true;
559 }
560
regexpr(void)561 int regexpr(void)
562 {
563 int c;
564 static char *buf = NULL;
565 static int bufsz = 500;
566 char *bp;
567
568 if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
569 FATAL("out of space for reg expr");
570 bp = buf;
571 for ( ; (c = input()) != '/' && c != 0; ) {
572 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
573 FATAL("out of space for reg expr %.10s...", buf);
574 if (c == '\n') {
575 *bp = '\0';
576 SYNTAX( "newline in regular expression %.10s...", buf );
577 unput('\n');
578 break;
579 } else if (c == '\\') {
580 *bp++ = '\\';
581 *bp++ = input();
582 } else {
583 *bp++ = c;
584 }
585 }
586 *bp = 0;
587 if (c == 0)
588 SYNTAX("non-terminated regular expression %.10s...", buf);
589 yylval.s = tostring(buf);
590 unput('/');
591 RET(REGEXPR);
592 }
593
594 /* low-level lexical stuff, sort of inherited from lex */
595
596 char ebuf[300];
597 char *ep = ebuf;
598 char yysbuf[100]; /* pushback buffer */
599 char *yysptr = yysbuf;
600 FILE *yyin = NULL;
601
input(void)602 int input(void) /* get next lexical input character */
603 {
604 int c;
605 extern char *lexprog;
606
607 if (yysptr > yysbuf)
608 c = (uschar)*--yysptr;
609 else if (lexprog != NULL) { /* awk '...' */
610 if ((c = (uschar)*lexprog) != 0)
611 lexprog++;
612 } else /* awk -f ... */
613 c = pgetc();
614 if (c == EOF)
615 c = 0;
616 if (ep >= ebuf + sizeof ebuf)
617 ep = ebuf;
618 *ep = c;
619 if (c != 0) {
620 ep++;
621 }
622 return (c);
623 }
624
unput(int c)625 void unput(int c) /* put lexical character back on input */
626 {
627 if (yysptr >= yysbuf + sizeof(yysbuf))
628 FATAL("pushed back too much: %.20s...", yysbuf);
629 *yysptr++ = c;
630 if (--ep < ebuf)
631 ep = ebuf + sizeof(ebuf) - 1;
632 }
633
unputstr(const char * s)634 void unputstr(const char *s) /* put a string back on input */
635 {
636 int i;
637
638 for (i = strlen(s)-1; i >= 0; i--)
639 unput(s[i]);
640 }
641