xref: /freebsd/usr.bin/indent/lexi.c (revision ab00ac327a66a53edaac95b536b209db3ae2cd9f)
1 /*-
2  * Copyright (c) 1985 Sun Microsystems, Inc.
3  * Copyright (c) 1980, 1993
4  *	The Regents of the University of California.  All rights reserved.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *	This product includes software developed by the University of
18  *	California, Berkeley and its contributors.
19  * 4. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 
36 #if 0
37 #ifndef lint
38 static char sccsid[] = "@(#)lexi.c	8.1 (Berkeley) 6/6/93";
39 #endif /* not lint */
40 #endif
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
43 
44 /*
45  * Here we have the token scanner for indent.  It scans off one token and puts
46  * it in the global variable "token".  It returns a code, indicating the type
47  * of token scanned.
48  */
49 
50 #include <err.h>
51 #include <stdio.h>
52 #include <ctype.h>
53 #include <stdlib.h>
54 #include <string.h>
55 #include "indent_globs.h"
56 #include "indent_codes.h"
57 #include "indent.h"
58 
59 #define alphanum 1
60 #ifdef undef
61 #define opchar 3
62 #endif
63 
64 struct templ {
65     const char *rwd;
66     int         rwcode;
67 };
68 
69 /*
70  * This table has to be sorted alphabetically, because it'll be used in binary
71  * search. For the same reason, string must be the first thing in struct templ.
72  */
73 struct templ specials[] =
74 {
75     {"auto", 10},
76     {"break", 9},
77     {"case", 8},
78     {"char", 4},
79     {"const", 4},
80     {"default", 8},
81     {"do", 6},
82     {"double", 4},
83     {"else", 6},
84     {"enum", 3},
85     {"extern", 10},
86     {"float", 4},
87     {"for", 5},
88     {"global", 4},
89     {"goto", 9},
90     {"if", 5},
91     {"int", 4},
92     {"long", 4},
93     {"offsetof", 1},
94     {"register", 10},
95     {"return", 9},
96     {"short", 4},
97     {"sizeof", 2},
98     {"static", 10},
99     {"struct", 3},
100     {"switch", 7},
101     {"typedef", 10},
102     {"union", 3},
103     {"unsigned", 4},
104     {"void", 4},
105     {"volatile", 4},
106     {"while", 5}
107 };
108 
109 const char **typenames;
110 int         typename_count;
111 int         typename_top = -1;
112 
113 char        chartype[128] =
114 {				/* this is used to facilitate the decision of
115 				 * what type (alphanumeric, operator) each
116 				 * character is */
117     0, 0, 0, 0, 0, 0, 0, 0,
118     0, 0, 0, 0, 0, 0, 0, 0,
119     0, 0, 0, 0, 0, 0, 0, 0,
120     0, 0, 0, 0, 0, 0, 0, 0,
121     0, 3, 0, 0, 1, 3, 3, 0,
122     0, 0, 3, 3, 0, 3, 0, 3,
123     1, 1, 1, 1, 1, 1, 1, 1,
124     1, 1, 0, 0, 3, 3, 3, 3,
125     0, 1, 1, 1, 1, 1, 1, 1,
126     1, 1, 1, 1, 1, 1, 1, 1,
127     1, 1, 1, 1, 1, 1, 1, 1,
128     1, 1, 1, 0, 0, 0, 3, 1,
129     0, 1, 1, 1, 1, 1, 1, 1,
130     1, 1, 1, 1, 1, 1, 1, 1,
131     1, 1, 1, 1, 1, 1, 1, 1,
132     1, 1, 1, 0, 3, 0, 3, 0
133 };
134 
135 static int
136 strcmp_type(const void *e1, const void *e2)
137 {
138     return (strcmp(e1, *(const char * const *)e2));
139 }
140 
141 int
142 lexi(void)
143 {
144     int         unary_delim;	/* this is set to 1 if the current token
145 				 * forces a following operator to be unary */
146     static int  last_code;	/* the last token type returned */
147     static int  l_struct;	/* set to 1 if the last token was 'struct' */
148     int         code;		/* internal code to be returned */
149     char        qchar;		/* the delimiter character for a string */
150 
151     e_token = s_token;		/* point to start of place to save token */
152     unary_delim = false;
153     ps.col_1 = ps.last_nl;	/* tell world that this token started in
154 				 * column 1 iff the last thing scanned was nl */
155     ps.last_nl = false;
156 
157     while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
158 	ps.col_1 = false;	/* leading blanks imply token is not in column
159 				 * 1 */
160 	if (++buf_ptr >= buf_end)
161 	    fill_buffer();
162     }
163 
164     /* Scan an alphanumeric token */
165     if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
166 	/*
167 	 * we have a character or number
168 	 */
169 	struct templ *p;
170 
171 	if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
172 	    enum base {
173 		BASE_2, BASE_8, BASE_10, BASE_16
174 	    };
175 	    int         seendot = 0,
176 	                seenexp = 0,
177 			seensfx = 0;
178 	    enum base	in_base = BASE_10;
179 
180 	    if (*buf_ptr == '0') {
181 		if (buf_ptr[1] == 'b' || buf_ptr[1] == 'B')
182 		    in_base = BASE_2;
183 		else if (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')
184 		    in_base = BASE_16;
185 		else if (isdigit(buf_ptr[1]))
186 		    in_base = BASE_8;
187 	    }
188 	    switch (in_base) {
189 	    case BASE_2:
190 		*e_token++ = *buf_ptr++;
191 		*e_token++ = *buf_ptr++;
192 		while (*buf_ptr == '0' || *buf_ptr == '1') {
193 		    CHECK_SIZE_TOKEN;
194 		    *e_token++ = *buf_ptr++;
195 		}
196 		break;
197 	    case BASE_8:
198 		*e_token++ = *buf_ptr++;
199 		while (*buf_ptr >= '0' && *buf_ptr <= '8') {
200 		    CHECK_SIZE_TOKEN;
201 		    *e_token++ = *buf_ptr++;
202 		}
203 		break;
204 	    case BASE_16:
205 		*e_token++ = *buf_ptr++;
206 		*e_token++ = *buf_ptr++;
207 		while (isxdigit(*buf_ptr)) {
208 		    CHECK_SIZE_TOKEN;
209 		    *e_token++ = *buf_ptr++;
210 		}
211 		break;
212 	    case BASE_10:
213 		while (1) {
214 		    if (*buf_ptr == '.') {
215 			if (seendot)
216 			    break;
217 			else
218 			    seendot++;
219 		    }
220 		    CHECK_SIZE_TOKEN;
221 		    *e_token++ = *buf_ptr++;
222 		    if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
223 			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
224 			    break;
225 			else {
226 			    seenexp++;
227 			    seendot++;
228 			    CHECK_SIZE_TOKEN;
229 			    *e_token++ = *buf_ptr++;
230 			    if (*buf_ptr == '+' || *buf_ptr == '-')
231 				*e_token++ = *buf_ptr++;
232 			}
233 		    }
234 		}
235 		break;
236 	    }
237 	    while (1) {
238 		if (!(seensfx & 1) && (*buf_ptr == 'U' || *buf_ptr == 'u')) {
239 		    CHECK_SIZE_TOKEN;
240 		    *e_token++ = *buf_ptr++;
241 		    seensfx |= 1;
242 		    continue;
243 		}
244 		if (!(seensfx & 2) && (strchr("fFlL", *buf_ptr) != NULL)) {
245 		    CHECK_SIZE_TOKEN;
246 		    if (buf_ptr[1] == buf_ptr[0])
247 		        *e_token++ = *buf_ptr++;
248 		    *e_token++ = *buf_ptr++;
249 		    seensfx |= 2;
250 		    continue;
251 		}
252 		break;
253 	    }
254 	}
255 	else
256 	    while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) {
257 		/* fill_buffer() terminates buffer with newline */
258 		if (*buf_ptr == BACKSLASH) {
259 		    if (*(buf_ptr + 1) == '\n') {
260 			buf_ptr += 2;
261 			if (buf_ptr >= buf_end)
262 			    fill_buffer();
263 			} else
264 			    break;
265 		}
266 		CHECK_SIZE_TOKEN;
267 		/* copy it over */
268 		*e_token++ = *buf_ptr++;
269 		if (buf_ptr >= buf_end)
270 		    fill_buffer();
271 	    }
272 	*e_token++ = '\0';
273 
274 	if (s_token[0] == 'L' && s_token[1] == '\0' &&
275 	      (*buf_ptr == '"' || *buf_ptr == '\''))
276 	    return (strpfx);
277 
278 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
279 	    if (++buf_ptr >= buf_end)
280 		fill_buffer();
281 	}
282 	ps.keyword = 0;
283 	if (l_struct && !ps.p_l_follow) {
284 				/* if last token was 'struct' and we're not
285 				 * in parentheses, then this token
286 				 * should be treated as a declaration */
287 	    l_struct = false;
288 	    last_code = ident;
289 	    ps.last_u_d = true;
290 	    return (decl);
291 	}
292 	ps.last_u_d = l_struct;	/* Operator after identifier is binary
293 				 * unless last token was 'struct' */
294 	l_struct = false;
295 	last_code = ident;	/* Remember that this is the code we will
296 				 * return */
297 
298 	p = bsearch(s_token,
299 	    specials,
300 	    sizeof(specials) / sizeof(specials[0]),
301 	    sizeof(specials[0]),
302 	    strcmp_type);
303 	if (p == NULL) {	/* not a special keyword... */
304 	    char *u;
305 
306 	    /* ... so maybe a type_t or a typedef */
307 	    if ((auto_typedefs && ((u = strrchr(s_token, '_')) != NULL) &&
308 	        strcmp(u, "_t") == 0) || (typename_top >= 0 &&
309 		  bsearch(s_token, typenames, typename_top + 1,
310 		    sizeof(typenames[0]), strcmp_type))) {
311 		ps.keyword = 4;	/* a type name */
312 		ps.last_u_d = true;
313 	        goto found_typename;
314 	    }
315 	} else {			/* we have a keyword */
316 	    ps.keyword = p->rwcode;
317 	    ps.last_u_d = true;
318 	    switch (p->rwcode) {
319 	    case 7:		/* it is a switch */
320 		return (swstmt);
321 	    case 8:		/* a case or default */
322 		return (casestmt);
323 
324 	    case 3:		/* a "struct" */
325 		/*
326 		 * Next time around, we will want to know that we have had a
327 		 * 'struct'
328 		 */
329 		l_struct = true;
330 		/* FALLTHROUGH */
331 
332 	    case 4:		/* one of the declaration keywords */
333 	    found_typename:
334 		if (ps.p_l_follow) {
335 		    /* inside parens: cast, param list, offsetof or sizeof */
336 		    ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.not_cast_mask;
337 		    break;
338 		}
339 		last_code = decl;
340 		return (decl);
341 
342 	    case 5:		/* if, while, for */
343 		return (sp_paren);
344 
345 	    case 6:		/* do, else */
346 		return (sp_nparen);
347 
348 	    case 10:		/* storage class specifier */
349 		return (storage);
350 
351 	    default:		/* all others are treated like any other
352 				 * identifier */
353 		return (ident);
354 	    }			/* end of switch */
355 	}			/* end of if (found_it) */
356 	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
357 	    char *tp = buf_ptr;
358 	    while (tp < buf_end)
359 		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
360 		    goto not_proc;
361 	    strncpy(ps.procname, token, sizeof ps.procname - 1);
362 	    if (ps.in_decl)
363 		ps.in_parameter_declaration = 1;
364 	    rparen_count = 1;
365     not_proc:;
366 	}
367 	/*
368 	 * The following hack attempts to guess whether or not the current
369 	 * token is in fact a declaration keyword -- one that has been
370 	 * typedefd
371 	 */
372 	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
373 		&& !ps.p_l_follow
374 	        && !ps.block_init
375 		&& (ps.last_token == rparen || ps.last_token == semicolon ||
376 		    ps.last_token == decl ||
377 		    ps.last_token == lbrace || ps.last_token == rbrace)) {
378 	    ps.keyword = 4;	/* a type name */
379 	    ps.last_u_d = true;
380 	    last_code = decl;
381 	    return decl;
382 	}
383 	if (last_code == decl)	/* if this is a declared variable, then
384 				 * following sign is unary */
385 	    ps.last_u_d = true;	/* will make "int a -1" work */
386 	last_code = ident;
387 	return (ident);		/* the ident is not in the list */
388     }				/* end of procesing for alpanum character */
389 
390     /* Scan a non-alphanumeric token */
391 
392     *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
393 				 * moved here */
394     *e_token = '\0';
395     if (++buf_ptr >= buf_end)
396 	fill_buffer();
397 
398     switch (*token) {
399     case '\n':
400 	unary_delim = ps.last_u_d;
401 	ps.last_nl = true;	/* remember that we just had a newline */
402 	code = (had_eof ? 0 : newline);
403 
404 	/*
405 	 * if data has been exhausted, the newline is a dummy, and we should
406 	 * return code to stop
407 	 */
408 	break;
409 
410     case '\'':			/* start of quoted character */
411     case '"':			/* start of string */
412 	qchar = *token;
413 	if (troff) {
414 	    e_token[-1] = '`';
415 	    if (qchar == '"')
416 		*e_token++ = '`';
417 	    e_token = chfont(&bodyf, &stringf, e_token);
418 	}
419 	do {			/* copy the string */
420 	    while (1) {		/* move one character or [/<char>]<char> */
421 		if (*buf_ptr == '\n') {
422 		    diag2(1, "Unterminated literal");
423 		    goto stop_lit;
424 		}
425 		CHECK_SIZE_TOKEN;	/* Only have to do this once in this loop,
426 					 * since CHECK_SIZE guarantees that there
427 					 * are at least 5 entries left */
428 		*e_token = *buf_ptr++;
429 		if (buf_ptr >= buf_end)
430 		    fill_buffer();
431 		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
432 		    if (*buf_ptr == '\n')	/* check for escaped newline */
433 			++line_no;
434 		    if (troff) {
435 			*++e_token = BACKSLASH;
436 			if (*buf_ptr == BACKSLASH)
437 			    *++e_token = BACKSLASH;
438 		    }
439 		    *++e_token = *buf_ptr++;
440 		    ++e_token;	/* we must increment this again because we
441 				 * copied two chars */
442 		    if (buf_ptr >= buf_end)
443 			fill_buffer();
444 		}
445 		else
446 		    break;	/* we copied one character */
447 	    }			/* end of while (1) */
448 	} while (*e_token++ != qchar);
449 	if (troff) {
450 	    e_token = chfont(&stringf, &bodyf, e_token - 1);
451 	    if (qchar == '"')
452 		*e_token++ = '\'';
453 	}
454 stop_lit:
455 	code = ident;
456 	break;
457 
458     case ('('):
459     case ('['):
460 	unary_delim = true;
461 	code = lparen;
462 	break;
463 
464     case (')'):
465     case (']'):
466 	code = rparen;
467 	break;
468 
469     case '#':
470 	unary_delim = ps.last_u_d;
471 	code = preesc;
472 	break;
473 
474     case '?':
475 	unary_delim = true;
476 	code = question;
477 	break;
478 
479     case (':'):
480 	code = colon;
481 	unary_delim = true;
482 	break;
483 
484     case (';'):
485 	unary_delim = true;
486 	code = semicolon;
487 	break;
488 
489     case ('{'):
490 	unary_delim = true;
491 
492 	/*
493 	 * if (ps.in_or_st) ps.block_init = 1;
494 	 */
495 	/* ?	code = ps.block_init ? lparen : lbrace; */
496 	code = lbrace;
497 	break;
498 
499     case ('}'):
500 	unary_delim = true;
501 	/* ?	code = ps.block_init ? rparen : rbrace; */
502 	code = rbrace;
503 	break;
504 
505     case 014:			/* a form feed */
506 	unary_delim = ps.last_u_d;
507 	ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
508 				 * right */
509 	code = form_feed;
510 	break;
511 
512     case (','):
513 	unary_delim = true;
514 	code = comma;
515 	break;
516 
517     case '.':
518 	unary_delim = false;
519 	code = period;
520 	break;
521 
522     case '-':
523     case '+':			/* check for -, +, --, ++ */
524 	code = (ps.last_u_d ? unary_op : binary_op);
525 	unary_delim = true;
526 
527 	if (*buf_ptr == token[0]) {
528 	    /* check for doubled character */
529 	    *e_token++ = *buf_ptr++;
530 	    /* buffer overflow will be checked at end of loop */
531 	    if (last_code == ident || last_code == rparen) {
532 		code = (ps.last_u_d ? unary_op : postop);
533 		/* check for following ++ or -- */
534 		unary_delim = false;
535 	    }
536 	}
537 	else if (*buf_ptr == '=')
538 	    /* check for operator += */
539 	    *e_token++ = *buf_ptr++;
540 	else if (*buf_ptr == '>') {
541 	    /* check for operator -> */
542 	    *e_token++ = *buf_ptr++;
543 	    if (!pointer_as_binop) {
544 		unary_delim = false;
545 		code = unary_op;
546 		ps.want_blank = false;
547 	    }
548 	}
549 	break;			/* buffer overflow will be checked at end of
550 				 * switch */
551 
552     case '=':
553 	if (ps.in_or_st)
554 	    ps.block_init = 1;
555 #ifdef undef
556 	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
557 	    e_token[-1] = *buf_ptr++;
558 	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
559 		*e_token++ = *buf_ptr++;
560 	    *e_token++ = '=';	/* Flip =+ to += */
561 	    *e_token = 0;
562 	}
563 #else
564 	if (*buf_ptr == '=') {/* == */
565 	    *e_token++ = '=';	/* Flip =+ to += */
566 	    buf_ptr++;
567 	    *e_token = 0;
568 	}
569 #endif
570 	code = binary_op;
571 	unary_delim = true;
572 	break;
573 	/* can drop thru!!! */
574 
575     case '>':
576     case '<':
577     case '!':			/* ops like <, <<, <=, !=, etc */
578 	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
579 	    *e_token++ = *buf_ptr;
580 	    if (++buf_ptr >= buf_end)
581 		fill_buffer();
582 	}
583 	if (*buf_ptr == '=')
584 	    *e_token++ = *buf_ptr++;
585 	code = (ps.last_u_d ? unary_op : binary_op);
586 	unary_delim = true;
587 	break;
588 
589     default:
590 	if (token[0] == '/' && *buf_ptr == '*') {
591 	    /* it is start of comment */
592 	    *e_token++ = '*';
593 
594 	    if (++buf_ptr >= buf_end)
595 		fill_buffer();
596 
597 	    code = comment;
598 	    unary_delim = ps.last_u_d;
599 	    break;
600 	}
601 	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
602 	    /*
603 	     * handle ||, &&, etc, and also things as in int *****i
604 	     */
605 	    *e_token++ = *buf_ptr;
606 	    if (++buf_ptr >= buf_end)
607 		fill_buffer();
608 	}
609 	code = (ps.last_u_d ? unary_op : binary_op);
610 	unary_delim = true;
611 
612 
613     }				/* end of switch */
614     if (code != newline) {
615 	l_struct = false;
616 	last_code = code;
617     }
618     if (buf_ptr >= buf_end)	/* check for input buffer empty */
619 	fill_buffer();
620     ps.last_u_d = unary_delim;
621     *e_token = '\0';		/* null terminate the token */
622     return (code);
623 }
624 
625 void
626 alloc_typenames(void)
627 {
628 
629     typenames = (const char **)malloc(sizeof(typenames[0]) *
630         (typename_count = 16));
631     if (typenames == NULL)
632 	err(1, NULL);
633 }
634 
635 void
636 add_typename(const char *key)
637 {
638     int comparison;
639     const char *copy;
640 
641     if (typename_top + 1 >= typename_count) {
642 	typenames = realloc((void *)typenames,
643 	    sizeof(typenames[0]) * (typename_count *= 2));
644 	if (typenames == NULL)
645 	    err(1, NULL);
646     }
647     if (typename_top == -1)
648 	typenames[++typename_top] = copy = strdup(key);
649     else if ((comparison = strcmp(key, typenames[typename_top])) >= 0) {
650 	/* take advantage of sorted input */
651 	if (comparison == 0)	/* remove duplicates */
652 	    return;
653 	typenames[++typename_top] = copy = strdup(key);
654     }
655     else {
656 	int p;
657 
658 	for (p = 0; (comparison = strcmp(key, typenames[p])) > 0; p++)
659 	    /* find place for the new key */;
660 	if (comparison == 0)	/* remove duplicates */
661 	    return;
662 	memmove(&typenames[p + 1], &typenames[p],
663 	    sizeof(typenames[0]) * (++typename_top - p));
664 	typenames[p] = copy = strdup(key);
665     }
666 
667     if (copy == NULL)
668 	err(1, NULL);
669 }
670