xref: /freebsd/usr.bin/indent/lexi.c (revision ca987d4641cdcd7f27e153db17c5bf064934faf5)
1 /*-
2  * Copyright (c) 1985 Sun Microsystems, Inc.
3  * Copyright (c) 1980, 1993
4  *	The Regents of the University of California.  All rights reserved.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *	This product includes software developed by the University of
18  *	California, Berkeley and its contributors.
19  * 4. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 
36 #if 0
37 #ifndef lint
38 static char sccsid[] = "@(#)lexi.c	8.1 (Berkeley) 6/6/93";
39 #endif /* not lint */
40 #endif
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
43 
44 /*
45  * Here we have the token scanner for indent.  It scans off one token and puts
46  * it in the global variable "token".  It returns a code, indicating the type
47  * of token scanned.
48  */
49 
50 #include <err.h>
51 #include <stdio.h>
52 #include <ctype.h>
53 #include <stdlib.h>
54 #include <string.h>
55 #include "indent_globs.h"
56 #include "indent_codes.h"
57 #include "indent.h"
58 
59 #define alphanum 1
60 #ifdef undef
61 #define opchar 3
62 #endif
63 
64 struct templ {
65     const char *rwd;
66     int         rwcode;
67 };
68 
69 /*
70  * This table has to be sorted alphabetically, because it'll be used in binary
71  * search. For the same reason, string must be the first thing in struct templ.
72  */
73 struct templ specials[] =
74 {
75     {"auto", 10},
76     {"break", 9},
77     {"case", 8},
78     {"char", 4},
79     {"const", 4},
80     {"default", 8},
81     {"do", 6},
82     {"double", 4},
83     {"else", 6},
84     {"enum", 3},
85     {"extern", 10},
86     {"float", 4},
87     {"for", 5},
88     {"global", 4},
89     {"goto", 9},
90     {"if", 5},
91     {"int", 4},
92     {"long", 4},
93     {"offsetof", 1},
94     {"register", 10},
95     {"return", 9},
96     {"short", 4},
97     {"sizeof", 2},
98     {"static", 10},
99     {"struct", 3},
100     {"switch", 7},
101     {"typedef", 10},
102     {"union", 3},
103     {"unsigned", 4},
104     {"void", 4},
105     {"volatile", 4},
106     {"while", 5}
107 };
108 
109 const char **typenames;
110 int         typename_count;
111 int         typename_top = -1;
112 
113 char        chartype[128] =
114 {				/* this is used to facilitate the decision of
115 				 * what type (alphanumeric, operator) each
116 				 * character is */
117     0, 0, 0, 0, 0, 0, 0, 0,
118     0, 0, 0, 0, 0, 0, 0, 0,
119     0, 0, 0, 0, 0, 0, 0, 0,
120     0, 0, 0, 0, 0, 0, 0, 0,
121     0, 3, 0, 0, 1, 3, 3, 0,
122     0, 0, 3, 3, 0, 3, 0, 3,
123     1, 1, 1, 1, 1, 1, 1, 1,
124     1, 1, 0, 0, 3, 3, 3, 3,
125     0, 1, 1, 1, 1, 1, 1, 1,
126     1, 1, 1, 1, 1, 1, 1, 1,
127     1, 1, 1, 1, 1, 1, 1, 1,
128     1, 1, 1, 0, 0, 0, 3, 1,
129     0, 1, 1, 1, 1, 1, 1, 1,
130     1, 1, 1, 1, 1, 1, 1, 1,
131     1, 1, 1, 1, 1, 1, 1, 1,
132     1, 1, 1, 0, 3, 0, 3, 0
133 };
134 
135 static int
136 strcmp_type(const void *e1, const void *e2)
137 {
138     return (strcmp(e1, *(const char * const *)e2));
139 }
140 
141 int
142 lexi(void)
143 {
144     int         unary_delim;	/* this is set to 1 if the current token
145 				 * forces a following operator to be unary */
146     static int  last_code;	/* the last token type returned */
147     static int  l_struct;	/* set to 1 if the last token was 'struct' */
148     int         code;		/* internal code to be returned */
149     char        qchar;		/* the delimiter character for a string */
150 
151     e_token = s_token;		/* point to start of place to save token */
152     unary_delim = false;
153     ps.col_1 = ps.last_nl;	/* tell world that this token started in
154 				 * column 1 iff the last thing scanned was nl */
155     ps.last_nl = false;
156 
157     while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
158 	ps.col_1 = false;	/* leading blanks imply token is not in column
159 				 * 1 */
160 	if (++buf_ptr >= buf_end)
161 	    fill_buffer();
162     }
163 
164     /* Scan an alphanumeric token */
165     if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
166 	/*
167 	 * we have a character or number
168 	 */
169 	struct templ *p;
170 
171 	if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
172 	    enum base {
173 		BASE_2, BASE_8, BASE_10, BASE_16
174 	    };
175 	    int         seendot = 0,
176 	                seenexp = 0,
177 			seensfx = 0;
178 	    enum base	in_base = BASE_10;
179 
180 	    if (*buf_ptr == '0') {
181 		if (buf_ptr[1] == 'b' || buf_ptr[1] == 'B')
182 		    in_base = BASE_2;
183 		else if (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')
184 		    in_base = BASE_16;
185 		else if (isdigit(buf_ptr[1]))
186 		    in_base = BASE_8;
187 	    }
188 	    switch (in_base) {
189 	    case BASE_2:
190 		*e_token++ = *buf_ptr++;
191 		*e_token++ = *buf_ptr++;
192 		while (*buf_ptr == '0' || *buf_ptr == '1') {
193 		    CHECK_SIZE_TOKEN;
194 		    *e_token++ = *buf_ptr++;
195 		}
196 		break;
197 	    case BASE_8:
198 		*e_token++ = *buf_ptr++;
199 		while (*buf_ptr >= '0' && *buf_ptr <= '8') {
200 		    CHECK_SIZE_TOKEN;
201 		    *e_token++ = *buf_ptr++;
202 		}
203 		break;
204 	    case BASE_16:
205 		*e_token++ = *buf_ptr++;
206 		*e_token++ = *buf_ptr++;
207 		while (isxdigit(*buf_ptr)) {
208 		    CHECK_SIZE_TOKEN;
209 		    *e_token++ = *buf_ptr++;
210 		}
211 		break;
212 	    case BASE_10:
213 		while (1) {
214 		    if (*buf_ptr == '.') {
215 			if (seendot)
216 			    break;
217 			else
218 			    seendot++;
219 		    }
220 		    CHECK_SIZE_TOKEN;
221 		    *e_token++ = *buf_ptr++;
222 		    if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
223 			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
224 			    break;
225 			else {
226 			    seenexp++;
227 			    seendot++;
228 			    CHECK_SIZE_TOKEN;
229 			    *e_token++ = *buf_ptr++;
230 			    if (*buf_ptr == '+' || *buf_ptr == '-')
231 				*e_token++ = *buf_ptr++;
232 			}
233 		    }
234 		}
235 		break;
236 	    }
237 	    while (1) {
238 		if (!(seensfx & 1) && (*buf_ptr == 'U' || *buf_ptr == 'u')) {
239 		    CHECK_SIZE_TOKEN;
240 		    *e_token++ = *buf_ptr++;
241 		    seensfx |= 1;
242 		    continue;
243 		}
244 		if (!(seensfx & 2) && (strchr("fFlL", *buf_ptr) != NULL)) {
245 		    CHECK_SIZE_TOKEN;
246 		    if (buf_ptr[1] == buf_ptr[0])
247 		        *e_token++ = *buf_ptr++;
248 		    *e_token++ = *buf_ptr++;
249 		    seensfx |= 2;
250 		    continue;
251 		}
252 		break;
253 	    }
254 	}
255 	else
256 	    while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) {
257 		/* fill_buffer() terminates buffer with newline */
258 		if (*buf_ptr == BACKSLASH) {
259 		    if (*(buf_ptr + 1) == '\n') {
260 			buf_ptr += 2;
261 			if (buf_ptr >= buf_end)
262 			    fill_buffer();
263 			} else
264 			    break;
265 		}
266 		CHECK_SIZE_TOKEN;
267 		/* copy it over */
268 		*e_token++ = *buf_ptr++;
269 		if (buf_ptr >= buf_end)
270 		    fill_buffer();
271 	    }
272 	*e_token++ = '\0';
273 
274 	if (s_token[0] == 'L' && s_token[1] == '\0' &&
275 	      (*buf_ptr == '"' || *buf_ptr == '\''))
276 	    return (strpfx);
277 
278 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
279 	    if (++buf_ptr >= buf_end)
280 		fill_buffer();
281 	}
282 	ps.keyword = 0;
283 	if (l_struct && !ps.p_l_follow) {
284 				/* if last token was 'struct' and we're not
285 				 * in parentheses, then this token
286 				 * should be treated as a declaration */
287 	    l_struct = false;
288 	    last_code = ident;
289 	    ps.last_u_d = true;
290 	    return (decl);
291 	}
292 	ps.last_u_d = l_struct;	/* Operator after identifier is binary
293 				 * unless last token was 'struct' */
294 	l_struct = false;
295 	last_code = ident;	/* Remember that this is the code we will
296 				 * return */
297 
298 	p = bsearch(s_token,
299 	    specials,
300 	    sizeof(specials) / sizeof(specials[0]),
301 	    sizeof(specials[0]),
302 	    strcmp_type);
303 	if (p == NULL) {	/* not a special keyword... */
304 	    char *u;
305 
306 	    /* ... so maybe a type_t or a typedef */
307 	    if ((auto_typedefs && ((u = strrchr(s_token, '_')) != NULL) &&
308 	        strcmp(u, "_t") == 0) || (typename_top >= 0 &&
309 		  bsearch(s_token, typenames, typename_top + 1,
310 		    sizeof(typenames[0]), strcmp_type))) {
311 		ps.keyword = 4;	/* a type name */
312 		ps.last_u_d = true;
313 	        goto found_typename;
314 	    }
315 	} else {			/* we have a keyword */
316 	    ps.keyword = p->rwcode;
317 	    ps.last_u_d = true;
318 	    switch (p->rwcode) {
319 	    case 7:		/* it is a switch */
320 		return (swstmt);
321 	    case 8:		/* a case or default */
322 		return (casestmt);
323 
324 	    case 3:		/* a "struct" */
325 		/*
326 		 * Next time around, we will want to know that we have had a
327 		 * 'struct'
328 		 */
329 		l_struct = true;
330 		/* FALLTHROUGH */
331 
332 	    case 4:		/* one of the declaration keywords */
333 	    found_typename:
334 		if (ps.p_l_follow) {
335 		    /* inside parens: cast, param list, offsetof or sizeof */
336 		    ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.not_cast_mask;
337 		    break;
338 		}
339 		last_code = decl;
340 		return (decl);
341 
342 	    case 5:		/* if, while, for */
343 		return (sp_paren);
344 
345 	    case 6:		/* do, else */
346 		return (sp_nparen);
347 
348 	    case 10:		/* storage class specifier */
349 		return (storage);
350 
351 	    default:		/* all others are treated like any other
352 				 * identifier */
353 		return (ident);
354 	    }			/* end of switch */
355 	}			/* end of if (found_it) */
356 	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0 &&
357 	    ps.in_parameter_declaration == 0 && ps.block_init == 0) {
358 	    char *tp = buf_ptr;
359 	    while (tp < buf_end)
360 		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
361 		    goto not_proc;
362 	    strncpy(ps.procname, token, sizeof ps.procname - 1);
363 	    if (ps.in_decl)
364 		ps.in_parameter_declaration = 1;
365 	    return (last_code = funcname);
366     not_proc:;
367 	}
368 	/*
369 	 * The following hack attempts to guess whether or not the current
370 	 * token is in fact a declaration keyword -- one that has been
371 	 * typedefd
372 	 */
373 	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
374 		&& !ps.p_l_follow
375 	        && !ps.block_init
376 		&& (ps.last_token == rparen || ps.last_token == semicolon ||
377 		    ps.last_token == decl ||
378 		    ps.last_token == lbrace || ps.last_token == rbrace)) {
379 	    ps.keyword = 4;	/* a type name */
380 	    ps.last_u_d = true;
381 	    last_code = decl;
382 	    return decl;
383 	}
384 	if (last_code == decl)	/* if this is a declared variable, then
385 				 * following sign is unary */
386 	    ps.last_u_d = true;	/* will make "int a -1" work */
387 	last_code = ident;
388 	return (ident);		/* the ident is not in the list */
389     }				/* end of procesing for alpanum character */
390 
391     /* Scan a non-alphanumeric token */
392 
393     *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
394 				 * moved here */
395     *e_token = '\0';
396     if (++buf_ptr >= buf_end)
397 	fill_buffer();
398 
399     switch (*token) {
400     case '\n':
401 	unary_delim = ps.last_u_d;
402 	ps.last_nl = true;	/* remember that we just had a newline */
403 	code = (had_eof ? 0 : newline);
404 
405 	/*
406 	 * if data has been exhausted, the newline is a dummy, and we should
407 	 * return code to stop
408 	 */
409 	break;
410 
411     case '\'':			/* start of quoted character */
412     case '"':			/* start of string */
413 	qchar = *token;
414 	if (troff) {
415 	    e_token[-1] = '`';
416 	    if (qchar == '"')
417 		*e_token++ = '`';
418 	    e_token = chfont(&bodyf, &stringf, e_token);
419 	}
420 	do {			/* copy the string */
421 	    while (1) {		/* move one character or [/<char>]<char> */
422 		if (*buf_ptr == '\n') {
423 		    diag2(1, "Unterminated literal");
424 		    goto stop_lit;
425 		}
426 		CHECK_SIZE_TOKEN;	/* Only have to do this once in this loop,
427 					 * since CHECK_SIZE guarantees that there
428 					 * are at least 5 entries left */
429 		*e_token = *buf_ptr++;
430 		if (buf_ptr >= buf_end)
431 		    fill_buffer();
432 		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
433 		    if (*buf_ptr == '\n')	/* check for escaped newline */
434 			++line_no;
435 		    if (troff) {
436 			*++e_token = BACKSLASH;
437 			if (*buf_ptr == BACKSLASH)
438 			    *++e_token = BACKSLASH;
439 		    }
440 		    *++e_token = *buf_ptr++;
441 		    ++e_token;	/* we must increment this again because we
442 				 * copied two chars */
443 		    if (buf_ptr >= buf_end)
444 			fill_buffer();
445 		}
446 		else
447 		    break;	/* we copied one character */
448 	    }			/* end of while (1) */
449 	} while (*e_token++ != qchar);
450 	if (troff) {
451 	    e_token = chfont(&stringf, &bodyf, e_token - 1);
452 	    if (qchar == '"')
453 		*e_token++ = '\'';
454 	}
455 stop_lit:
456 	code = ident;
457 	break;
458 
459     case ('('):
460     case ('['):
461 	unary_delim = true;
462 	code = lparen;
463 	break;
464 
465     case (')'):
466     case (']'):
467 	code = rparen;
468 	break;
469 
470     case '#':
471 	unary_delim = ps.last_u_d;
472 	code = preesc;
473 	break;
474 
475     case '?':
476 	unary_delim = true;
477 	code = question;
478 	break;
479 
480     case (':'):
481 	code = colon;
482 	unary_delim = true;
483 	break;
484 
485     case (';'):
486 	unary_delim = true;
487 	code = semicolon;
488 	break;
489 
490     case ('{'):
491 	unary_delim = true;
492 
493 	/*
494 	 * if (ps.in_or_st) ps.block_init = 1;
495 	 */
496 	/* ?	code = ps.block_init ? lparen : lbrace; */
497 	code = lbrace;
498 	break;
499 
500     case ('}'):
501 	unary_delim = true;
502 	/* ?	code = ps.block_init ? rparen : rbrace; */
503 	code = rbrace;
504 	break;
505 
506     case 014:			/* a form feed */
507 	unary_delim = ps.last_u_d;
508 	ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
509 				 * right */
510 	code = form_feed;
511 	break;
512 
513     case (','):
514 	unary_delim = true;
515 	code = comma;
516 	break;
517 
518     case '.':
519 	unary_delim = false;
520 	code = period;
521 	break;
522 
523     case '-':
524     case '+':			/* check for -, +, --, ++ */
525 	code = (ps.last_u_d ? unary_op : binary_op);
526 	unary_delim = true;
527 
528 	if (*buf_ptr == token[0]) {
529 	    /* check for doubled character */
530 	    *e_token++ = *buf_ptr++;
531 	    /* buffer overflow will be checked at end of loop */
532 	    if (last_code == ident || last_code == rparen) {
533 		code = (ps.last_u_d ? unary_op : postop);
534 		/* check for following ++ or -- */
535 		unary_delim = false;
536 	    }
537 	}
538 	else if (*buf_ptr == '=')
539 	    /* check for operator += */
540 	    *e_token++ = *buf_ptr++;
541 	else if (*buf_ptr == '>') {
542 	    /* check for operator -> */
543 	    *e_token++ = *buf_ptr++;
544 	    if (!pointer_as_binop) {
545 		unary_delim = false;
546 		code = unary_op;
547 		ps.want_blank = false;
548 	    }
549 	}
550 	break;			/* buffer overflow will be checked at end of
551 				 * switch */
552 
553     case '=':
554 	if (ps.in_or_st)
555 	    ps.block_init = 1;
556 #ifdef undef
557 	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
558 	    e_token[-1] = *buf_ptr++;
559 	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
560 		*e_token++ = *buf_ptr++;
561 	    *e_token++ = '=';	/* Flip =+ to += */
562 	    *e_token = 0;
563 	}
564 #else
565 	if (*buf_ptr == '=') {/* == */
566 	    *e_token++ = '=';	/* Flip =+ to += */
567 	    buf_ptr++;
568 	    *e_token = 0;
569 	}
570 #endif
571 	code = binary_op;
572 	unary_delim = true;
573 	break;
574 	/* can drop thru!!! */
575 
576     case '>':
577     case '<':
578     case '!':			/* ops like <, <<, <=, !=, etc */
579 	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
580 	    *e_token++ = *buf_ptr;
581 	    if (++buf_ptr >= buf_end)
582 		fill_buffer();
583 	}
584 	if (*buf_ptr == '=')
585 	    *e_token++ = *buf_ptr++;
586 	code = (ps.last_u_d ? unary_op : binary_op);
587 	unary_delim = true;
588 	break;
589 
590     default:
591 	if (token[0] == '/' && *buf_ptr == '*') {
592 	    /* it is start of comment */
593 	    *e_token++ = '*';
594 
595 	    if (++buf_ptr >= buf_end)
596 		fill_buffer();
597 
598 	    code = comment;
599 	    unary_delim = ps.last_u_d;
600 	    break;
601 	}
602 	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
603 	    /*
604 	     * handle ||, &&, etc, and also things as in int *****i
605 	     */
606 	    *e_token++ = *buf_ptr;
607 	    if (++buf_ptr >= buf_end)
608 		fill_buffer();
609 	}
610 	code = (ps.last_u_d ? unary_op : binary_op);
611 	unary_delim = true;
612 
613 
614     }				/* end of switch */
615     if (code != newline) {
616 	l_struct = false;
617 	last_code = code;
618     }
619     if (buf_ptr >= buf_end)	/* check for input buffer empty */
620 	fill_buffer();
621     ps.last_u_d = unary_delim;
622     *e_token = '\0';		/* null terminate the token */
623     return (code);
624 }
625 
626 void
627 alloc_typenames(void)
628 {
629 
630     typenames = (const char **)malloc(sizeof(typenames[0]) *
631         (typename_count = 16));
632     if (typenames == NULL)
633 	err(1, NULL);
634 }
635 
636 void
637 add_typename(const char *key)
638 {
639     int comparison;
640     const char *copy;
641 
642     if (typename_top + 1 >= typename_count) {
643 	typenames = realloc((void *)typenames,
644 	    sizeof(typenames[0]) * (typename_count *= 2));
645 	if (typenames == NULL)
646 	    err(1, NULL);
647     }
648     if (typename_top == -1)
649 	typenames[++typename_top] = copy = strdup(key);
650     else if ((comparison = strcmp(key, typenames[typename_top])) >= 0) {
651 	/* take advantage of sorted input */
652 	if (comparison == 0)	/* remove duplicates */
653 	    return;
654 	typenames[++typename_top] = copy = strdup(key);
655     }
656     else {
657 	int p;
658 
659 	for (p = 0; (comparison = strcmp(key, typenames[p])) > 0; p++)
660 	    /* find place for the new key */;
661 	if (comparison == 0)	/* remove duplicates */
662 	    return;
663 	memmove(&typenames[p + 1], &typenames[p],
664 	    sizeof(typenames[0]) * (++typename_top - p));
665 	typenames[p] = copy = strdup(key);
666     }
667 
668     if (copy == NULL)
669 	err(1, NULL);
670 }
671