xref: /freebsd/usr.bin/indent/lexi.c (revision 63c3f226969b89226e223bca86761b285d302ed6)
1 /*-
2  * SPDX-License-Identifier: BSD-4-Clause
3  *
4  * Copyright (c) 1985 Sun Microsystems, Inc.
5  * Copyright (c) 1980, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. All advertising materials mentioning features or use of this software
18  *    must display the following acknowledgement:
19  *	This product includes software developed by the University of
20  *	California, Berkeley and its contributors.
21  * 4. Neither the name of the University nor the names of its contributors
22  *    may be used to endorse or promote products derived from this software
23  *    without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  */
37 
38 #if 0
39 #ifndef lint
40 static char sccsid[] = "@(#)lexi.c	8.1 (Berkeley) 6/6/93";
41 #endif /* not lint */
42 #endif
43 #include <sys/cdefs.h>
44 __FBSDID("$FreeBSD$");
45 
46 /*
47  * Here we have the token scanner for indent.  It scans off one token and puts
48  * it in the global variable "token".  It returns a code, indicating the type
49  * of token scanned.
50  */
51 
52 #include <err.h>
53 #include <stdio.h>
54 #include <ctype.h>
55 #include <stdlib.h>
56 #include <string.h>
57 #include "indent_globs.h"
58 #include "indent_codes.h"
59 #include "indent.h"
60 
61 #define alphanum 1
62 #ifdef undef
63 #define opchar 3
64 #endif
65 
66 struct templ {
67     const char *rwd;
68     int         rwcode;
69 };
70 
71 /*
72  * This table has to be sorted alphabetically, because it'll be used in binary
73  * search. For the same reason, string must be the first thing in struct templ.
74  */
75 struct templ specials[] =
76 {
77     {"auto", 10},
78     {"break", 9},
79     {"case", 8},
80     {"char", 4},
81     {"const", 4},
82     {"default", 8},
83     {"do", 6},
84     {"double", 4},
85     {"else", 6},
86     {"enum", 3},
87     {"extern", 10},
88     {"float", 4},
89     {"for", 5},
90     {"global", 4},
91     {"goto", 9},
92     {"if", 5},
93     {"int", 4},
94     {"long", 4},
95     {"offsetof", 1},
96     {"register", 10},
97     {"return", 9},
98     {"short", 4},
99     {"sizeof", 2},
100     {"static", 10},
101     {"struct", 3},
102     {"switch", 7},
103     {"typedef", 11},
104     {"union", 3},
105     {"unsigned", 4},
106     {"void", 4},
107     {"volatile", 4},
108     {"while", 5}
109 };
110 
111 const char **typenames;
112 int         typename_count;
113 int         typename_top = -1;
114 
115 char        chartype[128] =
116 {				/* this is used to facilitate the decision of
117 				 * what type (alphanumeric, operator) each
118 				 * character is */
119     0, 0, 0, 0, 0, 0, 0, 0,
120     0, 0, 0, 0, 0, 0, 0, 0,
121     0, 0, 0, 0, 0, 0, 0, 0,
122     0, 0, 0, 0, 0, 0, 0, 0,
123     0, 3, 0, 0, 1, 3, 3, 0,
124     0, 0, 3, 3, 0, 3, 0, 3,
125     1, 1, 1, 1, 1, 1, 1, 1,
126     1, 1, 0, 0, 3, 3, 3, 3,
127     0, 1, 1, 1, 1, 1, 1, 1,
128     1, 1, 1, 1, 1, 1, 1, 1,
129     1, 1, 1, 1, 1, 1, 1, 1,
130     1, 1, 1, 0, 0, 0, 3, 1,
131     0, 1, 1, 1, 1, 1, 1, 1,
132     1, 1, 1, 1, 1, 1, 1, 1,
133     1, 1, 1, 1, 1, 1, 1, 1,
134     1, 1, 1, 0, 3, 0, 3, 0
135 };
136 
137 static int
138 strcmp_type(const void *e1, const void *e2)
139 {
140     return (strcmp(e1, *(const char * const *)e2));
141 }
142 
143 int
144 lexi(struct parser_state *state)
145 {
146     int         unary_delim;	/* this is set to 1 if the current token
147 				 * forces a following operator to be unary */
148     static int  last_code;	/* the last token type returned */
149     static int  l_struct;	/* set to 1 if the last token was 'struct' */
150     int         code;		/* internal code to be returned */
151     char        qchar;		/* the delimiter character for a string */
152 
153     e_token = s_token;		/* point to start of place to save token */
154     unary_delim = false;
155     state->col_1 = state->last_nl;	/* tell world that this token started
156 					 * in column 1 iff the last thing
157 					 * scanned was a newline */
158     state->last_nl = false;
159 
160     while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
161 	state->col_1 = false;	/* leading blanks imply token is not in column
162 				 * 1 */
163 	if (++buf_ptr >= buf_end)
164 	    fill_buffer();
165     }
166 
167     /* Scan an alphanumeric token */
168     if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
169 	/*
170 	 * we have a character or number
171 	 */
172 	struct templ *p;
173 
174 	if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
175 	    enum base {
176 		BASE_2, BASE_8, BASE_10, BASE_16
177 	    };
178 	    int         seendot = 0,
179 	                seenexp = 0,
180 			seensfx = 0;
181 	    enum base	in_base = BASE_10;
182 
183 	    if (*buf_ptr == '0') {
184 		if (buf_ptr[1] == 'b' || buf_ptr[1] == 'B')
185 		    in_base = BASE_2;
186 		else if (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')
187 		    in_base = BASE_16;
188 		else if (isdigit(buf_ptr[1]))
189 		    in_base = BASE_8;
190 	    }
191 	    switch (in_base) {
192 	    case BASE_2:
193 		*e_token++ = *buf_ptr++;
194 		*e_token++ = *buf_ptr++;
195 		while (*buf_ptr == '0' || *buf_ptr == '1') {
196 		    CHECK_SIZE_TOKEN;
197 		    *e_token++ = *buf_ptr++;
198 		}
199 		break;
200 	    case BASE_8:
201 		*e_token++ = *buf_ptr++;
202 		while (*buf_ptr >= '0' && *buf_ptr <= '8') {
203 		    CHECK_SIZE_TOKEN;
204 		    *e_token++ = *buf_ptr++;
205 		}
206 		break;
207 	    case BASE_16:
208 		*e_token++ = *buf_ptr++;
209 		*e_token++ = *buf_ptr++;
210 		while (isxdigit(*buf_ptr)) {
211 		    CHECK_SIZE_TOKEN;
212 		    *e_token++ = *buf_ptr++;
213 		}
214 		break;
215 	    case BASE_10:
216 		while (1) {
217 		    if (*buf_ptr == '.') {
218 			if (seendot)
219 			    break;
220 			else
221 			    seendot++;
222 		    }
223 		    CHECK_SIZE_TOKEN;
224 		    *e_token++ = *buf_ptr++;
225 		    if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
226 			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
227 			    break;
228 			else {
229 			    seenexp++;
230 			    seendot++;
231 			    CHECK_SIZE_TOKEN;
232 			    *e_token++ = *buf_ptr++;
233 			    if (*buf_ptr == '+' || *buf_ptr == '-')
234 				*e_token++ = *buf_ptr++;
235 			}
236 		    }
237 		}
238 		break;
239 	    }
240 	    while (1) {
241 		if (!(seensfx & 1) && (*buf_ptr == 'U' || *buf_ptr == 'u')) {
242 		    CHECK_SIZE_TOKEN;
243 		    *e_token++ = *buf_ptr++;
244 		    seensfx |= 1;
245 		    continue;
246 		}
247 		if (!(seensfx & 2) && (strchr("fFlL", *buf_ptr) != NULL)) {
248 		    CHECK_SIZE_TOKEN;
249 		    if (buf_ptr[1] == buf_ptr[0])
250 		        *e_token++ = *buf_ptr++;
251 		    *e_token++ = *buf_ptr++;
252 		    seensfx |= 2;
253 		    continue;
254 		}
255 		break;
256 	    }
257 	}
258 	else
259 	    while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) {
260 		/* fill_buffer() terminates buffer with newline */
261 		if (*buf_ptr == BACKSLASH) {
262 		    if (*(buf_ptr + 1) == '\n') {
263 			buf_ptr += 2;
264 			if (buf_ptr >= buf_end)
265 			    fill_buffer();
266 			} else
267 			    break;
268 		}
269 		CHECK_SIZE_TOKEN;
270 		/* copy it over */
271 		*e_token++ = *buf_ptr++;
272 		if (buf_ptr >= buf_end)
273 		    fill_buffer();
274 	    }
275 	*e_token++ = '\0';
276 
277 	if (s_token[0] == 'L' && s_token[1] == '\0' &&
278 	      (*buf_ptr == '"' || *buf_ptr == '\''))
279 	    return (strpfx);
280 
281 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
282 	    if (++buf_ptr >= buf_end)
283 		fill_buffer();
284 	}
285 	state->keyword = 0;
286 	if (l_struct && !state->p_l_follow) {
287 				/* if last token was 'struct' and we're not
288 				 * in parentheses, then this token
289 				 * should be treated as a declaration */
290 	    l_struct = false;
291 	    last_code = ident;
292 	    state->last_u_d = true;
293 	    return (decl);
294 	}
295 	state->last_u_d = l_struct;	/* Operator after identifier is
296 					 * binary unless last token was
297 					 * 'struct' */
298 	l_struct = false;
299 	last_code = ident;	/* Remember that this is the code we will
300 				 * return */
301 
302 	p = bsearch(s_token,
303 	    specials,
304 	    sizeof(specials) / sizeof(specials[0]),
305 	    sizeof(specials[0]),
306 	    strcmp_type);
307 	if (p == NULL) {	/* not a special keyword... */
308 	    char *u;
309 
310 	    /* ... so maybe a type_t or a typedef */
311 	    if ((auto_typedefs && ((u = strrchr(s_token, '_')) != NULL) &&
312 	        strcmp(u, "_t") == 0) || (typename_top >= 0 &&
313 		  bsearch(s_token, typenames, typename_top + 1,
314 		    sizeof(typenames[0]), strcmp_type))) {
315 		state->keyword = 4;	/* a type name */
316 		state->last_u_d = true;
317 	        goto found_typename;
318 	    }
319 	} else {			/* we have a keyword */
320 	    state->keyword = p->rwcode;
321 	    state->last_u_d = true;
322 	    switch (p->rwcode) {
323 	    case 7:		/* it is a switch */
324 		return (swstmt);
325 	    case 8:		/* a case or default */
326 		return (casestmt);
327 
328 	    case 3:		/* a "struct" */
329 		/*
330 		 * Next time around, we will want to know that we have had a
331 		 * 'struct'
332 		 */
333 		l_struct = true;
334 		/* FALLTHROUGH */
335 
336 	    case 4:		/* one of the declaration keywords */
337 	    found_typename:
338 		if (state->p_l_follow) {
339 		    /* inside parens: cast, param list, offsetof or sizeof */
340 		    state->cast_mask |= (1 << state->p_l_follow) & ~state->not_cast_mask;
341 		    break;
342 		}
343 		last_code = decl;
344 		return (decl);
345 
346 	    case 5:		/* if, while, for */
347 		return (sp_paren);
348 
349 	    case 6:		/* do, else */
350 		return (sp_nparen);
351 
352 	    case 10:		/* storage class specifier */
353 		return (storage);
354 
355 	    case 11:		/* typedef */
356 		return (type_def);
357 
358 	    default:		/* all others are treated like any other
359 				 * identifier */
360 		return (ident);
361 	    }			/* end of switch */
362 	}			/* end of if (found_it) */
363 	if (*buf_ptr == '(' && state->tos <= 1 && state->ind_level == 0 &&
364 	    state->in_parameter_declaration == 0 && state->block_init == 0) {
365 	    char *tp = buf_ptr;
366 	    while (tp < buf_end)
367 		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
368 		    goto not_proc;
369 	    strncpy(state->procname, token, sizeof state->procname - 1);
370 	    if (state->in_decl)
371 		state->in_parameter_declaration = 1;
372 	    return (last_code = funcname);
373     not_proc:;
374 	}
375 	/*
376 	 * The following hack attempts to guess whether or not the current
377 	 * token is in fact a declaration keyword -- one that has been
378 	 * typedefd
379 	 */
380 	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
381 		&& !state->p_l_follow
382 	        && !state->block_init
383 		&& (state->last_token == rparen || state->last_token == semicolon ||
384 		    state->last_token == decl ||
385 		    state->last_token == lbrace || state->last_token == rbrace)) {
386 	    state->keyword = 4;	/* a type name */
387 	    state->last_u_d = true;
388 	    last_code = decl;
389 	    return decl;
390 	}
391 	if (last_code == decl)	/* if this is a declared variable, then
392 				 * following sign is unary */
393 	    state->last_u_d = true;	/* will make "int a -1" work */
394 	last_code = ident;
395 	return (ident);		/* the ident is not in the list */
396     }				/* end of procesing for alpanum character */
397 
398     /* Scan a non-alphanumeric token */
399 
400     *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
401 				 * moved here */
402     *e_token = '\0';
403     if (++buf_ptr >= buf_end)
404 	fill_buffer();
405 
406     switch (*token) {
407     case '\n':
408 	unary_delim = state->last_u_d;
409 	state->last_nl = true;	/* remember that we just had a newline */
410 	code = (had_eof ? 0 : newline);
411 
412 	/*
413 	 * if data has been exhausted, the newline is a dummy, and we should
414 	 * return code to stop
415 	 */
416 	break;
417 
418     case '\'':			/* start of quoted character */
419     case '"':			/* start of string */
420 	qchar = *token;
421 	if (troff) {
422 	    e_token[-1] = '`';
423 	    if (qchar == '"')
424 		*e_token++ = '`';
425 	    e_token = chfont(&bodyf, &stringf, e_token);
426 	}
427 	do {			/* copy the string */
428 	    while (1) {		/* move one character or [/<char>]<char> */
429 		if (*buf_ptr == '\n') {
430 		    diag2(1, "Unterminated literal");
431 		    goto stop_lit;
432 		}
433 		CHECK_SIZE_TOKEN;	/* Only have to do this once in this loop,
434 					 * since CHECK_SIZE guarantees that there
435 					 * are at least 5 entries left */
436 		*e_token = *buf_ptr++;
437 		if (buf_ptr >= buf_end)
438 		    fill_buffer();
439 		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
440 		    if (*buf_ptr == '\n')	/* check for escaped newline */
441 			++line_no;
442 		    if (troff) {
443 			*++e_token = BACKSLASH;
444 			if (*buf_ptr == BACKSLASH)
445 			    *++e_token = BACKSLASH;
446 		    }
447 		    *++e_token = *buf_ptr++;
448 		    ++e_token;	/* we must increment this again because we
449 				 * copied two chars */
450 		    if (buf_ptr >= buf_end)
451 			fill_buffer();
452 		}
453 		else
454 		    break;	/* we copied one character */
455 	    }			/* end of while (1) */
456 	} while (*e_token++ != qchar);
457 	if (troff) {
458 	    e_token = chfont(&stringf, &bodyf, e_token - 1);
459 	    if (qchar == '"')
460 		*e_token++ = '\'';
461 	}
462 stop_lit:
463 	code = ident;
464 	break;
465 
466     case ('('):
467     case ('['):
468 	unary_delim = true;
469 	code = lparen;
470 	break;
471 
472     case (')'):
473     case (']'):
474 	code = rparen;
475 	break;
476 
477     case '#':
478 	unary_delim = state->last_u_d;
479 	code = preesc;
480 	break;
481 
482     case '?':
483 	unary_delim = true;
484 	code = question;
485 	break;
486 
487     case (':'):
488 	code = colon;
489 	unary_delim = true;
490 	break;
491 
492     case (';'):
493 	unary_delim = true;
494 	code = semicolon;
495 	break;
496 
497     case ('{'):
498 	unary_delim = true;
499 
500 	/*
501 	 * if (state->in_or_st) state->block_init = 1;
502 	 */
503 	/* ?	code = state->block_init ? lparen : lbrace; */
504 	code = lbrace;
505 	break;
506 
507     case ('}'):
508 	unary_delim = true;
509 	/* ?	code = state->block_init ? rparen : rbrace; */
510 	code = rbrace;
511 	break;
512 
513     case 014:			/* a form feed */
514 	unary_delim = state->last_u_d;
515 	state->last_nl = true;	/* remember this so we can set 'state->col_1'
516 				 * right */
517 	code = form_feed;
518 	break;
519 
520     case (','):
521 	unary_delim = true;
522 	code = comma;
523 	break;
524 
525     case '.':
526 	unary_delim = false;
527 	code = period;
528 	break;
529 
530     case '-':
531     case '+':			/* check for -, +, --, ++ */
532 	code = (state->last_u_d ? unary_op : binary_op);
533 	unary_delim = true;
534 
535 	if (*buf_ptr == token[0]) {
536 	    /* check for doubled character */
537 	    *e_token++ = *buf_ptr++;
538 	    /* buffer overflow will be checked at end of loop */
539 	    if (last_code == ident || last_code == rparen) {
540 		code = (state->last_u_d ? unary_op : postop);
541 		/* check for following ++ or -- */
542 		unary_delim = false;
543 	    }
544 	}
545 	else if (*buf_ptr == '=')
546 	    /* check for operator += */
547 	    *e_token++ = *buf_ptr++;
548 	else if (*buf_ptr == '>') {
549 	    /* check for operator -> */
550 	    *e_token++ = *buf_ptr++;
551 	    unary_delim = false;
552 	    code = unary_op;
553 	    state->want_blank = false;
554 	}
555 	break;			/* buffer overflow will be checked at end of
556 				 * switch */
557 
558     case '=':
559 	if (state->in_or_st)
560 	    state->block_init = 1;
561 #ifdef undef
562 	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
563 	    e_token[-1] = *buf_ptr++;
564 	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
565 		*e_token++ = *buf_ptr++;
566 	    *e_token++ = '=';	/* Flip =+ to += */
567 	    *e_token = 0;
568 	}
569 #else
570 	if (*buf_ptr == '=') {/* == */
571 	    *e_token++ = '=';	/* Flip =+ to += */
572 	    buf_ptr++;
573 	    *e_token = 0;
574 	}
575 #endif
576 	code = binary_op;
577 	unary_delim = true;
578 	break;
579 	/* can drop thru!!! */
580 
581     case '>':
582     case '<':
583     case '!':			/* ops like <, <<, <=, !=, etc */
584 	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
585 	    *e_token++ = *buf_ptr;
586 	    if (++buf_ptr >= buf_end)
587 		fill_buffer();
588 	}
589 	if (*buf_ptr == '=')
590 	    *e_token++ = *buf_ptr++;
591 	code = (state->last_u_d ? unary_op : binary_op);
592 	unary_delim = true;
593 	break;
594 
595     default:
596 	if (token[0] == '/' && *buf_ptr == '*') {
597 	    /* it is start of comment */
598 	    *e_token++ = '*';
599 
600 	    if (++buf_ptr >= buf_end)
601 		fill_buffer();
602 
603 	    code = comment;
604 	    unary_delim = state->last_u_d;
605 	    break;
606 	}
607 	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
608 	    /*
609 	     * handle ||, &&, etc, and also things as in int *****i
610 	     */
611 	    *e_token++ = *buf_ptr;
612 	    if (++buf_ptr >= buf_end)
613 		fill_buffer();
614 	}
615 	code = (state->last_u_d ? unary_op : binary_op);
616 	unary_delim = true;
617 
618 
619     }				/* end of switch */
620     if (code != newline) {
621 	l_struct = false;
622 	last_code = code;
623     }
624     if (buf_ptr >= buf_end)	/* check for input buffer empty */
625 	fill_buffer();
626     state->last_u_d = unary_delim;
627     *e_token = '\0';		/* null terminate the token */
628     return (code);
629 }
630 
631 void
632 alloc_typenames(void)
633 {
634 
635     typenames = (const char **)malloc(sizeof(typenames[0]) *
636         (typename_count = 16));
637     if (typenames == NULL)
638 	err(1, NULL);
639 }
640 
641 void
642 add_typename(const char *key)
643 {
644     int comparison;
645     const char *copy;
646 
647     if (typename_top + 1 >= typename_count) {
648 	typenames = realloc((void *)typenames,
649 	    sizeof(typenames[0]) * (typename_count *= 2));
650 	if (typenames == NULL)
651 	    err(1, NULL);
652     }
653     if (typename_top == -1)
654 	typenames[++typename_top] = copy = strdup(key);
655     else if ((comparison = strcmp(key, typenames[typename_top])) >= 0) {
656 	/* take advantage of sorted input */
657 	if (comparison == 0)	/* remove duplicates */
658 	    return;
659 	typenames[++typename_top] = copy = strdup(key);
660     }
661     else {
662 	int p;
663 
664 	for (p = 0; (comparison = strcmp(key, typenames[p])) > 0; p++)
665 	    /* find place for the new key */;
666 	if (comparison == 0)	/* remove duplicates */
667 	    return;
668 	memmove(&typenames[p + 1], &typenames[p],
669 	    sizeof(typenames[0]) * (++typename_top - p));
670 	typenames[p] = copy = strdup(key);
671     }
672 
673     if (copy == NULL)
674 	err(1, NULL);
675 }
676