xref: /freebsd/usr.bin/indent/lexi.c (revision 9de29bfb5a3a756a306a37d11462c6656f1953c8)
1 /*-
2  * SPDX-License-Identifier: BSD-4-Clause
3  *
4  * Copyright (c) 1985 Sun Microsystems, Inc.
5  * Copyright (c) 1980, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. All advertising materials mentioning features or use of this software
18  *    must display the following acknowledgement:
19  *	This product includes software developed by the University of
20  *	California, Berkeley and its contributors.
21  * 4. Neither the name of the University nor the names of its contributors
22  *    may be used to endorse or promote products derived from this software
23  *    without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  */
37 
38 #if 0
39 #ifndef lint
40 static char sccsid[] = "@(#)lexi.c	8.1 (Berkeley) 6/6/93";
41 #endif /* not lint */
42 #endif
43 #include <sys/cdefs.h>
44 __FBSDID("$FreeBSD$");
45 
46 /*
47  * Here we have the token scanner for indent.  It scans off one token and puts
48  * it in the global variable "token".  It returns a code, indicating the type
49  * of token scanned.
50  */
51 
52 #include <err.h>
53 #include <stdio.h>
54 #include <ctype.h>
55 #include <stdlib.h>
56 #include <string.h>
57 #include "indent_globs.h"
58 #include "indent_codes.h"
59 #include "indent.h"
60 
61 #define alphanum 1
62 #ifdef undef
63 #define opchar 3
64 #endif
65 
66 struct templ {
67     const char *rwd;
68     int         rwcode;
69 };
70 
71 /*
72  * This table has to be sorted alphabetically, because it'll be used in binary
73  * search. For the same reason, string must be the first thing in struct templ.
74  */
75 struct templ specials[] =
76 {
77     {"_Bool", 4},
78     {"_Complex", 4},
79     {"_Imaginary", 4},
80     {"auto", 10},
81     {"bool", 4},
82     {"break", 9},
83     {"case", 8},
84     {"char", 4},
85     {"complex", 4},
86     {"const", 4},
87     {"continue", 12},
88     {"default", 8},
89     {"do", 6},
90     {"double", 4},
91     {"else", 6},
92     {"enum", 3},
93     {"extern", 10},
94     {"float", 4},
95     {"for", 5},
96     {"global", 4},
97     {"goto", 9},
98     {"if", 5},
99     {"imaginary", 4},
100     {"inline", 12},
101     {"int", 4},
102     {"long", 4},
103     {"offsetof", 1},
104     {"register", 10},
105     {"restrict", 12},
106     {"return", 9},
107     {"short", 4},
108     {"signed", 4},
109     {"sizeof", 2},
110     {"static", 10},
111     {"struct", 3},
112     {"switch", 7},
113     {"typedef", 11},
114     {"union", 3},
115     {"unsigned", 4},
116     {"void", 4},
117     {"volatile", 4},
118     {"while", 5}
119 };
120 
121 const char **typenames;
122 int         typename_count;
123 int         typename_top = -1;
124 
125 char        chartype[128] =
126 {				/* this is used to facilitate the decision of
127 				 * what type (alphanumeric, operator) each
128 				 * character is */
129     0, 0, 0, 0, 0, 0, 0, 0,
130     0, 0, 0, 0, 0, 0, 0, 0,
131     0, 0, 0, 0, 0, 0, 0, 0,
132     0, 0, 0, 0, 0, 0, 0, 0,
133     0, 3, 0, 0, 1, 3, 3, 0,
134     0, 0, 3, 3, 0, 3, 0, 3,
135     1, 1, 1, 1, 1, 1, 1, 1,
136     1, 1, 0, 0, 3, 3, 3, 3,
137     0, 1, 1, 1, 1, 1, 1, 1,
138     1, 1, 1, 1, 1, 1, 1, 1,
139     1, 1, 1, 1, 1, 1, 1, 1,
140     1, 1, 1, 0, 0, 0, 3, 1,
141     0, 1, 1, 1, 1, 1, 1, 1,
142     1, 1, 1, 1, 1, 1, 1, 1,
143     1, 1, 1, 1, 1, 1, 1, 1,
144     1, 1, 1, 0, 3, 0, 3, 0
145 };
146 
147 static int
148 strcmp_type(const void *e1, const void *e2)
149 {
150     return (strcmp(e1, *(const char * const *)e2));
151 }
152 
153 int
154 lexi(struct parser_state *state)
155 {
156     int         unary_delim;	/* this is set to 1 if the current token
157 				 * forces a following operator to be unary */
158     int         code;		/* internal code to be returned */
159     char        qchar;		/* the delimiter character for a string */
160 
161     e_token = s_token;		/* point to start of place to save token */
162     unary_delim = false;
163     state->col_1 = state->last_nl;	/* tell world that this token started
164 					 * in column 1 iff the last thing
165 					 * scanned was a newline */
166     state->last_nl = false;
167 
168     while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
169 	state->col_1 = false;	/* leading blanks imply token is not in column
170 				 * 1 */
171 	if (++buf_ptr >= buf_end)
172 	    fill_buffer();
173     }
174 
175     /* Scan an alphanumeric token */
176     if (chartype[*buf_ptr & 127] == alphanum ||
177 	(buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) {
178 	/*
179 	 * we have a character or number
180 	 */
181 	struct templ *p;
182 
183 	if (isdigit((unsigned char)*buf_ptr) ||
184 	    (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) {
185 	    int         seendot = 0,
186 	                seenexp = 0,
187 			seensfx = 0;
188 
189 	    /*
190 	     * base 2, base 8, base 16:
191 	     */
192 	    if (buf_ptr[0] == '0' && buf_ptr[1] != '.') {
193 		int len;
194 
195 		if (buf_ptr[1] == 'b' || buf_ptr[1] == 'B')
196 		    len = strspn(buf_ptr + 2, "01") + 2;
197 		else if (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')
198 		    len = strspn(buf_ptr + 2, "0123456789ABCDEFabcdef") + 2;
199 		else
200 		    len = strspn(buf_ptr + 1, "012345678") + 1;
201 		if (len > 0) {
202 		    CHECK_SIZE_TOKEN(len);
203 		    memcpy(e_token, buf_ptr, len);
204 		    e_token += len;
205 		    buf_ptr += len;
206 		}
207 		else
208 		    diag2(1, "Unterminated literal");
209 	    }
210 	    else		/* base 10: */
211 		while (1) {
212 		    if (*buf_ptr == '.') {
213 			if (seendot)
214 			    break;
215 			else
216 			    seendot++;
217 		    }
218 		    CHECK_SIZE_TOKEN(3);
219 		    *e_token++ = *buf_ptr++;
220 		    if (!isdigit((unsigned char)*buf_ptr) && *buf_ptr != '.') {
221 			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
222 			    break;
223 			else {
224 			    seenexp++;
225 			    seendot++;
226 			    *e_token++ = *buf_ptr++;
227 			    if (*buf_ptr == '+' || *buf_ptr == '-')
228 				*e_token++ = *buf_ptr++;
229 			}
230 		    }
231 		}
232 
233 	    while (1) {
234 		CHECK_SIZE_TOKEN(2);
235 		if (!(seensfx & 1) && (*buf_ptr == 'U' || *buf_ptr == 'u')) {
236 		    *e_token++ = *buf_ptr++;
237 		    seensfx |= 1;
238 		    continue;
239 		}
240 		if (!(seensfx & 2) && (strchr("fFlL", *buf_ptr) != NULL)) {
241 		    if (buf_ptr[1] == buf_ptr[0])
242 		        *e_token++ = *buf_ptr++;
243 		    *e_token++ = *buf_ptr++;
244 		    seensfx |= 2;
245 		    continue;
246 		}
247 		break;
248 	    }
249 	}
250 	else
251 	    while (chartype[*buf_ptr & 127] == alphanum || *buf_ptr == BACKSLASH) {
252 		/* fill_buffer() terminates buffer with newline */
253 		if (*buf_ptr == BACKSLASH) {
254 		    if (*(buf_ptr + 1) == '\n') {
255 			buf_ptr += 2;
256 			if (buf_ptr >= buf_end)
257 			    fill_buffer();
258 			} else
259 			    break;
260 		}
261 		CHECK_SIZE_TOKEN(1);
262 		/* copy it over */
263 		*e_token++ = *buf_ptr++;
264 		if (buf_ptr >= buf_end)
265 		    fill_buffer();
266 	    }
267 	*e_token = '\0';
268 
269 	if (s_token[0] == 'L' && s_token[1] == '\0' &&
270 	      (*buf_ptr == '"' || *buf_ptr == '\''))
271 	    return (strpfx);
272 
273 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
274 	    if (++buf_ptr >= buf_end)
275 		fill_buffer();
276 	}
277 	state->keyword = 0;
278 	if (state->last_token == structure && !state->p_l_follow) {
279 				/* if last token was 'struct' and we're not
280 				 * in parentheses, then this token
281 				 * should be treated as a declaration */
282 	    state->last_u_d = true;
283 	    return (decl);
284 	}
285 	/*
286 	 * Operator after identifier is binary unless last token was 'struct'
287 	 */
288 	state->last_u_d = (state->last_token == structure);
289 
290 	p = bsearch(s_token,
291 	    specials,
292 	    sizeof(specials) / sizeof(specials[0]),
293 	    sizeof(specials[0]),
294 	    strcmp_type);
295 	if (p == NULL) {	/* not a special keyword... */
296 	    char *u;
297 
298 	    /* ... so maybe a type_t or a typedef */
299 	    if ((auto_typedefs && ((u = strrchr(s_token, '_')) != NULL) &&
300 	        strcmp(u, "_t") == 0) || (typename_top >= 0 &&
301 		  bsearch(s_token, typenames, typename_top + 1,
302 		    sizeof(typenames[0]), strcmp_type))) {
303 		state->keyword = 4;	/* a type name */
304 		state->last_u_d = true;
305 	        goto found_typename;
306 	    }
307 	} else {			/* we have a keyword */
308 	    state->keyword = p->rwcode;
309 	    state->last_u_d = true;
310 	    switch (p->rwcode) {
311 	    case 7:		/* it is a switch */
312 		return (swstmt);
313 	    case 8:		/* a case or default */
314 		return (casestmt);
315 
316 	    case 3:		/* a "struct" */
317 		/* FALLTHROUGH */
318 	    case 4:		/* one of the declaration keywords */
319 	    found_typename:
320 		if (state->p_l_follow) {
321 		    /* inside parens: cast, param list, offsetof or sizeof */
322 		    state->cast_mask |= (1 << state->p_l_follow) & ~state->not_cast_mask;
323 		}
324 		if (p != NULL && p->rwcode == 3)
325 		    return (structure);
326 		if (state->p_l_follow)
327 		    break;
328 		return (decl);
329 
330 	    case 5:		/* if, while, for */
331 		return (sp_paren);
332 
333 	    case 6:		/* do, else */
334 		return (sp_nparen);
335 
336 	    case 10:		/* storage class specifier */
337 		return (storage);
338 
339 	    case 11:		/* typedef */
340 		return (type_def);
341 
342 	    default:		/* all others are treated like any other
343 				 * identifier */
344 		return (ident);
345 	    }			/* end of switch */
346 	}			/* end of if (found_it) */
347 	if (*buf_ptr == '(' && state->tos <= 1 && state->ind_level == 0 &&
348 	    state->in_parameter_declaration == 0 && state->block_init == 0) {
349 	    char *tp = buf_ptr;
350 	    while (tp < buf_end)
351 		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
352 		    goto not_proc;
353 	    strncpy(state->procname, token, sizeof state->procname - 1);
354 	    if (state->in_decl)
355 		state->in_parameter_declaration = 1;
356 	    return (funcname);
357     not_proc:;
358 	}
359 	/*
360 	 * The following hack attempts to guess whether or not the current
361 	 * token is in fact a declaration keyword -- one that has been
362 	 * typedefd
363 	 */
364 	else if (!state->p_l_follow && !state->block_init &&
365 	    !state->in_stmt &&
366 	    ((*buf_ptr == '*' && buf_ptr[1] != '=') ||
367 		isalpha((unsigned char)*buf_ptr)) &&
368 	    (state->last_token == semicolon || state->last_token == lbrace ||
369 		state->last_token == rbrace)) {
370 	    state->keyword = 4;	/* a type name */
371 	    state->last_u_d = true;
372 	    return decl;
373 	}
374 	if (state->last_token == decl)	/* if this is a declared variable,
375 					 * then following sign is unary */
376 	    state->last_u_d = true;	/* will make "int a -1" work */
377 	return (ident);		/* the ident is not in the list */
378     }				/* end of procesing for alpanum character */
379 
380     /* Scan a non-alphanumeric token */
381 
382     CHECK_SIZE_TOKEN(3);		/* things like "<<=" */
383     *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
384 				 * moved here */
385     *e_token = '\0';
386     if (++buf_ptr >= buf_end)
387 	fill_buffer();
388 
389     switch (*token) {
390     case '\n':
391 	unary_delim = state->last_u_d;
392 	state->last_nl = true;	/* remember that we just had a newline */
393 	code = (had_eof ? 0 : newline);
394 
395 	/*
396 	 * if data has been exhausted, the newline is a dummy, and we should
397 	 * return code to stop
398 	 */
399 	break;
400 
401     case '\'':			/* start of quoted character */
402     case '"':			/* start of string */
403 	qchar = *token;
404 	do {			/* copy the string */
405 	    while (1) {		/* move one character or [/<char>]<char> */
406 		if (*buf_ptr == '\n') {
407 		    diag2(1, "Unterminated literal");
408 		    goto stop_lit;
409 		}
410 		CHECK_SIZE_TOKEN(2);
411 		*e_token = *buf_ptr++;
412 		if (buf_ptr >= buf_end)
413 		    fill_buffer();
414 		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
415 		    if (*buf_ptr == '\n')	/* check for escaped newline */
416 			++line_no;
417 		    *++e_token = *buf_ptr++;
418 		    ++e_token;	/* we must increment this again because we
419 				 * copied two chars */
420 		    if (buf_ptr >= buf_end)
421 			fill_buffer();
422 		}
423 		else
424 		    break;	/* we copied one character */
425 	    }			/* end of while (1) */
426 	} while (*e_token++ != qchar);
427 stop_lit:
428 	code = ident;
429 	break;
430 
431     case ('('):
432     case ('['):
433 	unary_delim = true;
434 	code = lparen;
435 	break;
436 
437     case (')'):
438     case (']'):
439 	code = rparen;
440 	break;
441 
442     case '#':
443 	unary_delim = state->last_u_d;
444 	code = preesc;
445 	break;
446 
447     case '?':
448 	unary_delim = true;
449 	code = question;
450 	break;
451 
452     case (':'):
453 	code = colon;
454 	unary_delim = true;
455 	break;
456 
457     case (';'):
458 	unary_delim = true;
459 	code = semicolon;
460 	break;
461 
462     case ('{'):
463 	unary_delim = true;
464 
465 	/*
466 	 * if (state->in_or_st) state->block_init = 1;
467 	 */
468 	/* ?	code = state->block_init ? lparen : lbrace; */
469 	code = lbrace;
470 	break;
471 
472     case ('}'):
473 	unary_delim = true;
474 	/* ?	code = state->block_init ? rparen : rbrace; */
475 	code = rbrace;
476 	break;
477 
478     case 014:			/* a form feed */
479 	unary_delim = state->last_u_d;
480 	state->last_nl = true;	/* remember this so we can set 'state->col_1'
481 				 * right */
482 	code = form_feed;
483 	break;
484 
485     case (','):
486 	unary_delim = true;
487 	code = comma;
488 	break;
489 
490     case '.':
491 	unary_delim = false;
492 	code = period;
493 	break;
494 
495     case '-':
496     case '+':			/* check for -, +, --, ++ */
497 	code = (state->last_u_d ? unary_op : binary_op);
498 	unary_delim = true;
499 
500 	if (*buf_ptr == token[0]) {
501 	    /* check for doubled character */
502 	    *e_token++ = *buf_ptr++;
503 	    /* buffer overflow will be checked at end of loop */
504 	    if (state->last_token == ident || state->last_token == rparen) {
505 		code = (state->last_u_d ? unary_op : postop);
506 		/* check for following ++ or -- */
507 		unary_delim = false;
508 	    }
509 	}
510 	else if (*buf_ptr == '=')
511 	    /* check for operator += */
512 	    *e_token++ = *buf_ptr++;
513 	else if (*buf_ptr == '>') {
514 	    /* check for operator -> */
515 	    *e_token++ = *buf_ptr++;
516 	    unary_delim = false;
517 	    code = unary_op;
518 	    state->want_blank = false;
519 	}
520 	break;			/* buffer overflow will be checked at end of
521 				 * switch */
522 
523     case '=':
524 	if (state->in_or_st)
525 	    state->block_init = 1;
526 #ifdef undef
527 	if (chartype[*buf_ptr & 127] == opchar) {	/* we have two char assignment */
528 	    e_token[-1] = *buf_ptr++;
529 	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
530 		*e_token++ = *buf_ptr++;
531 	    *e_token++ = '=';	/* Flip =+ to += */
532 	    *e_token = 0;
533 	}
534 #else
535 	if (*buf_ptr == '=') {/* == */
536 	    *e_token++ = '=';	/* Flip =+ to += */
537 	    buf_ptr++;
538 	    *e_token = 0;
539 	}
540 #endif
541 	code = binary_op;
542 	unary_delim = true;
543 	break;
544 	/* can drop thru!!! */
545 
546     case '>':
547     case '<':
548     case '!':			/* ops like <, <<, <=, !=, etc */
549 	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
550 	    *e_token++ = *buf_ptr;
551 	    if (++buf_ptr >= buf_end)
552 		fill_buffer();
553 	}
554 	if (*buf_ptr == '=')
555 	    *e_token++ = *buf_ptr++;
556 	code = (state->last_u_d ? unary_op : binary_op);
557 	unary_delim = true;
558 	break;
559 
560     case '*':
561 	unary_delim = true;
562 	if (!state->last_u_d) {
563 	    if (*buf_ptr == '=')
564 		*e_token++ = *buf_ptr++;
565 	    code = binary_op;
566 	    break;
567 	}
568 	while (*buf_ptr == '*' || isspace((unsigned char)*buf_ptr)) {
569 	    if (*buf_ptr == '*') {
570 		CHECK_SIZE_TOKEN(1);
571 		*e_token++ = *buf_ptr;
572 	    }
573 	    if (++buf_ptr >= buf_end)
574 		fill_buffer();
575 	}
576 	if (ps.in_decl) {
577 	    char *tp = buf_ptr;
578 
579 	    while (isalpha((unsigned char)*tp) ||
580 		   isspace((unsigned char)*tp)) {
581 		if (++tp >= buf_end)
582 		    fill_buffer();
583 	    }
584 	    if (*tp == '(')
585 		ps.procname[0] = ' ';
586 	}
587 	code = unary_op;
588 	break;
589 
590     default:
591 	if (token[0] == '/' && *buf_ptr == '*') {
592 	    /* it is start of comment */
593 	    *e_token++ = '*';
594 
595 	    if (++buf_ptr >= buf_end)
596 		fill_buffer();
597 
598 	    code = comment;
599 	    unary_delim = state->last_u_d;
600 	    break;
601 	}
602 	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
603 	    /*
604 	     * handle ||, &&, etc, and also things as in int *****i
605 	     */
606 	    CHECK_SIZE_TOKEN(1);
607 	    *e_token++ = *buf_ptr;
608 	    if (++buf_ptr >= buf_end)
609 		fill_buffer();
610 	}
611 	code = (state->last_u_d ? unary_op : binary_op);
612 	unary_delim = true;
613 
614 
615     }				/* end of switch */
616     if (buf_ptr >= buf_end)	/* check for input buffer empty */
617 	fill_buffer();
618     state->last_u_d = unary_delim;
619     CHECK_SIZE_TOKEN(1);
620     *e_token = '\0';		/* null terminate the token */
621     return (code);
622 }
623 
624 void
625 alloc_typenames(void)
626 {
627 
628     typenames = (const char **)malloc(sizeof(typenames[0]) *
629         (typename_count = 16));
630     if (typenames == NULL)
631 	err(1, NULL);
632 }
633 
634 void
635 add_typename(const char *key)
636 {
637     int comparison;
638     const char *copy;
639 
640     if (typename_top + 1 >= typename_count) {
641 	typenames = realloc((void *)typenames,
642 	    sizeof(typenames[0]) * (typename_count *= 2));
643 	if (typenames == NULL)
644 	    err(1, NULL);
645     }
646     if (typename_top == -1)
647 	typenames[++typename_top] = copy = strdup(key);
648     else if ((comparison = strcmp(key, typenames[typename_top])) >= 0) {
649 	/* take advantage of sorted input */
650 	if (comparison == 0)	/* remove duplicates */
651 	    return;
652 	typenames[++typename_top] = copy = strdup(key);
653     }
654     else {
655 	int p;
656 
657 	for (p = 0; (comparison = strcmp(key, typenames[p])) > 0; p++)
658 	    /* find place for the new key */;
659 	if (comparison == 0)	/* remove duplicates */
660 	    return;
661 	memmove(&typenames[p + 1], &typenames[p],
662 	    sizeof(typenames[0]) * (++typename_top - p));
663 	typenames[p] = copy = strdup(key);
664     }
665 
666     if (copy == NULL)
667 	err(1, NULL);
668 }
669