xref: /freebsd/usr.bin/indent/lexi.c (revision 5bdd8509689f8c3b3d2abef4159b7b2b36980470)
1 /*-
2  * SPDX-License-Identifier: BSD-4-Clause
3  *
4  * Copyright (c) 1985 Sun Microsystems, Inc.
5  * Copyright (c) 1980, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. All advertising materials mentioning features or use of this software
18  *    must display the following acknowledgement:
19  *	This product includes software developed by the University of
20  *	California, Berkeley and its contributors.
21  * 4. Neither the name of the University nor the names of its contributors
22  *    may be used to endorse or promote products derived from this software
23  *    without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  */
37 
38 #if 0
39 #ifndef lint
40 static char sccsid[] = "@(#)lexi.c	8.1 (Berkeley) 6/6/93";
41 #endif /* not lint */
42 #endif
43 #include <sys/cdefs.h>
44 __FBSDID("$FreeBSD$");
45 
46 /*
47  * Here we have the token scanner for indent.  It scans off one token and puts
48  * it in the global variable "token".  It returns a code, indicating the type
49  * of token scanned.
50  */
51 
52 #include <err.h>
53 #include <stdio.h>
54 #include <ctype.h>
55 #include <stdlib.h>
56 #include <string.h>
57 #include "indent_globs.h"
58 #include "indent_codes.h"
59 #include "indent.h"
60 
61 #define alphanum 1
62 #ifdef undef
63 #define opchar 3
64 #endif
65 
66 struct templ {
67     const char *rwd;
68     int         rwcode;
69 };
70 
71 /*
72  * This table has to be sorted alphabetically, because it'll be used in binary
73  * search. For the same reason, string must be the first thing in struct templ.
74  */
75 struct templ specials[] =
76 {
77     {"auto", 10},
78     {"break", 9},
79     {"case", 8},
80     {"char", 4},
81     {"const", 4},
82     {"default", 8},
83     {"do", 6},
84     {"double", 4},
85     {"else", 6},
86     {"enum", 3},
87     {"extern", 10},
88     {"float", 4},
89     {"for", 5},
90     {"global", 4},
91     {"goto", 9},
92     {"if", 5},
93     {"int", 4},
94     {"long", 4},
95     {"offsetof", 1},
96     {"register", 10},
97     {"return", 9},
98     {"short", 4},
99     {"sizeof", 2},
100     {"static", 10},
101     {"struct", 3},
102     {"switch", 7},
103     {"typedef", 11},
104     {"union", 3},
105     {"unsigned", 4},
106     {"void", 4},
107     {"volatile", 4},
108     {"while", 5}
109 };
110 
111 const char **typenames;
112 int         typename_count;
113 int         typename_top = -1;
114 
115 char        chartype[128] =
116 {				/* this is used to facilitate the decision of
117 				 * what type (alphanumeric, operator) each
118 				 * character is */
119     0, 0, 0, 0, 0, 0, 0, 0,
120     0, 0, 0, 0, 0, 0, 0, 0,
121     0, 0, 0, 0, 0, 0, 0, 0,
122     0, 0, 0, 0, 0, 0, 0, 0,
123     0, 3, 0, 0, 1, 3, 3, 0,
124     0, 0, 3, 3, 0, 3, 0, 3,
125     1, 1, 1, 1, 1, 1, 1, 1,
126     1, 1, 0, 0, 3, 3, 3, 3,
127     0, 1, 1, 1, 1, 1, 1, 1,
128     1, 1, 1, 1, 1, 1, 1, 1,
129     1, 1, 1, 1, 1, 1, 1, 1,
130     1, 1, 1, 0, 0, 0, 3, 1,
131     0, 1, 1, 1, 1, 1, 1, 1,
132     1, 1, 1, 1, 1, 1, 1, 1,
133     1, 1, 1, 1, 1, 1, 1, 1,
134     1, 1, 1, 0, 3, 0, 3, 0
135 };
136 
137 static int
138 strcmp_type(const void *e1, const void *e2)
139 {
140     return (strcmp(e1, *(const char * const *)e2));
141 }
142 
143 int
144 lexi(void)
145 {
146     int         unary_delim;	/* this is set to 1 if the current token
147 				 * forces a following operator to be unary */
148     static int  last_code;	/* the last token type returned */
149     static int  l_struct;	/* set to 1 if the last token was 'struct' */
150     int         code;		/* internal code to be returned */
151     char        qchar;		/* the delimiter character for a string */
152 
153     e_token = s_token;		/* point to start of place to save token */
154     unary_delim = false;
155     ps.col_1 = ps.last_nl;	/* tell world that this token started in
156 				 * column 1 iff the last thing scanned was nl */
157     ps.last_nl = false;
158 
159     while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
160 	ps.col_1 = false;	/* leading blanks imply token is not in column
161 				 * 1 */
162 	if (++buf_ptr >= buf_end)
163 	    fill_buffer();
164     }
165 
166     /* Scan an alphanumeric token */
167     if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
168 	/*
169 	 * we have a character or number
170 	 */
171 	struct templ *p;
172 
173 	if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
174 	    enum base {
175 		BASE_2, BASE_8, BASE_10, BASE_16
176 	    };
177 	    int         seendot = 0,
178 	                seenexp = 0,
179 			seensfx = 0;
180 	    enum base	in_base = BASE_10;
181 
182 	    if (*buf_ptr == '0') {
183 		if (buf_ptr[1] == 'b' || buf_ptr[1] == 'B')
184 		    in_base = BASE_2;
185 		else if (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')
186 		    in_base = BASE_16;
187 		else if (isdigit(buf_ptr[1]))
188 		    in_base = BASE_8;
189 	    }
190 	    switch (in_base) {
191 	    case BASE_2:
192 		*e_token++ = *buf_ptr++;
193 		*e_token++ = *buf_ptr++;
194 		while (*buf_ptr == '0' || *buf_ptr == '1') {
195 		    CHECK_SIZE_TOKEN;
196 		    *e_token++ = *buf_ptr++;
197 		}
198 		break;
199 	    case BASE_8:
200 		*e_token++ = *buf_ptr++;
201 		while (*buf_ptr >= '0' && *buf_ptr <= '8') {
202 		    CHECK_SIZE_TOKEN;
203 		    *e_token++ = *buf_ptr++;
204 		}
205 		break;
206 	    case BASE_16:
207 		*e_token++ = *buf_ptr++;
208 		*e_token++ = *buf_ptr++;
209 		while (isxdigit(*buf_ptr)) {
210 		    CHECK_SIZE_TOKEN;
211 		    *e_token++ = *buf_ptr++;
212 		}
213 		break;
214 	    case BASE_10:
215 		while (1) {
216 		    if (*buf_ptr == '.') {
217 			if (seendot)
218 			    break;
219 			else
220 			    seendot++;
221 		    }
222 		    CHECK_SIZE_TOKEN;
223 		    *e_token++ = *buf_ptr++;
224 		    if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
225 			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
226 			    break;
227 			else {
228 			    seenexp++;
229 			    seendot++;
230 			    CHECK_SIZE_TOKEN;
231 			    *e_token++ = *buf_ptr++;
232 			    if (*buf_ptr == '+' || *buf_ptr == '-')
233 				*e_token++ = *buf_ptr++;
234 			}
235 		    }
236 		}
237 		break;
238 	    }
239 	    while (1) {
240 		if (!(seensfx & 1) && (*buf_ptr == 'U' || *buf_ptr == 'u')) {
241 		    CHECK_SIZE_TOKEN;
242 		    *e_token++ = *buf_ptr++;
243 		    seensfx |= 1;
244 		    continue;
245 		}
246 		if (!(seensfx & 2) && (strchr("fFlL", *buf_ptr) != NULL)) {
247 		    CHECK_SIZE_TOKEN;
248 		    if (buf_ptr[1] == buf_ptr[0])
249 		        *e_token++ = *buf_ptr++;
250 		    *e_token++ = *buf_ptr++;
251 		    seensfx |= 2;
252 		    continue;
253 		}
254 		break;
255 	    }
256 	}
257 	else
258 	    while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) {
259 		/* fill_buffer() terminates buffer with newline */
260 		if (*buf_ptr == BACKSLASH) {
261 		    if (*(buf_ptr + 1) == '\n') {
262 			buf_ptr += 2;
263 			if (buf_ptr >= buf_end)
264 			    fill_buffer();
265 			} else
266 			    break;
267 		}
268 		CHECK_SIZE_TOKEN;
269 		/* copy it over */
270 		*e_token++ = *buf_ptr++;
271 		if (buf_ptr >= buf_end)
272 		    fill_buffer();
273 	    }
274 	*e_token++ = '\0';
275 
276 	if (s_token[0] == 'L' && s_token[1] == '\0' &&
277 	      (*buf_ptr == '"' || *buf_ptr == '\''))
278 	    return (strpfx);
279 
280 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
281 	    if (++buf_ptr >= buf_end)
282 		fill_buffer();
283 	}
284 	ps.keyword = 0;
285 	if (l_struct && !ps.p_l_follow) {
286 				/* if last token was 'struct' and we're not
287 				 * in parentheses, then this token
288 				 * should be treated as a declaration */
289 	    l_struct = false;
290 	    last_code = ident;
291 	    ps.last_u_d = true;
292 	    return (decl);
293 	}
294 	ps.last_u_d = l_struct;	/* Operator after identifier is binary
295 				 * unless last token was 'struct' */
296 	l_struct = false;
297 	last_code = ident;	/* Remember that this is the code we will
298 				 * return */
299 
300 	p = bsearch(s_token,
301 	    specials,
302 	    sizeof(specials) / sizeof(specials[0]),
303 	    sizeof(specials[0]),
304 	    strcmp_type);
305 	if (p == NULL) {	/* not a special keyword... */
306 	    char *u;
307 
308 	    /* ... so maybe a type_t or a typedef */
309 	    if ((auto_typedefs && ((u = strrchr(s_token, '_')) != NULL) &&
310 	        strcmp(u, "_t") == 0) || (typename_top >= 0 &&
311 		  bsearch(s_token, typenames, typename_top + 1,
312 		    sizeof(typenames[0]), strcmp_type))) {
313 		ps.keyword = 4;	/* a type name */
314 		ps.last_u_d = true;
315 	        goto found_typename;
316 	    }
317 	} else {			/* we have a keyword */
318 	    ps.keyword = p->rwcode;
319 	    ps.last_u_d = true;
320 	    switch (p->rwcode) {
321 	    case 7:		/* it is a switch */
322 		return (swstmt);
323 	    case 8:		/* a case or default */
324 		return (casestmt);
325 
326 	    case 3:		/* a "struct" */
327 		/*
328 		 * Next time around, we will want to know that we have had a
329 		 * 'struct'
330 		 */
331 		l_struct = true;
332 		/* FALLTHROUGH */
333 
334 	    case 4:		/* one of the declaration keywords */
335 	    found_typename:
336 		if (ps.p_l_follow) {
337 		    /* inside parens: cast, param list, offsetof or sizeof */
338 		    ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.not_cast_mask;
339 		    break;
340 		}
341 		last_code = decl;
342 		return (decl);
343 
344 	    case 5:		/* if, while, for */
345 		return (sp_paren);
346 
347 	    case 6:		/* do, else */
348 		return (sp_nparen);
349 
350 	    case 10:		/* storage class specifier */
351 		return (storage);
352 
353 	    case 11:		/* typedef */
354 		return (type_def);
355 
356 	    default:		/* all others are treated like any other
357 				 * identifier */
358 		return (ident);
359 	    }			/* end of switch */
360 	}			/* end of if (found_it) */
361 	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0 &&
362 	    ps.in_parameter_declaration == 0 && ps.block_init == 0) {
363 	    char *tp = buf_ptr;
364 	    while (tp < buf_end)
365 		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
366 		    goto not_proc;
367 	    strncpy(ps.procname, token, sizeof ps.procname - 1);
368 	    if (ps.in_decl)
369 		ps.in_parameter_declaration = 1;
370 	    return (last_code = funcname);
371     not_proc:;
372 	}
373 	/*
374 	 * The following hack attempts to guess whether or not the current
375 	 * token is in fact a declaration keyword -- one that has been
376 	 * typedefd
377 	 */
378 	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
379 		&& !ps.p_l_follow
380 	        && !ps.block_init
381 		&& (ps.last_token == rparen || ps.last_token == semicolon ||
382 		    ps.last_token == decl ||
383 		    ps.last_token == lbrace || ps.last_token == rbrace)) {
384 	    ps.keyword = 4;	/* a type name */
385 	    ps.last_u_d = true;
386 	    last_code = decl;
387 	    return decl;
388 	}
389 	if (last_code == decl)	/* if this is a declared variable, then
390 				 * following sign is unary */
391 	    ps.last_u_d = true;	/* will make "int a -1" work */
392 	last_code = ident;
393 	return (ident);		/* the ident is not in the list */
394     }				/* end of procesing for alpanum character */
395 
396     /* Scan a non-alphanumeric token */
397 
398     *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
399 				 * moved here */
400     *e_token = '\0';
401     if (++buf_ptr >= buf_end)
402 	fill_buffer();
403 
404     switch (*token) {
405     case '\n':
406 	unary_delim = ps.last_u_d;
407 	ps.last_nl = true;	/* remember that we just had a newline */
408 	code = (had_eof ? 0 : newline);
409 
410 	/*
411 	 * if data has been exhausted, the newline is a dummy, and we should
412 	 * return code to stop
413 	 */
414 	break;
415 
416     case '\'':			/* start of quoted character */
417     case '"':			/* start of string */
418 	qchar = *token;
419 	if (troff) {
420 	    e_token[-1] = '`';
421 	    if (qchar == '"')
422 		*e_token++ = '`';
423 	    e_token = chfont(&bodyf, &stringf, e_token);
424 	}
425 	do {			/* copy the string */
426 	    while (1) {		/* move one character or [/<char>]<char> */
427 		if (*buf_ptr == '\n') {
428 		    diag2(1, "Unterminated literal");
429 		    goto stop_lit;
430 		}
431 		CHECK_SIZE_TOKEN;	/* Only have to do this once in this loop,
432 					 * since CHECK_SIZE guarantees that there
433 					 * are at least 5 entries left */
434 		*e_token = *buf_ptr++;
435 		if (buf_ptr >= buf_end)
436 		    fill_buffer();
437 		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
438 		    if (*buf_ptr == '\n')	/* check for escaped newline */
439 			++line_no;
440 		    if (troff) {
441 			*++e_token = BACKSLASH;
442 			if (*buf_ptr == BACKSLASH)
443 			    *++e_token = BACKSLASH;
444 		    }
445 		    *++e_token = *buf_ptr++;
446 		    ++e_token;	/* we must increment this again because we
447 				 * copied two chars */
448 		    if (buf_ptr >= buf_end)
449 			fill_buffer();
450 		}
451 		else
452 		    break;	/* we copied one character */
453 	    }			/* end of while (1) */
454 	} while (*e_token++ != qchar);
455 	if (troff) {
456 	    e_token = chfont(&stringf, &bodyf, e_token - 1);
457 	    if (qchar == '"')
458 		*e_token++ = '\'';
459 	}
460 stop_lit:
461 	code = ident;
462 	break;
463 
464     case ('('):
465     case ('['):
466 	unary_delim = true;
467 	code = lparen;
468 	break;
469 
470     case (')'):
471     case (']'):
472 	code = rparen;
473 	break;
474 
475     case '#':
476 	unary_delim = ps.last_u_d;
477 	code = preesc;
478 	break;
479 
480     case '?':
481 	unary_delim = true;
482 	code = question;
483 	break;
484 
485     case (':'):
486 	code = colon;
487 	unary_delim = true;
488 	break;
489 
490     case (';'):
491 	unary_delim = true;
492 	code = semicolon;
493 	break;
494 
495     case ('{'):
496 	unary_delim = true;
497 
498 	/*
499 	 * if (ps.in_or_st) ps.block_init = 1;
500 	 */
501 	/* ?	code = ps.block_init ? lparen : lbrace; */
502 	code = lbrace;
503 	break;
504 
505     case ('}'):
506 	unary_delim = true;
507 	/* ?	code = ps.block_init ? rparen : rbrace; */
508 	code = rbrace;
509 	break;
510 
511     case 014:			/* a form feed */
512 	unary_delim = ps.last_u_d;
513 	ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
514 				 * right */
515 	code = form_feed;
516 	break;
517 
518     case (','):
519 	unary_delim = true;
520 	code = comma;
521 	break;
522 
523     case '.':
524 	unary_delim = false;
525 	code = period;
526 	break;
527 
528     case '-':
529     case '+':			/* check for -, +, --, ++ */
530 	code = (ps.last_u_d ? unary_op : binary_op);
531 	unary_delim = true;
532 
533 	if (*buf_ptr == token[0]) {
534 	    /* check for doubled character */
535 	    *e_token++ = *buf_ptr++;
536 	    /* buffer overflow will be checked at end of loop */
537 	    if (last_code == ident || last_code == rparen) {
538 		code = (ps.last_u_d ? unary_op : postop);
539 		/* check for following ++ or -- */
540 		unary_delim = false;
541 	    }
542 	}
543 	else if (*buf_ptr == '=')
544 	    /* check for operator += */
545 	    *e_token++ = *buf_ptr++;
546 	else if (*buf_ptr == '>') {
547 	    /* check for operator -> */
548 	    *e_token++ = *buf_ptr++;
549 	    unary_delim = false;
550 	    code = unary_op;
551 	    ps.want_blank = false;
552 	}
553 	break;			/* buffer overflow will be checked at end of
554 				 * switch */
555 
556     case '=':
557 	if (ps.in_or_st)
558 	    ps.block_init = 1;
559 #ifdef undef
560 	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
561 	    e_token[-1] = *buf_ptr++;
562 	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
563 		*e_token++ = *buf_ptr++;
564 	    *e_token++ = '=';	/* Flip =+ to += */
565 	    *e_token = 0;
566 	}
567 #else
568 	if (*buf_ptr == '=') {/* == */
569 	    *e_token++ = '=';	/* Flip =+ to += */
570 	    buf_ptr++;
571 	    *e_token = 0;
572 	}
573 #endif
574 	code = binary_op;
575 	unary_delim = true;
576 	break;
577 	/* can drop thru!!! */
578 
579     case '>':
580     case '<':
581     case '!':			/* ops like <, <<, <=, !=, etc */
582 	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
583 	    *e_token++ = *buf_ptr;
584 	    if (++buf_ptr >= buf_end)
585 		fill_buffer();
586 	}
587 	if (*buf_ptr == '=')
588 	    *e_token++ = *buf_ptr++;
589 	code = (ps.last_u_d ? unary_op : binary_op);
590 	unary_delim = true;
591 	break;
592 
593     default:
594 	if (token[0] == '/' && *buf_ptr == '*') {
595 	    /* it is start of comment */
596 	    *e_token++ = '*';
597 
598 	    if (++buf_ptr >= buf_end)
599 		fill_buffer();
600 
601 	    code = comment;
602 	    unary_delim = ps.last_u_d;
603 	    break;
604 	}
605 	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
606 	    /*
607 	     * handle ||, &&, etc, and also things as in int *****i
608 	     */
609 	    *e_token++ = *buf_ptr;
610 	    if (++buf_ptr >= buf_end)
611 		fill_buffer();
612 	}
613 	code = (ps.last_u_d ? unary_op : binary_op);
614 	unary_delim = true;
615 
616 
617     }				/* end of switch */
618     if (code != newline) {
619 	l_struct = false;
620 	last_code = code;
621     }
622     if (buf_ptr >= buf_end)	/* check for input buffer empty */
623 	fill_buffer();
624     ps.last_u_d = unary_delim;
625     *e_token = '\0';		/* null terminate the token */
626     return (code);
627 }
628 
629 void
630 alloc_typenames(void)
631 {
632 
633     typenames = (const char **)malloc(sizeof(typenames[0]) *
634         (typename_count = 16));
635     if (typenames == NULL)
636 	err(1, NULL);
637 }
638 
639 void
640 add_typename(const char *key)
641 {
642     int comparison;
643     const char *copy;
644 
645     if (typename_top + 1 >= typename_count) {
646 	typenames = realloc((void *)typenames,
647 	    sizeof(typenames[0]) * (typename_count *= 2));
648 	if (typenames == NULL)
649 	    err(1, NULL);
650     }
651     if (typename_top == -1)
652 	typenames[++typename_top] = copy = strdup(key);
653     else if ((comparison = strcmp(key, typenames[typename_top])) >= 0) {
654 	/* take advantage of sorted input */
655 	if (comparison == 0)	/* remove duplicates */
656 	    return;
657 	typenames[++typename_top] = copy = strdup(key);
658     }
659     else {
660 	int p;
661 
662 	for (p = 0; (comparison = strcmp(key, typenames[p])) > 0; p++)
663 	    /* find place for the new key */;
664 	if (comparison == 0)	/* remove duplicates */
665 	    return;
666 	memmove(&typenames[p + 1], &typenames[p],
667 	    sizeof(typenames[0]) * (++typename_top - p));
668 	typenames[p] = copy = strdup(key);
669     }
670 
671     if (copy == NULL)
672 	err(1, NULL);
673 }
674