xref: /freebsd/usr.bin/indent/lexi.c (revision 5bf5ca772c6de2d53344a78cf461447cc322ccea)
1 /*-
2  * SPDX-License-Identifier: BSD-4-Clause
3  *
4  * Copyright (c) 1985 Sun Microsystems, Inc.
5  * Copyright (c) 1980, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. All advertising materials mentioning features or use of this software
18  *    must display the following acknowledgement:
19  *	This product includes software developed by the University of
20  *	California, Berkeley and its contributors.
21  * 4. Neither the name of the University nor the names of its contributors
22  *    may be used to endorse or promote products derived from this software
23  *    without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  */
37 
38 #if 0
39 #ifndef lint
40 static char sccsid[] = "@(#)lexi.c	8.1 (Berkeley) 6/6/93";
41 #endif /* not lint */
42 #endif
43 #include <sys/cdefs.h>
44 __FBSDID("$FreeBSD$");
45 
46 /*
47  * Here we have the token scanner for indent.  It scans off one token and puts
48  * it in the global variable "token".  It returns a code, indicating the type
49  * of token scanned.
50  */
51 
52 #include <err.h>
53 #include <stdio.h>
54 #include <ctype.h>
55 #include <stdlib.h>
56 #include <string.h>
57 #include "indent_globs.h"
58 #include "indent_codes.h"
59 #include "indent.h"
60 
61 #define alphanum 1
62 #ifdef undef
63 #define opchar 3
64 #endif
65 
66 struct templ {
67     const char *rwd;
68     int         rwcode;
69 };
70 
71 /*
72  * This table has to be sorted alphabetically, because it'll be used in binary
73  * search. For the same reason, string must be the first thing in struct templ.
74  */
75 struct templ specials[] =
76 {
77     {"auto", 10},
78     {"break", 9},
79     {"case", 8},
80     {"char", 4},
81     {"const", 4},
82     {"default", 8},
83     {"do", 6},
84     {"double", 4},
85     {"else", 6},
86     {"enum", 3},
87     {"extern", 10},
88     {"float", 4},
89     {"for", 5},
90     {"global", 4},
91     {"goto", 9},
92     {"if", 5},
93     {"int", 4},
94     {"long", 4},
95     {"offsetof", 1},
96     {"register", 10},
97     {"return", 9},
98     {"short", 4},
99     {"sizeof", 2},
100     {"static", 10},
101     {"struct", 3},
102     {"switch", 7},
103     {"typedef", 10},
104     {"union", 3},
105     {"unsigned", 4},
106     {"void", 4},
107     {"volatile", 4},
108     {"while", 5}
109 };
110 
111 const char **typenames;
112 int         typename_count;
113 int         typename_top = -1;
114 
115 char        chartype[128] =
116 {				/* this is used to facilitate the decision of
117 				 * what type (alphanumeric, operator) each
118 				 * character is */
119     0, 0, 0, 0, 0, 0, 0, 0,
120     0, 0, 0, 0, 0, 0, 0, 0,
121     0, 0, 0, 0, 0, 0, 0, 0,
122     0, 0, 0, 0, 0, 0, 0, 0,
123     0, 3, 0, 0, 1, 3, 3, 0,
124     0, 0, 3, 3, 0, 3, 0, 3,
125     1, 1, 1, 1, 1, 1, 1, 1,
126     1, 1, 0, 0, 3, 3, 3, 3,
127     0, 1, 1, 1, 1, 1, 1, 1,
128     1, 1, 1, 1, 1, 1, 1, 1,
129     1, 1, 1, 1, 1, 1, 1, 1,
130     1, 1, 1, 0, 0, 0, 3, 1,
131     0, 1, 1, 1, 1, 1, 1, 1,
132     1, 1, 1, 1, 1, 1, 1, 1,
133     1, 1, 1, 1, 1, 1, 1, 1,
134     1, 1, 1, 0, 3, 0, 3, 0
135 };
136 
137 static int
138 strcmp_type(const void *e1, const void *e2)
139 {
140     return (strcmp(e1, *(const char * const *)e2));
141 }
142 
143 int
144 lexi(void)
145 {
146     int         unary_delim;	/* this is set to 1 if the current token
147 				 * forces a following operator to be unary */
148     static int  last_code;	/* the last token type returned */
149     static int  l_struct;	/* set to 1 if the last token was 'struct' */
150     int         code;		/* internal code to be returned */
151     char        qchar;		/* the delimiter character for a string */
152 
153     e_token = s_token;		/* point to start of place to save token */
154     unary_delim = false;
155     ps.col_1 = ps.last_nl;	/* tell world that this token started in
156 				 * column 1 iff the last thing scanned was nl */
157     ps.last_nl = false;
158 
159     while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
160 	ps.col_1 = false;	/* leading blanks imply token is not in column
161 				 * 1 */
162 	if (++buf_ptr >= buf_end)
163 	    fill_buffer();
164     }
165 
166     /* Scan an alphanumeric token */
167     if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
168 	/*
169 	 * we have a character or number
170 	 */
171 	struct templ *p;
172 
173 	if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
174 	    enum base {
175 		BASE_2, BASE_8, BASE_10, BASE_16
176 	    };
177 	    int         seendot = 0,
178 	                seenexp = 0,
179 			seensfx = 0;
180 	    enum base	in_base = BASE_10;
181 
182 	    if (*buf_ptr == '0') {
183 		if (buf_ptr[1] == 'b' || buf_ptr[1] == 'B')
184 		    in_base = BASE_2;
185 		else if (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')
186 		    in_base = BASE_16;
187 		else if (isdigit(buf_ptr[1]))
188 		    in_base = BASE_8;
189 	    }
190 	    switch (in_base) {
191 	    case BASE_2:
192 		*e_token++ = *buf_ptr++;
193 		*e_token++ = *buf_ptr++;
194 		while (*buf_ptr == '0' || *buf_ptr == '1') {
195 		    CHECK_SIZE_TOKEN;
196 		    *e_token++ = *buf_ptr++;
197 		}
198 		break;
199 	    case BASE_8:
200 		*e_token++ = *buf_ptr++;
201 		while (*buf_ptr >= '0' && *buf_ptr <= '8') {
202 		    CHECK_SIZE_TOKEN;
203 		    *e_token++ = *buf_ptr++;
204 		}
205 		break;
206 	    case BASE_16:
207 		*e_token++ = *buf_ptr++;
208 		*e_token++ = *buf_ptr++;
209 		while (isxdigit(*buf_ptr)) {
210 		    CHECK_SIZE_TOKEN;
211 		    *e_token++ = *buf_ptr++;
212 		}
213 		break;
214 	    case BASE_10:
215 		while (1) {
216 		    if (*buf_ptr == '.') {
217 			if (seendot)
218 			    break;
219 			else
220 			    seendot++;
221 		    }
222 		    CHECK_SIZE_TOKEN;
223 		    *e_token++ = *buf_ptr++;
224 		    if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
225 			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
226 			    break;
227 			else {
228 			    seenexp++;
229 			    seendot++;
230 			    CHECK_SIZE_TOKEN;
231 			    *e_token++ = *buf_ptr++;
232 			    if (*buf_ptr == '+' || *buf_ptr == '-')
233 				*e_token++ = *buf_ptr++;
234 			}
235 		    }
236 		}
237 		break;
238 	    }
239 	    while (1) {
240 		if (!(seensfx & 1) && (*buf_ptr == 'U' || *buf_ptr == 'u')) {
241 		    CHECK_SIZE_TOKEN;
242 		    *e_token++ = *buf_ptr++;
243 		    seensfx |= 1;
244 		    continue;
245 		}
246 		if (!(seensfx & 2) && (strchr("fFlL", *buf_ptr) != NULL)) {
247 		    CHECK_SIZE_TOKEN;
248 		    if (buf_ptr[1] == buf_ptr[0])
249 		        *e_token++ = *buf_ptr++;
250 		    *e_token++ = *buf_ptr++;
251 		    seensfx |= 2;
252 		    continue;
253 		}
254 		break;
255 	    }
256 	}
257 	else
258 	    while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) {
259 		/* fill_buffer() terminates buffer with newline */
260 		if (*buf_ptr == BACKSLASH) {
261 		    if (*(buf_ptr + 1) == '\n') {
262 			buf_ptr += 2;
263 			if (buf_ptr >= buf_end)
264 			    fill_buffer();
265 			} else
266 			    break;
267 		}
268 		CHECK_SIZE_TOKEN;
269 		/* copy it over */
270 		*e_token++ = *buf_ptr++;
271 		if (buf_ptr >= buf_end)
272 		    fill_buffer();
273 	    }
274 	*e_token++ = '\0';
275 
276 	if (s_token[0] == 'L' && s_token[1] == '\0' &&
277 	      (*buf_ptr == '"' || *buf_ptr == '\''))
278 	    return (strpfx);
279 
280 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
281 	    if (++buf_ptr >= buf_end)
282 		fill_buffer();
283 	}
284 	ps.keyword = 0;
285 	if (l_struct && !ps.p_l_follow) {
286 				/* if last token was 'struct' and we're not
287 				 * in parentheses, then this token
288 				 * should be treated as a declaration */
289 	    l_struct = false;
290 	    last_code = ident;
291 	    ps.last_u_d = true;
292 	    return (decl);
293 	}
294 	ps.last_u_d = l_struct;	/* Operator after identifier is binary
295 				 * unless last token was 'struct' */
296 	l_struct = false;
297 	last_code = ident;	/* Remember that this is the code we will
298 				 * return */
299 
300 	p = bsearch(s_token,
301 	    specials,
302 	    sizeof(specials) / sizeof(specials[0]),
303 	    sizeof(specials[0]),
304 	    strcmp_type);
305 	if (p == NULL) {	/* not a special keyword... */
306 	    char *u;
307 
308 	    /* ... so maybe a type_t or a typedef */
309 	    if ((auto_typedefs && ((u = strrchr(s_token, '_')) != NULL) &&
310 	        strcmp(u, "_t") == 0) || (typename_top >= 0 &&
311 		  bsearch(s_token, typenames, typename_top + 1,
312 		    sizeof(typenames[0]), strcmp_type))) {
313 		ps.keyword = 4;	/* a type name */
314 		ps.last_u_d = true;
315 	        goto found_typename;
316 	    }
317 	} else {			/* we have a keyword */
318 	    ps.keyword = p->rwcode;
319 	    ps.last_u_d = true;
320 	    switch (p->rwcode) {
321 	    case 7:		/* it is a switch */
322 		return (swstmt);
323 	    case 8:		/* a case or default */
324 		return (casestmt);
325 
326 	    case 3:		/* a "struct" */
327 		/*
328 		 * Next time around, we will want to know that we have had a
329 		 * 'struct'
330 		 */
331 		l_struct = true;
332 		/* FALLTHROUGH */
333 
334 	    case 4:		/* one of the declaration keywords */
335 	    found_typename:
336 		if (ps.p_l_follow) {
337 		    /* inside parens: cast, param list, offsetof or sizeof */
338 		    ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.not_cast_mask;
339 		    break;
340 		}
341 		last_code = decl;
342 		return (decl);
343 
344 	    case 5:		/* if, while, for */
345 		return (sp_paren);
346 
347 	    case 6:		/* do, else */
348 		return (sp_nparen);
349 
350 	    case 10:		/* storage class specifier */
351 		return (storage);
352 
353 	    default:		/* all others are treated like any other
354 				 * identifier */
355 		return (ident);
356 	    }			/* end of switch */
357 	}			/* end of if (found_it) */
358 	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0 &&
359 	    ps.in_parameter_declaration == 0 && ps.block_init == 0) {
360 	    char *tp = buf_ptr;
361 	    while (tp < buf_end)
362 		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
363 		    goto not_proc;
364 	    strncpy(ps.procname, token, sizeof ps.procname - 1);
365 	    if (ps.in_decl)
366 		ps.in_parameter_declaration = 1;
367 	    return (last_code = funcname);
368     not_proc:;
369 	}
370 	/*
371 	 * The following hack attempts to guess whether or not the current
372 	 * token is in fact a declaration keyword -- one that has been
373 	 * typedefd
374 	 */
375 	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
376 		&& !ps.p_l_follow
377 	        && !ps.block_init
378 		&& (ps.last_token == rparen || ps.last_token == semicolon ||
379 		    ps.last_token == decl ||
380 		    ps.last_token == lbrace || ps.last_token == rbrace)) {
381 	    ps.keyword = 4;	/* a type name */
382 	    ps.last_u_d = true;
383 	    last_code = decl;
384 	    return decl;
385 	}
386 	if (last_code == decl)	/* if this is a declared variable, then
387 				 * following sign is unary */
388 	    ps.last_u_d = true;	/* will make "int a -1" work */
389 	last_code = ident;
390 	return (ident);		/* the ident is not in the list */
391     }				/* end of procesing for alpanum character */
392 
393     /* Scan a non-alphanumeric token */
394 
395     *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
396 				 * moved here */
397     *e_token = '\0';
398     if (++buf_ptr >= buf_end)
399 	fill_buffer();
400 
401     switch (*token) {
402     case '\n':
403 	unary_delim = ps.last_u_d;
404 	ps.last_nl = true;	/* remember that we just had a newline */
405 	code = (had_eof ? 0 : newline);
406 
407 	/*
408 	 * if data has been exhausted, the newline is a dummy, and we should
409 	 * return code to stop
410 	 */
411 	break;
412 
413     case '\'':			/* start of quoted character */
414     case '"':			/* start of string */
415 	qchar = *token;
416 	if (troff) {
417 	    e_token[-1] = '`';
418 	    if (qchar == '"')
419 		*e_token++ = '`';
420 	    e_token = chfont(&bodyf, &stringf, e_token);
421 	}
422 	do {			/* copy the string */
423 	    while (1) {		/* move one character or [/<char>]<char> */
424 		if (*buf_ptr == '\n') {
425 		    diag2(1, "Unterminated literal");
426 		    goto stop_lit;
427 		}
428 		CHECK_SIZE_TOKEN;	/* Only have to do this once in this loop,
429 					 * since CHECK_SIZE guarantees that there
430 					 * are at least 5 entries left */
431 		*e_token = *buf_ptr++;
432 		if (buf_ptr >= buf_end)
433 		    fill_buffer();
434 		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
435 		    if (*buf_ptr == '\n')	/* check for escaped newline */
436 			++line_no;
437 		    if (troff) {
438 			*++e_token = BACKSLASH;
439 			if (*buf_ptr == BACKSLASH)
440 			    *++e_token = BACKSLASH;
441 		    }
442 		    *++e_token = *buf_ptr++;
443 		    ++e_token;	/* we must increment this again because we
444 				 * copied two chars */
445 		    if (buf_ptr >= buf_end)
446 			fill_buffer();
447 		}
448 		else
449 		    break;	/* we copied one character */
450 	    }			/* end of while (1) */
451 	} while (*e_token++ != qchar);
452 	if (troff) {
453 	    e_token = chfont(&stringf, &bodyf, e_token - 1);
454 	    if (qchar == '"')
455 		*e_token++ = '\'';
456 	}
457 stop_lit:
458 	code = ident;
459 	break;
460 
461     case ('('):
462     case ('['):
463 	unary_delim = true;
464 	code = lparen;
465 	break;
466 
467     case (')'):
468     case (']'):
469 	code = rparen;
470 	break;
471 
472     case '#':
473 	unary_delim = ps.last_u_d;
474 	code = preesc;
475 	break;
476 
477     case '?':
478 	unary_delim = true;
479 	code = question;
480 	break;
481 
482     case (':'):
483 	code = colon;
484 	unary_delim = true;
485 	break;
486 
487     case (';'):
488 	unary_delim = true;
489 	code = semicolon;
490 	break;
491 
492     case ('{'):
493 	unary_delim = true;
494 
495 	/*
496 	 * if (ps.in_or_st) ps.block_init = 1;
497 	 */
498 	/* ?	code = ps.block_init ? lparen : lbrace; */
499 	code = lbrace;
500 	break;
501 
502     case ('}'):
503 	unary_delim = true;
504 	/* ?	code = ps.block_init ? rparen : rbrace; */
505 	code = rbrace;
506 	break;
507 
508     case 014:			/* a form feed */
509 	unary_delim = ps.last_u_d;
510 	ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
511 				 * right */
512 	code = form_feed;
513 	break;
514 
515     case (','):
516 	unary_delim = true;
517 	code = comma;
518 	break;
519 
520     case '.':
521 	unary_delim = false;
522 	code = period;
523 	break;
524 
525     case '-':
526     case '+':			/* check for -, +, --, ++ */
527 	code = (ps.last_u_d ? unary_op : binary_op);
528 	unary_delim = true;
529 
530 	if (*buf_ptr == token[0]) {
531 	    /* check for doubled character */
532 	    *e_token++ = *buf_ptr++;
533 	    /* buffer overflow will be checked at end of loop */
534 	    if (last_code == ident || last_code == rparen) {
535 		code = (ps.last_u_d ? unary_op : postop);
536 		/* check for following ++ or -- */
537 		unary_delim = false;
538 	    }
539 	}
540 	else if (*buf_ptr == '=')
541 	    /* check for operator += */
542 	    *e_token++ = *buf_ptr++;
543 	else if (*buf_ptr == '>') {
544 	    /* check for operator -> */
545 	    *e_token++ = *buf_ptr++;
546 	    if (!pointer_as_binop) {
547 		unary_delim = false;
548 		code = unary_op;
549 		ps.want_blank = false;
550 	    }
551 	}
552 	break;			/* buffer overflow will be checked at end of
553 				 * switch */
554 
555     case '=':
556 	if (ps.in_or_st)
557 	    ps.block_init = 1;
558 #ifdef undef
559 	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
560 	    e_token[-1] = *buf_ptr++;
561 	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
562 		*e_token++ = *buf_ptr++;
563 	    *e_token++ = '=';	/* Flip =+ to += */
564 	    *e_token = 0;
565 	}
566 #else
567 	if (*buf_ptr == '=') {/* == */
568 	    *e_token++ = '=';	/* Flip =+ to += */
569 	    buf_ptr++;
570 	    *e_token = 0;
571 	}
572 #endif
573 	code = binary_op;
574 	unary_delim = true;
575 	break;
576 	/* can drop thru!!! */
577 
578     case '>':
579     case '<':
580     case '!':			/* ops like <, <<, <=, !=, etc */
581 	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
582 	    *e_token++ = *buf_ptr;
583 	    if (++buf_ptr >= buf_end)
584 		fill_buffer();
585 	}
586 	if (*buf_ptr == '=')
587 	    *e_token++ = *buf_ptr++;
588 	code = (ps.last_u_d ? unary_op : binary_op);
589 	unary_delim = true;
590 	break;
591 
592     default:
593 	if (token[0] == '/' && *buf_ptr == '*') {
594 	    /* it is start of comment */
595 	    *e_token++ = '*';
596 
597 	    if (++buf_ptr >= buf_end)
598 		fill_buffer();
599 
600 	    code = comment;
601 	    unary_delim = ps.last_u_d;
602 	    break;
603 	}
604 	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
605 	    /*
606 	     * handle ||, &&, etc, and also things as in int *****i
607 	     */
608 	    *e_token++ = *buf_ptr;
609 	    if (++buf_ptr >= buf_end)
610 		fill_buffer();
611 	}
612 	code = (ps.last_u_d ? unary_op : binary_op);
613 	unary_delim = true;
614 
615 
616     }				/* end of switch */
617     if (code != newline) {
618 	l_struct = false;
619 	last_code = code;
620     }
621     if (buf_ptr >= buf_end)	/* check for input buffer empty */
622 	fill_buffer();
623     ps.last_u_d = unary_delim;
624     *e_token = '\0';		/* null terminate the token */
625     return (code);
626 }
627 
628 void
629 alloc_typenames(void)
630 {
631 
632     typenames = (const char **)malloc(sizeof(typenames[0]) *
633         (typename_count = 16));
634     if (typenames == NULL)
635 	err(1, NULL);
636 }
637 
638 void
639 add_typename(const char *key)
640 {
641     int comparison;
642     const char *copy;
643 
644     if (typename_top + 1 >= typename_count) {
645 	typenames = realloc((void *)typenames,
646 	    sizeof(typenames[0]) * (typename_count *= 2));
647 	if (typenames == NULL)
648 	    err(1, NULL);
649     }
650     if (typename_top == -1)
651 	typenames[++typename_top] = copy = strdup(key);
652     else if ((comparison = strcmp(key, typenames[typename_top])) >= 0) {
653 	/* take advantage of sorted input */
654 	if (comparison == 0)	/* remove duplicates */
655 	    return;
656 	typenames[++typename_top] = copy = strdup(key);
657     }
658     else {
659 	int p;
660 
661 	for (p = 0; (comparison = strcmp(key, typenames[p])) > 0; p++)
662 	    /* find place for the new key */;
663 	if (comparison == 0)	/* remove duplicates */
664 	    return;
665 	memmove(&typenames[p + 1], &typenames[p],
666 	    sizeof(typenames[0]) * (++typename_top - p));
667 	typenames[p] = copy = strdup(key);
668     }
669 
670     if (copy == NULL)
671 	err(1, NULL);
672 }
673