xref: /freebsd/usr.bin/indent/lexi.c (revision 3bbaa755f34eaf2073f2f0c1fd537e96e26c81e8)
1 /*-
2  * SPDX-License-Identifier: BSD-4-Clause
3  *
4  * Copyright (c) 1985 Sun Microsystems, Inc.
5  * Copyright (c) 1980, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. All advertising materials mentioning features or use of this software
18  *    must display the following acknowledgement:
19  *	This product includes software developed by the University of
20  *	California, Berkeley and its contributors.
21  * 4. Neither the name of the University nor the names of its contributors
22  *    may be used to endorse or promote products derived from this software
23  *    without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  */
37 
38 #if 0
39 #ifndef lint
40 static char sccsid[] = "@(#)lexi.c	8.1 (Berkeley) 6/6/93";
41 #endif /* not lint */
42 #endif
43 #include <sys/cdefs.h>
44 __FBSDID("$FreeBSD$");
45 
46 /*
47  * Here we have the token scanner for indent.  It scans off one token and puts
48  * it in the global variable "token".  It returns a code, indicating the type
49  * of token scanned.
50  */
51 
52 #include <err.h>
53 #include <stdio.h>
54 #include <ctype.h>
55 #include <stdlib.h>
56 #include <string.h>
57 #include "indent_globs.h"
58 #include "indent_codes.h"
59 #include "indent.h"
60 
61 #define alphanum 1
62 #ifdef undef
63 #define opchar 3
64 #endif
65 
66 struct templ {
67     const char *rwd;
68     int         rwcode;
69 };
70 
71 /*
72  * This table has to be sorted alphabetically, because it'll be used in binary
73  * search. For the same reason, string must be the first thing in struct templ.
74  */
75 struct templ specials[] =
76 {
77     {"auto", 10},
78     {"break", 9},
79     {"case", 8},
80     {"char", 4},
81     {"const", 4},
82     {"default", 8},
83     {"do", 6},
84     {"double", 4},
85     {"else", 6},
86     {"enum", 3},
87     {"extern", 10},
88     {"float", 4},
89     {"for", 5},
90     {"global", 4},
91     {"goto", 9},
92     {"if", 5},
93     {"int", 4},
94     {"long", 4},
95     {"offsetof", 1},
96     {"register", 10},
97     {"return", 9},
98     {"short", 4},
99     {"sizeof", 2},
100     {"static", 10},
101     {"struct", 3},
102     {"switch", 7},
103     {"typedef", 11},
104     {"union", 3},
105     {"unsigned", 4},
106     {"void", 4},
107     {"volatile", 4},
108     {"while", 5}
109 };
110 
111 const char **typenames;
112 int         typename_count;
113 int         typename_top = -1;
114 
115 char        chartype[128] =
116 {				/* this is used to facilitate the decision of
117 				 * what type (alphanumeric, operator) each
118 				 * character is */
119     0, 0, 0, 0, 0, 0, 0, 0,
120     0, 0, 0, 0, 0, 0, 0, 0,
121     0, 0, 0, 0, 0, 0, 0, 0,
122     0, 0, 0, 0, 0, 0, 0, 0,
123     0, 3, 0, 0, 1, 3, 3, 0,
124     0, 0, 3, 3, 0, 3, 0, 3,
125     1, 1, 1, 1, 1, 1, 1, 1,
126     1, 1, 0, 0, 3, 3, 3, 3,
127     0, 1, 1, 1, 1, 1, 1, 1,
128     1, 1, 1, 1, 1, 1, 1, 1,
129     1, 1, 1, 1, 1, 1, 1, 1,
130     1, 1, 1, 0, 0, 0, 3, 1,
131     0, 1, 1, 1, 1, 1, 1, 1,
132     1, 1, 1, 1, 1, 1, 1, 1,
133     1, 1, 1, 1, 1, 1, 1, 1,
134     1, 1, 1, 0, 3, 0, 3, 0
135 };
136 
137 static int
138 strcmp_type(const void *e1, const void *e2)
139 {
140     return (strcmp(e1, *(const char * const *)e2));
141 }
142 
143 int
144 lexi(void)
145 {
146     int         unary_delim;	/* this is set to 1 if the current token
147 				 * forces a following operator to be unary */
148     static int  last_code;	/* the last token type returned */
149     static int  l_struct;	/* set to 1 if the last token was 'struct' */
150     int         code;		/* internal code to be returned */
151     char        qchar;		/* the delimiter character for a string */
152 
153     e_token = s_token;		/* point to start of place to save token */
154     unary_delim = false;
155     ps.col_1 = ps.last_nl;	/* tell world that this token started in
156 				 * column 1 iff the last thing scanned was nl */
157     ps.last_nl = false;
158 
159     while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
160 	ps.col_1 = false;	/* leading blanks imply token is not in column
161 				 * 1 */
162 	if (++buf_ptr >= buf_end)
163 	    fill_buffer();
164     }
165 
166     /* Scan an alphanumeric token */
167     if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
168 	/*
169 	 * we have a character or number
170 	 */
171 	struct templ *p;
172 
173 	if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
174 	    enum base {
175 		BASE_2, BASE_8, BASE_10, BASE_16
176 	    };
177 	    int         seendot = 0,
178 	                seenexp = 0,
179 			seensfx = 0;
180 	    enum base	in_base = BASE_10;
181 
182 	    if (*buf_ptr == '0') {
183 		if (buf_ptr[1] == 'b' || buf_ptr[1] == 'B')
184 		    in_base = BASE_2;
185 		else if (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')
186 		    in_base = BASE_16;
187 		else if (isdigit(buf_ptr[1]))
188 		    in_base = BASE_8;
189 	    }
190 	    switch (in_base) {
191 	    case BASE_2:
192 		*e_token++ = *buf_ptr++;
193 		*e_token++ = *buf_ptr++;
194 		while (*buf_ptr == '0' || *buf_ptr == '1') {
195 		    CHECK_SIZE_TOKEN;
196 		    *e_token++ = *buf_ptr++;
197 		}
198 		break;
199 	    case BASE_8:
200 		*e_token++ = *buf_ptr++;
201 		while (*buf_ptr >= '0' && *buf_ptr <= '8') {
202 		    CHECK_SIZE_TOKEN;
203 		    *e_token++ = *buf_ptr++;
204 		}
205 		break;
206 	    case BASE_16:
207 		*e_token++ = *buf_ptr++;
208 		*e_token++ = *buf_ptr++;
209 		while (isxdigit(*buf_ptr)) {
210 		    CHECK_SIZE_TOKEN;
211 		    *e_token++ = *buf_ptr++;
212 		}
213 		break;
214 	    case BASE_10:
215 		while (1) {
216 		    if (*buf_ptr == '.') {
217 			if (seendot)
218 			    break;
219 			else
220 			    seendot++;
221 		    }
222 		    CHECK_SIZE_TOKEN;
223 		    *e_token++ = *buf_ptr++;
224 		    if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
225 			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
226 			    break;
227 			else {
228 			    seenexp++;
229 			    seendot++;
230 			    CHECK_SIZE_TOKEN;
231 			    *e_token++ = *buf_ptr++;
232 			    if (*buf_ptr == '+' || *buf_ptr == '-')
233 				*e_token++ = *buf_ptr++;
234 			}
235 		    }
236 		}
237 		break;
238 	    }
239 	    while (1) {
240 		if (!(seensfx & 1) && (*buf_ptr == 'U' || *buf_ptr == 'u')) {
241 		    CHECK_SIZE_TOKEN;
242 		    *e_token++ = *buf_ptr++;
243 		    seensfx |= 1;
244 		    continue;
245 		}
246 		if (!(seensfx & 2) && (strchr("fFlL", *buf_ptr) != NULL)) {
247 		    CHECK_SIZE_TOKEN;
248 		    if (buf_ptr[1] == buf_ptr[0])
249 		        *e_token++ = *buf_ptr++;
250 		    *e_token++ = *buf_ptr++;
251 		    seensfx |= 2;
252 		    continue;
253 		}
254 		break;
255 	    }
256 	}
257 	else
258 	    while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) {
259 		/* fill_buffer() terminates buffer with newline */
260 		if (*buf_ptr == BACKSLASH) {
261 		    if (*(buf_ptr + 1) == '\n') {
262 			buf_ptr += 2;
263 			if (buf_ptr >= buf_end)
264 			    fill_buffer();
265 			} else
266 			    break;
267 		}
268 		CHECK_SIZE_TOKEN;
269 		/* copy it over */
270 		*e_token++ = *buf_ptr++;
271 		if (buf_ptr >= buf_end)
272 		    fill_buffer();
273 	    }
274 	*e_token++ = '\0';
275 
276 	if (s_token[0] == 'L' && s_token[1] == '\0' &&
277 	      (*buf_ptr == '"' || *buf_ptr == '\''))
278 	    return (strpfx);
279 
280 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
281 	    if (++buf_ptr >= buf_end)
282 		fill_buffer();
283 	}
284 	ps.keyword = 0;
285 	if (l_struct && !ps.p_l_follow) {
286 				/* if last token was 'struct' and we're not
287 				 * in parentheses, then this token
288 				 * should be treated as a declaration */
289 	    l_struct = false;
290 	    last_code = ident;
291 	    ps.last_u_d = true;
292 	    return (decl);
293 	}
294 	ps.last_u_d = l_struct;	/* Operator after identifier is binary
295 				 * unless last token was 'struct' */
296 	l_struct = false;
297 	last_code = ident;	/* Remember that this is the code we will
298 				 * return */
299 
300 	p = bsearch(s_token,
301 	    specials,
302 	    sizeof(specials) / sizeof(specials[0]),
303 	    sizeof(specials[0]),
304 	    strcmp_type);
305 	if (p == NULL) {	/* not a special keyword... */
306 	    char *u;
307 
308 	    /* ... so maybe a type_t or a typedef */
309 	    if ((auto_typedefs && ((u = strrchr(s_token, '_')) != NULL) &&
310 	        strcmp(u, "_t") == 0) || (typename_top >= 0 &&
311 		  bsearch(s_token, typenames, typename_top + 1,
312 		    sizeof(typenames[0]), strcmp_type))) {
313 		ps.keyword = 4;	/* a type name */
314 		ps.last_u_d = true;
315 	        goto found_typename;
316 	    }
317 	} else {			/* we have a keyword */
318 	    ps.keyword = p->rwcode;
319 	    ps.last_u_d = true;
320 	    switch (p->rwcode) {
321 	    case 7:		/* it is a switch */
322 		return (swstmt);
323 	    case 8:		/* a case or default */
324 		return (casestmt);
325 
326 	    case 3:		/* a "struct" */
327 		/*
328 		 * Next time around, we will want to know that we have had a
329 		 * 'struct'
330 		 */
331 		l_struct = true;
332 		/* FALLTHROUGH */
333 
334 	    case 4:		/* one of the declaration keywords */
335 	    found_typename:
336 		if (ps.p_l_follow) {
337 		    /* inside parens: cast, param list, offsetof or sizeof */
338 		    ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.not_cast_mask;
339 		    break;
340 		}
341 		last_code = decl;
342 		return (decl);
343 
344 	    case 5:		/* if, while, for */
345 		return (sp_paren);
346 
347 	    case 6:		/* do, else */
348 		return (sp_nparen);
349 
350 	    case 10:		/* storage class specifier */
351 		return (storage);
352 
353 	    case 11:		/* typedef */
354 		return (type_def);
355 
356 	    default:		/* all others are treated like any other
357 				 * identifier */
358 		return (ident);
359 	    }			/* end of switch */
360 	}			/* end of if (found_it) */
361 	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0 &&
362 	    ps.in_parameter_declaration == 0 && ps.block_init == 0) {
363 	    char *tp = buf_ptr;
364 	    while (tp < buf_end)
365 		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
366 		    goto not_proc;
367 	    strncpy(ps.procname, token, sizeof ps.procname - 1);
368 	    if (ps.in_decl)
369 		ps.in_parameter_declaration = 1;
370 	    return (last_code = funcname);
371     not_proc:;
372 	}
373 	/*
374 	 * The following hack attempts to guess whether or not the current
375 	 * token is in fact a declaration keyword -- one that has been
376 	 * typedefd
377 	 */
378 	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
379 		&& !ps.p_l_follow
380 	        && !ps.block_init
381 		&& (ps.last_token == rparen || ps.last_token == semicolon ||
382 		    ps.last_token == decl ||
383 		    ps.last_token == lbrace || ps.last_token == rbrace)) {
384 	    ps.keyword = 4;	/* a type name */
385 	    ps.last_u_d = true;
386 	    last_code = decl;
387 	    return decl;
388 	}
389 	if (last_code == decl)	/* if this is a declared variable, then
390 				 * following sign is unary */
391 	    ps.last_u_d = true;	/* will make "int a -1" work */
392 	last_code = ident;
393 	return (ident);		/* the ident is not in the list */
394     }				/* end of procesing for alpanum character */
395 
396     /* Scan a non-alphanumeric token */
397 
398     *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
399 				 * moved here */
400     *e_token = '\0';
401     if (++buf_ptr >= buf_end)
402 	fill_buffer();
403 
404     switch (*token) {
405     case '\n':
406 	unary_delim = ps.last_u_d;
407 	ps.last_nl = true;	/* remember that we just had a newline */
408 	code = (had_eof ? 0 : newline);
409 
410 	/*
411 	 * if data has been exhausted, the newline is a dummy, and we should
412 	 * return code to stop
413 	 */
414 	break;
415 
416     case '\'':			/* start of quoted character */
417     case '"':			/* start of string */
418 	qchar = *token;
419 	if (troff) {
420 	    e_token[-1] = '`';
421 	    if (qchar == '"')
422 		*e_token++ = '`';
423 	    e_token = chfont(&bodyf, &stringf, e_token);
424 	}
425 	do {			/* copy the string */
426 	    while (1) {		/* move one character or [/<char>]<char> */
427 		if (*buf_ptr == '\n') {
428 		    diag2(1, "Unterminated literal");
429 		    goto stop_lit;
430 		}
431 		CHECK_SIZE_TOKEN;	/* Only have to do this once in this loop,
432 					 * since CHECK_SIZE guarantees that there
433 					 * are at least 5 entries left */
434 		*e_token = *buf_ptr++;
435 		if (buf_ptr >= buf_end)
436 		    fill_buffer();
437 		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
438 		    if (*buf_ptr == '\n')	/* check for escaped newline */
439 			++line_no;
440 		    if (troff) {
441 			*++e_token = BACKSLASH;
442 			if (*buf_ptr == BACKSLASH)
443 			    *++e_token = BACKSLASH;
444 		    }
445 		    *++e_token = *buf_ptr++;
446 		    ++e_token;	/* we must increment this again because we
447 				 * copied two chars */
448 		    if (buf_ptr >= buf_end)
449 			fill_buffer();
450 		}
451 		else
452 		    break;	/* we copied one character */
453 	    }			/* end of while (1) */
454 	} while (*e_token++ != qchar);
455 	if (troff) {
456 	    e_token = chfont(&stringf, &bodyf, e_token - 1);
457 	    if (qchar == '"')
458 		*e_token++ = '\'';
459 	}
460 stop_lit:
461 	code = ident;
462 	break;
463 
464     case ('('):
465     case ('['):
466 	unary_delim = true;
467 	code = lparen;
468 	break;
469 
470     case (')'):
471     case (']'):
472 	code = rparen;
473 	break;
474 
475     case '#':
476 	unary_delim = ps.last_u_d;
477 	code = preesc;
478 	break;
479 
480     case '?':
481 	unary_delim = true;
482 	code = question;
483 	break;
484 
485     case (':'):
486 	code = colon;
487 	unary_delim = true;
488 	break;
489 
490     case (';'):
491 	unary_delim = true;
492 	code = semicolon;
493 	break;
494 
495     case ('{'):
496 	unary_delim = true;
497 
498 	/*
499 	 * if (ps.in_or_st) ps.block_init = 1;
500 	 */
501 	/* ?	code = ps.block_init ? lparen : lbrace; */
502 	code = lbrace;
503 	break;
504 
505     case ('}'):
506 	unary_delim = true;
507 	/* ?	code = ps.block_init ? rparen : rbrace; */
508 	code = rbrace;
509 	break;
510 
511     case 014:			/* a form feed */
512 	unary_delim = ps.last_u_d;
513 	ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
514 				 * right */
515 	code = form_feed;
516 	break;
517 
518     case (','):
519 	unary_delim = true;
520 	code = comma;
521 	break;
522 
523     case '.':
524 	unary_delim = false;
525 	code = period;
526 	break;
527 
528     case '-':
529     case '+':			/* check for -, +, --, ++ */
530 	code = (ps.last_u_d ? unary_op : binary_op);
531 	unary_delim = true;
532 
533 	if (*buf_ptr == token[0]) {
534 	    /* check for doubled character */
535 	    *e_token++ = *buf_ptr++;
536 	    /* buffer overflow will be checked at end of loop */
537 	    if (last_code == ident || last_code == rparen) {
538 		code = (ps.last_u_d ? unary_op : postop);
539 		/* check for following ++ or -- */
540 		unary_delim = false;
541 	    }
542 	}
543 	else if (*buf_ptr == '=')
544 	    /* check for operator += */
545 	    *e_token++ = *buf_ptr++;
546 	else if (*buf_ptr == '>') {
547 	    /* check for operator -> */
548 	    *e_token++ = *buf_ptr++;
549 	    if (!pointer_as_binop) {
550 		unary_delim = false;
551 		code = unary_op;
552 		ps.want_blank = false;
553 	    }
554 	}
555 	break;			/* buffer overflow will be checked at end of
556 				 * switch */
557 
558     case '=':
559 	if (ps.in_or_st)
560 	    ps.block_init = 1;
561 #ifdef undef
562 	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
563 	    e_token[-1] = *buf_ptr++;
564 	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
565 		*e_token++ = *buf_ptr++;
566 	    *e_token++ = '=';	/* Flip =+ to += */
567 	    *e_token = 0;
568 	}
569 #else
570 	if (*buf_ptr == '=') {/* == */
571 	    *e_token++ = '=';	/* Flip =+ to += */
572 	    buf_ptr++;
573 	    *e_token = 0;
574 	}
575 #endif
576 	code = binary_op;
577 	unary_delim = true;
578 	break;
579 	/* can drop thru!!! */
580 
581     case '>':
582     case '<':
583     case '!':			/* ops like <, <<, <=, !=, etc */
584 	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
585 	    *e_token++ = *buf_ptr;
586 	    if (++buf_ptr >= buf_end)
587 		fill_buffer();
588 	}
589 	if (*buf_ptr == '=')
590 	    *e_token++ = *buf_ptr++;
591 	code = (ps.last_u_d ? unary_op : binary_op);
592 	unary_delim = true;
593 	break;
594 
595     default:
596 	if (token[0] == '/' && *buf_ptr == '*') {
597 	    /* it is start of comment */
598 	    *e_token++ = '*';
599 
600 	    if (++buf_ptr >= buf_end)
601 		fill_buffer();
602 
603 	    code = comment;
604 	    unary_delim = ps.last_u_d;
605 	    break;
606 	}
607 	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
608 	    /*
609 	     * handle ||, &&, etc, and also things as in int *****i
610 	     */
611 	    *e_token++ = *buf_ptr;
612 	    if (++buf_ptr >= buf_end)
613 		fill_buffer();
614 	}
615 	code = (ps.last_u_d ? unary_op : binary_op);
616 	unary_delim = true;
617 
618 
619     }				/* end of switch */
620     if (code != newline) {
621 	l_struct = false;
622 	last_code = code;
623     }
624     if (buf_ptr >= buf_end)	/* check for input buffer empty */
625 	fill_buffer();
626     ps.last_u_d = unary_delim;
627     *e_token = '\0';		/* null terminate the token */
628     return (code);
629 }
630 
631 void
632 alloc_typenames(void)
633 {
634 
635     typenames = (const char **)malloc(sizeof(typenames[0]) *
636         (typename_count = 16));
637     if (typenames == NULL)
638 	err(1, NULL);
639 }
640 
641 void
642 add_typename(const char *key)
643 {
644     int comparison;
645     const char *copy;
646 
647     if (typename_top + 1 >= typename_count) {
648 	typenames = realloc((void *)typenames,
649 	    sizeof(typenames[0]) * (typename_count *= 2));
650 	if (typenames == NULL)
651 	    err(1, NULL);
652     }
653     if (typename_top == -1)
654 	typenames[++typename_top] = copy = strdup(key);
655     else if ((comparison = strcmp(key, typenames[typename_top])) >= 0) {
656 	/* take advantage of sorted input */
657 	if (comparison == 0)	/* remove duplicates */
658 	    return;
659 	typenames[++typename_top] = copy = strdup(key);
660     }
661     else {
662 	int p;
663 
664 	for (p = 0; (comparison = strcmp(key, typenames[p])) > 0; p++)
665 	    /* find place for the new key */;
666 	if (comparison == 0)	/* remove duplicates */
667 	    return;
668 	memmove(&typenames[p + 1], &typenames[p],
669 	    sizeof(typenames[0]) * (++typename_top - p));
670 	typenames[p] = copy = strdup(key);
671     }
672 
673     if (copy == NULL)
674 	err(1, NULL);
675 }
676