xref: /freebsd/usr.bin/indent/lexi.c (revision 1fb62fb074788ca4713551be09d6569966a3abee)
1 /*-
2  * Copyright (c) 1985 Sun Microsystems, Inc.
3  * Copyright (c) 1980, 1993
4  *	The Regents of the University of California.  All rights reserved.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *	This product includes software developed by the University of
18  *	California, Berkeley and its contributors.
19  * 4. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 
36 #if 0
37 #ifndef lint
38 static char sccsid[] = "@(#)lexi.c	8.1 (Berkeley) 6/6/93";
39 #endif /* not lint */
40 #endif
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
43 
44 /*
45  * Here we have the token scanner for indent.  It scans off one token and puts
46  * it in the global variable "token".  It returns a code, indicating the type
47  * of token scanned.
48  */
49 
50 #include <err.h>
51 #include <stdio.h>
52 #include <ctype.h>
53 #include <stdlib.h>
54 #include <string.h>
55 #include "indent_globs.h"
56 #include "indent_codes.h"
57 #include "indent.h"
58 
59 #define alphanum 1
60 #define opchar 3
61 
62 struct templ {
63     const char *rwd;
64     int         rwcode;
65 };
66 
67 /*
68  * This table has to be sorted alphabetically, because it'll be used in binary
69  * search. For the same reason, string must be the first thing in struct templ.
70  */
71 struct templ specials[] =
72 {
73     {"break", 9},
74     {"case", 8},
75     {"char", 4},
76     {"const", 4},
77     {"default", 8},
78     {"do", 6},
79     {"double", 4},
80     {"else", 6},
81     {"enum", 3},
82     {"extern", 4},
83     {"float", 4},
84     {"for", 5},
85     {"global", 4},
86     {"goto", 9},
87     {"if", 5},
88     {"int", 4},
89     {"long", 4},
90     {"offsetof", 1},
91     {"register", 4},
92     {"return", 9},
93     {"short", 4},
94     {"sizeof", 2},
95     {"static", 4},
96     {"struct", 3},
97     {"switch", 7},
98     {"typedef", 4},
99     {"union", 3},
100     {"unsigned", 4},
101     {"void", 4},
102     {"volatile", 4},
103     {"while", 5}
104 };
105 
106 const char **typenames;
107 int         typename_count;
108 int         typename_top = -1;
109 
110 char        chartype[128] =
111 {				/* this is used to facilitate the decision of
112 				 * what type (alphanumeric, operator) each
113 				 * character is */
114     0, 0, 0, 0, 0, 0, 0, 0,
115     0, 0, 0, 0, 0, 0, 0, 0,
116     0, 0, 0, 0, 0, 0, 0, 0,
117     0, 0, 0, 0, 0, 0, 0, 0,
118     0, 3, 0, 0, 1, 3, 3, 0,
119     0, 0, 3, 3, 0, 3, 0, 3,
120     1, 1, 1, 1, 1, 1, 1, 1,
121     1, 1, 0, 0, 3, 3, 3, 3,
122     0, 1, 1, 1, 1, 1, 1, 1,
123     1, 1, 1, 1, 1, 1, 1, 1,
124     1, 1, 1, 1, 1, 1, 1, 1,
125     1, 1, 1, 0, 0, 0, 3, 1,
126     0, 1, 1, 1, 1, 1, 1, 1,
127     1, 1, 1, 1, 1, 1, 1, 1,
128     1, 1, 1, 1, 1, 1, 1, 1,
129     1, 1, 1, 0, 3, 0, 3, 0
130 };
131 
132 static int
133 strcmp_type(const void *e1, const void *e2)
134 {
135     return (strcmp(e1, *(const char * const *)e2));
136 }
137 
138 int
139 lexi(void)
140 {
141     int         unary_delim;	/* this is set to 1 if the current token
142 				 * forces a following operator to be unary */
143     static int  last_code;	/* the last token type returned */
144     static int  l_struct;	/* set to 1 if the last token was 'struct' */
145     int         code;		/* internal code to be returned */
146     char        qchar;		/* the delimiter character for a string */
147 
148     e_token = s_token;		/* point to start of place to save token */
149     unary_delim = false;
150     ps.col_1 = ps.last_nl;	/* tell world that this token started in
151 				 * column 1 iff the last thing scanned was nl */
152     ps.last_nl = false;
153 
154     while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
155 	ps.col_1 = false;	/* leading blanks imply token is not in column
156 				 * 1 */
157 	if (++buf_ptr >= buf_end)
158 	    fill_buffer();
159     }
160 
161     /* Scan an alphanumeric token */
162     if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
163 	/*
164 	 * we have a character or number
165 	 */
166 	struct templ *p;
167 
168 	if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
169 	    int         seendot = 0,
170 	                seenexp = 0,
171 			seensfx = 0;
172 	    if (*buf_ptr == '0' &&
173 		    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
174 		*e_token++ = *buf_ptr++;
175 		*e_token++ = *buf_ptr++;
176 		while (isxdigit(*buf_ptr)) {
177 		    CHECK_SIZE_TOKEN;
178 		    *e_token++ = *buf_ptr++;
179 		}
180 	    }
181 	    else
182 		while (1) {
183 		    if (*buf_ptr == '.') {
184 			if (seendot)
185 			    break;
186 			else
187 			    seendot++;
188 		    }
189 		    CHECK_SIZE_TOKEN;
190 		    *e_token++ = *buf_ptr++;
191 		    if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
192 			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
193 			    break;
194 			else {
195 			    seenexp++;
196 			    seendot++;
197 			    CHECK_SIZE_TOKEN;
198 			    *e_token++ = *buf_ptr++;
199 			    if (*buf_ptr == '+' || *buf_ptr == '-')
200 				*e_token++ = *buf_ptr++;
201 			}
202 		    }
203 		}
204 	    while (1) {
205 		if (!(seensfx & 1) && (*buf_ptr == 'U' || *buf_ptr == 'u')) {
206 		    CHECK_SIZE_TOKEN;
207 		    *e_token++ = *buf_ptr++;
208 		    seensfx |= 1;
209 		    continue;
210 		}
211 		if (!(seensfx & 2) && (strchr("fFlL", *buf_ptr) != NULL)) {
212 		    CHECK_SIZE_TOKEN;
213 		    if (buf_ptr[1] == buf_ptr[0])
214 		        *e_token++ = *buf_ptr++;
215 		    *e_token++ = *buf_ptr++;
216 		    seensfx |= 2;
217 		    continue;
218 		}
219 		break;
220 	    }
221 	}
222 	else
223 	    while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) {
224 		/* fill_buffer() terminates buffer with newline */
225 		if (*buf_ptr == BACKSLASH) {
226 		    if (*(buf_ptr + 1) == '\n') {
227 			buf_ptr += 2;
228 			if (buf_ptr >= buf_end)
229 			    fill_buffer();
230 			} else
231 			    break;
232 		}
233 		CHECK_SIZE_TOKEN;
234 		/* copy it over */
235 		*e_token++ = *buf_ptr++;
236 		if (buf_ptr >= buf_end)
237 		    fill_buffer();
238 	    }
239 	*e_token++ = '\0';
240 
241 	if (s_token[0] == 'L' && s_token[1] == '\0' &&
242 	      (*buf_ptr == '"' || *buf_ptr == '\''))
243 	    return (strpfx);
244 
245 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
246 	    if (++buf_ptr >= buf_end)
247 		fill_buffer();
248 	}
249 	ps.keyword = 0;
250 	if (l_struct && !ps.p_l_follow) {
251 				/* if last token was 'struct' and we're not
252 				 * in parentheses, then this token
253 				 * should be treated as a declaration */
254 	    l_struct = false;
255 	    last_code = ident;
256 	    ps.last_u_d = true;
257 	    return (decl);
258 	}
259 	ps.last_u_d = l_struct;	/* Operator after identifier is binary
260 				 * unless last token was 'struct' */
261 	l_struct = false;
262 	last_code = ident;	/* Remember that this is the code we will
263 				 * return */
264 
265 	p = bsearch(s_token,
266 	    specials,
267 	    sizeof(specials) / sizeof(specials[0]),
268 	    sizeof(specials[0]),
269 	    strcmp_type);
270 	if (p == NULL) {	/* not a special keyword... */
271 	    char *u;
272 
273 	    /* ... so maybe a type_t or a typedef */
274 	    if ((auto_typedefs && ((u = strrchr(s_token, '_')) != NULL) &&
275 	        strcmp(u, "_t") == 0) || (typename_top >= 0 &&
276 		  bsearch(s_token, typenames, typename_top + 1,
277 		    sizeof(typenames[0]), strcmp_type))) {
278 		ps.keyword = 4;	/* a type name */
279 		ps.last_u_d = true;
280 	        goto found_typename;
281 	    }
282 	} else {			/* we have a keyword */
283 	    ps.keyword = p->rwcode;
284 	    ps.last_u_d = true;
285 	    switch (p->rwcode) {
286 	    case 7:		/* it is a switch */
287 		return (swstmt);
288 	    case 8:		/* a case or default */
289 		return (casestmt);
290 
291 	    case 3:		/* a "struct" */
292 		/*
293 		 * Next time around, we will want to know that we have had a
294 		 * 'struct'
295 		 */
296 		l_struct = true;
297 		/* FALLTHROUGH */
298 
299 	    case 4:		/* one of the declaration keywords */
300 	    found_typename:
301 		if (ps.p_l_follow) {
302 		    /* inside parens: cast, param list, offsetof or sizeof */
303 		    ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.not_cast_mask;
304 		    break;
305 		}
306 		last_code = decl;
307 		return (decl);
308 
309 	    case 5:		/* if, while, for */
310 		return (sp_paren);
311 
312 	    case 6:		/* do, else */
313 		return (sp_nparen);
314 
315 	    default:		/* all others are treated like any other
316 				 * identifier */
317 		return (ident);
318 	    }			/* end of switch */
319 	}			/* end of if (found_it) */
320 	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
321 	    char *tp = buf_ptr;
322 	    while (tp < buf_end)
323 		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
324 		    goto not_proc;
325 	    strncpy(ps.procname, token, sizeof ps.procname - 1);
326 	    ps.in_parameter_declaration = 1;
327 	    rparen_count = 1;
328     not_proc:;
329 	}
330 	/*
331 	 * The following hack attempts to guess whether or not the current
332 	 * token is in fact a declaration keyword -- one that has been
333 	 * typedefd
334 	 */
335 	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
336 		&& !ps.p_l_follow
337 	        && !ps.block_init
338 		&& (ps.last_token == rparen || ps.last_token == semicolon ||
339 		    ps.last_token == decl ||
340 		    ps.last_token == lbrace || ps.last_token == rbrace)) {
341 	    ps.keyword = 4;	/* a type name */
342 	    ps.last_u_d = true;
343 	    last_code = decl;
344 	    return decl;
345 	}
346 	if (last_code == decl)	/* if this is a declared variable, then
347 				 * following sign is unary */
348 	    ps.last_u_d = true;	/* will make "int a -1" work */
349 	last_code = ident;
350 	return (ident);		/* the ident is not in the list */
351     }				/* end of procesing for alpanum character */
352 
353     /* Scan a non-alphanumeric token */
354 
355     *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
356 				 * moved here */
357     *e_token = '\0';
358     if (++buf_ptr >= buf_end)
359 	fill_buffer();
360 
361     switch (*token) {
362     case '\n':
363 	unary_delim = ps.last_u_d;
364 	ps.last_nl = true;	/* remember that we just had a newline */
365 	code = (had_eof ? 0 : newline);
366 
367 	/*
368 	 * if data has been exhausted, the newline is a dummy, and we should
369 	 * return code to stop
370 	 */
371 	break;
372 
373     case '\'':			/* start of quoted character */
374     case '"':			/* start of string */
375 	qchar = *token;
376 	if (troff) {
377 	    e_token[-1] = '`';
378 	    if (qchar == '"')
379 		*e_token++ = '`';
380 	    e_token = chfont(&bodyf, &stringf, e_token);
381 	}
382 	do {			/* copy the string */
383 	    while (1) {		/* move one character or [/<char>]<char> */
384 		if (*buf_ptr == '\n') {
385 		    diag2(1, "Unterminated literal");
386 		    goto stop_lit;
387 		}
388 		CHECK_SIZE_TOKEN;	/* Only have to do this once in this loop,
389 					 * since CHECK_SIZE guarantees that there
390 					 * are at least 5 entries left */
391 		*e_token = *buf_ptr++;
392 		if (buf_ptr >= buf_end)
393 		    fill_buffer();
394 		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
395 		    if (*buf_ptr == '\n')	/* check for escaped newline */
396 			++line_no;
397 		    if (troff) {
398 			*++e_token = BACKSLASH;
399 			if (*buf_ptr == BACKSLASH)
400 			    *++e_token = BACKSLASH;
401 		    }
402 		    *++e_token = *buf_ptr++;
403 		    ++e_token;	/* we must increment this again because we
404 				 * copied two chars */
405 		    if (buf_ptr >= buf_end)
406 			fill_buffer();
407 		}
408 		else
409 		    break;	/* we copied one character */
410 	    }			/* end of while (1) */
411 	} while (*e_token++ != qchar);
412 	if (troff) {
413 	    e_token = chfont(&stringf, &bodyf, e_token - 1);
414 	    if (qchar == '"')
415 		*e_token++ = '\'';
416 	}
417 stop_lit:
418 	code = ident;
419 	break;
420 
421     case ('('):
422     case ('['):
423 	unary_delim = true;
424 	code = lparen;
425 	break;
426 
427     case (')'):
428     case (']'):
429 	code = rparen;
430 	break;
431 
432     case '#':
433 	unary_delim = ps.last_u_d;
434 	code = preesc;
435 	break;
436 
437     case '?':
438 	unary_delim = true;
439 	code = question;
440 	break;
441 
442     case (':'):
443 	code = colon;
444 	unary_delim = true;
445 	break;
446 
447     case (';'):
448 	unary_delim = true;
449 	code = semicolon;
450 	break;
451 
452     case ('{'):
453 	unary_delim = true;
454 
455 	/*
456 	 * if (ps.in_or_st) ps.block_init = 1;
457 	 */
458 	/* ?	code = ps.block_init ? lparen : lbrace; */
459 	code = lbrace;
460 	break;
461 
462     case ('}'):
463 	unary_delim = true;
464 	/* ?	code = ps.block_init ? rparen : rbrace; */
465 	code = rbrace;
466 	break;
467 
468     case 014:			/* a form feed */
469 	unary_delim = ps.last_u_d;
470 	ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
471 				 * right */
472 	code = form_feed;
473 	break;
474 
475     case (','):
476 	unary_delim = true;
477 	code = comma;
478 	break;
479 
480     case '.':
481 	unary_delim = false;
482 	code = period;
483 	break;
484 
485     case '-':
486     case '+':			/* check for -, +, --, ++ */
487 	code = (ps.last_u_d ? unary_op : binary_op);
488 	unary_delim = true;
489 
490 	if (*buf_ptr == token[0]) {
491 	    /* check for doubled character */
492 	    *e_token++ = *buf_ptr++;
493 	    /* buffer overflow will be checked at end of loop */
494 	    if (last_code == ident || last_code == rparen) {
495 		code = (ps.last_u_d ? unary_op : postop);
496 		/* check for following ++ or -- */
497 		unary_delim = false;
498 	    }
499 	}
500 	else if (*buf_ptr == '=')
501 	    /* check for operator += */
502 	    *e_token++ = *buf_ptr++;
503 	else if (*buf_ptr == '>') {
504 	    /* check for operator -> */
505 	    *e_token++ = *buf_ptr++;
506 	    if (!pointer_as_binop) {
507 		unary_delim = false;
508 		code = unary_op;
509 		ps.want_blank = false;
510 	    }
511 	}
512 	break;			/* buffer overflow will be checked at end of
513 				 * switch */
514 
515     case '=':
516 	if (ps.in_or_st)
517 	    ps.block_init = 1;
518 #ifdef undef
519 	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
520 	    e_token[-1] = *buf_ptr++;
521 	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
522 		*e_token++ = *buf_ptr++;
523 	    *e_token++ = '=';	/* Flip =+ to += */
524 	    *e_token = 0;
525 	}
526 #else
527 	if (*buf_ptr == '=') {/* == */
528 	    *e_token++ = '=';	/* Flip =+ to += */
529 	    buf_ptr++;
530 	    *e_token = 0;
531 	}
532 #endif
533 	code = binary_op;
534 	unary_delim = true;
535 	break;
536 	/* can drop thru!!! */
537 
538     case '>':
539     case '<':
540     case '!':			/* ops like <, <<, <=, !=, etc */
541 	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
542 	    *e_token++ = *buf_ptr;
543 	    if (++buf_ptr >= buf_end)
544 		fill_buffer();
545 	}
546 	if (*buf_ptr == '=')
547 	    *e_token++ = *buf_ptr++;
548 	code = (ps.last_u_d ? unary_op : binary_op);
549 	unary_delim = true;
550 	break;
551 
552     default:
553 	if (token[0] == '/' && *buf_ptr == '*') {
554 	    /* it is start of comment */
555 	    *e_token++ = '*';
556 
557 	    if (++buf_ptr >= buf_end)
558 		fill_buffer();
559 
560 	    code = comment;
561 	    unary_delim = ps.last_u_d;
562 	    break;
563 	}
564 	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
565 	    /*
566 	     * handle ||, &&, etc, and also things as in int *****i
567 	     */
568 	    *e_token++ = *buf_ptr;
569 	    if (++buf_ptr >= buf_end)
570 		fill_buffer();
571 	}
572 	code = (ps.last_u_d ? unary_op : binary_op);
573 	unary_delim = true;
574 
575 
576     }				/* end of switch */
577     if (code != newline) {
578 	l_struct = false;
579 	last_code = code;
580     }
581     if (buf_ptr >= buf_end)	/* check for input buffer empty */
582 	fill_buffer();
583     ps.last_u_d = unary_delim;
584     *e_token = '\0';		/* null terminate the token */
585     return (code);
586 }
587 
588 void
589 alloc_typenames(void)
590 {
591 
592     typenames = (const char **)malloc(sizeof(typenames[0]) *
593         (typename_count = 16));
594     if (typenames == NULL)
595 	err(1, NULL);
596 }
597 
598 void
599 add_typename(const char *key)
600 {
601     int comparison;
602     const char *copy;
603 
604     if (typename_top + 1 >= typename_count) {
605 	typenames = realloc((void *)typenames,
606 	    sizeof(typenames[0]) * (typename_count *= 2));
607 	if (typenames == NULL)
608 	    err(1, NULL);
609     }
610     if (typename_top == -1)
611 	typenames[++typename_top] = copy = strdup(key);
612     else if ((comparison = strcmp(key, typenames[typename_top])) >= 0) {
613 	/* take advantage of sorted input */
614 	if (comparison == 0)	/* remove duplicates */
615 	    return;
616 	typenames[++typename_top] = copy = strdup(key);
617     }
618     else {
619 	int p;
620 
621 	for (p = 0; (comparison = strcmp(key, typenames[p])) > 0; p++)
622 	    /* find place for the new key */;
623 	if (comparison == 0)	/* remove duplicates */
624 	    return;
625 	memmove(&typenames[p + 1], &typenames[p],
626 	    sizeof(typenames[0]) * (++typename_top - p));
627 	typenames[p] = copy = strdup(key);
628     }
629 
630     if (copy == NULL)
631 	err(1, NULL);
632 }
633