xref: /freebsd/usr.bin/indent/lexi.c (revision c35b5d8372e4c4ec50e8653c2b51e6179a81769e)
1 /*-
2  * Copyright (c) 1985 Sun Microsystems, Inc.
3  * Copyright (c) 1980, 1993
4  *	The Regents of the University of California.  All rights reserved.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *	This product includes software developed by the University of
18  *	California, Berkeley and its contributors.
19  * 4. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 
36 #if 0
37 #ifndef lint
38 static char sccsid[] = "@(#)lexi.c	8.1 (Berkeley) 6/6/93";
39 #endif /* not lint */
40 #endif
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
43 
44 /*
45  * Here we have the token scanner for indent.  It scans off one token and puts
46  * it in the global variable "token".  It returns a code, indicating the type
47  * of token scanned.
48  */
49 
50 #include <err.h>
51 #include <stdio.h>
52 #include <ctype.h>
53 #include <stdlib.h>
54 #include <string.h>
55 #include "indent_globs.h"
56 #include "indent_codes.h"
57 #include "indent.h"
58 
59 #define alphanum 1
60 #define opchar 3
61 
62 struct templ {
63     const char *rwd;
64     int         rwcode;
65 };
66 
67 /*
68  * This table has to be sorted alphabetically, because it'll be used in binary
69  * search. For the same reason, string must be the first thing in struct templ.
70  */
71 struct templ specials[] =
72 {
73     {"break", 9},
74     {"case", 8},
75     {"char", 4},
76     {"const", 4},
77     {"default", 8},
78     {"do", 6},
79     {"double", 4},
80     {"else", 6},
81     {"enum", 3},
82     {"extern", 4},
83     {"float", 4},
84     {"for", 5},
85     {"global", 4},
86     {"goto", 9},
87     {"if", 5},
88     {"int", 4},
89     {"long", 4},
90     {"offsetof", 1},
91     {"register", 4},
92     {"return", 9},
93     {"short", 4},
94     {"sizeof", 2},
95     {"static", 4},
96     {"struct", 3},
97     {"switch", 7},
98     {"typedef", 4},
99     {"union", 3},
100     {"unsigned", 4},
101     {"void", 4},
102     {"volatile", 4},
103     {"while", 5}
104 };
105 
106 const char **typenames;
107 int         typename_count;
108 int         typename_top = -1;
109 
110 char        chartype[128] =
111 {				/* this is used to facilitate the decision of
112 				 * what type (alphanumeric, operator) each
113 				 * character is */
114     0, 0, 0, 0, 0, 0, 0, 0,
115     0, 0, 0, 0, 0, 0, 0, 0,
116     0, 0, 0, 0, 0, 0, 0, 0,
117     0, 0, 0, 0, 0, 0, 0, 0,
118     0, 3, 0, 0, 1, 3, 3, 0,
119     0, 0, 3, 3, 0, 3, 0, 3,
120     1, 1, 1, 1, 1, 1, 1, 1,
121     1, 1, 0, 0, 3, 3, 3, 3,
122     0, 1, 1, 1, 1, 1, 1, 1,
123     1, 1, 1, 1, 1, 1, 1, 1,
124     1, 1, 1, 1, 1, 1, 1, 1,
125     1, 1, 1, 0, 0, 0, 3, 1,
126     0, 1, 1, 1, 1, 1, 1, 1,
127     1, 1, 1, 1, 1, 1, 1, 1,
128     1, 1, 1, 1, 1, 1, 1, 1,
129     1, 1, 1, 0, 3, 0, 3, 0
130 };
131 
132 static int
133 strcmp_type(const void *e1, const void *e2)
134 {
135     return (strcmp(e1, *(const char * const *)e2));
136 }
137 
138 int
139 lexi(void)
140 {
141     int         unary_delim;	/* this is set to 1 if the current token
142 				 * forces a following operator to be unary */
143     static int  last_code;	/* the last token type returned */
144     static int  l_struct;	/* set to 1 if the last token was 'struct' */
145     int         code;		/* internal code to be returned */
146     char        qchar;		/* the delimiter character for a string */
147 
148     e_token = s_token;		/* point to start of place to save token */
149     unary_delim = false;
150     ps.col_1 = ps.last_nl;	/* tell world that this token started in
151 				 * column 1 iff the last thing scanned was nl */
152     ps.last_nl = false;
153 
154     while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
155 	ps.col_1 = false;	/* leading blanks imply token is not in column
156 				 * 1 */
157 	if (++buf_ptr >= buf_end)
158 	    fill_buffer();
159     }
160 
161     /* Scan an alphanumeric token */
162     if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
163 	/*
164 	 * we have a character or number
165 	 */
166 	struct templ *p;
167 
168 	if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
169 	    int         seendot = 0,
170 	                seenexp = 0,
171 			seensfx = 0;
172 	    if (*buf_ptr == '0' &&
173 		    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
174 		*e_token++ = *buf_ptr++;
175 		*e_token++ = *buf_ptr++;
176 		while (isxdigit(*buf_ptr)) {
177 		    CHECK_SIZE_TOKEN;
178 		    *e_token++ = *buf_ptr++;
179 		}
180 	    }
181 	    else
182 		while (1) {
183 		    if (*buf_ptr == '.') {
184 			if (seendot)
185 			    break;
186 			else
187 			    seendot++;
188 		    }
189 		    CHECK_SIZE_TOKEN;
190 		    *e_token++ = *buf_ptr++;
191 		    if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
192 			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
193 			    break;
194 			else {
195 			    seenexp++;
196 			    seendot++;
197 			    CHECK_SIZE_TOKEN;
198 			    *e_token++ = *buf_ptr++;
199 			    if (*buf_ptr == '+' || *buf_ptr == '-')
200 				*e_token++ = *buf_ptr++;
201 			}
202 		    }
203 		}
204 	    while (1) {
205 		if (!(seensfx & 1) && (*buf_ptr == 'U' || *buf_ptr == 'u')) {
206 		    CHECK_SIZE_TOKEN;
207 		    *e_token++ = *buf_ptr++;
208 		    seensfx |= 1;
209 		    continue;
210 		}
211 		if (!(seensfx & 2) && (strchr("fFlL", *buf_ptr) != NULL)) {
212 		    CHECK_SIZE_TOKEN;
213 		    if (buf_ptr[1] == buf_ptr[0])
214 		        *e_token++ = *buf_ptr++;
215 		    *e_token++ = *buf_ptr++;
216 		    seensfx |= 2;
217 		    continue;
218 		}
219 		break;
220 	    }
221 	}
222 	else
223 	    while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) {
224 		/* fill_buffer() terminates buffer with newline */
225 		if (*buf_ptr == BACKSLASH) {
226 		    if (*(buf_ptr + 1) == '\n') {
227 			buf_ptr += 2;
228 			if (buf_ptr >= buf_end)
229 			    fill_buffer();
230 			} else
231 			    break;
232 		}
233 		CHECK_SIZE_TOKEN;
234 		/* copy it over */
235 		*e_token++ = *buf_ptr++;
236 		if (buf_ptr >= buf_end)
237 		    fill_buffer();
238 	    }
239 	*e_token++ = '\0';
240 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
241 	    if (++buf_ptr >= buf_end)
242 		fill_buffer();
243 	}
244 	ps.keyword = 0;
245 	if (l_struct && !ps.p_l_follow) {
246 				/* if last token was 'struct' and we're not
247 				 * in parentheses, then this token
248 				 * should be treated as a declaration */
249 	    l_struct = false;
250 	    last_code = ident;
251 	    ps.last_u_d = true;
252 	    return (decl);
253 	}
254 	ps.last_u_d = l_struct;	/* Operator after identifier is binary
255 				 * unless last token was 'struct' */
256 	l_struct = false;
257 	last_code = ident;	/* Remember that this is the code we will
258 				 * return */
259 
260 	p = bsearch(s_token,
261 	    specials,
262 	    sizeof(specials) / sizeof(specials[0]),
263 	    sizeof(specials[0]),
264 	    strcmp_type);
265 	if (p == NULL) {	/* not a special keyword... */
266 	    char *u;
267 
268 	    /* ... so maybe a type_t or a typedef */
269 	    if ((auto_typedefs && ((u = strrchr(s_token, '_')) != NULL) &&
270 	        strcmp(u, "_t") == 0) || (typename_top >= 0 &&
271 		  bsearch(s_token, typenames, typename_top + 1,
272 		    sizeof(typenames[0]), strcmp_type))) {
273 		ps.keyword = 4;	/* a type name */
274 		ps.last_u_d = true;
275 	        goto found_typename;
276 	    }
277 	} else {			/* we have a keyword */
278 	    ps.keyword = p->rwcode;
279 	    ps.last_u_d = true;
280 	    switch (p->rwcode) {
281 	    case 7:		/* it is a switch */
282 		return (swstmt);
283 	    case 8:		/* a case or default */
284 		return (casestmt);
285 
286 	    case 3:		/* a "struct" */
287 		/*
288 		 * Next time around, we will want to know that we have had a
289 		 * 'struct'
290 		 */
291 		l_struct = true;
292 		/* FALLTHROUGH */
293 
294 	    case 4:		/* one of the declaration keywords */
295 	    found_typename:
296 		if (ps.p_l_follow) {
297 		    /* inside parens: cast, param list, offsetof or sizeof */
298 		    ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.not_cast_mask;
299 		    break;
300 		}
301 		last_code = decl;
302 		return (decl);
303 
304 	    case 5:		/* if, while, for */
305 		return (sp_paren);
306 
307 	    case 6:		/* do, else */
308 		return (sp_nparen);
309 
310 	    default:		/* all others are treated like any other
311 				 * identifier */
312 		return (ident);
313 	    }			/* end of switch */
314 	}			/* end of if (found_it) */
315 	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
316 	    char *tp = buf_ptr;
317 	    while (tp < buf_end)
318 		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
319 		    goto not_proc;
320 	    strncpy(ps.procname, token, sizeof ps.procname - 1);
321 	    ps.in_parameter_declaration = 1;
322 	    rparen_count = 1;
323     not_proc:;
324 	}
325 	/*
326 	 * The following hack attempts to guess whether or not the current
327 	 * token is in fact a declaration keyword -- one that has been
328 	 * typedefd
329 	 */
330 	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
331 		&& !ps.p_l_follow
332 	        && !ps.block_init
333 		&& (ps.last_token == rparen || ps.last_token == semicolon ||
334 		    ps.last_token == decl ||
335 		    ps.last_token == lbrace || ps.last_token == rbrace)) {
336 	    ps.keyword = 4;	/* a type name */
337 	    ps.last_u_d = true;
338 	    last_code = decl;
339 	    return decl;
340 	}
341 	if (last_code == decl)	/* if this is a declared variable, then
342 				 * following sign is unary */
343 	    ps.last_u_d = true;	/* will make "int a -1" work */
344 	last_code = ident;
345 	return (ident);		/* the ident is not in the list */
346     }				/* end of procesing for alpanum character */
347 
348     /* Scan a non-alphanumeric token */
349 
350     *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
351 				 * moved here */
352     *e_token = '\0';
353     if (++buf_ptr >= buf_end)
354 	fill_buffer();
355 
356     switch (*token) {
357     case '\n':
358 	unary_delim = ps.last_u_d;
359 	ps.last_nl = true;	/* remember that we just had a newline */
360 	code = (had_eof ? 0 : newline);
361 
362 	/*
363 	 * if data has been exhausted, the newline is a dummy, and we should
364 	 * return code to stop
365 	 */
366 	break;
367 
368     case '\'':			/* start of quoted character */
369     case '"':			/* start of string */
370 	qchar = *token;
371 	if (troff) {
372 	    e_token[-1] = '`';
373 	    if (qchar == '"')
374 		*e_token++ = '`';
375 	    e_token = chfont(&bodyf, &stringf, e_token);
376 	}
377 	do {			/* copy the string */
378 	    while (1) {		/* move one character or [/<char>]<char> */
379 		if (*buf_ptr == '\n') {
380 		    diag2(1, "Unterminated literal");
381 		    goto stop_lit;
382 		}
383 		CHECK_SIZE_TOKEN;	/* Only have to do this once in this loop,
384 					 * since CHECK_SIZE guarantees that there
385 					 * are at least 5 entries left */
386 		*e_token = *buf_ptr++;
387 		if (buf_ptr >= buf_end)
388 		    fill_buffer();
389 		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
390 		    if (*buf_ptr == '\n')	/* check for escaped newline */
391 			++line_no;
392 		    if (troff) {
393 			*++e_token = BACKSLASH;
394 			if (*buf_ptr == BACKSLASH)
395 			    *++e_token = BACKSLASH;
396 		    }
397 		    *++e_token = *buf_ptr++;
398 		    ++e_token;	/* we must increment this again because we
399 				 * copied two chars */
400 		    if (buf_ptr >= buf_end)
401 			fill_buffer();
402 		}
403 		else
404 		    break;	/* we copied one character */
405 	    }			/* end of while (1) */
406 	} while (*e_token++ != qchar);
407 	if (troff) {
408 	    e_token = chfont(&stringf, &bodyf, e_token - 1);
409 	    if (qchar == '"')
410 		*e_token++ = '\'';
411 	}
412 stop_lit:
413 	code = ident;
414 	break;
415 
416     case ('('):
417     case ('['):
418 	unary_delim = true;
419 	code = lparen;
420 	break;
421 
422     case (')'):
423     case (']'):
424 	code = rparen;
425 	break;
426 
427     case '#':
428 	unary_delim = ps.last_u_d;
429 	code = preesc;
430 	break;
431 
432     case '?':
433 	unary_delim = true;
434 	code = question;
435 	break;
436 
437     case (':'):
438 	code = colon;
439 	unary_delim = true;
440 	break;
441 
442     case (';'):
443 	unary_delim = true;
444 	code = semicolon;
445 	break;
446 
447     case ('{'):
448 	unary_delim = true;
449 
450 	/*
451 	 * if (ps.in_or_st) ps.block_init = 1;
452 	 */
453 	/* ?	code = ps.block_init ? lparen : lbrace; */
454 	code = lbrace;
455 	break;
456 
457     case ('}'):
458 	unary_delim = true;
459 	/* ?	code = ps.block_init ? rparen : rbrace; */
460 	code = rbrace;
461 	break;
462 
463     case 014:			/* a form feed */
464 	unary_delim = ps.last_u_d;
465 	ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
466 				 * right */
467 	code = form_feed;
468 	break;
469 
470     case (','):
471 	unary_delim = true;
472 	code = comma;
473 	break;
474 
475     case '.':
476 	unary_delim = false;
477 	code = period;
478 	break;
479 
480     case '-':
481     case '+':			/* check for -, +, --, ++ */
482 	code = (ps.last_u_d ? unary_op : binary_op);
483 	unary_delim = true;
484 
485 	if (*buf_ptr == token[0]) {
486 	    /* check for doubled character */
487 	    *e_token++ = *buf_ptr++;
488 	    /* buffer overflow will be checked at end of loop */
489 	    if (last_code == ident || last_code == rparen) {
490 		code = (ps.last_u_d ? unary_op : postop);
491 		/* check for following ++ or -- */
492 		unary_delim = false;
493 	    }
494 	}
495 	else if (*buf_ptr == '=')
496 	    /* check for operator += */
497 	    *e_token++ = *buf_ptr++;
498 	else if (*buf_ptr == '>') {
499 	    /* check for operator -> */
500 	    *e_token++ = *buf_ptr++;
501 	    if (!pointer_as_binop) {
502 		unary_delim = false;
503 		code = unary_op;
504 		ps.want_blank = false;
505 	    }
506 	}
507 	break;			/* buffer overflow will be checked at end of
508 				 * switch */
509 
510     case '=':
511 	if (ps.in_or_st)
512 	    ps.block_init = 1;
513 #ifdef undef
514 	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
515 	    e_token[-1] = *buf_ptr++;
516 	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
517 		*e_token++ = *buf_ptr++;
518 	    *e_token++ = '=';	/* Flip =+ to += */
519 	    *e_token = 0;
520 	}
521 #else
522 	if (*buf_ptr == '=') {/* == */
523 	    *e_token++ = '=';	/* Flip =+ to += */
524 	    buf_ptr++;
525 	    *e_token = 0;
526 	}
527 #endif
528 	code = binary_op;
529 	unary_delim = true;
530 	break;
531 	/* can drop thru!!! */
532 
533     case '>':
534     case '<':
535     case '!':			/* ops like <, <<, <=, !=, etc */
536 	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
537 	    *e_token++ = *buf_ptr;
538 	    if (++buf_ptr >= buf_end)
539 		fill_buffer();
540 	}
541 	if (*buf_ptr == '=')
542 	    *e_token++ = *buf_ptr++;
543 	code = (ps.last_u_d ? unary_op : binary_op);
544 	unary_delim = true;
545 	break;
546 
547     default:
548 	if (token[0] == '/' && *buf_ptr == '*') {
549 	    /* it is start of comment */
550 	    *e_token++ = '*';
551 
552 	    if (++buf_ptr >= buf_end)
553 		fill_buffer();
554 
555 	    code = comment;
556 	    unary_delim = ps.last_u_d;
557 	    break;
558 	}
559 	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
560 	    /*
561 	     * handle ||, &&, etc, and also things as in int *****i
562 	     */
563 	    *e_token++ = *buf_ptr;
564 	    if (++buf_ptr >= buf_end)
565 		fill_buffer();
566 	}
567 	code = (ps.last_u_d ? unary_op : binary_op);
568 	unary_delim = true;
569 
570 
571     }				/* end of switch */
572     if (code != newline) {
573 	l_struct = false;
574 	last_code = code;
575     }
576     if (buf_ptr >= buf_end)	/* check for input buffer empty */
577 	fill_buffer();
578     ps.last_u_d = unary_delim;
579     *e_token = '\0';		/* null terminate the token */
580     return (code);
581 }
582 
583 void
584 alloc_typenames(void)
585 {
586 
587     typenames = (const char **)malloc(sizeof(typenames[0]) *
588         (typename_count = 16));
589     if (typenames == NULL)
590 	err(1, NULL);
591 }
592 
593 void
594 add_typename(const char *key)
595 {
596     int comparison;
597     const char *copy;
598 
599     if (typename_top + 1 >= typename_count) {
600 	typenames = realloc((void *)typenames,
601 	    sizeof(typenames[0]) * (typename_count *= 2));
602 	if (typenames == NULL)
603 	    err(1, NULL);
604     }
605     if (typename_top == -1)
606 	typenames[++typename_top] = copy = strdup(key);
607     else if ((comparison = strcmp(key, typenames[typename_top])) >= 0) {
608 	/* take advantage of sorted input */
609 	if (comparison == 0)	/* remove duplicates */
610 	    return;
611 	typenames[++typename_top] = copy = strdup(key);
612     }
613     else {
614 	int p;
615 
616 	for (p = 0; (comparison = strcmp(key, typenames[p])) > 0; p++)
617 	    /* find place for the new key */;
618 	if (comparison == 0)	/* remove duplicates */
619 	    return;
620 	memmove(&typenames[p + 1], &typenames[p],
621 	    sizeof(typenames[0]) * (++typename_top - p));
622 	typenames[p] = copy = strdup(key);
623     }
624 
625     if (copy == NULL)
626 	err(1, NULL);
627 }
628