xref: /freebsd/usr.bin/indent/lexi.c (revision 70ed590b393173d4ea697be2a27054ed171f0c1a)
1 /*
2  * Copyright (c) 1985 Sun Microsystems, Inc.
3  * Copyright (c) 1980, 1993
4  *	The Regents of the University of California.  All rights reserved.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *	This product includes software developed by the University of
18  *	California, Berkeley and its contributors.
19  * 4. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 
36 #if 0
37 #ifndef lint
38 static char sccsid[] = "@(#)lexi.c	8.1 (Berkeley) 6/6/93";
39 #endif /* not lint */
40 #endif
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
43 
44 /*
45  * Here we have the token scanner for indent.  It scans off one token and puts
46  * it in the global variable "token".  It returns a code, indicating the type
47  * of token scanned.
48  */
49 
50 #include <err.h>
51 #include <stdio.h>
52 #include <ctype.h>
53 #include <stdlib.h>
54 #include <string.h>
55 #include "indent_globs.h"
56 #include "indent_codes.h"
57 #include "indent.h"
58 
59 #define alphanum 1
60 #define opchar 3
61 
62 struct templ {
63     const char *rwd;
64     int         rwcode;
65 };
66 
67 struct templ specials[1000] =
68 {
69     {"switch", 1},
70     {"case", 2},
71     {"break", 0},
72     {"struct", 3},
73     {"union", 3},
74     {"enum", 3},
75     {"default", 2},
76     {"int", 4},
77     {"char", 4},
78     {"float", 4},
79     {"double", 4},
80     {"long", 4},
81     {"short", 4},
82     {"typdef", 4},
83     {"unsigned", 4},
84     {"register", 4},
85     {"static", 4},
86     {"global", 4},
87     {"extern", 4},
88     {"void", 4},
89     {"const", 4},
90     {"volatile", 4},
91     {"goto", 0},
92     {"return", 0},
93     {"if", 5},
94     {"while", 5},
95     {"for", 5},
96     {"else", 6},
97     {"do", 6},
98     {"sizeof", 7},
99     {0, 0}
100 };
101 
102 char        chartype[128] =
103 {				/* this is used to facilitate the decision of
104 				 * what type (alphanumeric, operator) each
105 				 * character is */
106     0, 0, 0, 0, 0, 0, 0, 0,
107     0, 0, 0, 0, 0, 0, 0, 0,
108     0, 0, 0, 0, 0, 0, 0, 0,
109     0, 0, 0, 0, 0, 0, 0, 0,
110     0, 3, 0, 0, 1, 3, 3, 0,
111     0, 0, 3, 3, 0, 3, 0, 3,
112     1, 1, 1, 1, 1, 1, 1, 1,
113     1, 1, 0, 0, 3, 3, 3, 3,
114     0, 1, 1, 1, 1, 1, 1, 1,
115     1, 1, 1, 1, 1, 1, 1, 1,
116     1, 1, 1, 1, 1, 1, 1, 1,
117     1, 1, 1, 0, 0, 0, 3, 1,
118     0, 1, 1, 1, 1, 1, 1, 1,
119     1, 1, 1, 1, 1, 1, 1, 1,
120     1, 1, 1, 1, 1, 1, 1, 1,
121     1, 1, 1, 0, 3, 0, 3, 0
122 };
123 
124 int
125 lexi(void)
126 {
127     int         unary_delim;	/* this is set to 1 if the current token
128 				 * forces a following operator to be unary */
129     static int  last_code;	/* the last token type returned */
130     static int  l_struct;	/* set to 1 if the last token was 'struct' */
131     int         code;		/* internal code to be returned */
132     char        qchar;		/* the delimiter character for a string */
133 
134     e_token = s_token;		/* point to start of place to save token */
135     unary_delim = false;
136     ps.col_1 = ps.last_nl;	/* tell world that this token started in
137 				 * column 1 iff the last thing scanned was nl */
138     ps.last_nl = false;
139 
140     while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
141 	ps.col_1 = false;	/* leading blanks imply token is not in column
142 				 * 1 */
143 	if (++buf_ptr >= buf_end)
144 	    fill_buffer();
145     }
146 
147     /* Scan an alphanumeric token */
148     if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
149 	/*
150 	 * we have a character or number
151 	 */
152 	const char *j;		/* used for searching thru list of
153 				 *
154 				 * reserved words */
155 	struct templ *p;
156 
157 	if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
158 	    int         seendot = 0,
159 	                seenexp = 0,
160 			seensfx = 0;
161 	    if (*buf_ptr == '0' &&
162 		    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
163 		*e_token++ = *buf_ptr++;
164 		*e_token++ = *buf_ptr++;
165 		while (isxdigit(*buf_ptr)) {
166 		    CHECK_SIZE_TOKEN;
167 		    *e_token++ = *buf_ptr++;
168 		}
169 	    }
170 	    else
171 		while (1) {
172 		    if (*buf_ptr == '.') {
173 			if (seendot)
174 			    break;
175 			else
176 			    seendot++;
177 		    }
178 		    CHECK_SIZE_TOKEN;
179 		    *e_token++ = *buf_ptr++;
180 		    if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
181 			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
182 			    break;
183 			else {
184 			    seenexp++;
185 			    seendot++;
186 			    CHECK_SIZE_TOKEN;
187 			    *e_token++ = *buf_ptr++;
188 			    if (*buf_ptr == '+' || *buf_ptr == '-')
189 				*e_token++ = *buf_ptr++;
190 			}
191 		    }
192 		}
193 	    while (1) {
194 		if (!(seensfx & 1) &&
195 			(*buf_ptr == 'U' || *buf_ptr == 'u')) {
196 		    CHECK_SIZE_TOKEN;
197 		    *e_token++ = *buf_ptr++;
198 		    seensfx |= 1;
199 		    continue;
200 		}
201         	if (!(seensfx & 2) &&
202 			(*buf_ptr == 'L' || *buf_ptr == 'l')) {
203 		    CHECK_SIZE_TOKEN;
204 		    if (buf_ptr[1] == buf_ptr[0])
205 		        *e_token++ = *buf_ptr++;
206 		    *e_token++ = *buf_ptr++;
207 		    seensfx |= 2;
208 		    continue;
209 		}
210 		break;
211 	    }
212 	}
213 	else
214 	    while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) {
215 		/* fill_buffer() terminates buffer with newline */
216 		if (*buf_ptr == BACKSLASH) {
217 		    if (*(buf_ptr + 1) == '\n') {
218 			buf_ptr += 2;
219 			if (buf_ptr >= buf_end)
220 			    fill_buffer();
221 			} else
222 			    break;
223 		}
224 		CHECK_SIZE_TOKEN;
225 		/* copy it over */
226 		*e_token++ = *buf_ptr++;
227 		if (buf_ptr >= buf_end)
228 		    fill_buffer();
229 	    }
230 	*e_token++ = '\0';
231 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
232 	    if (++buf_ptr >= buf_end)
233 		fill_buffer();
234 	}
235 	ps.its_a_keyword = false;
236 	ps.sizeof_keyword = false;
237 	if (l_struct && !ps.p_l_follow) {
238 				/* if last token was 'struct' and we're not
239 				 * in parentheses, then this token
240 				 * should be treated as a declaration */
241 	    l_struct = false;
242 	    last_code = ident;
243 	    ps.last_u_d = true;
244 	    return (decl);
245 	}
246 	ps.last_u_d = l_struct;	/* Operator after identifier is binary
247 				 * unless last token was 'struct' */
248 	l_struct = false;
249 	last_code = ident;	/* Remember that this is the code we will
250 				 * return */
251 
252 	if (auto_typedefs) {
253 	    const char *q = s_token;
254 	    /* Check if we have an "_t" in the end */
255 	    if (q[0] && q[1] &&
256 	        (strcmp(q + strlen(q) - 2, "_t") == 0)) {
257 	        ps.its_a_keyword = true;
258 		ps.last_u_d = true;
259 	        goto found_auto_typedef;
260 	    }
261 	}
262 
263 	/*
264 	 * This loop will check if the token is a keyword.
265 	 */
266 	for (p = specials; (j = p->rwd) != 0; p++) {
267 	    const char *q = s_token;	/* point at scanned token */
268 	    if (*j++ != *q++ || *j++ != *q++)
269 		continue;	/* This test depends on the fact that
270 				 * identifiers are always at least 1 character
271 				 * long (ie. the first two bytes of the
272 				 * identifier are always meaningful) */
273 	    if (q[-1] == 0)
274 		break;		/* If its a one-character identifier */
275 	    while (*q++ == *j)
276 		if (*j++ == 0)
277 		    goto found_keyword;	/* I wish that C had a multi-level
278 					 * break... */
279 	}
280 	if (p->rwd) {		/* we have a keyword */
281     found_keyword:
282 	    ps.its_a_keyword = true;
283 	    ps.last_u_d = true;
284 	    switch (p->rwcode) {
285 	    case 1:		/* it is a switch */
286 		return (swstmt);
287 	    case 2:		/* a case or default */
288 		return (casestmt);
289 
290 	    case 3:		/* a "struct" */
291 		/*
292 		 * Next time around, we will want to know that we have had a
293 		 * 'struct'
294 		 */
295 		l_struct = true;
296 		/* FALLTHROUGH */
297 
298 	    case 4:		/* one of the declaration keywords */
299 	    found_auto_typedef:
300 		if (ps.p_l_follow) {
301 		    ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.sizeof_mask;
302 		    break;	/* inside parens: cast, param list or sizeof */
303 		}
304 		last_code = decl;
305 		return (decl);
306 
307 	    case 5:		/* if, while, for */
308 		return (sp_paren);
309 
310 	    case 6:		/* do, else */
311 		return (sp_nparen);
312 
313 	    case 7:
314 		ps.sizeof_keyword = true;
315 	    default:		/* all others are treated like any other
316 				 * identifier */
317 		return (ident);
318 	    }			/* end of switch */
319 	}			/* end of if (found_it) */
320 	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
321 	    char *tp = buf_ptr;
322 	    while (tp < buf_end)
323 		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
324 		    goto not_proc;
325 	    strncpy(ps.procname, token, sizeof ps.procname - 1);
326 	    ps.in_parameter_declaration = 1;
327 	    rparen_count = 1;
328     not_proc:;
329 	}
330 	/*
331 	 * The following hack attempts to guess whether or not the current
332 	 * token is in fact a declaration keyword -- one that has been
333 	 * typedefd
334 	 */
335 	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
336 		&& !ps.p_l_follow
337 	        && !ps.block_init
338 		&& (ps.last_token == rparen || ps.last_token == semicolon ||
339 		    ps.last_token == decl ||
340 		    ps.last_token == lbrace || ps.last_token == rbrace)) {
341 	    ps.its_a_keyword = true;
342 	    ps.last_u_d = true;
343 	    last_code = decl;
344 	    return decl;
345 	}
346 	if (last_code == decl)	/* if this is a declared variable, then
347 				 * following sign is unary */
348 	    ps.last_u_d = true;	/* will make "int a -1" work */
349 	last_code = ident;
350 	return (ident);		/* the ident is not in the list */
351     }				/* end of procesing for alpanum character */
352 
353     /* Scan a non-alphanumeric token */
354 
355     *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
356 				 * moved here */
357     *e_token = '\0';
358     if (++buf_ptr >= buf_end)
359 	fill_buffer();
360 
361     switch (*token) {
362     case '\n':
363 	unary_delim = ps.last_u_d;
364 	ps.last_nl = true;	/* remember that we just had a newline */
365 	code = (had_eof ? 0 : newline);
366 
367 	/*
368 	 * if data has been exhausted, the newline is a dummy, and we should
369 	 * return code to stop
370 	 */
371 	break;
372 
373     case '\'':			/* start of quoted character */
374     case '"':			/* start of string */
375 	qchar = *token;
376 	if (troff) {
377 	    e_token[-1] = '`';
378 	    if (qchar == '"')
379 		*e_token++ = '`';
380 	    e_token = chfont(&bodyf, &stringf, e_token);
381 	}
382 	do {			/* copy the string */
383 	    while (1) {		/* move one character or [/<char>]<char> */
384 		if (*buf_ptr == '\n') {
385 		    diag2(1, "Unterminated literal");
386 		    goto stop_lit;
387 		}
388 		CHECK_SIZE_TOKEN;	/* Only have to do this once in this loop,
389 					 * since CHECK_SIZE guarantees that there
390 					 * are at least 5 entries left */
391 		*e_token = *buf_ptr++;
392 		if (buf_ptr >= buf_end)
393 		    fill_buffer();
394 		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
395 		    if (*buf_ptr == '\n')	/* check for escaped newline */
396 			++line_no;
397 		    if (troff) {
398 			*++e_token = BACKSLASH;
399 			if (*buf_ptr == BACKSLASH)
400 			    *++e_token = BACKSLASH;
401 		    }
402 		    *++e_token = *buf_ptr++;
403 		    ++e_token;	/* we must increment this again because we
404 				 * copied two chars */
405 		    if (buf_ptr >= buf_end)
406 			fill_buffer();
407 		}
408 		else
409 		    break;	/* we copied one character */
410 	    }			/* end of while (1) */
411 	} while (*e_token++ != qchar);
412 	if (troff) {
413 	    e_token = chfont(&stringf, &bodyf, e_token - 1);
414 	    if (qchar == '"')
415 		*e_token++ = '\'';
416 	}
417 stop_lit:
418 	code = ident;
419 	break;
420 
421     case ('('):
422     case ('['):
423 	unary_delim = true;
424 	code = lparen;
425 	break;
426 
427     case (')'):
428     case (']'):
429 	code = rparen;
430 	break;
431 
432     case '#':
433 	unary_delim = ps.last_u_d;
434 	code = preesc;
435 	break;
436 
437     case '?':
438 	unary_delim = true;
439 	code = question;
440 	break;
441 
442     case (':'):
443 	code = colon;
444 	unary_delim = true;
445 	break;
446 
447     case (';'):
448 	unary_delim = true;
449 	code = semicolon;
450 	break;
451 
452     case ('{'):
453 	unary_delim = true;
454 
455 	/*
456 	 * if (ps.in_or_st) ps.block_init = 1;
457 	 */
458 	/* ?	code = ps.block_init ? lparen : lbrace; */
459 	code = lbrace;
460 	break;
461 
462     case ('}'):
463 	unary_delim = true;
464 	/* ?	code = ps.block_init ? rparen : rbrace; */
465 	code = rbrace;
466 	break;
467 
468     case 014:			/* a form feed */
469 	unary_delim = ps.last_u_d;
470 	ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
471 				 * right */
472 	code = form_feed;
473 	break;
474 
475     case (','):
476 	unary_delim = true;
477 	code = comma;
478 	break;
479 
480     case '.':
481 	unary_delim = false;
482 	code = period;
483 	break;
484 
485     case '-':
486     case '+':			/* check for -, +, --, ++ */
487 	code = (ps.last_u_d ? unary_op : binary_op);
488 	unary_delim = true;
489 
490 	if (*buf_ptr == token[0]) {
491 	    /* check for doubled character */
492 	    *e_token++ = *buf_ptr++;
493 	    /* buffer overflow will be checked at end of loop */
494 	    if (last_code == ident || last_code == rparen) {
495 		code = (ps.last_u_d ? unary_op : postop);
496 		/* check for following ++ or -- */
497 		unary_delim = false;
498 	    }
499 	}
500 	else if (*buf_ptr == '=')
501 	    /* check for operator += */
502 	    *e_token++ = *buf_ptr++;
503 	else if (*buf_ptr == '>') {
504 	    /* check for operator -> */
505 	    *e_token++ = *buf_ptr++;
506 	    if (!pointer_as_binop) {
507 		unary_delim = false;
508 		code = unary_op;
509 		ps.want_blank = false;
510 	    }
511 	}
512 	break;			/* buffer overflow will be checked at end of
513 				 * switch */
514 
515     case '=':
516 	if (ps.in_or_st)
517 	    ps.block_init = 1;
518 #ifdef undef
519 	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
520 	    e_token[-1] = *buf_ptr++;
521 	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
522 		*e_token++ = *buf_ptr++;
523 	    *e_token++ = '=';	/* Flip =+ to += */
524 	    *e_token = 0;
525 	}
526 #else
527 	if (*buf_ptr == '=') {/* == */
528 	    *e_token++ = '=';	/* Flip =+ to += */
529 	    buf_ptr++;
530 	    *e_token = 0;
531 	}
532 #endif
533 	code = binary_op;
534 	unary_delim = true;
535 	break;
536 	/* can drop thru!!! */
537 
538     case '>':
539     case '<':
540     case '!':			/* ops like <, <<, <=, !=, etc */
541 	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
542 	    *e_token++ = *buf_ptr;
543 	    if (++buf_ptr >= buf_end)
544 		fill_buffer();
545 	}
546 	if (*buf_ptr == '=')
547 	    *e_token++ = *buf_ptr++;
548 	code = (ps.last_u_d ? unary_op : binary_op);
549 	unary_delim = true;
550 	break;
551 
552     default:
553 	if (token[0] == '/' && *buf_ptr == '*') {
554 	    /* it is start of comment */
555 	    *e_token++ = '*';
556 
557 	    if (++buf_ptr >= buf_end)
558 		fill_buffer();
559 
560 	    code = comment;
561 	    unary_delim = ps.last_u_d;
562 	    break;
563 	}
564 	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
565 	    /*
566 	     * handle ||, &&, etc, and also things as in int *****i
567 	     */
568 	    *e_token++ = *buf_ptr;
569 	    if (++buf_ptr >= buf_end)
570 		fill_buffer();
571 	}
572 	code = (ps.last_u_d ? unary_op : binary_op);
573 	unary_delim = true;
574 
575 
576     }				/* end of switch */
577     if (code != newline) {
578 	l_struct = false;
579 	last_code = code;
580     }
581     if (buf_ptr >= buf_end)	/* check for input buffer empty */
582 	fill_buffer();
583     ps.last_u_d = unary_delim;
584     *e_token = '\0';		/* null terminate the token */
585     return (code);
586 }
587 
588 /*
589  * Add the given keyword to the keyword table, using val as the keyword type
590  */
591 void
592 addkey(char *key, int val)
593 {
594     struct templ *p = specials;
595     while (p->rwd)
596 	if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
597 	    return;
598 	else
599 	    p++;
600     if (p >= specials + sizeof specials / sizeof specials[0])
601 	return;			/* For now, table overflows are silently
602 				 * ignored */
603     p->rwd = key;
604     p->rwcode = val;
605     p[1].rwd = 0;
606     p[1].rwcode = 0;
607 }
608