xref: /freebsd/usr.bin/indent/lexi.c (revision 884a2a699669ec61e2366e3e358342dbc94be24a)
1 /*
2  * Copyright (c) 1985 Sun Microsystems, Inc.
3  * Copyright (c) 1980, 1993
4  *	The Regents of the University of California.  All rights reserved.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *	This product includes software developed by the University of
18  *	California, Berkeley and its contributors.
19  * 4. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 
36 #if 0
37 #ifndef lint
38 static char sccsid[] = "@(#)lexi.c	8.1 (Berkeley) 6/6/93";
39 #endif /* not lint */
40 #endif
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
43 
44 /*
45  * Here we have the token scanner for indent.  It scans off one token and puts
46  * it in the global variable "token".  It returns a code, indicating the type
47  * of token scanned.
48  */
49 
50 #include <err.h>
51 #include <stdio.h>
52 #include <ctype.h>
53 #include <stdlib.h>
54 #include <string.h>
55 #include "indent_globs.h"
56 #include "indent_codes.h"
57 #include "indent.h"
58 
59 #define alphanum 1
60 #define opchar 3
61 
62 struct templ {
63     const char *rwd;
64     int         rwcode;
65 };
66 
67 struct templ specials[1000] =
68 {
69     {"switch", 1},
70     {"case", 2},
71     {"break", 0},
72     {"struct", 3},
73     {"union", 3},
74     {"enum", 3},
75     {"default", 2},
76     {"int", 4},
77     {"char", 4},
78     {"float", 4},
79     {"double", 4},
80     {"long", 4},
81     {"short", 4},
82     {"typdef", 4},
83     {"unsigned", 4},
84     {"register", 4},
85     {"static", 4},
86     {"global", 4},
87     {"extern", 4},
88     {"void", 4},
89     {"const", 4},
90     {"volatile", 4},
91     {"goto", 0},
92     {"return", 0},
93     {"if", 5},
94     {"while", 5},
95     {"for", 5},
96     {"else", 6},
97     {"do", 6},
98     {"sizeof", 7},
99     {0, 0}
100 };
101 
102 char        chartype[128] =
103 {				/* this is used to facilitate the decision of
104 				 * what type (alphanumeric, operator) each
105 				 * character is */
106     0, 0, 0, 0, 0, 0, 0, 0,
107     0, 0, 0, 0, 0, 0, 0, 0,
108     0, 0, 0, 0, 0, 0, 0, 0,
109     0, 0, 0, 0, 0, 0, 0, 0,
110     0, 3, 0, 0, 1, 3, 3, 0,
111     0, 0, 3, 3, 0, 3, 0, 3,
112     1, 1, 1, 1, 1, 1, 1, 1,
113     1, 1, 0, 0, 3, 3, 3, 3,
114     0, 1, 1, 1, 1, 1, 1, 1,
115     1, 1, 1, 1, 1, 1, 1, 1,
116     1, 1, 1, 1, 1, 1, 1, 1,
117     1, 1, 1, 0, 0, 0, 3, 1,
118     0, 1, 1, 1, 1, 1, 1, 1,
119     1, 1, 1, 1, 1, 1, 1, 1,
120     1, 1, 1, 1, 1, 1, 1, 1,
121     1, 1, 1, 0, 3, 0, 3, 0
122 };
123 
124 int
125 lexi(void)
126 {
127     int         unary_delim;	/* this is set to 1 if the current token
128 				 * forces a following operator to be unary */
129     static int  last_code;	/* the last token type returned */
130     static int  l_struct;	/* set to 1 if the last token was 'struct' */
131     int         code;		/* internal code to be returned */
132     char        qchar;		/* the delimiter character for a string */
133 
134     e_token = s_token;		/* point to start of place to save token */
135     unary_delim = false;
136     ps.col_1 = ps.last_nl;	/* tell world that this token started in
137 				 * column 1 iff the last thing scanned was nl */
138     ps.last_nl = false;
139 
140     while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
141 	ps.col_1 = false;	/* leading blanks imply token is not in column
142 				 * 1 */
143 	if (++buf_ptr >= buf_end)
144 	    fill_buffer();
145     }
146 
147     /* Scan an alphanumeric token */
148     if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
149 	/*
150 	 * we have a character or number
151 	 */
152 	const char *j;		/* used for searching thru list of
153 				 *
154 				 * reserved words */
155 	struct templ *p;
156 
157 	if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
158 	    int         seendot = 0,
159 	                seenexp = 0,
160 			seensfx = 0;
161 	    if (*buf_ptr == '0' &&
162 		    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
163 		*e_token++ = *buf_ptr++;
164 		*e_token++ = *buf_ptr++;
165 		while (isxdigit(*buf_ptr)) {
166 		    CHECK_SIZE_TOKEN;
167 		    *e_token++ = *buf_ptr++;
168 		}
169 	    }
170 	    else
171 		while (1) {
172 		    if (*buf_ptr == '.') {
173 			if (seendot)
174 			    break;
175 			else
176 			    seendot++;
177 		    }
178 		    CHECK_SIZE_TOKEN;
179 		    *e_token++ = *buf_ptr++;
180 		    if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
181 			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
182 			    break;
183 			else {
184 			    seenexp++;
185 			    seendot++;
186 			    CHECK_SIZE_TOKEN;
187 			    *e_token++ = *buf_ptr++;
188 			    if (*buf_ptr == '+' || *buf_ptr == '-')
189 				*e_token++ = *buf_ptr++;
190 			}
191 		    }
192 		}
193 	    while (1) {
194 		if (!(seensfx & 1) &&
195 			(*buf_ptr == 'U' || *buf_ptr == 'u')) {
196 		    CHECK_SIZE_TOKEN;
197 		    *e_token++ = *buf_ptr++;
198 		    seensfx |= 1;
199 		    continue;
200 		}
201         	if (!(seensfx & 2) &&
202 			(*buf_ptr == 'L' || *buf_ptr == 'l')) {
203 		    CHECK_SIZE_TOKEN;
204 		    if (buf_ptr[1] == buf_ptr[0])
205 		        *e_token++ = *buf_ptr++;
206 		    *e_token++ = *buf_ptr++;
207 		    seensfx |= 2;
208 		    continue;
209 		}
210 		break;
211 	    }
212 	}
213 	else
214 	    while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) {
215 		/* fill_buffer() terminates buffer with newline */
216 		if (*buf_ptr == BACKSLASH) {
217 		    if (*(buf_ptr + 1) == '\n') {
218 			buf_ptr += 2;
219 			if (buf_ptr >= buf_end)
220 			    fill_buffer();
221 			} else
222 			    break;
223 		}
224 		CHECK_SIZE_TOKEN;
225 		/* copy it over */
226 		*e_token++ = *buf_ptr++;
227 		if (buf_ptr >= buf_end)
228 		    fill_buffer();
229 	    }
230 	*e_token++ = '\0';
231 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
232 	    if (++buf_ptr >= buf_end)
233 		fill_buffer();
234 	}
235 	ps.its_a_keyword = false;
236 	ps.sizeof_keyword = false;
237 	if (l_struct && !ps.p_l_follow) {
238 				/* if last token was 'struct' and we're not
239 				 * in parentheses, then this token
240 				 * should be treated as a declaration */
241 	    l_struct = false;
242 	    last_code = ident;
243 	    ps.last_u_d = true;
244 	    return (decl);
245 	}
246 	ps.last_u_d = l_struct;	/* Operator after identifier is binary
247 				 * unless last token was 'struct' */
248 	l_struct = false;
249 	last_code = ident;	/* Remember that this is the code we will
250 				 * return */
251 
252 	if (auto_typedefs) {
253 	    const char *q = s_token;
254 	    size_t q_len = strlen(q);
255 	    /* Check if we have an "_t" in the end */
256 	    if (q_len > 2 &&
257 	        (strcmp(q + q_len - 2, "_t") == 0)) {
258 	        ps.its_a_keyword = true;
259 		ps.last_u_d = true;
260 	        goto found_auto_typedef;
261 	    }
262 	}
263 
264 	/*
265 	 * This loop will check if the token is a keyword.
266 	 */
267 	for (p = specials; (j = p->rwd) != 0; p++) {
268 	    const char *q = s_token;	/* point at scanned token */
269 	    if (*j++ != *q++ || *j++ != *q++)
270 		continue;	/* This test depends on the fact that
271 				 * identifiers are always at least 1 character
272 				 * long (ie. the first two bytes of the
273 				 * identifier are always meaningful) */
274 	    if (q[-1] == 0)
275 		break;		/* If its a one-character identifier */
276 	    while (*q++ == *j)
277 		if (*j++ == 0)
278 		    goto found_keyword;	/* I wish that C had a multi-level
279 					 * break... */
280 	}
281 	if (p->rwd) {		/* we have a keyword */
282     found_keyword:
283 	    ps.its_a_keyword = true;
284 	    ps.last_u_d = true;
285 	    switch (p->rwcode) {
286 	    case 1:		/* it is a switch */
287 		return (swstmt);
288 	    case 2:		/* a case or default */
289 		return (casestmt);
290 
291 	    case 3:		/* a "struct" */
292 		/*
293 		 * Next time around, we will want to know that we have had a
294 		 * 'struct'
295 		 */
296 		l_struct = true;
297 		/* FALLTHROUGH */
298 
299 	    case 4:		/* one of the declaration keywords */
300 	    found_auto_typedef:
301 		if (ps.p_l_follow) {
302 		    ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.sizeof_mask;
303 		    break;	/* inside parens: cast, param list or sizeof */
304 		}
305 		last_code = decl;
306 		return (decl);
307 
308 	    case 5:		/* if, while, for */
309 		return (sp_paren);
310 
311 	    case 6:		/* do, else */
312 		return (sp_nparen);
313 
314 	    case 7:
315 		ps.sizeof_keyword = true;
316 	    default:		/* all others are treated like any other
317 				 * identifier */
318 		return (ident);
319 	    }			/* end of switch */
320 	}			/* end of if (found_it) */
321 	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
322 	    char *tp = buf_ptr;
323 	    while (tp < buf_end)
324 		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
325 		    goto not_proc;
326 	    strncpy(ps.procname, token, sizeof ps.procname - 1);
327 	    ps.in_parameter_declaration = 1;
328 	    rparen_count = 1;
329     not_proc:;
330 	}
331 	/*
332 	 * The following hack attempts to guess whether or not the current
333 	 * token is in fact a declaration keyword -- one that has been
334 	 * typedefd
335 	 */
336 	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
337 		&& !ps.p_l_follow
338 	        && !ps.block_init
339 		&& (ps.last_token == rparen || ps.last_token == semicolon ||
340 		    ps.last_token == decl ||
341 		    ps.last_token == lbrace || ps.last_token == rbrace)) {
342 	    ps.its_a_keyword = true;
343 	    ps.last_u_d = true;
344 	    last_code = decl;
345 	    return decl;
346 	}
347 	if (last_code == decl)	/* if this is a declared variable, then
348 				 * following sign is unary */
349 	    ps.last_u_d = true;	/* will make "int a -1" work */
350 	last_code = ident;
351 	return (ident);		/* the ident is not in the list */
352     }				/* end of procesing for alpanum character */
353 
354     /* Scan a non-alphanumeric token */
355 
356     *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
357 				 * moved here */
358     *e_token = '\0';
359     if (++buf_ptr >= buf_end)
360 	fill_buffer();
361 
362     switch (*token) {
363     case '\n':
364 	unary_delim = ps.last_u_d;
365 	ps.last_nl = true;	/* remember that we just had a newline */
366 	code = (had_eof ? 0 : newline);
367 
368 	/*
369 	 * if data has been exhausted, the newline is a dummy, and we should
370 	 * return code to stop
371 	 */
372 	break;
373 
374     case '\'':			/* start of quoted character */
375     case '"':			/* start of string */
376 	qchar = *token;
377 	if (troff) {
378 	    e_token[-1] = '`';
379 	    if (qchar == '"')
380 		*e_token++ = '`';
381 	    e_token = chfont(&bodyf, &stringf, e_token);
382 	}
383 	do {			/* copy the string */
384 	    while (1) {		/* move one character or [/<char>]<char> */
385 		if (*buf_ptr == '\n') {
386 		    diag2(1, "Unterminated literal");
387 		    goto stop_lit;
388 		}
389 		CHECK_SIZE_TOKEN;	/* Only have to do this once in this loop,
390 					 * since CHECK_SIZE guarantees that there
391 					 * are at least 5 entries left */
392 		*e_token = *buf_ptr++;
393 		if (buf_ptr >= buf_end)
394 		    fill_buffer();
395 		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
396 		    if (*buf_ptr == '\n')	/* check for escaped newline */
397 			++line_no;
398 		    if (troff) {
399 			*++e_token = BACKSLASH;
400 			if (*buf_ptr == BACKSLASH)
401 			    *++e_token = BACKSLASH;
402 		    }
403 		    *++e_token = *buf_ptr++;
404 		    ++e_token;	/* we must increment this again because we
405 				 * copied two chars */
406 		    if (buf_ptr >= buf_end)
407 			fill_buffer();
408 		}
409 		else
410 		    break;	/* we copied one character */
411 	    }			/* end of while (1) */
412 	} while (*e_token++ != qchar);
413 	if (troff) {
414 	    e_token = chfont(&stringf, &bodyf, e_token - 1);
415 	    if (qchar == '"')
416 		*e_token++ = '\'';
417 	}
418 stop_lit:
419 	code = ident;
420 	break;
421 
422     case ('('):
423     case ('['):
424 	unary_delim = true;
425 	code = lparen;
426 	break;
427 
428     case (')'):
429     case (']'):
430 	code = rparen;
431 	break;
432 
433     case '#':
434 	unary_delim = ps.last_u_d;
435 	code = preesc;
436 	break;
437 
438     case '?':
439 	unary_delim = true;
440 	code = question;
441 	break;
442 
443     case (':'):
444 	code = colon;
445 	unary_delim = true;
446 	break;
447 
448     case (';'):
449 	unary_delim = true;
450 	code = semicolon;
451 	break;
452 
453     case ('{'):
454 	unary_delim = true;
455 
456 	/*
457 	 * if (ps.in_or_st) ps.block_init = 1;
458 	 */
459 	/* ?	code = ps.block_init ? lparen : lbrace; */
460 	code = lbrace;
461 	break;
462 
463     case ('}'):
464 	unary_delim = true;
465 	/* ?	code = ps.block_init ? rparen : rbrace; */
466 	code = rbrace;
467 	break;
468 
469     case 014:			/* a form feed */
470 	unary_delim = ps.last_u_d;
471 	ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
472 				 * right */
473 	code = form_feed;
474 	break;
475 
476     case (','):
477 	unary_delim = true;
478 	code = comma;
479 	break;
480 
481     case '.':
482 	unary_delim = false;
483 	code = period;
484 	break;
485 
486     case '-':
487     case '+':			/* check for -, +, --, ++ */
488 	code = (ps.last_u_d ? unary_op : binary_op);
489 	unary_delim = true;
490 
491 	if (*buf_ptr == token[0]) {
492 	    /* check for doubled character */
493 	    *e_token++ = *buf_ptr++;
494 	    /* buffer overflow will be checked at end of loop */
495 	    if (last_code == ident || last_code == rparen) {
496 		code = (ps.last_u_d ? unary_op : postop);
497 		/* check for following ++ or -- */
498 		unary_delim = false;
499 	    }
500 	}
501 	else if (*buf_ptr == '=')
502 	    /* check for operator += */
503 	    *e_token++ = *buf_ptr++;
504 	else if (*buf_ptr == '>') {
505 	    /* check for operator -> */
506 	    *e_token++ = *buf_ptr++;
507 	    if (!pointer_as_binop) {
508 		unary_delim = false;
509 		code = unary_op;
510 		ps.want_blank = false;
511 	    }
512 	}
513 	break;			/* buffer overflow will be checked at end of
514 				 * switch */
515 
516     case '=':
517 	if (ps.in_or_st)
518 	    ps.block_init = 1;
519 #ifdef undef
520 	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
521 	    e_token[-1] = *buf_ptr++;
522 	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
523 		*e_token++ = *buf_ptr++;
524 	    *e_token++ = '=';	/* Flip =+ to += */
525 	    *e_token = 0;
526 	}
527 #else
528 	if (*buf_ptr == '=') {/* == */
529 	    *e_token++ = '=';	/* Flip =+ to += */
530 	    buf_ptr++;
531 	    *e_token = 0;
532 	}
533 #endif
534 	code = binary_op;
535 	unary_delim = true;
536 	break;
537 	/* can drop thru!!! */
538 
539     case '>':
540     case '<':
541     case '!':			/* ops like <, <<, <=, !=, etc */
542 	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
543 	    *e_token++ = *buf_ptr;
544 	    if (++buf_ptr >= buf_end)
545 		fill_buffer();
546 	}
547 	if (*buf_ptr == '=')
548 	    *e_token++ = *buf_ptr++;
549 	code = (ps.last_u_d ? unary_op : binary_op);
550 	unary_delim = true;
551 	break;
552 
553     default:
554 	if (token[0] == '/' && *buf_ptr == '*') {
555 	    /* it is start of comment */
556 	    *e_token++ = '*';
557 
558 	    if (++buf_ptr >= buf_end)
559 		fill_buffer();
560 
561 	    code = comment;
562 	    unary_delim = ps.last_u_d;
563 	    break;
564 	}
565 	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
566 	    /*
567 	     * handle ||, &&, etc, and also things as in int *****i
568 	     */
569 	    *e_token++ = *buf_ptr;
570 	    if (++buf_ptr >= buf_end)
571 		fill_buffer();
572 	}
573 	code = (ps.last_u_d ? unary_op : binary_op);
574 	unary_delim = true;
575 
576 
577     }				/* end of switch */
578     if (code != newline) {
579 	l_struct = false;
580 	last_code = code;
581     }
582     if (buf_ptr >= buf_end)	/* check for input buffer empty */
583 	fill_buffer();
584     ps.last_u_d = unary_delim;
585     *e_token = '\0';		/* null terminate the token */
586     return (code);
587 }
588 
589 /*
590  * Add the given keyword to the keyword table, using val as the keyword type
591  */
592 void
593 addkey(char *key, int val)
594 {
595     struct templ *p = specials;
596     while (p->rwd)
597 	if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
598 	    return;
599 	else
600 	    p++;
601     if (p >= specials + sizeof specials / sizeof specials[0])
602 	return;			/* For now, table overflows are silently
603 				 * ignored */
604     p->rwd = key;
605     p->rwcode = val;
606     p[1].rwd = 0;
607     p[1].rwcode = 0;
608 }
609