xref: /freebsd/usr.bin/indent/lexi.c (revision 999c1fd64b489eda8c04f1e1529f828ebe5c7794)
1 /*
2  * Copyright (c) 1985 Sun Microsystems, Inc.
3  * Copyright (c) 1980, 1993
4  *	The Regents of the University of California.  All rights reserved.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *	This product includes software developed by the University of
18  *	California, Berkeley and its contributors.
19  * 4. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 
36 #if 0
37 #ifndef lint
38 static char sccsid[] = "@(#)lexi.c	8.1 (Berkeley) 6/6/93";
39 #endif /* not lint */
40 #endif
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
43 
44 /*
45  * Here we have the token scanner for indent.  It scans off one token and puts
46  * it in the global variable "token".  It returns a code, indicating the type
47  * of token scanned.
48  */
49 
50 #include <err.h>
51 #include <stdio.h>
52 #include <ctype.h>
53 #include <stdlib.h>
54 #include <string.h>
55 #include "indent_globs.h"
56 #include "indent_codes.h"
57 #include "indent.h"
58 
59 #define alphanum 1
60 #define opchar 3
61 
62 struct templ {
63     const char *rwd;
64     int         rwcode;
65 };
66 
67 struct templ specials[1000] =
68 {
69     {"switch", 1},
70     {"case", 2},
71     {"break", 0},
72     {"struct", 3},
73     {"union", 3},
74     {"enum", 3},
75     {"default", 2},
76     {"int", 4},
77     {"char", 4},
78     {"float", 4},
79     {"double", 4},
80     {"long", 4},
81     {"short", 4},
82     {"typedef", 4},
83     {"unsigned", 4},
84     {"register", 4},
85     {"static", 4},
86     {"global", 4},
87     {"extern", 4},
88     {"void", 4},
89     {"const", 4},
90     {"volatile", 4},
91     {"goto", 0},
92     {"return", 0},
93     {"if", 5},
94     {"while", 5},
95     {"for", 5},
96     {"else", 6},
97     {"do", 6},
98     {"sizeof", 7},
99     {0, 0}
100 };
101 
102 char        chartype[128] =
103 {				/* this is used to facilitate the decision of
104 				 * what type (alphanumeric, operator) each
105 				 * character is */
106     0, 0, 0, 0, 0, 0, 0, 0,
107     0, 0, 0, 0, 0, 0, 0, 0,
108     0, 0, 0, 0, 0, 0, 0, 0,
109     0, 0, 0, 0, 0, 0, 0, 0,
110     0, 3, 0, 0, 1, 3, 3, 0,
111     0, 0, 3, 3, 0, 3, 0, 3,
112     1, 1, 1, 1, 1, 1, 1, 1,
113     1, 1, 0, 0, 3, 3, 3, 3,
114     0, 1, 1, 1, 1, 1, 1, 1,
115     1, 1, 1, 1, 1, 1, 1, 1,
116     1, 1, 1, 1, 1, 1, 1, 1,
117     1, 1, 1, 0, 0, 0, 3, 1,
118     0, 1, 1, 1, 1, 1, 1, 1,
119     1, 1, 1, 1, 1, 1, 1, 1,
120     1, 1, 1, 1, 1, 1, 1, 1,
121     1, 1, 1, 0, 3, 0, 3, 0
122 };
123 
124 int
125 lexi(void)
126 {
127     int         unary_delim;	/* this is set to 1 if the current token
128 				 * forces a following operator to be unary */
129     static int  last_code;	/* the last token type returned */
130     static int  l_struct;	/* set to 1 if the last token was 'struct' */
131     int         code;		/* internal code to be returned */
132     char        qchar;		/* the delimiter character for a string */
133 
134     e_token = s_token;		/* point to start of place to save token */
135     unary_delim = false;
136     ps.col_1 = ps.last_nl;	/* tell world that this token started in
137 				 * column 1 iff the last thing scanned was nl */
138     ps.last_nl = false;
139 
140     while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
141 	ps.col_1 = false;	/* leading blanks imply token is not in column
142 				 * 1 */
143 	if (++buf_ptr >= buf_end)
144 	    fill_buffer();
145     }
146 
147     /* Scan an alphanumeric token */
148     if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
149 	/*
150 	 * we have a character or number
151 	 */
152 	const char *j;		/* used for searching thru list of
153 				 *
154 				 * reserved words */
155 	struct templ *p;
156 
157 	if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
158 	    int         seendot = 0,
159 	                seenexp = 0,
160 			seensfx = 0;
161 	    if (*buf_ptr == '0' &&
162 		    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
163 		*e_token++ = *buf_ptr++;
164 		*e_token++ = *buf_ptr++;
165 		while (isxdigit(*buf_ptr)) {
166 		    CHECK_SIZE_TOKEN;
167 		    *e_token++ = *buf_ptr++;
168 		}
169 	    }
170 	    else
171 		while (1) {
172 		    if (*buf_ptr == '.') {
173 			if (seendot)
174 			    break;
175 			else
176 			    seendot++;
177 		    }
178 		    CHECK_SIZE_TOKEN;
179 		    *e_token++ = *buf_ptr++;
180 		    if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
181 			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
182 			    break;
183 			else {
184 			    seenexp++;
185 			    seendot++;
186 			    CHECK_SIZE_TOKEN;
187 			    *e_token++ = *buf_ptr++;
188 			    if (*buf_ptr == '+' || *buf_ptr == '-')
189 				*e_token++ = *buf_ptr++;
190 			}
191 		    }
192 		}
193 	    while (1) {
194 		if (!(seensfx & 1) && (*buf_ptr == 'U' || *buf_ptr == 'u')) {
195 		    CHECK_SIZE_TOKEN;
196 		    *e_token++ = *buf_ptr++;
197 		    seensfx |= 1;
198 		    continue;
199 		}
200 		if (!(seensfx & 2) && (strchr("fFlL", *buf_ptr) != NULL)) {
201 		    CHECK_SIZE_TOKEN;
202 		    if (buf_ptr[1] == buf_ptr[0])
203 		        *e_token++ = *buf_ptr++;
204 		    *e_token++ = *buf_ptr++;
205 		    seensfx |= 2;
206 		    continue;
207 		}
208 		break;
209 	    }
210 	}
211 	else
212 	    while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) {
213 		/* fill_buffer() terminates buffer with newline */
214 		if (*buf_ptr == BACKSLASH) {
215 		    if (*(buf_ptr + 1) == '\n') {
216 			buf_ptr += 2;
217 			if (buf_ptr >= buf_end)
218 			    fill_buffer();
219 			} else
220 			    break;
221 		}
222 		CHECK_SIZE_TOKEN;
223 		/* copy it over */
224 		*e_token++ = *buf_ptr++;
225 		if (buf_ptr >= buf_end)
226 		    fill_buffer();
227 	    }
228 	*e_token++ = '\0';
229 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
230 	    if (++buf_ptr >= buf_end)
231 		fill_buffer();
232 	}
233 	ps.its_a_keyword = false;
234 	ps.sizeof_keyword = false;
235 	if (l_struct && !ps.p_l_follow) {
236 				/* if last token was 'struct' and we're not
237 				 * in parentheses, then this token
238 				 * should be treated as a declaration */
239 	    l_struct = false;
240 	    last_code = ident;
241 	    ps.last_u_d = true;
242 	    return (decl);
243 	}
244 	ps.last_u_d = l_struct;	/* Operator after identifier is binary
245 				 * unless last token was 'struct' */
246 	l_struct = false;
247 	last_code = ident;	/* Remember that this is the code we will
248 				 * return */
249 
250 	if (auto_typedefs) {
251 	    const char *q = s_token;
252 	    size_t q_len = strlen(q);
253 	    /* Check if we have an "_t" in the end */
254 	    if (q_len > 2 &&
255 	        (strcmp(q + q_len - 2, "_t") == 0)) {
256 	        ps.its_a_keyword = true;
257 		ps.last_u_d = true;
258 	        goto found_auto_typedef;
259 	    }
260 	}
261 
262 	/*
263 	 * This loop will check if the token is a keyword.
264 	 */
265 	for (p = specials; (j = p->rwd) != NULL; p++) {
266 	    const char *q = s_token;	/* point at scanned token */
267 	    if (*j++ != *q++ || *j++ != *q++)
268 		continue;	/* This test depends on the fact that
269 				 * identifiers are always at least 1 character
270 				 * long (ie. the first two bytes of the
271 				 * identifier are always meaningful) */
272 	    if (q[-1] == 0)
273 		break;		/* If its a one-character identifier */
274 	    while (*q++ == *j)
275 		if (*j++ == 0)
276 		    goto found_keyword;	/* I wish that C had a multi-level
277 					 * break... */
278 	}
279 	if (p->rwd) {		/* we have a keyword */
280     found_keyword:
281 	    ps.its_a_keyword = true;
282 	    ps.last_u_d = true;
283 	    switch (p->rwcode) {
284 	    case 1:		/* it is a switch */
285 		return (swstmt);
286 	    case 2:		/* a case or default */
287 		return (casestmt);
288 
289 	    case 3:		/* a "struct" */
290 		/*
291 		 * Next time around, we will want to know that we have had a
292 		 * 'struct'
293 		 */
294 		l_struct = true;
295 		/* FALLTHROUGH */
296 
297 	    case 4:		/* one of the declaration keywords */
298 	    found_auto_typedef:
299 		if (ps.p_l_follow) {
300 		    ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.sizeof_mask;
301 		    break;	/* inside parens: cast, param list or sizeof */
302 		}
303 		last_code = decl;
304 		return (decl);
305 
306 	    case 5:		/* if, while, for */
307 		return (sp_paren);
308 
309 	    case 6:		/* do, else */
310 		return (sp_nparen);
311 
312 	    case 7:
313 		ps.sizeof_keyword = true;
314 	    default:		/* all others are treated like any other
315 				 * identifier */
316 		return (ident);
317 	    }			/* end of switch */
318 	}			/* end of if (found_it) */
319 	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
320 	    char *tp = buf_ptr;
321 	    while (tp < buf_end)
322 		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
323 		    goto not_proc;
324 	    strncpy(ps.procname, token, sizeof ps.procname - 1);
325 	    ps.in_parameter_declaration = 1;
326 	    rparen_count = 1;
327     not_proc:;
328 	}
329 	/*
330 	 * The following hack attempts to guess whether or not the current
331 	 * token is in fact a declaration keyword -- one that has been
332 	 * typedefd
333 	 */
334 	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
335 		&& !ps.p_l_follow
336 	        && !ps.block_init
337 		&& (ps.last_token == rparen || ps.last_token == semicolon ||
338 		    ps.last_token == decl ||
339 		    ps.last_token == lbrace || ps.last_token == rbrace)) {
340 	    ps.its_a_keyword = true;
341 	    ps.last_u_d = true;
342 	    last_code = decl;
343 	    return decl;
344 	}
345 	if (last_code == decl)	/* if this is a declared variable, then
346 				 * following sign is unary */
347 	    ps.last_u_d = true;	/* will make "int a -1" work */
348 	last_code = ident;
349 	return (ident);		/* the ident is not in the list */
350     }				/* end of procesing for alpanum character */
351 
352     /* Scan a non-alphanumeric token */
353 
354     *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
355 				 * moved here */
356     *e_token = '\0';
357     if (++buf_ptr >= buf_end)
358 	fill_buffer();
359 
360     switch (*token) {
361     case '\n':
362 	unary_delim = ps.last_u_d;
363 	ps.last_nl = true;	/* remember that we just had a newline */
364 	code = (had_eof ? 0 : newline);
365 
366 	/*
367 	 * if data has been exhausted, the newline is a dummy, and we should
368 	 * return code to stop
369 	 */
370 	break;
371 
372     case '\'':			/* start of quoted character */
373     case '"':			/* start of string */
374 	qchar = *token;
375 	if (troff) {
376 	    e_token[-1] = '`';
377 	    if (qchar == '"')
378 		*e_token++ = '`';
379 	    e_token = chfont(&bodyf, &stringf, e_token);
380 	}
381 	do {			/* copy the string */
382 	    while (1) {		/* move one character or [/<char>]<char> */
383 		if (*buf_ptr == '\n') {
384 		    diag2(1, "Unterminated literal");
385 		    goto stop_lit;
386 		}
387 		CHECK_SIZE_TOKEN;	/* Only have to do this once in this loop,
388 					 * since CHECK_SIZE guarantees that there
389 					 * are at least 5 entries left */
390 		*e_token = *buf_ptr++;
391 		if (buf_ptr >= buf_end)
392 		    fill_buffer();
393 		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
394 		    if (*buf_ptr == '\n')	/* check for escaped newline */
395 			++line_no;
396 		    if (troff) {
397 			*++e_token = BACKSLASH;
398 			if (*buf_ptr == BACKSLASH)
399 			    *++e_token = BACKSLASH;
400 		    }
401 		    *++e_token = *buf_ptr++;
402 		    ++e_token;	/* we must increment this again because we
403 				 * copied two chars */
404 		    if (buf_ptr >= buf_end)
405 			fill_buffer();
406 		}
407 		else
408 		    break;	/* we copied one character */
409 	    }			/* end of while (1) */
410 	} while (*e_token++ != qchar);
411 	if (troff) {
412 	    e_token = chfont(&stringf, &bodyf, e_token - 1);
413 	    if (qchar == '"')
414 		*e_token++ = '\'';
415 	}
416 stop_lit:
417 	code = ident;
418 	break;
419 
420     case ('('):
421     case ('['):
422 	unary_delim = true;
423 	code = lparen;
424 	break;
425 
426     case (')'):
427     case (']'):
428 	code = rparen;
429 	break;
430 
431     case '#':
432 	unary_delim = ps.last_u_d;
433 	code = preesc;
434 	break;
435 
436     case '?':
437 	unary_delim = true;
438 	code = question;
439 	break;
440 
441     case (':'):
442 	code = colon;
443 	unary_delim = true;
444 	break;
445 
446     case (';'):
447 	unary_delim = true;
448 	code = semicolon;
449 	break;
450 
451     case ('{'):
452 	unary_delim = true;
453 
454 	/*
455 	 * if (ps.in_or_st) ps.block_init = 1;
456 	 */
457 	/* ?	code = ps.block_init ? lparen : lbrace; */
458 	code = lbrace;
459 	break;
460 
461     case ('}'):
462 	unary_delim = true;
463 	/* ?	code = ps.block_init ? rparen : rbrace; */
464 	code = rbrace;
465 	break;
466 
467     case 014:			/* a form feed */
468 	unary_delim = ps.last_u_d;
469 	ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
470 				 * right */
471 	code = form_feed;
472 	break;
473 
474     case (','):
475 	unary_delim = true;
476 	code = comma;
477 	break;
478 
479     case '.':
480 	unary_delim = false;
481 	code = period;
482 	break;
483 
484     case '-':
485     case '+':			/* check for -, +, --, ++ */
486 	code = (ps.last_u_d ? unary_op : binary_op);
487 	unary_delim = true;
488 
489 	if (*buf_ptr == token[0]) {
490 	    /* check for doubled character */
491 	    *e_token++ = *buf_ptr++;
492 	    /* buffer overflow will be checked at end of loop */
493 	    if (last_code == ident || last_code == rparen) {
494 		code = (ps.last_u_d ? unary_op : postop);
495 		/* check for following ++ or -- */
496 		unary_delim = false;
497 	    }
498 	}
499 	else if (*buf_ptr == '=')
500 	    /* check for operator += */
501 	    *e_token++ = *buf_ptr++;
502 	else if (*buf_ptr == '>') {
503 	    /* check for operator -> */
504 	    *e_token++ = *buf_ptr++;
505 	    if (!pointer_as_binop) {
506 		unary_delim = false;
507 		code = unary_op;
508 		ps.want_blank = false;
509 	    }
510 	}
511 	break;			/* buffer overflow will be checked at end of
512 				 * switch */
513 
514     case '=':
515 	if (ps.in_or_st)
516 	    ps.block_init = 1;
517 #ifdef undef
518 	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
519 	    e_token[-1] = *buf_ptr++;
520 	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
521 		*e_token++ = *buf_ptr++;
522 	    *e_token++ = '=';	/* Flip =+ to += */
523 	    *e_token = 0;
524 	}
525 #else
526 	if (*buf_ptr == '=') {/* == */
527 	    *e_token++ = '=';	/* Flip =+ to += */
528 	    buf_ptr++;
529 	    *e_token = 0;
530 	}
531 #endif
532 	code = binary_op;
533 	unary_delim = true;
534 	break;
535 	/* can drop thru!!! */
536 
537     case '>':
538     case '<':
539     case '!':			/* ops like <, <<, <=, !=, etc */
540 	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
541 	    *e_token++ = *buf_ptr;
542 	    if (++buf_ptr >= buf_end)
543 		fill_buffer();
544 	}
545 	if (*buf_ptr == '=')
546 	    *e_token++ = *buf_ptr++;
547 	code = (ps.last_u_d ? unary_op : binary_op);
548 	unary_delim = true;
549 	break;
550 
551     default:
552 	if (token[0] == '/' && *buf_ptr == '*') {
553 	    /* it is start of comment */
554 	    *e_token++ = '*';
555 
556 	    if (++buf_ptr >= buf_end)
557 		fill_buffer();
558 
559 	    code = comment;
560 	    unary_delim = ps.last_u_d;
561 	    break;
562 	}
563 	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
564 	    /*
565 	     * handle ||, &&, etc, and also things as in int *****i
566 	     */
567 	    *e_token++ = *buf_ptr;
568 	    if (++buf_ptr >= buf_end)
569 		fill_buffer();
570 	}
571 	code = (ps.last_u_d ? unary_op : binary_op);
572 	unary_delim = true;
573 
574 
575     }				/* end of switch */
576     if (code != newline) {
577 	l_struct = false;
578 	last_code = code;
579     }
580     if (buf_ptr >= buf_end)	/* check for input buffer empty */
581 	fill_buffer();
582     ps.last_u_d = unary_delim;
583     *e_token = '\0';		/* null terminate the token */
584     return (code);
585 }
586 
587 /*
588  * Add the given keyword to the keyword table, using val as the keyword type
589  */
590 void
591 addkey(char *key, int val)
592 {
593     struct templ *p = specials;
594     while (p->rwd)
595 	if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
596 	    return;
597 	else
598 	    p++;
599     if (p >= specials + sizeof specials / sizeof specials[0])
600 	return;			/* For now, table overflows are silently
601 				 * ignored */
602     p->rwd = key;
603     p->rwcode = val;
604     p[1].rwd = NULL;
605     p[1].rwcode = 0;
606 }
607