xref: /freebsd/usr.bin/indent/lexi.c (revision 1e413cf93298b5b97441a21d9a50fdcd0ee9945e)
1 /*
2  * Copyright (c) 1985 Sun Microsystems, Inc.
3  * Copyright (c) 1980, 1993
4  *	The Regents of the University of California.  All rights reserved.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *	This product includes software developed by the University of
18  *	California, Berkeley and its contributors.
19  * 4. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 
36 #if 0
37 #ifndef lint
38 static char sccsid[] = "@(#)lexi.c	8.1 (Berkeley) 6/6/93";
39 #endif /* not lint */
40 #endif
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
43 
44 /*
45  * Here we have the token scanner for indent.  It scans off one token and puts
46  * it in the global variable "token".  It returns a code, indicating the type
47  * of token scanned.
48  */
49 
50 #include <err.h>
51 #include <stdio.h>
52 #include <ctype.h>
53 #include <stdlib.h>
54 #include <string.h>
55 #include "indent_globs.h"
56 #include "indent_codes.h"
57 #include "indent.h"
58 
59 #define alphanum 1
60 #define opchar 3
61 
62 struct templ {
63     const char *rwd;
64     int         rwcode;
65 };
66 
67 struct templ specials[1000] =
68 {
69     {"switch", 1},
70     {"case", 2},
71     {"break", 0},
72     {"struct", 3},
73     {"union", 3},
74     {"enum", 3},
75     {"default", 2},
76     {"int", 4},
77     {"char", 4},
78     {"float", 4},
79     {"double", 4},
80     {"long", 4},
81     {"short", 4},
82     {"typdef", 4},
83     {"unsigned", 4},
84     {"register", 4},
85     {"static", 4},
86     {"global", 4},
87     {"extern", 4},
88     {"void", 4},
89     {"const", 4},
90     {"volatile", 4},
91     {"goto", 0},
92     {"return", 0},
93     {"if", 5},
94     {"while", 5},
95     {"for", 5},
96     {"else", 6},
97     {"do", 6},
98     {"sizeof", 7},
99     {0, 0}
100 };
101 
102 char        chartype[128] =
103 {				/* this is used to facilitate the decision of
104 				 * what type (alphanumeric, operator) each
105 				 * character is */
106     0, 0, 0, 0, 0, 0, 0, 0,
107     0, 0, 0, 0, 0, 0, 0, 0,
108     0, 0, 0, 0, 0, 0, 0, 0,
109     0, 0, 0, 0, 0, 0, 0, 0,
110     0, 3, 0, 0, 1, 3, 3, 0,
111     0, 0, 3, 3, 0, 3, 0, 3,
112     1, 1, 1, 1, 1, 1, 1, 1,
113     1, 1, 0, 0, 3, 3, 3, 3,
114     0, 1, 1, 1, 1, 1, 1, 1,
115     1, 1, 1, 1, 1, 1, 1, 1,
116     1, 1, 1, 1, 1, 1, 1, 1,
117     1, 1, 1, 0, 0, 0, 3, 1,
118     0, 1, 1, 1, 1, 1, 1, 1,
119     1, 1, 1, 1, 1, 1, 1, 1,
120     1, 1, 1, 1, 1, 1, 1, 1,
121     1, 1, 1, 0, 3, 0, 3, 0
122 };
123 
124 int
125 lexi(void)
126 {
127     int         unary_delim;	/* this is set to 1 if the current token
128 				 * forces a following operator to be unary */
129     static int  last_code;	/* the last token type returned */
130     static int  l_struct;	/* set to 1 if the last token was 'struct' */
131     int         code;		/* internal code to be returned */
132     char        qchar;		/* the delimiter character for a string */
133 
134     e_token = s_token;		/* point to start of place to save token */
135     unary_delim = false;
136     ps.col_1 = ps.last_nl;	/* tell world that this token started in
137 				 * column 1 iff the last thing scanned was nl */
138     ps.last_nl = false;
139 
140     while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
141 	ps.col_1 = false;	/* leading blanks imply token is not in column
142 				 * 1 */
143 	if (++buf_ptr >= buf_end)
144 	    fill_buffer();
145     }
146 
147     /* Scan an alphanumeric token */
148     if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
149 	/*
150 	 * we have a character or number
151 	 */
152 	const char *j;		/* used for searching thru list of
153 				 *
154 				 * reserved words */
155 	struct templ *p;
156 
157 	if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
158 	    int         seendot = 0,
159 	                seenexp = 0,
160 			seensfx = 0;
161 	    if (*buf_ptr == '0' &&
162 		    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
163 		*e_token++ = *buf_ptr++;
164 		*e_token++ = *buf_ptr++;
165 		while (isxdigit(*buf_ptr)) {
166 		    CHECK_SIZE_TOKEN;
167 		    *e_token++ = *buf_ptr++;
168 		}
169 	    }
170 	    else
171 		while (1) {
172 		    if (*buf_ptr == '.') {
173 			if (seendot)
174 			    break;
175 			else
176 			    seendot++;
177 		    }
178 		    CHECK_SIZE_TOKEN;
179 		    *e_token++ = *buf_ptr++;
180 		    if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
181 			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
182 			    break;
183 			else {
184 			    seenexp++;
185 			    seendot++;
186 			    CHECK_SIZE_TOKEN;
187 			    *e_token++ = *buf_ptr++;
188 			    if (*buf_ptr == '+' || *buf_ptr == '-')
189 				*e_token++ = *buf_ptr++;
190 			}
191 		    }
192 		}
193 	    while (1) {
194 		if (!(seensfx & 1) &&
195 			(*buf_ptr == 'U' || *buf_ptr == 'u')) {
196 		    CHECK_SIZE_TOKEN;
197 		    *e_token++ = *buf_ptr++;
198 		    seensfx |= 1;
199 		    continue;
200 		}
201         	if (!(seensfx & 2) &&
202 			(*buf_ptr == 'L' || *buf_ptr == 'l')) {
203 		    CHECK_SIZE_TOKEN;
204 		    if (buf_ptr[1] == buf_ptr[0])
205 		        *e_token++ = *buf_ptr++;
206 		    *e_token++ = *buf_ptr++;
207 		    seensfx |= 2;
208 		    continue;
209 		}
210 		break;
211 	    }
212 	}
213 	else
214 	    while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) {
215 		/* fill_buffer() terminates buffer with newline */
216 		if (*buf_ptr == BACKSLASH) {
217 		    if (*(buf_ptr + 1) == '\n') {
218 			buf_ptr += 2;
219 			if (buf_ptr >= buf_end)
220 			    fill_buffer();
221 			} else
222 			    break;
223 		}
224 		CHECK_SIZE_TOKEN;
225 		/* copy it over */
226 		*e_token++ = *buf_ptr++;
227 		if (buf_ptr >= buf_end)
228 		    fill_buffer();
229 	    }
230 	*e_token++ = '\0';
231 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
232 	    if (++buf_ptr >= buf_end)
233 		fill_buffer();
234 	}
235 	ps.its_a_keyword = false;
236 	ps.sizeof_keyword = false;
237 	if (l_struct && !ps.p_l_follow) {
238 				/* if last token was 'struct' and we're not
239 				 * in parentheses, then this token
240 				 * should be treated as a declaration */
241 	    l_struct = false;
242 	    last_code = ident;
243 	    ps.last_u_d = true;
244 	    return (decl);
245 	}
246 	ps.last_u_d = l_struct;	/* Operator after identifier is binary
247 				 * unless last token was 'struct' */
248 	l_struct = false;
249 	last_code = ident;	/* Remember that this is the code we will
250 				 * return */
251 
252 	/*
253 	 * This loop will check if the token is a keyword.
254 	 */
255 	for (p = specials; (j = p->rwd) != 0; p++) {
256 	    const char *q = s_token;	/* point at scanned token */
257 	    if (*j++ != *q++ || *j++ != *q++)
258 		continue;	/* This test depends on the fact that
259 				 * identifiers are always at least 1 character
260 				 * long (ie. the first two bytes of the
261 				 * identifier are always meaningful) */
262 	    if (q[-1] == 0)
263 		break;		/* If its a one-character identifier */
264 	    while (*q++ == *j)
265 		if (*j++ == 0)
266 		    goto found_keyword;	/* I wish that C had a multi-level
267 					 * break... */
268 	}
269 	if (p->rwd) {		/* we have a keyword */
270     found_keyword:
271 	    ps.its_a_keyword = true;
272 	    ps.last_u_d = true;
273 	    switch (p->rwcode) {
274 	    case 1:		/* it is a switch */
275 		return (swstmt);
276 	    case 2:		/* a case or default */
277 		return (casestmt);
278 
279 	    case 3:		/* a "struct" */
280 		/*
281 		 * Next time around, we will want to know that we have had a
282 		 * 'struct'
283 		 */
284 		l_struct = true;
285 		/* FALLTHROUGH */
286 
287 	    case 4:		/* one of the declaration keywords */
288 		if (ps.p_l_follow) {
289 		    ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.sizeof_mask;
290 		    break;	/* inside parens: cast, param list or sizeof */
291 		}
292 		last_code = decl;
293 		return (decl);
294 
295 	    case 5:		/* if, while, for */
296 		return (sp_paren);
297 
298 	    case 6:		/* do, else */
299 		return (sp_nparen);
300 
301 	    case 7:
302 		ps.sizeof_keyword = true;
303 	    default:		/* all others are treated like any other
304 				 * identifier */
305 		return (ident);
306 	    }			/* end of switch */
307 	}			/* end of if (found_it) */
308 	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
309 	    char *tp = buf_ptr;
310 	    while (tp < buf_end)
311 		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
312 		    goto not_proc;
313 	    strncpy(ps.procname, token, sizeof ps.procname - 1);
314 	    ps.in_parameter_declaration = 1;
315 	    rparen_count = 1;
316     not_proc:;
317 	}
318 	/*
319 	 * The following hack attempts to guess whether or not the current
320 	 * token is in fact a declaration keyword -- one that has been
321 	 * typedefd
322 	 */
323 	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
324 		&& !ps.p_l_follow
325 	        && !ps.block_init
326 		&& (ps.last_token == rparen || ps.last_token == semicolon ||
327 		    ps.last_token == decl ||
328 		    ps.last_token == lbrace || ps.last_token == rbrace)) {
329 	    ps.its_a_keyword = true;
330 	    ps.last_u_d = true;
331 	    last_code = decl;
332 	    return decl;
333 	}
334 	if (last_code == decl)	/* if this is a declared variable, then
335 				 * following sign is unary */
336 	    ps.last_u_d = true;	/* will make "int a -1" work */
337 	last_code = ident;
338 	return (ident);		/* the ident is not in the list */
339     }				/* end of procesing for alpanum character */
340 
341     /* Scan a non-alphanumeric token */
342 
343     *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
344 				 * moved here */
345     *e_token = '\0';
346     if (++buf_ptr >= buf_end)
347 	fill_buffer();
348 
349     switch (*token) {
350     case '\n':
351 	unary_delim = ps.last_u_d;
352 	ps.last_nl = true;	/* remember that we just had a newline */
353 	code = (had_eof ? 0 : newline);
354 
355 	/*
356 	 * if data has been exhausted, the newline is a dummy, and we should
357 	 * return code to stop
358 	 */
359 	break;
360 
361     case '\'':			/* start of quoted character */
362     case '"':			/* start of string */
363 	qchar = *token;
364 	if (troff) {
365 	    e_token[-1] = '`';
366 	    if (qchar == '"')
367 		*e_token++ = '`';
368 	    e_token = chfont(&bodyf, &stringf, e_token);
369 	}
370 	do {			/* copy the string */
371 	    while (1) {		/* move one character or [/<char>]<char> */
372 		if (*buf_ptr == '\n') {
373 		    diag2(1, "Unterminated literal");
374 		    goto stop_lit;
375 		}
376 		CHECK_SIZE_TOKEN;	/* Only have to do this once in this loop,
377 					 * since CHECK_SIZE guarantees that there
378 					 * are at least 5 entries left */
379 		*e_token = *buf_ptr++;
380 		if (buf_ptr >= buf_end)
381 		    fill_buffer();
382 		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
383 		    if (*buf_ptr == '\n')	/* check for escaped newline */
384 			++line_no;
385 		    if (troff) {
386 			*++e_token = BACKSLASH;
387 			if (*buf_ptr == BACKSLASH)
388 			    *++e_token = BACKSLASH;
389 		    }
390 		    *++e_token = *buf_ptr++;
391 		    ++e_token;	/* we must increment this again because we
392 				 * copied two chars */
393 		    if (buf_ptr >= buf_end)
394 			fill_buffer();
395 		}
396 		else
397 		    break;	/* we copied one character */
398 	    }			/* end of while (1) */
399 	} while (*e_token++ != qchar);
400 	if (troff) {
401 	    e_token = chfont(&stringf, &bodyf, e_token - 1);
402 	    if (qchar == '"')
403 		*e_token++ = '\'';
404 	}
405 stop_lit:
406 	code = ident;
407 	break;
408 
409     case ('('):
410     case ('['):
411 	unary_delim = true;
412 	code = lparen;
413 	break;
414 
415     case (')'):
416     case (']'):
417 	code = rparen;
418 	break;
419 
420     case '#':
421 	unary_delim = ps.last_u_d;
422 	code = preesc;
423 	break;
424 
425     case '?':
426 	unary_delim = true;
427 	code = question;
428 	break;
429 
430     case (':'):
431 	code = colon;
432 	unary_delim = true;
433 	break;
434 
435     case (';'):
436 	unary_delim = true;
437 	code = semicolon;
438 	break;
439 
440     case ('{'):
441 	unary_delim = true;
442 
443 	/*
444 	 * if (ps.in_or_st) ps.block_init = 1;
445 	 */
446 	/* ?	code = ps.block_init ? lparen : lbrace; */
447 	code = lbrace;
448 	break;
449 
450     case ('}'):
451 	unary_delim = true;
452 	/* ?	code = ps.block_init ? rparen : rbrace; */
453 	code = rbrace;
454 	break;
455 
456     case 014:			/* a form feed */
457 	unary_delim = ps.last_u_d;
458 	ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
459 				 * right */
460 	code = form_feed;
461 	break;
462 
463     case (','):
464 	unary_delim = true;
465 	code = comma;
466 	break;
467 
468     case '.':
469 	unary_delim = false;
470 	code = period;
471 	break;
472 
473     case '-':
474     case '+':			/* check for -, +, --, ++ */
475 	code = (ps.last_u_d ? unary_op : binary_op);
476 	unary_delim = true;
477 
478 	if (*buf_ptr == token[0]) {
479 	    /* check for doubled character */
480 	    *e_token++ = *buf_ptr++;
481 	    /* buffer overflow will be checked at end of loop */
482 	    if (last_code == ident || last_code == rparen) {
483 		code = (ps.last_u_d ? unary_op : postop);
484 		/* check for following ++ or -- */
485 		unary_delim = false;
486 	    }
487 	}
488 	else if (*buf_ptr == '=')
489 	    /* check for operator += */
490 	    *e_token++ = *buf_ptr++;
491 	else if (*buf_ptr == '>') {
492 	    /* check for operator -> */
493 	    *e_token++ = *buf_ptr++;
494 	    if (!pointer_as_binop) {
495 		unary_delim = false;
496 		code = unary_op;
497 		ps.want_blank = false;
498 	    }
499 	}
500 	break;			/* buffer overflow will be checked at end of
501 				 * switch */
502 
503     case '=':
504 	if (ps.in_or_st)
505 	    ps.block_init = 1;
506 #ifdef undef
507 	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
508 	    e_token[-1] = *buf_ptr++;
509 	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
510 		*e_token++ = *buf_ptr++;
511 	    *e_token++ = '=';	/* Flip =+ to += */
512 	    *e_token = 0;
513 	}
514 #else
515 	if (*buf_ptr == '=') {/* == */
516 	    *e_token++ = '=';	/* Flip =+ to += */
517 	    buf_ptr++;
518 	    *e_token = 0;
519 	}
520 #endif
521 	code = binary_op;
522 	unary_delim = true;
523 	break;
524 	/* can drop thru!!! */
525 
526     case '>':
527     case '<':
528     case '!':			/* ops like <, <<, <=, !=, etc */
529 	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
530 	    *e_token++ = *buf_ptr;
531 	    if (++buf_ptr >= buf_end)
532 		fill_buffer();
533 	}
534 	if (*buf_ptr == '=')
535 	    *e_token++ = *buf_ptr++;
536 	code = (ps.last_u_d ? unary_op : binary_op);
537 	unary_delim = true;
538 	break;
539 
540     default:
541 	if (token[0] == '/' && *buf_ptr == '*') {
542 	    /* it is start of comment */
543 	    *e_token++ = '*';
544 
545 	    if (++buf_ptr >= buf_end)
546 		fill_buffer();
547 
548 	    code = comment;
549 	    unary_delim = ps.last_u_d;
550 	    break;
551 	}
552 	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
553 	    /*
554 	     * handle ||, &&, etc, and also things as in int *****i
555 	     */
556 	    *e_token++ = *buf_ptr;
557 	    if (++buf_ptr >= buf_end)
558 		fill_buffer();
559 	}
560 	code = (ps.last_u_d ? unary_op : binary_op);
561 	unary_delim = true;
562 
563 
564     }				/* end of switch */
565     if (code != newline) {
566 	l_struct = false;
567 	last_code = code;
568     }
569     if (buf_ptr >= buf_end)	/* check for input buffer empty */
570 	fill_buffer();
571     ps.last_u_d = unary_delim;
572     *e_token = '\0';		/* null terminate the token */
573     return (code);
574 }
575 
576 /*
577  * Add the given keyword to the keyword table, using val as the keyword type
578  */
579 void
580 addkey(char *key, int val)
581 {
582     struct templ *p = specials;
583     while (p->rwd)
584 	if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
585 	    return;
586 	else
587 	    p++;
588     if (p >= specials + sizeof specials / sizeof specials[0])
589 	return;			/* For now, table overflows are silently
590 				 * ignored */
591     p->rwd = key;
592     p->rwcode = val;
593     p[1].rwd = 0;
594     p[1].rwcode = 0;
595 }
596