xref: /freebsd/usr.bin/indent/lexi.c (revision 6780ab54325a71e7e70112b11657973edde8655e)
1 /*
2  * Copyright (c) 1985 Sun Microsystems, Inc.
3  * Copyright (c) 1980, 1993
4  *	The Regents of the University of California.  All rights reserved.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *	This product includes software developed by the University of
18  *	California, Berkeley and its contributors.
19  * 4. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 
36 #if 0
37 #ifndef lint
38 static char sccsid[] = "@(#)lexi.c	8.1 (Berkeley) 6/6/93";
39 #endif /* not lint */
40 #endif
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
43 
44 /*
45  * Here we have the token scanner for indent.  It scans off one token and puts
46  * it in the global variable "token".  It returns a code, indicating the type
47  * of token scanned.
48  */
49 
50 #include <stdio.h>
51 #include <ctype.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #include "indent_globs.h"
55 #include "indent_codes.h"
56 #include "indent.h"
57 
58 #define alphanum 1
59 #define opchar 3
60 
61 void fill_buffer(void);
62 
63 struct templ {
64     const char *rwd;
65     int         rwcode;
66 };
67 
68 struct templ specials[1000] =
69 {
70     {"switch", 1},
71     {"case", 2},
72     {"break", 0},
73     {"struct", 3},
74     {"union", 3},
75     {"enum", 3},
76     {"default", 2},
77     {"int", 4},
78     {"char", 4},
79     {"float", 4},
80     {"double", 4},
81     {"long", 4},
82     {"short", 4},
83     {"typdef", 4},
84     {"unsigned", 4},
85     {"register", 4},
86     {"static", 4},
87     {"global", 4},
88     {"extern", 4},
89     {"void", 4},
90     {"goto", 0},
91     {"return", 0},
92     {"if", 5},
93     {"while", 5},
94     {"for", 5},
95     {"else", 6},
96     {"do", 6},
97     {"sizeof", 7},
98     {"const", 9},
99     {"volatile", 9},
100     {0, 0}
101 };
102 
103 char        chartype[128] =
104 {				/* this is used to facilitate the decision of
105 				 * what type (alphanumeric, operator) each
106 				 * character is */
107     0, 0, 0, 0, 0, 0, 0, 0,
108     0, 0, 0, 0, 0, 0, 0, 0,
109     0, 0, 0, 0, 0, 0, 0, 0,
110     0, 0, 0, 0, 0, 0, 0, 0,
111     0, 3, 0, 0, 1, 3, 3, 0,
112     0, 0, 3, 3, 0, 3, 0, 3,
113     1, 1, 1, 1, 1, 1, 1, 1,
114     1, 1, 0, 0, 3, 3, 3, 3,
115     0, 1, 1, 1, 1, 1, 1, 1,
116     1, 1, 1, 1, 1, 1, 1, 1,
117     1, 1, 1, 1, 1, 1, 1, 1,
118     1, 1, 1, 0, 0, 0, 3, 1,
119     0, 1, 1, 1, 1, 1, 1, 1,
120     1, 1, 1, 1, 1, 1, 1, 1,
121     1, 1, 1, 1, 1, 1, 1, 1,
122     1, 1, 1, 0, 3, 0, 3, 0
123 };
124 
125 int
126 lexi(void)
127 {
128     int         unary_delim;	/* this is set to 1 if the current token
129 				 * forces a following operator to be unary */
130     static int  last_code;	/* the last token type returned */
131     static int  l_struct;	/* set to 1 if the last token was 'struct' */
132     int         code;		/* internal code to be returned */
133     char        qchar;		/* the delimiter character for a string */
134 
135     e_token = s_token;		/* point to start of place to save token */
136     unary_delim = false;
137     ps.col_1 = ps.last_nl;	/* tell world that this token started in
138 				 * column 1 iff the last thing scanned was nl */
139     ps.last_nl = false;
140 
141     while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
142 	ps.col_1 = false;	/* leading blanks imply token is not in column
143 				 * 1 */
144 	if (++buf_ptr >= buf_end)
145 	    fill_buffer();
146     }
147 
148     /* Scan an alphanumeric token */
149     if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
150 	/*
151 	 * we have a character or number
152 	 */
153 	const char *j;		/* used for searching thru list of
154 				 *
155 				 * reserved words */
156 	struct templ *p;
157 
158 	if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
159 	    int         seendot = 0,
160 	                seenexp = 0,
161 			seensfx = 0;
162 	    if (*buf_ptr == '0' &&
163 		    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
164 		*e_token++ = *buf_ptr++;
165 		*e_token++ = *buf_ptr++;
166 		while (isxdigit(*buf_ptr)) {
167 		    CHECK_SIZE_TOKEN;
168 		    *e_token++ = *buf_ptr++;
169 		}
170 	    }
171 	    else
172 		while (1) {
173 		    if (*buf_ptr == '.') {
174 			if (seendot)
175 			    break;
176 			else
177 			    seendot++;
178 		    }
179 		    CHECK_SIZE_TOKEN;
180 		    *e_token++ = *buf_ptr++;
181 		    if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
182 			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
183 			    break;
184 			else {
185 			    seenexp++;
186 			    seendot++;
187 			    CHECK_SIZE_TOKEN;
188 			    *e_token++ = *buf_ptr++;
189 			    if (*buf_ptr == '+' || *buf_ptr == '-')
190 				*e_token++ = *buf_ptr++;
191 			}
192 		    }
193 		}
194 	    while (1) {
195 		if (!(seensfx & 1) &&
196 			(*buf_ptr == 'U' || *buf_ptr == 'u')) {
197 		    CHECK_SIZE_TOKEN;
198 		    *e_token++ = *buf_ptr++;
199 		    seensfx |= 1;
200 		    continue;
201 		}
202         	if (!(seensfx & 2) &&
203 			(*buf_ptr == 'L' || *buf_ptr == 'l')) {
204 		    CHECK_SIZE_TOKEN;
205 		    if (buf_ptr[1] == buf_ptr[0])
206 		        *e_token++ = *buf_ptr++;
207 		    *e_token++ = *buf_ptr++;
208 		    seensfx |= 2;
209 		    continue;
210 		}
211 		break;
212 	    }
213 	}
214 	else
215 	    while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) {
216 		/* fill_buffer() terminates buffer with newline */
217 		if (*buf_ptr == BACKSLASH) {
218 		    if (*(buf_ptr + 1) == '\n') {
219 			buf_ptr += 2;
220 			if (buf_ptr >= buf_end)
221 			    fill_buffer();
222 			} else
223 			    break;
224 		}
225 		CHECK_SIZE_TOKEN;
226 		/* copy it over */
227 		*e_token++ = *buf_ptr++;
228 		if (buf_ptr >= buf_end)
229 		    fill_buffer();
230 	    }
231 	*e_token++ = '\0';
232 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
233 	    if (++buf_ptr >= buf_end)
234 		fill_buffer();
235 	}
236 	ps.its_a_keyword = false;
237 	ps.sizeof_keyword = false;
238 	if (l_struct) {		/* if last token was 'struct', then this token
239 				 * should be treated as a declaration */
240 	    l_struct = false;
241 	    last_code = ident;
242 	    ps.last_u_d = true;
243 	    return (decl);
244 	}
245 	ps.last_u_d = false;	/* Operator after identifier is binary */
246 	last_code = ident;	/* Remember that this is the code we will
247 				 * return */
248 
249 	/*
250 	 * This loop will check if the token is a keyword.
251 	 */
252 	for (p = specials; (j = p->rwd) != 0; p++) {
253 	    const char *q = s_token;	/* point at scanned token */
254 	    if (*j++ != *q++ || *j++ != *q++)
255 		continue;	/* This test depends on the fact that
256 				 * identifiers are always at least 1 character
257 				 * long (ie. the first two bytes of the
258 				 * identifier are always meaningful) */
259 	    if (q[-1] == 0)
260 		break;		/* If its a one-character identifier */
261 	    while (*q++ == *j)
262 		if (*j++ == 0)
263 		    goto found_keyword;	/* I wish that C had a multi-level
264 					 * break... */
265 	}
266 	if (p->rwd) {		/* we have a keyword */
267     found_keyword:
268 	    ps.its_a_keyword = true;
269 	    ps.last_u_d = true;
270 	    switch (p->rwcode) {
271 	    case 1:		/* it is a switch */
272 		return (swstmt);
273 	    case 2:		/* a case or default */
274 		return (casestmt);
275 
276 	    case 3:		/* a "struct" */
277 		if (ps.p_l_follow)
278 			break;	/* inside parens: cast */
279 		/*
280 		 * Next time around, we may want to know that we have had a
281 		 * 'struct'
282 		 */
283 		l_struct = true;
284 
285 		/*
286 		 * Fall through to test for a cast, function prototype or
287 		 * sizeof().
288 		 */
289 	    case 4:		/* one of the declaration keywords */
290 		if (ps.p_l_follow) {
291 		    ps.cast_mask |= 1 << ps.p_l_follow;
292 
293 		    /*
294 		     * Forget that we saw `struct' if we're in a sizeof().
295 		     */
296 		    if (ps.sizeof_mask)
297 			l_struct = false;
298 
299 		    break;	/* inside parens: cast, prototype or sizeof() */
300 		}
301 		last_code = decl;
302 		return (decl);
303 
304 	    case 5:		/* if, while, for */
305 		return (sp_paren);
306 
307 	    case 6:		/* do, else */
308 		return (sp_nparen);
309 
310 	    case 7:
311 		ps.sizeof_keyword = true;
312 	    default:		/* all others are treated like any other
313 				 * identifier */
314 		return (ident);
315 	    }			/* end of switch */
316 	}			/* end of if (found_it) */
317 	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
318 	    char *tp = buf_ptr;
319 	    while (tp < buf_end)
320 		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
321 		    goto not_proc;
322 	    strncpy(ps.procname, token, sizeof ps.procname - 1);
323 	    ps.in_parameter_declaration = 1;
324 	    rparen_count = 1;
325     not_proc:;
326 	}
327 	/*
328 	 * The following hack attempts to guess whether or not the current
329 	 * token is in fact a declaration keyword -- one that has been
330 	 * typedefd
331 	 */
332 	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
333 		&& !ps.p_l_follow
334 	        && !ps.block_init
335 		&& (ps.last_token == rparen || ps.last_token == semicolon ||
336 		    ps.last_token == decl ||
337 		    ps.last_token == lbrace || ps.last_token == rbrace)) {
338 	    ps.its_a_keyword = true;
339 	    ps.last_u_d = true;
340 	    last_code = decl;
341 	    return decl;
342 	}
343 	if (last_code == decl)	/* if this is a declared variable, then
344 				 * following sign is unary */
345 	    ps.last_u_d = true;	/* will make "int a -1" work */
346 	last_code = ident;
347 	return (ident);		/* the ident is not in the list */
348     }				/* end of procesing for alpanum character */
349 
350     /* Scan a non-alphanumeric token */
351 
352     *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
353 				 * moved here */
354     *e_token = '\0';
355     if (++buf_ptr >= buf_end)
356 	fill_buffer();
357 
358     switch (*token) {
359     case '\n':
360 	unary_delim = ps.last_u_d;
361 	ps.last_nl = true;	/* remember that we just had a newline */
362 	code = (had_eof ? 0 : newline);
363 
364 	/*
365 	 * if data has been exhausted, the newline is a dummy, and we should
366 	 * return code to stop
367 	 */
368 	break;
369 
370     case '\'':			/* start of quoted character */
371     case '"':			/* start of string */
372 	qchar = *token;
373 	if (troff) {
374 	    e_token[-1] = '`';
375 	    if (qchar == '"')
376 		*e_token++ = '`';
377 	    e_token = chfont(&bodyf, &stringf, e_token);
378 	}
379 	do {			/* copy the string */
380 	    while (1) {		/* move one character or [/<char>]<char> */
381 		if (*buf_ptr == '\n') {
382 		    printf("%d: Unterminated literal\n", line_no);
383 		    goto stop_lit;
384 		}
385 		CHECK_SIZE_TOKEN;	/* Only have to do this once in this loop,
386 					 * since CHECK_SIZE guarantees that there
387 					 * are at least 5 entries left */
388 		*e_token = *buf_ptr++;
389 		if (buf_ptr >= buf_end)
390 		    fill_buffer();
391 		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
392 		    if (*buf_ptr == '\n')	/* check for escaped newline */
393 			++line_no;
394 		    if (troff) {
395 			*++e_token = BACKSLASH;
396 			if (*buf_ptr == BACKSLASH)
397 			    *++e_token = BACKSLASH;
398 		    }
399 		    *++e_token = *buf_ptr++;
400 		    ++e_token;	/* we must increment this again because we
401 				 * copied two chars */
402 		    if (buf_ptr >= buf_end)
403 			fill_buffer();
404 		}
405 		else
406 		    break;	/* we copied one character */
407 	    }			/* end of while (1) */
408 	} while (*e_token++ != qchar);
409 	if (troff) {
410 	    e_token = chfont(&stringf, &bodyf, e_token - 1);
411 	    if (qchar == '"')
412 		*e_token++ = '\'';
413 	}
414 stop_lit:
415 	code = ident;
416 	break;
417 
418     case ('('):
419     case ('['):
420 	unary_delim = true;
421 	code = lparen;
422 	break;
423 
424     case (')'):
425     case (']'):
426 	code = rparen;
427 	break;
428 
429     case '#':
430 	unary_delim = ps.last_u_d;
431 	code = preesc;
432 	break;
433 
434     case '?':
435 	unary_delim = true;
436 	code = question;
437 	break;
438 
439     case (':'):
440 	code = colon;
441 	unary_delim = true;
442 	break;
443 
444     case (';'):
445 	unary_delim = true;
446 	code = semicolon;
447 	break;
448 
449     case ('{'):
450 	unary_delim = true;
451 
452 	/*
453 	 * if (ps.in_or_st) ps.block_init = 1;
454 	 */
455 	/* ?	code = ps.block_init ? lparen : lbrace; */
456 	code = lbrace;
457 	break;
458 
459     case ('}'):
460 	unary_delim = true;
461 	/* ?	code = ps.block_init ? rparen : rbrace; */
462 	code = rbrace;
463 	break;
464 
465     case 014:			/* a form feed */
466 	unary_delim = ps.last_u_d;
467 	ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
468 				 * right */
469 	code = form_feed;
470 	break;
471 
472     case (','):
473 	unary_delim = true;
474 	code = comma;
475 	break;
476 
477     case '.':
478 	unary_delim = false;
479 	code = period;
480 	break;
481 
482     case '-':
483     case '+':			/* check for -, +, --, ++ */
484 	code = (ps.last_u_d ? unary_op : binary_op);
485 	unary_delim = true;
486 
487 	if (*buf_ptr == token[0]) {
488 	    /* check for doubled character */
489 	    *e_token++ = *buf_ptr++;
490 	    /* buffer overflow will be checked at end of loop */
491 	    if (last_code == ident || last_code == rparen) {
492 		code = (ps.last_u_d ? unary_op : postop);
493 		/* check for following ++ or -- */
494 		unary_delim = false;
495 	    }
496 	}
497 	else if (*buf_ptr == '=')
498 	    /* check for operator += */
499 	    *e_token++ = *buf_ptr++;
500 	else if (*buf_ptr == '>') {
501 	    /* check for operator -> */
502 	    *e_token++ = *buf_ptr++;
503 	    if (!pointer_as_binop) {
504 		unary_delim = false;
505 		code = unary_op;
506 		ps.want_blank = false;
507 	    }
508 	}
509 	break;			/* buffer overflow will be checked at end of
510 				 * switch */
511 
512     case '=':
513 	if (ps.in_or_st)
514 	    ps.block_init = 1;
515 #ifdef undef
516 	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
517 	    e_token[-1] = *buf_ptr++;
518 	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
519 		*e_token++ = *buf_ptr++;
520 	    *e_token++ = '=';	/* Flip =+ to += */
521 	    *e_token = 0;
522 	}
523 #else
524 	if (*buf_ptr == '=') {/* == */
525 	    *e_token++ = '=';	/* Flip =+ to += */
526 	    buf_ptr++;
527 	    *e_token = 0;
528 	}
529 #endif
530 	code = binary_op;
531 	unary_delim = true;
532 	break;
533 	/* can drop thru!!! */
534 
535     case '>':
536     case '<':
537     case '!':			/* ops like <, <<, <=, !=, etc */
538 	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
539 	    *e_token++ = *buf_ptr;
540 	    if (++buf_ptr >= buf_end)
541 		fill_buffer();
542 	}
543 	if (*buf_ptr == '=')
544 	    *e_token++ = *buf_ptr++;
545 	code = (ps.last_u_d ? unary_op : binary_op);
546 	unary_delim = true;
547 	break;
548 
549     default:
550 	if (token[0] == '/' && *buf_ptr == '*') {
551 	    /* it is start of comment */
552 	    *e_token++ = '*';
553 
554 	    if (++buf_ptr >= buf_end)
555 		fill_buffer();
556 
557 	    code = comment;
558 	    unary_delim = ps.last_u_d;
559 	    break;
560 	}
561 	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
562 	    /*
563 	     * handle ||, &&, etc, and also things as in int *****i
564 	     */
565 	    *e_token++ = *buf_ptr;
566 	    if (++buf_ptr >= buf_end)
567 		fill_buffer();
568 	}
569 	code = (ps.last_u_d ? unary_op : binary_op);
570 	unary_delim = true;
571 
572 
573     }				/* end of switch */
574     if (code != newline) {
575 	l_struct = false;
576 	last_code = code;
577     }
578     if (buf_ptr >= buf_end)	/* check for input buffer empty */
579 	fill_buffer();
580     ps.last_u_d = unary_delim;
581     *e_token = '\0';		/* null terminate the token */
582     return (code);
583 }
584 
585 /*
586  * Add the given keyword to the keyword table, using val as the keyword type
587  */
588 void
589 addkey(char *key, int val)
590 {
591     struct templ *p = specials;
592     while (p->rwd)
593 	if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
594 	    return;
595 	else
596 	    p++;
597     if (p >= specials + sizeof specials / sizeof specials[0])
598 	return;			/* For now, table overflows are silently
599 				 * ignored */
600     p->rwd = key;
601     p->rwcode = val;
602     p[1].rwd = 0;
603     p[1].rwcode = 0;
604 }
605