xref: /freebsd/usr.bin/indent/lexi.c (revision 5521ff5a4d1929056e7ffc982fac3341ca54df7c)
1 /*
2  * Copyright (c) 1985 Sun Microsystems, Inc.
3  * Copyright (c) 1980, 1993
4  *	The Regents of the University of California.  All rights reserved.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *	This product includes software developed by the University of
18  *	California, Berkeley and its contributors.
19  * 4. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 
36 #ifndef lint
37 static char sccsid[] = "@(#)lexi.c	8.1 (Berkeley) 6/6/93";
38 static const char rcsid[] =
39   "$FreeBSD$";
40 #endif /* not lint */
41 
42 /*
43  * Here we have the token scanner for indent.  It scans off one token and puts
44  * it in the global variable "token".  It returns a code, indicating the type
45  * of token scanned.
46  */
47 
48 #include <stdio.h>
49 #include <ctype.h>
50 #include <stdlib.h>
51 #include <string.h>
52 #include "indent_globs.h"
53 #include "indent_codes.h"
54 
55 #define alphanum 1
56 #define opchar 3
57 
58 struct templ {
59     char       *rwd;
60     int         rwcode;
61 };
62 
63 struct templ specials[1000] =
64 {
65     "switch", 1,
66     "case", 2,
67     "break", 0,
68     "struct", 3,
69     "union", 3,
70     "enum", 3,
71     "default", 2,
72     "int", 4,
73     "char", 4,
74     "float", 4,
75     "double", 4,
76     "long", 4,
77     "short", 4,
78     "typdef", 4,
79     "unsigned", 4,
80     "register", 4,
81     "static", 4,
82     "global", 4,
83     "extern", 4,
84     "void", 4,
85     "goto", 0,
86     "return", 0,
87     "if", 5,
88     "while", 5,
89     "for", 5,
90     "else", 6,
91     "do", 6,
92     "sizeof", 7,
93     "const", 9,
94     "volatile", 9,
95     0, 0
96 };
97 
98 char        chartype[128] =
99 {				/* this is used to facilitate the decision of
100 				 * what type (alphanumeric, operator) each
101 				 * character is */
102     0, 0, 0, 0, 0, 0, 0, 0,
103     0, 0, 0, 0, 0, 0, 0, 0,
104     0, 0, 0, 0, 0, 0, 0, 0,
105     0, 0, 0, 0, 0, 0, 0, 0,
106     0, 3, 0, 0, 1, 3, 3, 0,
107     0, 0, 3, 3, 0, 3, 0, 3,
108     1, 1, 1, 1, 1, 1, 1, 1,
109     1, 1, 0, 0, 3, 3, 3, 3,
110     0, 1, 1, 1, 1, 1, 1, 1,
111     1, 1, 1, 1, 1, 1, 1, 1,
112     1, 1, 1, 1, 1, 1, 1, 1,
113     1, 1, 1, 0, 0, 0, 3, 1,
114     0, 1, 1, 1, 1, 1, 1, 1,
115     1, 1, 1, 1, 1, 1, 1, 1,
116     1, 1, 1, 1, 1, 1, 1, 1,
117     1, 1, 1, 0, 3, 0, 3, 0
118 };
119 
120 
121 
122 
123 int
124 lexi()
125 {
126     int         unary_delim;	/* this is set to 1 if the current token
127 				 *
128 				 * forces a following operator to be unary */
129     static int  last_code;	/* the last token type returned */
130     static int  l_struct;	/* set to 1 if the last token was 'struct' */
131     int         code;		/* internal code to be returned */
132     char        qchar;		/* the delimiter character for a string */
133 
134     e_token = s_token;		/* point to start of place to save token */
135     unary_delim = false;
136     ps.col_1 = ps.last_nl;	/* tell world that this token started in
137 				 * column 1 iff the last thing scanned was nl */
138     ps.last_nl = false;
139 
140     while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
141 	ps.col_1 = false;	/* leading blanks imply token is not in column
142 				 * 1 */
143 	if (++buf_ptr >= buf_end)
144 	    fill_buffer();
145     }
146 
147     /* Scan an alphanumeric token */
148     if (chartype[*buf_ptr] == alphanum || buf_ptr[0] == '.' && isdigit(buf_ptr[1])) {
149 	/*
150 	 * we have a character or number
151 	 */
152 	register char *j;	/* used for searching thru list of
153 				 *
154 				 * reserved words */
155 	register struct templ *p;
156 
157 	if (isdigit(*buf_ptr) || buf_ptr[0] == '.' && isdigit(buf_ptr[1])) {
158 	    int         seendot = 0,
159 	                seenexp = 0,
160 			seensfx = 0;
161 	    if (*buf_ptr == '0' &&
162 		    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
163 		*e_token++ = *buf_ptr++;
164 		*e_token++ = *buf_ptr++;
165 		while (isxdigit(*buf_ptr)) {
166 		    CHECK_SIZE_TOKEN;
167 		    *e_token++ = *buf_ptr++;
168 		}
169 	    }
170 	    else
171 		while (1) {
172 		    if (*buf_ptr == '.')
173 			if (seendot)
174 			    break;
175 			else
176 			    seendot++;
177 		    CHECK_SIZE_TOKEN;
178 		    *e_token++ = *buf_ptr++;
179 		    if (!isdigit(*buf_ptr) && *buf_ptr != '.')
180 			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
181 			    break;
182 			else {
183 			    seenexp++;
184 			    seendot++;
185 			    CHECK_SIZE_TOKEN;
186 			    *e_token++ = *buf_ptr++;
187 			    if (*buf_ptr == '+' || *buf_ptr == '-')
188 				*e_token++ = *buf_ptr++;
189 			}
190 		}
191 	    while (1) {
192 		if (!(seensfx & 1) &&
193 			(*buf_ptr == 'U' || *buf_ptr == 'u')) {
194 		    CHECK_SIZE_TOKEN;
195 		    *e_token++ = *buf_ptr++;
196 		    seensfx |= 1;
197 		    continue;
198 		}
199         	if (!(seensfx & 2) &&
200 			(*buf_ptr == 'L' || *buf_ptr == 'l')) {
201 		    CHECK_SIZE_TOKEN;
202 		    if (buf_ptr[1] == buf_ptr[0])
203 		        *e_token++ = *buf_ptr++;
204 		    *e_token++ = *buf_ptr++;
205 		    seensfx |= 2;
206 		    continue;
207 		}
208 		break;
209 	    }
210 	}
211 	else
212 	    while (chartype[*buf_ptr] == alphanum) {	/* copy it over */
213 		CHECK_SIZE_TOKEN;
214 		*e_token++ = *buf_ptr++;
215 		if (buf_ptr >= buf_end)
216 		    fill_buffer();
217 	    }
218 	*e_token++ = '\0';
219 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
220 	    if (++buf_ptr >= buf_end)
221 		fill_buffer();
222 	}
223 	ps.its_a_keyword = false;
224 	ps.sizeof_keyword = false;
225 	if (l_struct) {		/* if last token was 'struct', then this token
226 				 * should be treated as a declaration */
227 	    l_struct = false;
228 	    last_code = ident;
229 	    ps.last_u_d = true;
230 	    return (decl);
231 	}
232 	ps.last_u_d = false;	/* Operator after indentifier is binary */
233 	last_code = ident;	/* Remember that this is the code we will
234 				 * return */
235 
236 	/*
237 	 * This loop will check if the token is a keyword.
238 	 */
239 	for (p = specials; (j = p->rwd) != 0; p++) {
240 	    register char *p = s_token;	/* point at scanned token */
241 	    if (*j++ != *p++ || *j++ != *p++)
242 		continue;	/* This test depends on the fact that
243 				 * identifiers are always at least 1 character
244 				 * long (ie. the first two bytes of the
245 				 * identifier are always meaningful) */
246 	    if (p[-1] == 0)
247 		break;		/* If its a one-character identifier */
248 	    while (*p++ == *j)
249 		if (*j++ == 0)
250 		    goto found_keyword;	/* I wish that C had a multi-level
251 					 * break... */
252 	}
253 	if (p->rwd) {		/* we have a keyword */
254     found_keyword:
255 	    ps.its_a_keyword = true;
256 	    ps.last_u_d = true;
257 	    switch (p->rwcode) {
258 	    case 1:		/* it is a switch */
259 		return (swstmt);
260 	    case 2:		/* a case or default */
261 		return (casestmt);
262 
263 	    case 3:		/* a "struct" */
264 		if (ps.p_l_follow)
265 			break;	/* inside parens: cast */
266 		/*
267 		 * Next time around, we may want to know that we have had a
268 		 * 'struct'
269 		 */
270 		l_struct = true;
271 
272 		/*
273 		 * Fall through to test for a cast, function prototype or
274 		 * sizeof().
275 		 */
276 	    case 4:		/* one of the declaration keywords */
277 		if (ps.p_l_follow) {
278 		    ps.cast_mask |= 1 << ps.p_l_follow;
279 
280 		    /*
281 		     * Forget that we saw `struct' if we're in a sizeof().
282 		     */
283 		    if (ps.sizeof_mask)
284 			l_struct = false;
285 
286 		    break;	/* inside parens: cast, prototype or sizeof() */
287 		}
288 		last_code = decl;
289 		return (decl);
290 
291 	    case 5:		/* if, while, for */
292 		return (sp_paren);
293 
294 	    case 6:		/* do, else */
295 		return (sp_nparen);
296 
297 	    case 7:
298 		ps.sizeof_keyword = true;
299 	    default:		/* all others are treated like any other
300 				 * identifier */
301 		return (ident);
302 	    }			/* end of switch */
303 	}			/* end of if (found_it) */
304 	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
305 	    register char *tp = buf_ptr;
306 	    while (tp < buf_end)
307 		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
308 		    goto not_proc;
309 	    strncpy(ps.procname, token, sizeof ps.procname - 1);
310 	    ps.in_parameter_declaration = 1;
311 	    rparen_count = 1;
312     not_proc:;
313 	}
314 	/*
315 	 * The following hack attempts to guess whether or not the current
316 	 * token is in fact a declaration keyword -- one that has been
317 	 * typedefd
318 	 */
319 	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
320 		&& !ps.p_l_follow
321 	        && !ps.block_init
322 		&& (ps.last_token == rparen || ps.last_token == semicolon ||
323 		    ps.last_token == decl ||
324 		    ps.last_token == lbrace || ps.last_token == rbrace)) {
325 	    ps.its_a_keyword = true;
326 	    ps.last_u_d = true;
327 	    last_code = decl;
328 	    return decl;
329 	}
330 	if (last_code == decl)	/* if this is a declared variable, then
331 				 * following sign is unary */
332 	    ps.last_u_d = true;	/* will make "int a -1" work */
333 	last_code = ident;
334 	return (ident);		/* the ident is not in the list */
335     }				/* end of procesing for alpanum character */
336 
337     /* Scan a non-alphanumeric token */
338 
339     *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
340 				 * moved here */
341     *e_token = '\0';
342     if (++buf_ptr >= buf_end)
343 	fill_buffer();
344 
345     switch (*token) {
346     case '\n':
347 	unary_delim = ps.last_u_d;
348 	ps.last_nl = true;	/* remember that we just had a newline */
349 	code = (had_eof ? 0 : newline);
350 
351 	/*
352 	 * if data has been exausted, the newline is a dummy, and we should
353 	 * return code to stop
354 	 */
355 	break;
356 
357     case '\'':			/* start of quoted character */
358     case '"':			/* start of string */
359 	qchar = *token;
360 	if (troff) {
361 	    e_token[-1] = '`';
362 	    if (qchar == '"')
363 		*e_token++ = '`';
364 	    e_token = chfont(&bodyf, &stringf, e_token);
365 	}
366 	do {			/* copy the string */
367 	    while (1) {		/* move one character or [/<char>]<char> */
368 		if (*buf_ptr == '\n') {
369 		    printf("%d: Unterminated literal\n", line_no);
370 		    goto stop_lit;
371 		}
372 		CHECK_SIZE_TOKEN;	/* Only have to do this once in this loop,
373 					 * since CHECK_SIZE guarantees that there
374 					 * are at least 5 entries left */
375 		*e_token = *buf_ptr++;
376 		if (buf_ptr >= buf_end)
377 		    fill_buffer();
378 		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
379 		    if (*buf_ptr == '\n')	/* check for escaped newline */
380 			++line_no;
381 		    if (troff) {
382 			*++e_token = BACKSLASH;
383 			if (*buf_ptr == BACKSLASH)
384 			    *++e_token = BACKSLASH;
385 		    }
386 		    *++e_token = *buf_ptr++;
387 		    ++e_token;	/* we must increment this again because we
388 				 * copied two chars */
389 		    if (buf_ptr >= buf_end)
390 			fill_buffer();
391 		}
392 		else
393 		    break;	/* we copied one character */
394 	    }			/* end of while (1) */
395 	} while (*e_token++ != qchar);
396 	if (troff) {
397 	    e_token = chfont(&stringf, &bodyf, e_token - 1);
398 	    if (qchar == '"')
399 		*e_token++ = '\'';
400 	}
401 stop_lit:
402 	code = ident;
403 	break;
404 
405     case ('('):
406     case ('['):
407 	unary_delim = true;
408 	code = lparen;
409 	break;
410 
411     case (')'):
412     case (']'):
413 	code = rparen;
414 	break;
415 
416     case '#':
417 	unary_delim = ps.last_u_d;
418 	code = preesc;
419 	break;
420 
421     case '?':
422 	unary_delim = true;
423 	code = question;
424 	break;
425 
426     case (':'):
427 	code = colon;
428 	unary_delim = true;
429 	break;
430 
431     case (';'):
432 	unary_delim = true;
433 	code = semicolon;
434 	break;
435 
436     case ('{'):
437 	unary_delim = true;
438 
439 	/*
440 	 * if (ps.in_or_st) ps.block_init = 1;
441 	 */
442 	/* ?	code = ps.block_init ? lparen : lbrace; */
443 	code = lbrace;
444 	break;
445 
446     case ('}'):
447 	unary_delim = true;
448 	/* ?	code = ps.block_init ? rparen : rbrace; */
449 	code = rbrace;
450 	break;
451 
452     case 014:			/* a form feed */
453 	unary_delim = ps.last_u_d;
454 	ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
455 				 * right */
456 	code = form_feed;
457 	break;
458 
459     case (','):
460 	unary_delim = true;
461 	code = comma;
462 	break;
463 
464     case '.':
465 	unary_delim = false;
466 	code = period;
467 	break;
468 
469     case '-':
470     case '+':			/* check for -, +, --, ++ */
471 	code = (ps.last_u_d ? unary_op : binary_op);
472 	unary_delim = true;
473 
474 	if (*buf_ptr == token[0]) {
475 	    /* check for doubled character */
476 	    *e_token++ = *buf_ptr++;
477 	    /* buffer overflow will be checked at end of loop */
478 	    if (last_code == ident || last_code == rparen) {
479 		code = (ps.last_u_d ? unary_op : postop);
480 		/* check for following ++ or -- */
481 		unary_delim = false;
482 	    }
483 	}
484 	else if (*buf_ptr == '=')
485 	    /* check for operator += */
486 	    *e_token++ = *buf_ptr++;
487 	else if (*buf_ptr == '>') {
488 	    /* check for operator -> */
489 	    *e_token++ = *buf_ptr++;
490 	    if (!pointer_as_binop) {
491 		unary_delim = false;
492 		code = unary_op;
493 		ps.want_blank = false;
494 	    }
495 	}
496 	break;			/* buffer overflow will be checked at end of
497 				 * switch */
498 
499     case '=':
500 	if (ps.in_or_st)
501 	    ps.block_init = 1;
502 #ifdef undef
503 	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
504 	    e_token[-1] = *buf_ptr++;
505 	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
506 		*e_token++ = *buf_ptr++;
507 	    *e_token++ = '=';	/* Flip =+ to += */
508 	    *e_token = 0;
509 	}
510 #else
511 	if (*buf_ptr == '=') {/* == */
512 	    *e_token++ = '=';	/* Flip =+ to += */
513 	    buf_ptr++;
514 	    *e_token = 0;
515 	}
516 #endif
517 	code = binary_op;
518 	unary_delim = true;
519 	break;
520 	/* can drop thru!!! */
521 
522     case '>':
523     case '<':
524     case '!':			/* ops like <, <<, <=, !=, etc */
525 	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
526 	    *e_token++ = *buf_ptr;
527 	    if (++buf_ptr >= buf_end)
528 		fill_buffer();
529 	}
530 	if (*buf_ptr == '=')
531 	    *e_token++ = *buf_ptr++;
532 	code = (ps.last_u_d ? unary_op : binary_op);
533 	unary_delim = true;
534 	break;
535 
536     default:
537 	if (token[0] == '/' && *buf_ptr == '*') {
538 	    /* it is start of comment */
539 	    *e_token++ = '*';
540 
541 	    if (++buf_ptr >= buf_end)
542 		fill_buffer();
543 
544 	    code = comment;
545 	    unary_delim = ps.last_u_d;
546 	    break;
547 	}
548 	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
549 	    /*
550 	     * handle ||, &&, etc, and also things as in int *****i
551 	     */
552 	    *e_token++ = *buf_ptr;
553 	    if (++buf_ptr >= buf_end)
554 		fill_buffer();
555 	}
556 	code = (ps.last_u_d ? unary_op : binary_op);
557 	unary_delim = true;
558 
559 
560     }				/* end of switch */
561     if (code != newline) {
562 	l_struct = false;
563 	last_code = code;
564     }
565     if (buf_ptr >= buf_end)	/* check for input buffer empty */
566 	fill_buffer();
567     ps.last_u_d = unary_delim;
568     *e_token = '\0';		/* null terminate the token */
569     return (code);
570 }
571 
572 /*
573  * Add the given keyword to the keyword table, using val as the keyword type
574  */
575 addkey(key, val)
576     char       *key;
577 {
578     register struct templ *p = specials;
579     while (p->rwd)
580 	if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
581 	    return;
582 	else
583 	    p++;
584     if (p >= specials + sizeof specials / sizeof specials[0])
585 	return;			/* For now, table overflows are silently
586 				 * ignored */
587     p->rwd = key;
588     p->rwcode = val;
589     p[1].rwd = 0;
590     p[1].rwcode = 0;
591     return;
592 }
593