xref: /freebsd/usr.bin/indent/lexi.c (revision 1d66272a85cde1c8a69c58f4b5dd649babd6eca6)
1 /*
2  * Copyright (c) 1985 Sun Microsystems, Inc.
3  * Copyright (c) 1980, 1993
4  *	The Regents of the University of California.  All rights reserved.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *	This product includes software developed by the University of
18  *	California, Berkeley and its contributors.
19  * 4. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 
36 #ifndef lint
37 static char sccsid[] = "@(#)lexi.c	8.1 (Berkeley) 6/6/93";
38 static const char rcsid[] =
39   "$FreeBSD$";
40 #endif /* not lint */
41 
42 /*
43  * Here we have the token scanner for indent.  It scans off one token and puts
44  * it in the global variable "token".  It returns a code, indicating the type
45  * of token scanned.
46  */
47 
48 #include <stdio.h>
49 #include <ctype.h>
50 #include <stdlib.h>
51 #include <string.h>
52 #include "indent_globs.h"
53 #include "indent_codes.h"
54 
55 #define alphanum 1
56 #define opchar 3
57 
58 struct templ {
59     char       *rwd;
60     int         rwcode;
61 };
62 
63 struct templ specials[1000] =
64 {
65     "switch", 1,
66     "case", 2,
67     "break", 0,
68     "struct", 3,
69     "union", 3,
70     "enum", 3,
71     "default", 2,
72     "int", 4,
73     "char", 4,
74     "float", 4,
75     "double", 4,
76     "long", 4,
77     "short", 4,
78     "typdef", 4,
79     "unsigned", 4,
80     "register", 4,
81     "static", 4,
82     "global", 4,
83     "extern", 4,
84     "void", 4,
85     "goto", 0,
86     "return", 0,
87     "if", 5,
88     "while", 5,
89     "for", 5,
90     "else", 6,
91     "do", 6,
92     "sizeof", 7,
93     "const", 9,
94     "volatile", 9,
95     0, 0
96 };
97 
98 char        chartype[128] =
99 {				/* this is used to facilitate the decision of
100 				 * what type (alphanumeric, operator) each
101 				 * character is */
102     0, 0, 0, 0, 0, 0, 0, 0,
103     0, 0, 0, 0, 0, 0, 0, 0,
104     0, 0, 0, 0, 0, 0, 0, 0,
105     0, 0, 0, 0, 0, 0, 0, 0,
106     0, 3, 0, 0, 1, 3, 3, 0,
107     0, 0, 3, 3, 0, 3, 0, 3,
108     1, 1, 1, 1, 1, 1, 1, 1,
109     1, 1, 0, 0, 3, 3, 3, 3,
110     0, 1, 1, 1, 1, 1, 1, 1,
111     1, 1, 1, 1, 1, 1, 1, 1,
112     1, 1, 1, 1, 1, 1, 1, 1,
113     1, 1, 1, 0, 0, 0, 3, 1,
114     0, 1, 1, 1, 1, 1, 1, 1,
115     1, 1, 1, 1, 1, 1, 1, 1,
116     1, 1, 1, 1, 1, 1, 1, 1,
117     1, 1, 1, 0, 3, 0, 3, 0
118 };
119 
120 
121 
122 
123 int
124 lexi()
125 {
126     int         unary_delim;	/* this is set to 1 if the current token
127 				 *
128 				 * forces a following operator to be unary */
129     static int  last_code;	/* the last token type returned */
130     static int  l_struct;	/* set to 1 if the last token was 'struct' */
131     int         code;		/* internal code to be returned */
132     char        qchar;		/* the delimiter character for a string */
133 
134     e_token = s_token;		/* point to start of place to save token */
135     unary_delim = false;
136     ps.col_1 = ps.last_nl;	/* tell world that this token started in
137 				 * column 1 iff the last thing scanned was nl */
138     ps.last_nl = false;
139 
140     while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
141 	ps.col_1 = false;	/* leading blanks imply token is not in column
142 				 * 1 */
143 	if (++buf_ptr >= buf_end)
144 	    fill_buffer();
145     }
146 
147     /* Scan an alphanumeric token */
148     if (chartype[*buf_ptr] == alphanum || buf_ptr[0] == '.' && isdigit(buf_ptr[1])) {
149 	/*
150 	 * we have a character or number
151 	 */
152 	register char *j;	/* used for searching thru list of
153 				 *
154 				 * reserved words */
155 	register struct templ *p;
156 
157 	if (isdigit(*buf_ptr) || buf_ptr[0] == '.' && isdigit(buf_ptr[1])) {
158 	    int         seendot = 0,
159 	                seenexp = 0,
160 			seensfx = 0;
161 	    if (*buf_ptr == '0' &&
162 		    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
163 		*e_token++ = *buf_ptr++;
164 		*e_token++ = *buf_ptr++;
165 		while (isxdigit(*buf_ptr)) {
166 		    CHECK_SIZE_TOKEN;
167 		    *e_token++ = *buf_ptr++;
168 		}
169 	    }
170 	    else
171 		while (1) {
172 		    if (*buf_ptr == '.')
173 			if (seendot)
174 			    break;
175 			else
176 			    seendot++;
177 		    CHECK_SIZE_TOKEN;
178 		    *e_token++ = *buf_ptr++;
179 		    if (!isdigit(*buf_ptr) && *buf_ptr != '.')
180 			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
181 			    break;
182 			else {
183 			    seenexp++;
184 			    seendot++;
185 			    CHECK_SIZE_TOKEN;
186 			    *e_token++ = *buf_ptr++;
187 			    if (*buf_ptr == '+' || *buf_ptr == '-')
188 				*e_token++ = *buf_ptr++;
189 			}
190 		}
191 	    while (1) {
192 		if (!(seensfx & 1) &&
193 			(*buf_ptr == 'U' || *buf_ptr == 'u')) {
194 		    CHECK_SIZE_TOKEN;
195 		    *e_token++ = *buf_ptr++;
196 		    seensfx |= 1;
197 		    continue;
198 		}
199         	if (!(seensfx & 2) &&
200 			(*buf_ptr == 'L' || *buf_ptr == 'l')) {
201 		    CHECK_SIZE_TOKEN;
202 		    if (buf_ptr[1] == buf_ptr[0])
203 		        *e_token++ = *buf_ptr++;
204 		    *e_token++ = *buf_ptr++;
205 		    seensfx |= 2;
206 		    continue;
207 		}
208 		break;
209 	    }
210 	}
211 	else
212 	    while (chartype[*buf_ptr] == alphanum) {	/* copy it over */
213 		CHECK_SIZE_TOKEN;
214 		*e_token++ = *buf_ptr++;
215 		if (buf_ptr >= buf_end)
216 		    fill_buffer();
217 	    }
218 	*e_token++ = '\0';
219 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
220 	    if (++buf_ptr >= buf_end)
221 		fill_buffer();
222 	}
223 	ps.its_a_keyword = false;
224 	ps.sizeof_keyword = false;
225 	if (l_struct) {		/* if last token was 'struct', then this token
226 				 * should be treated as a declaration */
227 	    l_struct = false;
228 	    last_code = ident;
229 	    ps.last_u_d = true;
230 	    return (decl);
231 	}
232 	ps.last_u_d = false;	/* Operator after indentifier is binary */
233 	last_code = ident;	/* Remember that this is the code we will
234 				 * return */
235 
236 	/*
237 	 * This loop will check if the token is a keyword.
238 	 */
239 	for (p = specials; (j = p->rwd) != 0; p++) {
240 	    register char *p = s_token;	/* point at scanned token */
241 	    if (*j++ != *p++ || *j++ != *p++)
242 		continue;	/* This test depends on the fact that
243 				 * identifiers are always at least 1 character
244 				 * long (ie. the first two bytes of the
245 				 * identifier are always meaningful) */
246 	    if (p[-1] == 0)
247 		break;		/* If its a one-character identifier */
248 	    while (*p++ == *j)
249 		if (*j++ == 0)
250 		    goto found_keyword;	/* I wish that C had a multi-level
251 					 * break... */
252 	}
253 	if (p->rwd) {		/* we have a keyword */
254     found_keyword:
255 	    ps.its_a_keyword = true;
256 	    ps.last_u_d = true;
257 	    switch (p->rwcode) {
258 	    case 1:		/* it is a switch */
259 		return (swstmt);
260 	    case 2:		/* a case or default */
261 		return (casestmt);
262 
263 	    case 3:		/* a "struct" */
264 		/*
265 		 * Next time around, we may want to know that we have had a
266 		 * 'struct'
267 		 */
268 		l_struct = true;
269 
270 		/*
271 		 * Fall through to test for a cast, function prototype or
272 		 * sizeof().
273 		 */
274 	    case 4:		/* one of the declaration keywords */
275 		if (ps.p_l_follow) {
276 		    ps.cast_mask |= 1 << ps.p_l_follow;
277 
278 		    /*
279 		     * Forget that we saw `struct' if we're in a sizeof().
280 		     */
281 		    if (ps.sizeof_mask)
282 			l_struct = false;
283 
284 		    break;	/* inside parens: cast, prototype or sizeof() */
285 		}
286 		last_code = decl;
287 		return (decl);
288 
289 	    case 5:		/* if, while, for */
290 		return (sp_paren);
291 
292 	    case 6:		/* do, else */
293 		return (sp_nparen);
294 
295 	    case 7:
296 		ps.sizeof_keyword = true;
297 	    default:		/* all others are treated like any other
298 				 * identifier */
299 		return (ident);
300 	    }			/* end of switch */
301 	}			/* end of if (found_it) */
302 	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
303 	    register char *tp = buf_ptr;
304 	    while (tp < buf_end)
305 		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
306 		    goto not_proc;
307 	    strncpy(ps.procname, token, sizeof ps.procname - 1);
308 	    ps.in_parameter_declaration = 1;
309 	    rparen_count = 1;
310     not_proc:;
311 	}
312 	/*
313 	 * The following hack attempts to guess whether or not the current
314 	 * token is in fact a declaration keyword -- one that has been
315 	 * typedefd
316 	 */
317 	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
318 		&& !ps.p_l_follow
319 	        && !ps.block_init
320 		&& (ps.last_token == rparen || ps.last_token == semicolon ||
321 		    ps.last_token == decl ||
322 		    ps.last_token == lbrace || ps.last_token == rbrace)) {
323 	    ps.its_a_keyword = true;
324 	    ps.last_u_d = true;
325 	    last_code = decl;
326 	    return decl;
327 	}
328 	if (last_code == decl)	/* if this is a declared variable, then
329 				 * following sign is unary */
330 	    ps.last_u_d = true;	/* will make "int a -1" work */
331 	last_code = ident;
332 	return (ident);		/* the ident is not in the list */
333     }				/* end of procesing for alpanum character */
334 
335     /* Scan a non-alphanumeric token */
336 
337     *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
338 				 * moved here */
339     *e_token = '\0';
340     if (++buf_ptr >= buf_end)
341 	fill_buffer();
342 
343     switch (*token) {
344     case '\n':
345 	unary_delim = ps.last_u_d;
346 	ps.last_nl = true;	/* remember that we just had a newline */
347 	code = (had_eof ? 0 : newline);
348 
349 	/*
350 	 * if data has been exausted, the newline is a dummy, and we should
351 	 * return code to stop
352 	 */
353 	break;
354 
355     case '\'':			/* start of quoted character */
356     case '"':			/* start of string */
357 	qchar = *token;
358 	if (troff) {
359 	    e_token[-1] = '`';
360 	    if (qchar == '"')
361 		*e_token++ = '`';
362 	    e_token = chfont(&bodyf, &stringf, e_token);
363 	}
364 	do {			/* copy the string */
365 	    while (1) {		/* move one character or [/<char>]<char> */
366 		if (*buf_ptr == '\n') {
367 		    printf("%d: Unterminated literal\n", line_no);
368 		    goto stop_lit;
369 		}
370 		CHECK_SIZE_TOKEN;	/* Only have to do this once in this loop,
371 					 * since CHECK_SIZE guarantees that there
372 					 * are at least 5 entries left */
373 		*e_token = *buf_ptr++;
374 		if (buf_ptr >= buf_end)
375 		    fill_buffer();
376 		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
377 		    if (*buf_ptr == '\n')	/* check for escaped newline */
378 			++line_no;
379 		    if (troff) {
380 			*++e_token = BACKSLASH;
381 			if (*buf_ptr == BACKSLASH)
382 			    *++e_token = BACKSLASH;
383 		    }
384 		    *++e_token = *buf_ptr++;
385 		    ++e_token;	/* we must increment this again because we
386 				 * copied two chars */
387 		    if (buf_ptr >= buf_end)
388 			fill_buffer();
389 		}
390 		else
391 		    break;	/* we copied one character */
392 	    }			/* end of while (1) */
393 	} while (*e_token++ != qchar);
394 	if (troff) {
395 	    e_token = chfont(&stringf, &bodyf, e_token - 1);
396 	    if (qchar == '"')
397 		*e_token++ = '\'';
398 	}
399 stop_lit:
400 	code = ident;
401 	break;
402 
403     case ('('):
404     case ('['):
405 	unary_delim = true;
406 	code = lparen;
407 	break;
408 
409     case (')'):
410     case (']'):
411 	code = rparen;
412 	break;
413 
414     case '#':
415 	unary_delim = ps.last_u_d;
416 	code = preesc;
417 	break;
418 
419     case '?':
420 	unary_delim = true;
421 	code = question;
422 	break;
423 
424     case (':'):
425 	code = colon;
426 	unary_delim = true;
427 	break;
428 
429     case (';'):
430 	unary_delim = true;
431 	code = semicolon;
432 	break;
433 
434     case ('{'):
435 	unary_delim = true;
436 
437 	/*
438 	 * if (ps.in_or_st) ps.block_init = 1;
439 	 */
440 	/* ?	code = ps.block_init ? lparen : lbrace; */
441 	code = lbrace;
442 	break;
443 
444     case ('}'):
445 	unary_delim = true;
446 	/* ?	code = ps.block_init ? rparen : rbrace; */
447 	code = rbrace;
448 	break;
449 
450     case 014:			/* a form feed */
451 	unary_delim = ps.last_u_d;
452 	ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
453 				 * right */
454 	code = form_feed;
455 	break;
456 
457     case (','):
458 	unary_delim = true;
459 	code = comma;
460 	break;
461 
462     case '.':
463 	unary_delim = false;
464 	code = period;
465 	break;
466 
467     case '-':
468     case '+':			/* check for -, +, --, ++ */
469 	code = (ps.last_u_d ? unary_op : binary_op);
470 	unary_delim = true;
471 
472 	if (*buf_ptr == token[0]) {
473 	    /* check for doubled character */
474 	    *e_token++ = *buf_ptr++;
475 	    /* buffer overflow will be checked at end of loop */
476 	    if (last_code == ident || last_code == rparen) {
477 		code = (ps.last_u_d ? unary_op : postop);
478 		/* check for following ++ or -- */
479 		unary_delim = false;
480 	    }
481 	}
482 	else if (*buf_ptr == '=')
483 	    /* check for operator += */
484 	    *e_token++ = *buf_ptr++;
485 	else if (*buf_ptr == '>') {
486 	    /* check for operator -> */
487 	    *e_token++ = *buf_ptr++;
488 	    if (!pointer_as_binop) {
489 		unary_delim = false;
490 		code = unary_op;
491 		ps.want_blank = false;
492 	    }
493 	}
494 	break;			/* buffer overflow will be checked at end of
495 				 * switch */
496 
497     case '=':
498 	if (ps.in_or_st)
499 	    ps.block_init = 1;
500 #ifdef undef
501 	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
502 	    e_token[-1] = *buf_ptr++;
503 	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
504 		*e_token++ = *buf_ptr++;
505 	    *e_token++ = '=';	/* Flip =+ to += */
506 	    *e_token = 0;
507 	}
508 #else
509 	if (*buf_ptr == '=') {/* == */
510 	    *e_token++ = '=';	/* Flip =+ to += */
511 	    buf_ptr++;
512 	    *e_token = 0;
513 	}
514 #endif
515 	code = binary_op;
516 	unary_delim = true;
517 	break;
518 	/* can drop thru!!! */
519 
520     case '>':
521     case '<':
522     case '!':			/* ops like <, <<, <=, !=, etc */
523 	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
524 	    *e_token++ = *buf_ptr;
525 	    if (++buf_ptr >= buf_end)
526 		fill_buffer();
527 	}
528 	if (*buf_ptr == '=')
529 	    *e_token++ = *buf_ptr++;
530 	code = (ps.last_u_d ? unary_op : binary_op);
531 	unary_delim = true;
532 	break;
533 
534     default:
535 	if (token[0] == '/' && *buf_ptr == '*') {
536 	    /* it is start of comment */
537 	    *e_token++ = '*';
538 
539 	    if (++buf_ptr >= buf_end)
540 		fill_buffer();
541 
542 	    code = comment;
543 	    unary_delim = ps.last_u_d;
544 	    break;
545 	}
546 	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
547 	    /*
548 	     * handle ||, &&, etc, and also things as in int *****i
549 	     */
550 	    *e_token++ = *buf_ptr;
551 	    if (++buf_ptr >= buf_end)
552 		fill_buffer();
553 	}
554 	code = (ps.last_u_d ? unary_op : binary_op);
555 	unary_delim = true;
556 
557 
558     }				/* end of switch */
559     if (code != newline) {
560 	l_struct = false;
561 	last_code = code;
562     }
563     if (buf_ptr >= buf_end)	/* check for input buffer empty */
564 	fill_buffer();
565     ps.last_u_d = unary_delim;
566     *e_token = '\0';		/* null terminate the token */
567     return (code);
568 }
569 
570 /*
571  * Add the given keyword to the keyword table, using val as the keyword type
572  */
573 addkey(key, val)
574     char       *key;
575 {
576     register struct templ *p = specials;
577     while (p->rwd)
578 	if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
579 	    return;
580 	else
581 	    p++;
582     if (p >= specials + sizeof specials / sizeof specials[0])
583 	return;			/* For now, table overflows are silently
584 				 * ignored */
585     p->rwd = key;
586     p->rwcode = val;
587     p[1].rwd = 0;
588     p[1].rwcode = 0;
589     return;
590 }
591