xref: /freebsd/usr.bin/indent/lexi.c (revision daf1cffce2e07931f27c6c6998652e90df6ba87e)
1 /*
2  * Copyright (c) 1985 Sun Microsystems, Inc.
3  * Copyright (c) 1980, 1993
4  *	The Regents of the University of California.  All rights reserved.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *	This product includes software developed by the University of
18  *	California, Berkeley and its contributors.
19  * 4. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 
36 #ifndef lint
37 static char sccsid[] = "@(#)lexi.c	8.1 (Berkeley) 6/6/93";
38 #endif /* not lint */
39 
40 /*
41  * Here we have the token scanner for indent.  It scans off one token and puts
42  * it in the global variable "token".  It returns a code, indicating the type
43  * of token scanned.
44  */
45 
46 #include <stdio.h>
47 #include <ctype.h>
48 #include <stdlib.h>
49 #include <string.h>
50 #include "indent_globs.h"
51 #include "indent_codes.h"
52 
53 #define alphanum 1
54 #define opchar 3
55 
56 struct templ {
57     char       *rwd;
58     int         rwcode;
59 };
60 
61 struct templ specials[100] =
62 {
63     "switch", 1,
64     "case", 2,
65     "break", 0,
66     "struct", 3,
67     "union", 3,
68     "enum", 3,
69     "default", 2,
70     "int", 4,
71     "char", 4,
72     "float", 4,
73     "double", 4,
74     "long", 4,
75     "short", 4,
76     "typdef", 4,
77     "unsigned", 4,
78     "register", 4,
79     "static", 4,
80     "global", 4,
81     "extern", 4,
82     "void", 4,
83     "goto", 0,
84     "return", 0,
85     "if", 5,
86     "while", 5,
87     "for", 5,
88     "else", 6,
89     "do", 6,
90     "sizeof", 7,
91     0, 0
92 };
93 
94 char        chartype[128] =
95 {				/* this is used to facilitate the decision of
96 				 * what type (alphanumeric, operator) each
97 				 * character is */
98     0, 0, 0, 0, 0, 0, 0, 0,
99     0, 0, 0, 0, 0, 0, 0, 0,
100     0, 0, 0, 0, 0, 0, 0, 0,
101     0, 0, 0, 0, 0, 0, 0, 0,
102     0, 3, 0, 0, 1, 3, 3, 0,
103     0, 0, 3, 3, 0, 3, 0, 3,
104     1, 1, 1, 1, 1, 1, 1, 1,
105     1, 1, 0, 0, 3, 3, 3, 3,
106     0, 1, 1, 1, 1, 1, 1, 1,
107     1, 1, 1, 1, 1, 1, 1, 1,
108     1, 1, 1, 1, 1, 1, 1, 1,
109     1, 1, 1, 0, 0, 0, 3, 1,
110     0, 1, 1, 1, 1, 1, 1, 1,
111     1, 1, 1, 1, 1, 1, 1, 1,
112     1, 1, 1, 1, 1, 1, 1, 1,
113     1, 1, 1, 0, 3, 0, 3, 0
114 };
115 
116 
117 
118 
119 int
120 lexi()
121 {
122     int         unary_delim;	/* this is set to 1 if the current token
123 				 *
124 				 * forces a following operator to be unary */
125     static int  last_code;	/* the last token type returned */
126     static int  l_struct;	/* set to 1 if the last token was 'struct' */
127     int         code;		/* internal code to be returned */
128     char        qchar;		/* the delimiter character for a string */
129 
130     e_token = s_token;		/* point to start of place to save token */
131     unary_delim = false;
132     ps.col_1 = ps.last_nl;	/* tell world that this token started in
133 				 * column 1 iff the last thing scanned was nl */
134     ps.last_nl = false;
135 
136     while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
137 	ps.col_1 = false;	/* leading blanks imply token is not in column
138 				 * 1 */
139 	if (++buf_ptr >= buf_end)
140 	    fill_buffer();
141     }
142 
143     /* Scan an alphanumeric token */
144     if (chartype[*buf_ptr] == alphanum || buf_ptr[0] == '.' && isdigit(buf_ptr[1])) {
145 	/*
146 	 * we have a character or number
147 	 */
148 	register char *j;	/* used for searching thru list of
149 				 *
150 				 * reserved words */
151 	register struct templ *p;
152 
153 	if (isdigit(*buf_ptr) || buf_ptr[0] == '.' && isdigit(buf_ptr[1])) {
154 	    int         seendot = 0,
155 	                seenexp = 0,
156 			seensfx = 0;
157 	    if (*buf_ptr == '0' &&
158 		    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
159 		*e_token++ = *buf_ptr++;
160 		*e_token++ = *buf_ptr++;
161 		while (isxdigit(*buf_ptr)) {
162 		    CHECK_SIZE_TOKEN;
163 		    *e_token++ = *buf_ptr++;
164 		}
165 	    }
166 	    else
167 		while (1) {
168 		    if (*buf_ptr == '.')
169 			if (seendot)
170 			    break;
171 			else
172 			    seendot++;
173 		    CHECK_SIZE_TOKEN;
174 		    *e_token++ = *buf_ptr++;
175 		    if (!isdigit(*buf_ptr) && *buf_ptr != '.')
176 			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
177 			    break;
178 			else {
179 			    seenexp++;
180 			    seendot++;
181 			    CHECK_SIZE_TOKEN;
182 			    *e_token++ = *buf_ptr++;
183 			    if (*buf_ptr == '+' || *buf_ptr == '-')
184 				*e_token++ = *buf_ptr++;
185 			}
186 		}
187 	    while (1) {
188 		if (!(seensfx & 1) &&
189 			(*buf_ptr == 'U' || *buf_ptr == 'u')) {
190 		    CHECK_SIZE_TOKEN;
191 		    *e_token++ = *buf_ptr++;
192 		    seensfx |= 1;
193 		    continue;
194 		}
195         	if (!(seensfx & 2) &&
196 			(*buf_ptr == 'L' || *buf_ptr == 'l')) {
197 		    CHECK_SIZE_TOKEN;
198 		    if (buf_ptr[1] == buf_ptr[0])
199 		        *e_token++ = *buf_ptr++;
200 		    *e_token++ = *buf_ptr++;
201 		    seensfx |= 2;
202 		    continue;
203 		}
204 		break;
205 	    }
206 	}
207 	else
208 	    while (chartype[*buf_ptr] == alphanum) {	/* copy it over */
209 		CHECK_SIZE_TOKEN;
210 		*e_token++ = *buf_ptr++;
211 		if (buf_ptr >= buf_end)
212 		    fill_buffer();
213 	    }
214 	*e_token++ = '\0';
215 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
216 	    if (++buf_ptr >= buf_end)
217 		fill_buffer();
218 	}
219 	ps.its_a_keyword = false;
220 	ps.sizeof_keyword = false;
221 	if (l_struct) {		/* if last token was 'struct', then this token
222 				 * should be treated as a declaration */
223 	    l_struct = false;
224 	    last_code = ident;
225 	    ps.last_u_d = true;
226 	    return (decl);
227 	}
228 	ps.last_u_d = false;	/* Operator after indentifier is binary */
229 	last_code = ident;	/* Remember that this is the code we will
230 				 * return */
231 
232 	/*
233 	 * This loop will check if the token is a keyword.
234 	 */
235 	for (p = specials; (j = p->rwd) != 0; p++) {
236 	    register char *p = s_token;	/* point at scanned token */
237 	    if (*j++ != *p++ || *j++ != *p++)
238 		continue;	/* This test depends on the fact that
239 				 * identifiers are always at least 1 character
240 				 * long (ie. the first two bytes of the
241 				 * identifier are always meaningful) */
242 	    if (p[-1] == 0)
243 		break;		/* If its a one-character identifier */
244 	    while (*p++ == *j)
245 		if (*j++ == 0)
246 		    goto found_keyword;	/* I wish that C had a multi-level
247 					 * break... */
248 	}
249 	if (p->rwd) {		/* we have a keyword */
250     found_keyword:
251 	    ps.its_a_keyword = true;
252 	    ps.last_u_d = true;
253 	    switch (p->rwcode) {
254 	    case 1:		/* it is a switch */
255 		return (swstmt);
256 	    case 2:		/* a case or default */
257 		return (casestmt);
258 
259 	    case 3:		/* a "struct" */
260 		if (ps.p_l_follow)
261 		    break;	/* inside parens: cast */
262 		l_struct = true;
263 
264 		/*
265 		 * Next time around, we will want to know that we have had a
266 		 * 'struct'
267 		 */
268 	    case 4:		/* one of the declaration keywords */
269 		if (ps.p_l_follow) {
270 		    ps.cast_mask |= 1 << ps.p_l_follow;
271 		    break;	/* inside parens: cast */
272 		}
273 		last_code = decl;
274 		return (decl);
275 
276 	    case 5:		/* if, while, for */
277 		return (sp_paren);
278 
279 	    case 6:		/* do, else */
280 		return (sp_nparen);
281 
282 	    case 7:
283 		ps.sizeof_keyword = true;
284 	    default:		/* all others are treated like any other
285 				 * identifier */
286 		return (ident);
287 	    }			/* end of switch */
288 	}			/* end of if (found_it) */
289 	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
290 	    register char *tp = buf_ptr;
291 	    while (tp < buf_end)
292 		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
293 		    goto not_proc;
294 	    strncpy(ps.procname, token, sizeof ps.procname - 1);
295 	    ps.in_parameter_declaration = 1;
296 	    rparen_count = 1;
297     not_proc:;
298 	}
299 	/*
300 	 * The following hack attempts to guess whether or not the current
301 	 * token is in fact a declaration keyword -- one that has been
302 	 * typedefd
303 	 */
304 	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
305 		&& !ps.p_l_follow
306 	        && !ps.block_init
307 		&& (ps.last_token == rparen || ps.last_token == semicolon ||
308 		    ps.last_token == decl ||
309 		    ps.last_token == lbrace || ps.last_token == rbrace)) {
310 	    ps.its_a_keyword = true;
311 	    ps.last_u_d = true;
312 	    last_code = decl;
313 	    return decl;
314 	}
315 	if (last_code == decl)	/* if this is a declared variable, then
316 				 * following sign is unary */
317 	    ps.last_u_d = true;	/* will make "int a -1" work */
318 	last_code = ident;
319 	return (ident);		/* the ident is not in the list */
320     }				/* end of procesing for alpanum character */
321 
322     /* Scan a non-alphanumeric token */
323 
324     *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
325 				 * moved here */
326     *e_token = '\0';
327     if (++buf_ptr >= buf_end)
328 	fill_buffer();
329 
330     switch (*token) {
331     case '\n':
332 	unary_delim = ps.last_u_d;
333 	ps.last_nl = true;	/* remember that we just had a newline */
334 	code = (had_eof ? 0 : newline);
335 
336 	/*
337 	 * if data has been exausted, the newline is a dummy, and we should
338 	 * return code to stop
339 	 */
340 	break;
341 
342     case '\'':			/* start of quoted character */
343     case '"':			/* start of string */
344 	qchar = *token;
345 	if (troff) {
346 	    e_token[-1] = '`';
347 	    if (qchar == '"')
348 		*e_token++ = '`';
349 	    e_token = chfont(&bodyf, &stringf, e_token);
350 	}
351 	do {			/* copy the string */
352 	    while (1) {		/* move one character or [/<char>]<char> */
353 		if (*buf_ptr == '\n') {
354 		    printf("%d: Unterminated literal\n", line_no);
355 		    goto stop_lit;
356 		}
357 		CHECK_SIZE_TOKEN;	/* Only have to do this once in this loop,
358 					 * since CHECK_SIZE guarantees that there
359 					 * are at least 5 entries left */
360 		*e_token = *buf_ptr++;
361 		if (buf_ptr >= buf_end)
362 		    fill_buffer();
363 		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
364 		    if (*buf_ptr == '\n')	/* check for escaped newline */
365 			++line_no;
366 		    if (troff) {
367 			*++e_token = BACKSLASH;
368 			if (*buf_ptr == BACKSLASH)
369 			    *++e_token = BACKSLASH;
370 		    }
371 		    *++e_token = *buf_ptr++;
372 		    ++e_token;	/* we must increment this again because we
373 				 * copied two chars */
374 		    if (buf_ptr >= buf_end)
375 			fill_buffer();
376 		}
377 		else
378 		    break;	/* we copied one character */
379 	    }			/* end of while (1) */
380 	} while (*e_token++ != qchar);
381 	if (troff) {
382 	    e_token = chfont(&stringf, &bodyf, e_token - 1);
383 	    if (qchar == '"')
384 		*e_token++ = '\'';
385 	}
386 stop_lit:
387 	code = ident;
388 	break;
389 
390     case ('('):
391     case ('['):
392 	unary_delim = true;
393 	code = lparen;
394 	break;
395 
396     case (')'):
397     case (']'):
398 	code = rparen;
399 	break;
400 
401     case '#':
402 	unary_delim = ps.last_u_d;
403 	code = preesc;
404 	break;
405 
406     case '?':
407 	unary_delim = true;
408 	code = question;
409 	break;
410 
411     case (':'):
412 	code = colon;
413 	unary_delim = true;
414 	break;
415 
416     case (';'):
417 	unary_delim = true;
418 	code = semicolon;
419 	break;
420 
421     case ('{'):
422 	unary_delim = true;
423 
424 	/*
425 	 * if (ps.in_or_st) ps.block_init = 1;
426 	 */
427 	/* ?	code = ps.block_init ? lparen : lbrace; */
428 	code = lbrace;
429 	break;
430 
431     case ('}'):
432 	unary_delim = true;
433 	/* ?	code = ps.block_init ? rparen : rbrace; */
434 	code = rbrace;
435 	break;
436 
437     case 014:			/* a form feed */
438 	unary_delim = ps.last_u_d;
439 	ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
440 				 * right */
441 	code = form_feed;
442 	break;
443 
444     case (','):
445 	unary_delim = true;
446 	code = comma;
447 	break;
448 
449     case '.':
450 	unary_delim = false;
451 	code = period;
452 	break;
453 
454     case '-':
455     case '+':			/* check for -, +, --, ++ */
456 	code = (ps.last_u_d ? unary_op : binary_op);
457 	unary_delim = true;
458 
459 	if (*buf_ptr == token[0]) {
460 	    /* check for doubled character */
461 	    *e_token++ = *buf_ptr++;
462 	    /* buffer overflow will be checked at end of loop */
463 	    if (last_code == ident || last_code == rparen) {
464 		code = (ps.last_u_d ? unary_op : postop);
465 		/* check for following ++ or -- */
466 		unary_delim = false;
467 	    }
468 	}
469 	else if (*buf_ptr == '=')
470 	    /* check for operator += */
471 	    *e_token++ = *buf_ptr++;
472 	else if (*buf_ptr == '>') {
473 	    /* check for operator -> */
474 	    *e_token++ = *buf_ptr++;
475 	    if (!pointer_as_binop) {
476 		unary_delim = false;
477 		code = unary_op;
478 		ps.want_blank = false;
479 	    }
480 	}
481 	break;			/* buffer overflow will be checked at end of
482 				 * switch */
483 
484     case '=':
485 	if (ps.in_or_st)
486 	    ps.block_init = 1;
487 #ifdef undef
488 	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
489 	    e_token[-1] = *buf_ptr++;
490 	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
491 		*e_token++ = *buf_ptr++;
492 	    *e_token++ = '=';	/* Flip =+ to += */
493 	    *e_token = 0;
494 	}
495 #else
496 	if (*buf_ptr == '=') {/* == */
497 	    *e_token++ = '=';	/* Flip =+ to += */
498 	    buf_ptr++;
499 	    *e_token = 0;
500 	}
501 #endif
502 	code = binary_op;
503 	unary_delim = true;
504 	break;
505 	/* can drop thru!!! */
506 
507     case '>':
508     case '<':
509     case '!':			/* ops like <, <<, <=, !=, etc */
510 	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
511 	    *e_token++ = *buf_ptr;
512 	    if (++buf_ptr >= buf_end)
513 		fill_buffer();
514 	}
515 	if (*buf_ptr == '=')
516 	    *e_token++ = *buf_ptr++;
517 	code = (ps.last_u_d ? unary_op : binary_op);
518 	unary_delim = true;
519 	break;
520 
521     default:
522 	if (token[0] == '/' && *buf_ptr == '*') {
523 	    /* it is start of comment */
524 	    *e_token++ = '*';
525 
526 	    if (++buf_ptr >= buf_end)
527 		fill_buffer();
528 
529 	    code = comment;
530 	    unary_delim = ps.last_u_d;
531 	    break;
532 	}
533 	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
534 	    /*
535 	     * handle ||, &&, etc, and also things as in int *****i
536 	     */
537 	    *e_token++ = *buf_ptr;
538 	    if (++buf_ptr >= buf_end)
539 		fill_buffer();
540 	}
541 	code = (ps.last_u_d ? unary_op : binary_op);
542 	unary_delim = true;
543 
544 
545     }				/* end of switch */
546     if (code != newline) {
547 	l_struct = false;
548 	last_code = code;
549     }
550     if (buf_ptr >= buf_end)	/* check for input buffer empty */
551 	fill_buffer();
552     ps.last_u_d = unary_delim;
553     *e_token = '\0';		/* null terminate the token */
554     return (code);
555 }
556 
557 /*
558  * Add the given keyword to the keyword table, using val as the keyword type
559  */
560 addkey(key, val)
561     char       *key;
562 {
563     register struct templ *p = specials;
564     while (p->rwd)
565 	if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
566 	    return;
567 	else
568 	    p++;
569     if (p >= specials + sizeof specials / sizeof specials[0])
570 	return;			/* For now, table overflows are silently
571 				 * ignored */
572     p->rwd = key;
573     p->rwcode = val;
574     p[1].rwd = 0;
575     p[1].rwcode = 0;
576     return;
577 }
578