xref: /freebsd/usr.bin/indent/lexi.c (revision afe61c15161c324a7af299a9b8457aba5afc92db)
1 /*
2  * Copyright (c) 1985 Sun Microsystems, Inc.
3  * Copyright (c) 1980, 1993
4  *	The Regents of the University of California.  All rights reserved.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *	This product includes software developed by the University of
18  *	California, Berkeley and its contributors.
19  * 4. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 
36 #ifndef lint
37 static char sccsid[] = "@(#)lexi.c	8.1 (Berkeley) 6/6/93";
38 #endif /* not lint */
39 
40 /*
41  * Here we have the token scanner for indent.  It scans off one token and puts
42  * it in the global variable "token".  It returns a code, indicating the type
43  * of token scanned.
44  */
45 
46 #include <stdio.h>
47 #include <ctype.h>
48 #include <stdlib.h>
49 #include <string.h>
50 #include "indent_globs.h"
51 #include "indent_codes.h"
52 
53 #define alphanum 1
54 #define opchar 3
55 
56 struct templ {
57     char       *rwd;
58     int         rwcode;
59 };
60 
61 struct templ specials[100] =
62 {
63     "switch", 1,
64     "case", 2,
65     "break", 0,
66     "struct", 3,
67     "union", 3,
68     "enum", 3,
69     "default", 2,
70     "int", 4,
71     "char", 4,
72     "float", 4,
73     "double", 4,
74     "long", 4,
75     "short", 4,
76     "typdef", 4,
77     "unsigned", 4,
78     "register", 4,
79     "static", 4,
80     "global", 4,
81     "extern", 4,
82     "void", 4,
83     "goto", 0,
84     "return", 0,
85     "if", 5,
86     "while", 5,
87     "for", 5,
88     "else", 6,
89     "do", 6,
90     "sizeof", 7,
91     0, 0
92 };
93 
94 char        chartype[128] =
95 {				/* this is used to facilitate the decision of
96 				 * what type (alphanumeric, operator) each
97 				 * character is */
98     0, 0, 0, 0, 0, 0, 0, 0,
99     0, 0, 0, 0, 0, 0, 0, 0,
100     0, 0, 0, 0, 0, 0, 0, 0,
101     0, 0, 0, 0, 0, 0, 0, 0,
102     0, 3, 0, 0, 1, 3, 3, 0,
103     0, 0, 3, 3, 0, 3, 0, 3,
104     1, 1, 1, 1, 1, 1, 1, 1,
105     1, 1, 0, 0, 3, 3, 3, 3,
106     0, 1, 1, 1, 1, 1, 1, 1,
107     1, 1, 1, 1, 1, 1, 1, 1,
108     1, 1, 1, 1, 1, 1, 1, 1,
109     1, 1, 1, 0, 0, 0, 3, 1,
110     0, 1, 1, 1, 1, 1, 1, 1,
111     1, 1, 1, 1, 1, 1, 1, 1,
112     1, 1, 1, 1, 1, 1, 1, 1,
113     1, 1, 1, 0, 3, 0, 3, 0
114 };
115 
116 
117 
118 
119 int
120 lexi()
121 {
122     int         unary_delim;	/* this is set to 1 if the current token
123 				 *
124 				 * forces a following operator to be unary */
125     static int  last_code;	/* the last token type returned */
126     static int  l_struct;	/* set to 1 if the last token was 'struct' */
127     int         code;		/* internal code to be returned */
128     char        qchar;		/* the delimiter character for a string */
129 
130     e_token = s_token;		/* point to start of place to save token */
131     unary_delim = false;
132     ps.col_1 = ps.last_nl;	/* tell world that this token started in
133 				 * column 1 iff the last thing scanned was nl */
134     ps.last_nl = false;
135 
136     while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
137 	ps.col_1 = false;	/* leading blanks imply token is not in column
138 				 * 1 */
139 	if (++buf_ptr >= buf_end)
140 	    fill_buffer();
141     }
142 
143     /* Scan an alphanumeric token */
144     if (chartype[*buf_ptr] == alphanum || buf_ptr[0] == '.' && isdigit(buf_ptr[1])) {
145 	/*
146 	 * we have a character or number
147 	 */
148 	register char *j;	/* used for searching thru list of
149 				 *
150 				 * reserved words */
151 	register struct templ *p;
152 
153 	if (isdigit(*buf_ptr) || buf_ptr[0] == '.' && isdigit(buf_ptr[1])) {
154 	    int         seendot = 0,
155 	                seenexp = 0;
156 	    if (*buf_ptr == '0' &&
157 		    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
158 		*e_token++ = *buf_ptr++;
159 		*e_token++ = *buf_ptr++;
160 		while (isxdigit(*buf_ptr)) {
161 		    CHECK_SIZE_TOKEN;
162 		    *e_token++ = *buf_ptr++;
163 		}
164 	    }
165 	    else
166 		while (1) {
167 		    if (*buf_ptr == '.')
168 			if (seendot)
169 			    break;
170 			else
171 			    seendot++;
172 		    CHECK_SIZE_TOKEN;
173 		    *e_token++ = *buf_ptr++;
174 		    if (!isdigit(*buf_ptr) && *buf_ptr != '.')
175 			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
176 			    break;
177 			else {
178 			    seenexp++;
179 			    seendot++;
180 			    CHECK_SIZE_TOKEN;
181 			    *e_token++ = *buf_ptr++;
182 			    if (*buf_ptr == '+' || *buf_ptr == '-')
183 				*e_token++ = *buf_ptr++;
184 			}
185 		}
186 	    if (*buf_ptr == 'L' || *buf_ptr == 'l')
187 		*e_token++ = *buf_ptr++;
188 	}
189 	else
190 	    while (chartype[*buf_ptr] == alphanum) {	/* copy it over */
191 		CHECK_SIZE_TOKEN;
192 		*e_token++ = *buf_ptr++;
193 		if (buf_ptr >= buf_end)
194 		    fill_buffer();
195 	    }
196 	*e_token++ = '\0';
197 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
198 	    if (++buf_ptr >= buf_end)
199 		fill_buffer();
200 	}
201 	ps.its_a_keyword = false;
202 	ps.sizeof_keyword = false;
203 	if (l_struct) {		/* if last token was 'struct', then this token
204 				 * should be treated as a declaration */
205 	    l_struct = false;
206 	    last_code = ident;
207 	    ps.last_u_d = true;
208 	    return (decl);
209 	}
210 	ps.last_u_d = false;	/* Operator after indentifier is binary */
211 	last_code = ident;	/* Remember that this is the code we will
212 				 * return */
213 
214 	/*
215 	 * This loop will check if the token is a keyword.
216 	 */
217 	for (p = specials; (j = p->rwd) != 0; p++) {
218 	    register char *p = s_token;	/* point at scanned token */
219 	    if (*j++ != *p++ || *j++ != *p++)
220 		continue;	/* This test depends on the fact that
221 				 * identifiers are always at least 1 character
222 				 * long (ie. the first two bytes of the
223 				 * identifier are always meaningful) */
224 	    if (p[-1] == 0)
225 		break;		/* If its a one-character identifier */
226 	    while (*p++ == *j)
227 		if (*j++ == 0)
228 		    goto found_keyword;	/* I wish that C had a multi-level
229 					 * break... */
230 	}
231 	if (p->rwd) {		/* we have a keyword */
232     found_keyword:
233 	    ps.its_a_keyword = true;
234 	    ps.last_u_d = true;
235 	    switch (p->rwcode) {
236 	    case 1:		/* it is a switch */
237 		return (swstmt);
238 	    case 2:		/* a case or default */
239 		return (casestmt);
240 
241 	    case 3:		/* a "struct" */
242 		if (ps.p_l_follow)
243 		    break;	/* inside parens: cast */
244 		l_struct = true;
245 
246 		/*
247 		 * Next time around, we will want to know that we have had a
248 		 * 'struct'
249 		 */
250 	    case 4:		/* one of the declaration keywords */
251 		if (ps.p_l_follow) {
252 		    ps.cast_mask |= 1 << ps.p_l_follow;
253 		    break;	/* inside parens: cast */
254 		}
255 		last_code = decl;
256 		return (decl);
257 
258 	    case 5:		/* if, while, for */
259 		return (sp_paren);
260 
261 	    case 6:		/* do, else */
262 		return (sp_nparen);
263 
264 	    case 7:
265 		ps.sizeof_keyword = true;
266 	    default:		/* all others are treated like any other
267 				 * identifier */
268 		return (ident);
269 	    }			/* end of switch */
270 	}			/* end of if (found_it) */
271 	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
272 	    register char *tp = buf_ptr;
273 	    while (tp < buf_end)
274 		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
275 		    goto not_proc;
276 	    strncpy(ps.procname, token, sizeof ps.procname - 1);
277 	    ps.in_parameter_declaration = 1;
278 	    rparen_count = 1;
279     not_proc:;
280 	}
281 	/*
282 	 * The following hack attempts to guess whether or not the current
283 	 * token is in fact a declaration keyword -- one that has been
284 	 * typedefd
285 	 */
286 	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
287 		&& !ps.p_l_follow
288 	        && !ps.block_init
289 		&& (ps.last_token == rparen || ps.last_token == semicolon ||
290 		    ps.last_token == decl ||
291 		    ps.last_token == lbrace || ps.last_token == rbrace)) {
292 	    ps.its_a_keyword = true;
293 	    ps.last_u_d = true;
294 	    last_code = decl;
295 	    return decl;
296 	}
297 	if (last_code == decl)	/* if this is a declared variable, then
298 				 * following sign is unary */
299 	    ps.last_u_d = true;	/* will make "int a -1" work */
300 	last_code = ident;
301 	return (ident);		/* the ident is not in the list */
302     }				/* end of procesing for alpanum character */
303 
304     /* Scan a non-alphanumeric token */
305 
306     *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
307 				 * moved here */
308     *e_token = '\0';
309     if (++buf_ptr >= buf_end)
310 	fill_buffer();
311 
312     switch (*token) {
313     case '\n':
314 	unary_delim = ps.last_u_d;
315 	ps.last_nl = true;	/* remember that we just had a newline */
316 	code = (had_eof ? 0 : newline);
317 
318 	/*
319 	 * if data has been exausted, the newline is a dummy, and we should
320 	 * return code to stop
321 	 */
322 	break;
323 
324     case '\'':			/* start of quoted character */
325     case '"':			/* start of string */
326 	qchar = *token;
327 	if (troff) {
328 	    e_token[-1] = '`';
329 	    if (qchar == '"')
330 		*e_token++ = '`';
331 	    e_token = chfont(&bodyf, &stringf, e_token);
332 	}
333 	do {			/* copy the string */
334 	    while (1) {		/* move one character or [/<char>]<char> */
335 		if (*buf_ptr == '\n') {
336 		    printf("%d: Unterminated literal\n", line_no);
337 		    goto stop_lit;
338 		}
339 		CHECK_SIZE_TOKEN;	/* Only have to do this once in this loop,
340 					 * since CHECK_SIZE guarantees that there
341 					 * are at least 5 entries left */
342 		*e_token = *buf_ptr++;
343 		if (buf_ptr >= buf_end)
344 		    fill_buffer();
345 		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
346 		    if (*buf_ptr == '\n')	/* check for escaped newline */
347 			++line_no;
348 		    if (troff) {
349 			*++e_token = BACKSLASH;
350 			if (*buf_ptr == BACKSLASH)
351 			    *++e_token = BACKSLASH;
352 		    }
353 		    *++e_token = *buf_ptr++;
354 		    ++e_token;	/* we must increment this again because we
355 				 * copied two chars */
356 		    if (buf_ptr >= buf_end)
357 			fill_buffer();
358 		}
359 		else
360 		    break;	/* we copied one character */
361 	    }			/* end of while (1) */
362 	} while (*e_token++ != qchar);
363 	if (troff) {
364 	    e_token = chfont(&stringf, &bodyf, e_token - 1);
365 	    if (qchar == '"')
366 		*e_token++ = '\'';
367 	}
368 stop_lit:
369 	code = ident;
370 	break;
371 
372     case ('('):
373     case ('['):
374 	unary_delim = true;
375 	code = lparen;
376 	break;
377 
378     case (')'):
379     case (']'):
380 	code = rparen;
381 	break;
382 
383     case '#':
384 	unary_delim = ps.last_u_d;
385 	code = preesc;
386 	break;
387 
388     case '?':
389 	unary_delim = true;
390 	code = question;
391 	break;
392 
393     case (':'):
394 	code = colon;
395 	unary_delim = true;
396 	break;
397 
398     case (';'):
399 	unary_delim = true;
400 	code = semicolon;
401 	break;
402 
403     case ('{'):
404 	unary_delim = true;
405 
406 	/*
407 	 * if (ps.in_or_st) ps.block_init = 1;
408 	 */
409 	/* ?	code = ps.block_init ? lparen : lbrace; */
410 	code = lbrace;
411 	break;
412 
413     case ('}'):
414 	unary_delim = true;
415 	/* ?	code = ps.block_init ? rparen : rbrace; */
416 	code = rbrace;
417 	break;
418 
419     case 014:			/* a form feed */
420 	unary_delim = ps.last_u_d;
421 	ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
422 				 * right */
423 	code = form_feed;
424 	break;
425 
426     case (','):
427 	unary_delim = true;
428 	code = comma;
429 	break;
430 
431     case '.':
432 	unary_delim = false;
433 	code = period;
434 	break;
435 
436     case '-':
437     case '+':			/* check for -, +, --, ++ */
438 	code = (ps.last_u_d ? unary_op : binary_op);
439 	unary_delim = true;
440 
441 	if (*buf_ptr == token[0]) {
442 	    /* check for doubled character */
443 	    *e_token++ = *buf_ptr++;
444 	    /* buffer overflow will be checked at end of loop */
445 	    if (last_code == ident || last_code == rparen) {
446 		code = (ps.last_u_d ? unary_op : postop);
447 		/* check for following ++ or -- */
448 		unary_delim = false;
449 	    }
450 	}
451 	else if (*buf_ptr == '=')
452 	    /* check for operator += */
453 	    *e_token++ = *buf_ptr++;
454 	else if (*buf_ptr == '>') {
455 	    /* check for operator -> */
456 	    *e_token++ = *buf_ptr++;
457 	    if (!pointer_as_binop) {
458 		unary_delim = false;
459 		code = unary_op;
460 		ps.want_blank = false;
461 	    }
462 	}
463 	break;			/* buffer overflow will be checked at end of
464 				 * switch */
465 
466     case '=':
467 	if (ps.in_or_st)
468 	    ps.block_init = 1;
469 #ifdef undef
470 	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
471 	    e_token[-1] = *buf_ptr++;
472 	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
473 		*e_token++ = *buf_ptr++;
474 	    *e_token++ = '=';	/* Flip =+ to += */
475 	    *e_token = 0;
476 	}
477 #else
478 	if (*buf_ptr == '=') {/* == */
479 	    *e_token++ = '=';	/* Flip =+ to += */
480 	    buf_ptr++;
481 	    *e_token = 0;
482 	}
483 #endif
484 	code = binary_op;
485 	unary_delim = true;
486 	break;
487 	/* can drop thru!!! */
488 
489     case '>':
490     case '<':
491     case '!':			/* ops like <, <<, <=, !=, etc */
492 	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
493 	    *e_token++ = *buf_ptr;
494 	    if (++buf_ptr >= buf_end)
495 		fill_buffer();
496 	}
497 	if (*buf_ptr == '=')
498 	    *e_token++ = *buf_ptr++;
499 	code = (ps.last_u_d ? unary_op : binary_op);
500 	unary_delim = true;
501 	break;
502 
503     default:
504 	if (token[0] == '/' && *buf_ptr == '*') {
505 	    /* it is start of comment */
506 	    *e_token++ = '*';
507 
508 	    if (++buf_ptr >= buf_end)
509 		fill_buffer();
510 
511 	    code = comment;
512 	    unary_delim = ps.last_u_d;
513 	    break;
514 	}
515 	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
516 	    /*
517 	     * handle ||, &&, etc, and also things as in int *****i
518 	     */
519 	    *e_token++ = *buf_ptr;
520 	    if (++buf_ptr >= buf_end)
521 		fill_buffer();
522 	}
523 	code = (ps.last_u_d ? unary_op : binary_op);
524 	unary_delim = true;
525 
526 
527     }				/* end of switch */
528     if (code != newline) {
529 	l_struct = false;
530 	last_code = code;
531     }
532     if (buf_ptr >= buf_end)	/* check for input buffer empty */
533 	fill_buffer();
534     ps.last_u_d = unary_delim;
535     *e_token = '\0';		/* null terminate the token */
536     return (code);
537 }
538 
539 /*
540  * Add the given keyword to the keyword table, using val as the keyword type
541  */
542 addkey(key, val)
543     char       *key;
544 {
545     register struct templ *p = specials;
546     while (p->rwd)
547 	if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
548 	    return;
549 	else
550 	    p++;
551     if (p >= specials + sizeof specials / sizeof specials[0])
552 	return;			/* For now, table overflows are silently
553 				 * ignored */
554     p->rwd = key;
555     p->rwcode = val;
556     p[1].rwd = 0;
557     p[1].rwcode = 0;
558     return;
559 }
560