xref: /illumos-gate/usr/src/contrib/ast/src/lib/libpp/ppfsm.c (revision b30d193948be5a7794d7ae3ba0ed9c2f72c88e0f)
1 /***********************************************************************
2 *                                                                      *
3 *               This software is part of the ast package               *
4 *          Copyright (c) 1986-2011 AT&T Intellectual Property          *
5 *                      and is licensed under the                       *
6 *                 Eclipse Public License, Version 1.0                  *
7 *                    by AT&T Intellectual Property                     *
8 *                                                                      *
9 *                A copy of the License is available at                 *
10 *          http://www.eclipse.org/org/documents/epl-v10.html           *
11 *         (with md5 checksum b35adb5213ca9657e911e9befb180842)         *
12 *                                                                      *
13 *              Information and Software Systems Research               *
14 *                            AT&T Research                             *
15 *                           Florham Park NJ                            *
16 *                                                                      *
17 *                 Glenn Fowler <gsf@research.att.com>                  *
18 *                                                                      *
19 ***********************************************************************/
20 #pragma prototyped
21 /*
22  * Glenn Fowler
23  * AT&T Research
24  *
25  * preprocessor and proto lexical analyzer fsm
26  * define PROTOMAIN for standalone proto
27  */
28 
29 #include "pplib.h"
30 #include "ppfsm.h"
31 
32 /*
33  * lexical FSM encoding
34  * derived from a standalone ansi cpp by Dennis Ritchie
35  * modified for libpp by Glenn Fowler
36  *
37  *   fsm[] is initialized from fsminit[].  The encoding is blown out into
38  *   fsm[] for time efficiency.  When in state state, and one of the
39  *   characters in ch arrives, enter nextstate.  States >= TERMINAL are
40  *   either final, or at least require special action.  In fsminit[] there
41  *   is a line for each <state,charset,nextstate>.  Early entries are
42  *   overwritten by later ones.  C_XXX is the universal set and should
43  *   always be first.  Some of the fsminit[] entries are templates for
44  *   groups of states.  The OP entries trigger the state copies.  States
45  *   above TERMINAL are represented in fsm[] as negative values.  S_TOK and
46  *   S_TOKB encode the resulting token type in the upper bits.  These actions
47  *   differ in that S_TOKB has a lookahead char.
48  *
49  *   fsm[] has three start states:
50  *
51  *	PROTO	proto (ANSI -> K&R,C++,ANSI)
52  *	QUICK	standalone ppcpp()
53  *	TOKEN	tokenizing pplex()
54  *
55  *   If the next state remains the same then the fsm[] transition value is 0.
56  *   MAX+1 is a power of 2 so that fsm[state][EOF==MAX+1] actually accesses
57  *   fsm[state+1][0] which is ~S_EOB for all states.  This preserves the
58  *   power of 2 fsm[] row size for efficient array indexing.  Thanks to
59  *   D. G. Korn for the last two observations.  The pseudo non-terminal state
60  *   fsm[TERMINAL][state+1] is used to differentiate EOB from EOF.
61  *
62  *   The bit layout is:
63  *
64  *	TERM	arg	SPLICE	next
65  *	15	14-8	7	6-0
66  */
67 
68 /*
69  * NOTE: these must be `control' characters for all native codesets
70  *       currently ok for {ascii,ebcdic1,ebcdic2,ebcdic3}
71  */
72 
73 #define C_DEC		001
74 #define C_EOF		002
75 #define C_HEX		003
76 #define C_LET		021
77 #define C_OCT		022
78 #define C_XXX		023
79 
80 #define OP		(-1)
81 #define END		0
82 #define COPY		1
83 
84 #define copy(t,f)	(memcpy(&fsm[t][1],&fsm[f][1],(MAX+1)*sizeof(short)),fsm[TERMINAL][(t)+1]=fsm[TERMINAL][(f)+1])
85 
86 struct fsminit				/* fsm initialization row	*/
87 {
88 	int		state;		/* if in this state		*/
89 	unsigned char	ch[4];		/* and see one of these		*/
90 	int		nextstate;	/* enter this state if <TERMINAL*/
91 };
92 
93 static struct fsminit	fsminit[] =
94 {
95 	/* proto start state */
96 	{	PROTO,	{ C_XXX },		S_CHR,			},
97 	{	PROTO,	{ C_EOF },		S_EOF,			},
98 	{	PROTO,	{ C_DEC },		BAD1,			},
99 	{	PROTO,	{ '.' },		DOT,			},
100 	{	PROTO,	{ C_LET },		NID,			},
101 	{	PROTO,	{ 'L' },		LIT,			},
102 	{	PROTO,	{ 'd', 'e', 'f', 'i' },	RES1,			},
103 	{	PROTO,	{ 'r', 's', 't', 'v' },	RES1,			},
104 	{	PROTO,	{ 'w', 'N' },		RES1,			},
105 	{	PROTO,	{ '"', '\'' },		S_LITBEG,		},
106 	{	PROTO,	{ '/' },		COM1,			},
107 	{	PROTO,	{ '\n' },		S_NL,			},
108 	{	PROTO,	{ ' ','\t','\f','\v' },	WS1,			},
109 
110 /* proto {do,else,extern,for,if,inline,return,static,typedef,va_start,void,while,NoN} */
111 	{	RES1,	{ C_XXX },		S_MACRO,		},
112 	{	RES1,	{ C_LET, C_DEC },	NID,			},
113 	{	RES1,	{ 'a' },		RES1a,			},
114 	{	RES1,	{ 'e' },		RES1e,			},
115 	{	RES1,	{ 'f' },		RES1f,			},
116 	{	RES1,	{ 'h' },		RES1h,			},
117 	{	RES1,	{ 'l' },		RES1l,			},
118 	{	RES1,	{ 'n' },		RES1n,			},
119 	{	RES1,	{ 'o' },		RES1o,			},
120 	{	RES1,	{ 't' },		RES1t,			},
121 	{	RES1,	{ 'x' },		RES1x,			},
122 	{	RES1,	{ 'y' },		RES1y,			},
123 
124 	/* proto reserved {va_start} */
125 	{	RES1a,	{ C_XXX },		S_RESERVED,		},
126 	{	RES1a,	{ C_LET, C_DEC },	NID,			},
127 	{	RES1a,	{ '_','s','t','a' },	RES1a,			},
128 	{	RES1a,	{ 'r' },		RES1a,			},
129 
130 	/* proto reserved {return} */
131 	{	RES1e,	{ C_XXX },		S_RESERVED,		},
132 	{	RES1e,	{ C_LET, C_DEC },	NID,			},
133 	{	RES1e,	{ 't','u','r','n' },	RES1e,			},
134 
135 	/* proto reserved {if} */
136 	{	RES1f,	{ C_XXX },		S_RESERVED,		},
137 	{	RES1f,	{ C_LET, C_DEC },	NID,			},
138 
139 	/* proto reserved {while} */
140 	{	RES1h,	{ C_XXX },		S_RESERVED,		},
141 	{	RES1h,	{ C_LET, C_DEC },	NID,			},
142 	{	RES1h,	{ 'i','l','e' },	RES1h,			},
143 
144 	/* proto reserved {else} */
145 	{	RES1l,	{ C_XXX },		S_RESERVED,		},
146 	{	RES1l,	{ C_LET, C_DEC },	NID,			},
147 	{	RES1l,	{ 's','e' },		RES1l,			},
148 
149 	/* proto reserved {inline} */
150 	{	RES1n,	{ C_XXX },		S_RESERVED,		},
151 	{	RES1n,	{ C_LET, C_DEC },	NID,			},
152 	{	RES1n,	{ 'l','i','n','e' },	RES1n,			},
153 
154 	/* proto reserved {do,for,void} */
155 	{	RES1o,	{ C_XXX },		S_RESERVED,		},
156 	{	RES1o,	{ C_LET, C_DEC },	NID,			},
157 	{	RES1o,	{ 'r','i','d','N' },	RES1o,			},
158 
159 	/* proto reserved {static} */
160 	{	RES1t,	{ C_XXX },		S_RESERVED,		},
161 	{	RES1t,	{ C_LET, C_DEC },	NID,			},
162 	{	RES1t,	{ 'a','t','i','c' },	RES1t,			},
163 
164 	/* proto reserved {extern} */
165 	{	RES1x,	{ C_XXX },		S_RESERVED,		},
166 	{	RES1x,	{ C_LET, C_DEC },	NID,			},
167 	{	RES1x,	{ 't','e','r','n' },	RES1x,			},
168 
169 	/* proto reserved {typedef} */
170 	{	RES1y,	{ C_XXX },		S_RESERVED,		},
171 	{	RES1y,	{ C_LET, C_DEC },	NID,			},
172 	{	RES1y,	{ 'p','e','d','f' },	RES1y,			},
173 
174 	/* saw /, perhaps start of comment */
175 	{	COM1,	{ C_XXX },		S_CHRB,			},
176 	{	COM1,	{ '*' },		COM2,			},
177 #if PROTOMAIN
178 	{	COM1,	{ '/' },		COM5,			},
179 #endif
180 
181 	/* saw / *, start of comment */
182 	{	COM2,	{ C_XXX },		COM2,			},
183 	{	COM2,	{ '\n', C_EOF },	S_COMMENT,		},
184 	{	COM2,	{ '/' },		COM4,			},
185 	{	COM2,	{ '*' },		COM3,			},
186 	{	COM2,	{ '#', ';', ')' },	QUAL(COM2),		},
187 
188 	/* saw the * possibly ending a comment */
189 	{	COM3,	{ C_XXX },		COM2,			},
190 	{	COM3,	{ '\n', C_EOF },	S_COMMENT,		},
191 	{	COM3,	{ '#', ';', ')' },	QUAL(COM2),		},
192 	{	COM3,	{ '*' },		COM3,			},
193 	{	COM3,	{ '/' },		S_COMMENT,		},
194 
195 	/* saw / in / * comment, possible malformed nest */
196 	{	COM4,	{ C_XXX },		COM2,			},
197 	{	COM4,	{ '*', '\n', C_EOF },	S_COMMENT,		},
198 	{	COM4,	{ '/' },		COM4,			},
199 
200 	/* saw / /, start of comment */
201 	{	COM5,	{ C_XXX },		COM5,			},
202 	{	COM5,	{ '\n', C_EOF },	S_COMMENT,		},
203 	{	COM5,	{ '/' },		COM6,			},
204 	{	COM5,	{ '*' },		COM7,			},
205 
206 	/* saw / in / / comment, possible malformed nest */
207 	{	COM6,	{ C_XXX },		COM5,			},
208 	{	COM6,	{ '*', '\n', C_EOF },	S_COMMENT,		},
209 	{	COM6,	{ '/' },		COM6,			},
210 
211 	/* saw * in / /, possible malformed nest */
212 	{	COM7,	{ C_XXX },		COM5,			},
213 	{	COM7,	{ '\n', C_EOF },	S_COMMENT,		},
214 	{	COM7,	{ '*' },		COM7,			},
215 	{	COM7,	{ '/' },		S_COMMENT,		},
216 
217 	/* normal identifier -- always a macro candidate */
218 	{	NID,	{ C_XXX },		S_MACRO,		},
219 	{	NID,	{ C_LET, C_DEC },	NID,			},
220 
221 	/* saw ., operator or dbl constant */
222 	{	DOT,	{ C_XXX },		S_CHRB,			},
223 	{	DOT,	{ '.' },		DOT2,			},
224 	{	DOT,	{ C_DEC },		BAD1,			},
225 
226 	/* saw .., possible ... */
227 	{	DOT2,	{ C_XXX },		BACK(T_INVALID),	},
228 	{	DOT2,	{ '.' },		KEEP(T_VARIADIC),	},
229 
230 	/* saw L (possible start of normal wide literal) */
231 	{	LIT,	{ C_XXX },		S_MACRO,		},
232 	{	LIT,	{ C_LET, C_DEC },	NID,			},
233 	{	LIT,	{ '"', '\'' },		QUAL(LIT1),		},
234 
235 	/* saw " or ' beginning literal */
236 	{	LIT1,	{ C_XXX },		LIT1,			},
237 	{	LIT1,	{ '"', '\'' },		S_LITEND,		},
238 	{	LIT1,	{ '\n', C_EOF },	S_LITEND,		},
239 	{	LIT1,	{ '\\' },		LIT2,			},
240 
241 	/* saw \ in literal */
242 	{	LIT2,	{ C_XXX },		S_LITESC,		},
243 	{	LIT2,	{ '\n', C_EOF },	S_LITEND,		},
244 
245 	/* eat malformed numeric constant */
246 	{	BAD1,	{ C_XXX },		BACK(T_INVALID),	},
247 	{	BAD1,	{ C_LET, C_DEC, '.' },	BAD1,			},
248 	{	BAD1,	{ 'e', 'E' },		BAD2,			},
249 
250 	/* eat malformed numeric fraction|exponent */
251 	{	BAD2,	{ C_XXX },		BACK(T_INVALID),	},
252 	{	BAD2,	{ C_LET, C_DEC, '.' },	BAD1,			},
253 	{	BAD2,	{ '+', '-' },		BAD1,			},
254 
255 	/* saw white space, eat it up */
256 	{	WS1,	{ C_XXX },		S_WS,			},
257 	{	WS1,	{ ' ', '\t' },		WS1,			},
258 	{	WS1,	{ '\f', '\v' },		S_VS,			},
259 
260 #if !PROTOMAIN
261 
262 	/* quick template */
263 	{	QUICK,	{ C_XXX },		QTOK,			},
264 	{	QUICK,	{ C_EOF, MARK },	S_CHRB,			},
265 	{	QUICK,	{ C_LET, C_DEC },	QID,			},
266 	{	QUICK,	{ 'L' },		LIT0,			},
267 	{	QUICK,	{ '"', '\'' },		S_LITBEG,		},
268 	{	QUICK,	{ '/' },		S_CHRB,			},
269 	{	QUICK,	{ '*' },		QCOM,			},
270 	{	QUICK,	{ '#' },		SHARP1,			},
271 	{	QUICK,	{ '\n' },		S_NL,			},
272 	{	QUICK,	{ '\f', '\v' },		S_VS,			},
273 
274 	/* copy QUICK to QUICK+1 through MAC0+1 */
275 	{	OP,	{QUICK,QUICK+1,MAC0+1},	COPY,			},
276 
277 	/* quick start state */
278 	{	QUICK,	{ C_EOF },		S_EOF,			},
279 	{	QUICK,	{ C_DEC },		QNUM,			},
280 	{	QUICK,	{ MARK },		QTOK,			},
281 	{	QUICK,	{ '/' },		COM1,			},
282 	{	QUICK,	{ ' ', '\t' },		QUICK,			},
283 
284 	/* grab non-macro tokens */
285 	{	QTOK,	{ C_DEC },		QNUM,			},
286 
287 	/* grab numeric and invalid tokens */
288 	{	QNUM,	{ C_LET, C_DEC, '.' },	QNUM,			},
289 	{	QNUM,	{ 'e', 'E' },		QEXP,			},
290 
291 	/* grab exponent token */
292 	{	QEXP,	{ C_LET, C_DEC, '.' },	QNUM,			},
293 	{	QEXP,	{ '+', '-' },		QNUM,			},
294 
295 	/* saw *, grab possible bad comment terminator */
296 	{	QCOM,	{ C_DEC },		QNUM,			},
297 	{	QCOM,	{ '/' },		S_COMMENT,		},
298 
299 	/* saw L (possible start of wide string or first macro char) */
300 	{	MAC0,	{ 'L' },		QID,			},
301 	{	MAC0,	{ '"', '\'' },		QUAL(LIT1),		},
302 
303 	/* macro candidate template */
304 	{	MAC0+1,	{ 'L' },		QID,			},
305 
306 	/* copy MAC0+1 to MAC0+2 through MACN */
307 	{	OP,	{MAC0+1,MAC0+2,MACN},	COPY			},
308 
309 	/* saw L (possible start of wide string or macro L) */
310 	{	HIT0,	{ C_XXX },		S_MACRO,		},
311 	{	HIT0,	{ C_LET, C_DEC },	QID,			},
312 	{	HIT0,	{ '"', '\'' },		QUAL(LIT1),		},
313 
314 	/* macro hit template */
315 	{	HIT0+1,	{ C_XXX },		S_MACRO,		},
316 	{	HIT0+1,	{ C_LET, C_DEC },	QID,			},
317 
318 	/* copy HIT0+1 to HIT0+2 through HITN */
319 	{	OP,	{HIT0+1,HIT0+2,HITN},	COPY			},
320 
321 	/* saw L (possible start of wide literal) */
322 	{	LIT0,	{ C_XXX },		S_MACRO,		},
323 	{	LIT0,	{ C_LET, C_DEC },	QID,			},
324 	{	LIT0,	{ '"', '\'' },		QUAL(LIT1),		},
325 
326 	/* (!PROTOMAIN COM1) saw /, perhaps start of comment or /= */
327 	{	COM1,	{ '=' },		KEEP(T_DIVEQ),		},
328 
329 	/* normal start state */
330 	{	TOKEN,	{ C_XXX },		S_HUH,			},
331 	{	TOKEN,	{ C_EOF },		S_EOF,			},
332 	{	TOKEN,	{ C_DEC },		DEC1,			},
333 	{	TOKEN,	{ '0' },		OCT1,			},
334 	{	TOKEN,	{ '.' },		DOT1,			},
335 	{	TOKEN,	{ C_LET },		NID,			},
336 	{	TOKEN,	{ 'L' },		LIT,			},
337 	{	TOKEN,	{ '"', '\'', '<' },	S_LITBEG,		},
338 	{	TOKEN,	{ '/' },		COM1,			},
339 	{	TOKEN,	{ '\n' },		S_NL,			},
340 	{	TOKEN,	{ ' ', '\t' },		WS1,			},
341 	{	TOKEN,	{ '\f', '\v' },		S_VS,			},
342 	{	TOKEN,	{ '#' },		SHARP1,			},
343 	{	TOKEN,	{ ':' },		COLON1,			},
344 	{	TOKEN,	{ '%' },		PCT1,			},
345 	{	TOKEN,	{ '&' },		AND1,			},
346 	{	TOKEN,	{ '*' },		STAR1,			},
347 	{	TOKEN,	{ '+' },		PLUS1,			},
348 	{	TOKEN,	{ '-' },		MINUS1,			},
349 	{	TOKEN,	{ '=' },		EQ1,			},
350 	{	TOKEN,	{ '!' },		NOT1,			},
351 	{	TOKEN,	{ '>' },		GT1,			},
352 	{	TOKEN,	{ '^' },		CIRC1,			},
353 	{	TOKEN,	{ '|' },		OR1,			},
354 	{	TOKEN,	{ '(', ')', '[', ']' },	S_CHR,			},
355 	{	TOKEN,	{ '{', '}', ',', ';' },	S_CHR,			},
356 	{	TOKEN,	{ '~', '?' },		S_CHR,			},
357 
358 	/* saw 0, possible oct|hex|dec|dbl constant */
359 	{	OCT1,	{ C_XXX },		BACK(T_DECIMAL),	},
360 	{	OCT1,	{ C_LET, C_DEC },	BAD1,			},
361 	{	OCT1,	{ C_OCT },		OCT2,			},
362 	{	OCT1,	{ 'e', 'E' },		DBL2,			},
363 	{	OCT1,	{ 'l', 'L', 'u', 'U' },	QUAL(DEC2),		},
364 	{	OCT1,	{ 'x', 'X' },		HEX1,			},
365 	{	OCT1,	{ '.' },		DBL1,			},
366 
367 	/* saw 0<oct>, oct constant */
368 	{	OCT2,	{ C_XXX },		BACK(T_OCTAL),		},
369 	{	OCT2,	{ C_LET, C_DEC },	BAD1,			},
370 	{	OCT2,	{ C_OCT },		OCT2,			},
371 	{	OCT2,	{ 'e', 'E' },		DBL2,			},
372 	{	OCT2,	{ 'l', 'L', 'u', 'U' },	QUAL(OCT3),		},
373 	{	OCT2,	{ '.' },		DBL1,			},
374 
375 	/* oct constant qualifier */
376 	{	OCT3,	{ C_XXX },		BACK(T_OCTAL),		},
377 	{	OCT3,	{ C_LET, C_DEC, '.' },	BAD1,			},
378 	{	OCT3,	{ 'l', 'L', 'u', 'U' },	QUAL(OCT3),		},
379 
380 	/* saw 0 [xX], hex constant */
381 	{	HEX1,	{ C_XXX },		BACK(T_HEXADECIMAL),	},
382 	{	HEX1,	{ C_LET },		BAD1,			},
383 	{	HEX1,	{ C_HEX },		HEX1,			},
384 	{	HEX1,	{ 'e', 'E' },		HEX3,			},
385 	{	HEX1,	{ 'l', 'L', 'u', 'U' },	QUAL(HEX2),		},
386 	{	HEX1,	{ '.' },		HEX4,			},
387 	{	HEX1,	{ 'p', 'P' },		HEX5,			},
388 
389 	/* hex constant qualifier */
390 	{	HEX2,	{ C_XXX },		BACK(T_HEXADECIMAL),	},
391 	{	HEX2,	{ C_LET, C_DEC, '.' },	BAD1,			},
392 	{	HEX2,	{ 'l', 'L', 'u', 'U' },	QUAL(HEX2),		},
393 
394 	/* hex [eE][-+] botch */
395 	{	HEX3,	{ C_XXX },		BACK(T_HEXADECIMAL),	},
396 	{	HEX3,	{ C_LET, '.', '-', '+'},BAD1,			},
397 	{	HEX3,	{ C_HEX },		HEX1,			},
398 	{	HEX3,	{ 'e', 'E' },		HEX3,			},
399 	{	HEX3,	{ 'l', 'L', 'u', 'U' },	QUAL(HEX2),		},
400 
401 	/* hex dbl fraction */
402 	{	HEX4,	{ C_XXX },		BACK(T_HEXDOUBLE),	},
403 	{	HEX4,	{ C_LET, '.' },		BAD1,			},
404 	{	HEX4,	{ C_HEX },		HEX4,			},
405 	{	HEX4,	{ 'p', 'P' },		HEX5,			},
406 	{	HEX4,	{ 'f', 'F', 'l', 'L' },	QUAL(HEX8),		},
407 
408 	/* optional hex dbl exponent sign */
409 	{	HEX5,	{ C_XXX },		BACK(T_INVALID),	},
410 	{	HEX5,	{ C_LET, '.' },		BAD1,			},
411 	{	HEX5,	{ '+', '-' },		HEX6,			},
412 	{	HEX5,	{ C_DEC },		HEX7,			},
413 
414 	/* mandatory hex dbl exponent first digit */
415 	{	HEX6,	{ C_XXX },		BACK(T_INVALID),	},
416 	{	HEX6,	{ C_LET, '.' },		BAD1,			},
417 	{	HEX6,	{ C_DEC },		HEX7,			},
418 
419 	/* hex dbl exponent digits */
420 	{	HEX7,	{ C_XXX },		BACK(T_HEXDOUBLE),	},
421 	{	HEX7,	{ C_LET, '.' },		BAD1,			},
422 	{	HEX7,	{ C_DEC },		HEX7,			},
423 	{	HEX7,	{ 'f', 'F', 'l', 'L' },	QUAL(HEX8),		},
424 
425 	/* hex dbl constant qualifier */
426 	{	HEX8,	{ C_XXX },		BACK(T_HEXDOUBLE),	},
427 	{	HEX8,	{ C_LET, '.' },		BAD1,			},
428 	{	HEX8,	{ 'f', 'F', 'l', 'L' },	QUAL(HEX8),		},
429 
430 	/* saw <dec>, dec constant */
431 	{	DEC1,	{ C_XXX },		BACK(T_DECIMAL),	},
432 	{	DEC1,	{ C_LET },		BAD1,			},
433 	{	DEC1,	{ C_DEC },		DEC1,			},
434 	{	DEC1,	{ 'e', 'E' },		DBL2,			},
435 	{	DEC1,	{ 'l', 'L', 'u', 'U' },	QUAL(DEC2),		},
436 	{	DEC1,	{ '.' },		DBL1,			},
437 
438 	/* dec constant qualifier */
439 	{	DEC2,	{ C_XXX },		BACK(T_DECIMAL),	},
440 	{	DEC2,	{ C_LET, C_DEC },	BAD1,			},
441 	{	DEC2,	{ 'l', 'L', 'u', 'U' },	QUAL(DEC2),		},
442 
443 	/* saw ., operator or dbl constant */
444 	{	DOT1,	{ C_XXX },		S_CHRB,			},
445 	{	DOT1,	{ '.' },		DOT2,			},
446 	{	DOT1,	{ C_DEC },		DBL1,			},
447 
448 	/* dbl fraction */
449 	{	DBL1,	{ C_XXX },		BACK(T_DOUBLE),		},
450 	{	DBL1,	{ C_LET, '.' },		BAD1,			},
451 	{	DBL1,	{ C_DEC },		DBL1,			},
452 	{	DBL1,	{ 'e', 'E' },		DBL2,			},
453 	{	DBL1,	{ 'f', 'F', 'l', 'L' },	QUAL(DBL5),		},
454 
455 	/* optional dbl exponent sign */
456 	{	DBL2,	{ C_XXX },		BACK(T_INVALID),	},
457 	{	DBL2,	{ C_LET, '.' },		BAD1,			},
458 	{	DBL2,	{ '+', '-' },		DBL3,			},
459 	{	DBL2,	{ C_DEC },		DBL4,			},
460 
461 	/* mandatory dbl exponent first digit */
462 	{	DBL3,	{ C_XXX },		BACK(T_INVALID),	},
463 	{	DBL3,	{ C_LET, '.' },		BAD1,			},
464 	{	DBL3,	{ C_DEC },		DBL4,			},
465 
466 	/* dbl exponent digits */
467 	{	DBL4,	{ C_XXX },		BACK(T_DOUBLE),		},
468 	{	DBL4,	{ C_LET, '.' },		BAD1,			},
469 	{	DBL4,	{ C_DEC },		DBL4,			},
470 	{	DBL4,	{ 'f', 'F', 'l', 'L' },	QUAL(DBL5),		},
471 
472 	/* dbl constant qualifier */
473 	{	DBL5,	{ C_XXX },		BACK(T_DOUBLE),		},
474 	{	DBL5,	{ C_LET, '.' },		BAD1,			},
475 	{	DBL5,	{ 'f', 'F', 'l', 'L' },	QUAL(DBL5),		},
476 
477 	/* saw < starting include header */
478 	{	HDR1,	{ C_XXX },		HDR1,			},
479 	{	HDR1,	{ '>', '\n', C_EOF },	S_LITEND,		},
480 
481 	/* saw <binop><space> expecting = */
482 	{	BIN1,	{ C_XXX },		S_HUH,			},
483 	{	BIN1,	{ ' ', '\t' },		BIN1,			},
484 
485 	/* 2-char ops */
486 
487 	{	SHARP1,	{ C_XXX },		S_SHARP,		},
488 
489 	{	PCT1,	{ C_XXX },		S_CHRB,			},
490 	{	PCT1,	{ '=' },		KEEP(T_MODEQ),		},
491 
492 	{	AND1,	{ C_XXX },		S_CHRB,			},
493 	{	AND1,	{ '=' },		KEEP(T_ANDEQ),		},
494 	{	AND1,	{ '&' },		KEEP(T_ANDAND),		},
495 
496 	{	STAR1,	{ C_XXX },		S_CHRB,			},
497 	{	STAR1,	{ '=' },		KEEP(T_MPYEQ),		},
498 	{	STAR1,	{ '/' },		S_COMMENT,		},
499 
500 	{	PLUS1,	{ C_XXX },		S_CHRB,			},
501 	{	PLUS1,	{ '=' },		KEEP(T_ADDEQ),		},
502 	{	PLUS1,	{ '+' },		KEEP(T_ADDADD),		},
503 
504 	{	MINUS1,	{ C_XXX },		S_CHRB,			},
505 	{	MINUS1,	{ '=' },		KEEP(T_SUBEQ),		},
506 	{	MINUS1,	{ '-' },		KEEP(T_SUBSUB),		},
507 	{	MINUS1,	{ '>' },		KEEP(T_PTRMEM),		},
508 
509 	{	COLON1,	{ C_XXX },		S_CHRB,			},
510 	{	COLON1,	{ '=', '>' },		S_HUH,			},
511 
512 	{	LT1,	{ C_XXX },		S_CHRB,			},
513 	{	LT1,	{ '=' },		KEEP(T_LE),		},
514 	{	LT1,	{ '<' },		LSH1,			},
515 
516 	{	EQ1,	{ C_XXX },		S_CHRB,			},
517 	{	EQ1,	{ '=' },		KEEP(T_EQ),		},
518 
519 	{	NOT1,	{ C_XXX },		S_CHRB,			},
520 	{	NOT1,	{ '=' },		KEEP(T_NE),		},
521 
522 	{	GT1,	{ C_XXX },		S_CHRB,			},
523 	{	GT1,	{ '=' },		KEEP(T_GE),		},
524 	{	GT1,	{ '>' },		RSH1,			},
525 
526 	{	CIRC1,	{ C_XXX },		S_CHRB,			},
527 	{	CIRC1,	{ '=' },		KEEP(T_XOREQ),		},
528 
529 	{	OR1,	{ C_XXX },		S_CHRB,			},
530 	{	OR1,	{ '=' },		KEEP(T_OREQ),		},
531 	{	OR1,	{ '|' },		KEEP(T_OROR),		},
532 
533 	/* 3-char ops */
534 
535 	{	ARROW1,	{ C_XXX },		BACK(T_PTRMEM),		},
536 	{	ARROW1,	{ '*' },		KEEP(T_PTRMEMREF),	},
537 
538 	{	LSH1,	{ C_XXX },		BACK(T_LSHIFT),		},
539 	{	LSH1,	{ '=' },		KEEP(T_LSHIFTEQ),	},
540 
541 	{	RSH1,	{ C_XXX },		BACK(T_RSHIFT),		},
542 	{	RSH1,	{ '=' },		KEEP(T_RSHIFTEQ),	},
543 
544 #endif
545 
546 	/* end */
547 	{	OP,	{ 0 },			END,			}
548 };
549 
550 short		fsm[TERMINAL+1][MAX+1];
551 
552 char		trigraph[MAX+1];
553 
554 #if PROTOMAIN
555 static char	spl[] = { '\\', '\r', 0 };
556 static char	aln[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_$@";
557 #else
558 static char	spl[] = { MARK, '?', '\\', '\r', CC_sub, 0 };
559 static char	aln[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_";
560 #endif
561 static char*	let = &aln[10];
562 static char	hex[] = "fedcbaFEDCBA9876543210";
563 static char*	dec = &hex[12];
564 static char*	oct = &hex[14];
565 
566 /*
567  * runtime FSM modifications
568  * ppfsm(FSM_INIT,0) must be called first
569  */
570 
571 void
ppfsm(int op,register char * s)572 ppfsm(int op, register char* s)
573 {
574 	register int			c;
575 	register int			n;
576 	register int			i;
577 	register short*			rp;
578 	register struct fsminit*	fp;
579 #if !PROTOMAIN
580 	char*				t;
581 	int				x;
582 #endif
583 
584 	switch (op)
585 	{
586 
587 #if !PROTOMAIN
588 
589 	case FSM_IDADD:
590 		while (c = *s++)
591 			if (!ppisid(c))
592 			{
593 				if (fsm[TOKEN][c] == ~S_HUH)
594 				{
595 					setid(c);
596 					for (i = 0; i < TERMINAL; i++)
597 						fsm[i][c] = IDSTATE(fsm[i]['_']);
598 				}
599 				else error(2, "%c: cannot add to identifier set", c);
600 			}
601 		break;
602 
603 	case FSM_IDDEL:
604 		while (c = *s++)
605 			if (ppisid(c))
606 			{
607 				clrid(c);
608 				for (i = 0; i < TERMINAL; i++)
609 					fsm[i][c] = ~S_HUH;
610 			}
611 		break;
612 
613 #endif
614 
615 	case FSM_INIT:
616 		for (fp = fsminit;; fp++)
617 		{
618 			if ((n = fp->nextstate) >= TERMINAL) n = ~n;
619 			if (fp->state == OP)
620 			{
621 #if !PROTOMAIN
622 				switch (n)
623 				{
624 				case COPY:
625 					c = fp->ch[0];
626 					n = fp->ch[2];
627 					for (i = fp->ch[1]; i <= n; i++)
628 						copy(i, c);
629 					continue;
630 				default:
631 					break;
632 				}
633 #endif
634 				break;
635 			}
636 			rp = fsm[fp->state];
637 			for (i = 0; i < sizeof(fp->ch) && (c = fp->ch[i]); i++)
638 			{
639 				switch (c)
640 				{
641 				case C_XXX:
642 					for (c = 0; c <= MAX; c++)
643 						rp[c] = n;
644 					/*FALLTHROUGH*/
645 
646 				case C_EOF:
647 					fsm[TERMINAL][fp->state+1] = n < 0 ? ~n : n;
648 					continue;
649 
650 				case C_LET:
651 					s = let;
652 					break;
653 
654 				case C_HEX:
655 					s = hex;
656 					break;
657 
658 				case C_DEC:
659 					s = dec;
660 					break;
661 
662 				case C_OCT:
663 					s = oct;
664 					break;
665 
666 				default:
667 					rp[c] = n;
668 					continue;
669 				}
670 				while (c = *s++)
671 					rp[c] = n;
672 			}
673 		}
674 
675 		/*
676 		 * install splice special cases
677 		 * and same non-terminal transitions
678 		 */
679 
680 		for (i = 0; i < TERMINAL; i++)
681 		{
682 			rp = fsm[i];
683 			s = spl;
684 			while (c = *s++)
685 				if (c != MARK || !INCOMMENT(rp))
686 				{
687 					if (rp[c] >= 0) rp[c] = ~rp[c];
688 					rp[c] &= ~SPLICE;
689 				}
690 			rp[EOB] = ~S_EOB;
691 			for (c = 0; c <= MAX; c++)
692 				if (rp[c] == i)
693 					rp[c] = 0;
694 		}
695 		fsm[TERMINAL][0] = ~S_EOB;
696 
697 #if !PROTOMAIN
698 
699 		/*
700 		 * default character types
701 		 */
702 
703 		s = let;
704 		while (c = *s++)
705 			setid(c);
706 		s = dec;
707 		while (c = *s++)
708 			setdig(c);
709 		s = spl;
710 		do setsplice(c = *s++); while (c);
711 
712 		/*
713 		 * trigraph map
714 		 */
715 
716 		trigraph['='] = '#';
717 		trigraph['('] = '[';
718 		trigraph['/'] = '\\';
719 		trigraph[')'] = ']';
720 		trigraph['\''] = '^';
721 		trigraph['<'] = '{';
722 		trigraph['!'] = '|';
723 		trigraph['>'] = '}';
724 		trigraph['-'] = '~';
725 #endif
726 		break;
727 
728 #if !PROTOMAIN
729 
730 	case FSM_PLUSPLUS:
731 		if (pp.option & PLUSPLUS)
732 		{
733 			fsm[COLON1][':'] = ~KEEP(T_SCOPE);
734 			fsm[DOT1]['*'] = ~KEEP(T_DOTREF);
735 			fsm[MINUS1]['>'] = ARROW1;
736 			fsm[COM1]['/'] = COM5;
737 			t = "%<:";
738 			for (i = 0; i < TERMINAL; i++)
739 			{
740 				rp = fsm[i];
741 				if (!INCOMMENT(rp) && !INQUOTE(rp))
742 				{
743 					s = t;
744 					while (c = *s++)
745 					{
746 						if (rp[c] > 0) rp[c] = ~rp[c];
747 						else if (!rp[c]) rp[c] = ~i;
748 						rp[c] &= ~SPLICE;
749 					}
750 				}
751 			}
752 			s = t;
753 			while (c = *s++) setsplice(c);
754 		}
755 		else
756 		{
757 			fsm[COLON1][':'] = ~S_CHRB;
758 			fsm[DOT1]['*'] = ~S_CHRB;
759 			fsm[MINUS1]['>'] = ~KEEP(T_PTRMEM);
760 			fsm[COM1]['/'] = (pp.option & PLUSCOMMENT) ? COM5 : ~S_CHRB;
761 		}
762 		break;
763 
764 #if COMPATIBLE
765 
766 	case FSM_COMPATIBILITY:
767 		if (pp.state & COMPATIBILITY)
768 		{
769 			fsm[HEX1]['e'] = HEX1;
770 			fsm[HEX1]['E'] = HEX1;
771 			fsm[QNUM]['e'] = QNUM;
772 			fsm[QNUM]['E'] = QNUM;
773 			fsm[QNUM]['u'] = ~QUAL(QNUM);
774 			fsm[QNUM]['U'] = ~QUAL(QNUM);
775 		}
776 		else
777 		{
778 			fsm[HEX1]['e'] = HEX3;
779 			fsm[HEX1]['E'] = HEX3;
780 			fsm[QNUM]['e'] = QEXP;
781 			fsm[QNUM]['E'] = QEXP;
782 			fsm[QNUM]['u'] = QNUM;
783 			fsm[QNUM]['U'] = QNUM;
784 		}
785 		break;
786 
787 #endif
788 
789 	case FSM_QUOTADD:
790 		while (c = *s++)
791 			if (fsm[TOKEN][c] == ~S_HUH)
792 				for (i = 0; i < TERMINAL; i++)
793 					fsm[i][c] = fsm[i]['"'];
794 			else error(2, "%c: cannot add to quote set", c);
795 		break;
796 
797 	case FSM_QUOTDEL:
798 		while (c = *s++)
799 			if (c != '"' && fsm[TOKEN][c] == fsm[TOKEN]['"'])
800 				for (i = 0; i < TERMINAL; i++)
801 					fsm[i][c] = fsm[i]['_'];
802 		break;
803 
804 	case FSM_OPSPACE:
805 		n = s ? BIN1 : ~S_CHRB;
806 		fsm[COM1][' '] = fsm[COM1]['\t'] = n;
807 		fsm[AND1][' '] = fsm[AND1]['\t'] = n;
808 		fsm[STAR1][' '] = fsm[STAR1]['\t'] = n;
809 		fsm[PCT1][' '] = fsm[PCT1]['\t'] = n;
810 		fsm[PLUS1][' '] = fsm[PLUS1]['\t'] = n;
811 		fsm[MINUS1][' '] = fsm[MINUS1]['\t'] = n;
812 		fsm[CIRC1][' '] = fsm[CIRC1]['\t'] = n;
813 		fsm[OR1][' '] = fsm[OR1]['\t'] = n;
814 		fsm[LSH1][' '] = fsm[LSH1]['\t'] = s ? BIN1 : ~BACK(T_LSHIFT);
815 		fsm[RSH1][' '] = fsm[RSH1]['\t'] = s ? BIN1 : ~BACK(T_RSHIFT);
816 		break;
817 
818 	case FSM_MACRO:
819 		if (pp.truncate && strlen(s) >= pp.truncate)
820 		{
821 			x = s[pp.truncate];
822 			s[pp.truncate] = 0;
823 		}
824 		else x = -1;
825 		i = MAC0 + ((c = *s++) != 'L');
826 		if ((n = fsm[QUICK][c]) != (i + NMAC))
827 		{
828 			n = i;
829 			if (!*s) n += NMAC;
830 		}
831 		if (fsm[QUICK][c] != n)
832 			fsm[QUICK][c] = fsm[QCOM][c] = fsm[QTOK][c] = n;
833 		if (c = *s++)
834 		{
835 			for (;;)
836 			{
837 				if ((i = n) < HIT0)
838 				{
839 					if (n < MACN) n++;
840 					if (!*s)
841 					{
842 						n += NMAC;
843 						break;
844 					}
845 					if (fsm[i][c] < HIT0)
846 						fsm[i][c] = n;
847 					if (fsm[i + NMAC][c] < HIT0)
848 						fsm[i + NMAC][c] = n;
849 				}
850 				else
851 				{
852 					if (n < HITN) n++;
853 					if (!*s) break;
854 					if (fsm[i][c] < HIT0)
855 					{
856 						n -= NMAC;
857 						fsm[i][c] = n;
858 					}
859 				}
860 				c = *s++;
861 			}
862 			if (x >= 0)
863 			{
864 				*s = x;
865 				for (n = CHAR_MIN; n <= CHAR_MAX; n++)
866 					if (ppisidig(n))
867 						fsm[HITN][n] = HITN;
868 				n = HITN;
869 			}
870 			if (fsm[i][c] < n)
871 				fsm[i][c] = n;
872 			if (i < HIT0 && fsm[i + NMAC][c] < n)
873 				fsm[i + NMAC][c] = n;
874 		}
875 		break;
876 
877 #endif
878 
879 	}
880 }
881 
882 #if !PROTOMAIN
883 
884 /*
885  * file buffer refill
886  * c is current input char
887  */
888 
889 void
refill(register int c)890 refill(register int c)
891 {
892 	if (pp.in->flags & IN_eof)
893 	{
894 		pp.in->nextchr--;
895 		c = 0;
896 	}
897 	else
898 	{
899 		*((pp.in->nextchr = pp.in->buffer + PPBAKSIZ) - 1) = c;
900 		c =
901 #if PROTOTYPE
902 		(pp.in->flags & IN_prototype) ? pppread(pp.in->nextchr) :
903 #endif
904 		read(pp.in->fd, pp.in->nextchr, PPBUFSIZ);
905 	}
906 	if (c > 0)
907 	{
908 		if (pp.in->nextchr[c - 1] == '\n') pp.in->flags |= IN_newline;
909 		else pp.in->flags &= ~IN_newline;
910 #if PROTOTYPE
911 		if (!(pp.in->flags & IN_prototype))
912 #endif
913 		if (c < PPBUFSIZ && (pp.in->flags & IN_regular))
914 		{
915 			pp.in->flags |= IN_eof;
916 			close(pp.in->fd);
917 			pp.in->fd = -1;
918 		}
919 	}
920 	else
921 	{
922 		if (c < 0)
923 		{
924 			error(ERROR_SYSTEM|3, "read error");
925 			c = 0;
926 		}
927 		else if ((pp.in->flags ^ pp.in->prev->flags) & IN_c)
928 		{
929 			static char	ket[] = { 0, '}', '\n', 0 };
930 
931 			pp.in->flags ^= IN_c;
932 			pp.in->nextchr = ket + 1;
933 			c = 2;
934 		}
935 		pp.in->flags |= IN_eof;
936 	}
937 #if CHECKPOINT
938 	pp.in->buflen = c;
939 #endif
940 	pp.in->nextchr[c] = 0;
941 	debug((-7, "refill(\"%s\") = %d = \"%-.*s%s\"", error_info.file, c, (c > 32 ? 32 : c), pp.in->nextchr, c > 32 ? "..." : ""));
942 	if (pp.test & 0x0080)
943 		sfprintf(sfstderr, "===== refill(\"%s\") = %d =====\n%s\n===== eob(\"%s\") =====\n", error_info.file, c, pp.in->nextchr, error_info.file);
944 }
945 
946 #endif
947