xref: /freebsd/contrib/bc/include/lex.h (revision 2faf504d1ab821fe2b9df9d2afb49bb35e1334f4)
1 /*
2  * *****************************************************************************
3  *
4  * SPDX-License-Identifier: BSD-2-Clause
5  *
6  * Copyright (c) 2018-2021 Gavin D. Howard and contributors.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions are met:
10  *
11  * * Redistributions of source code must retain the above copyright notice, this
12  *   list of conditions and the following disclaimer.
13  *
14  * * Redistributions in binary form must reproduce the above copyright notice,
15  *   this list of conditions and the following disclaimer in the documentation
16  *   and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGE.
29  *
30  * *****************************************************************************
31  *
32  * Definitions for bc's lexer.
33  *
34  */
35 
36 #ifndef BC_LEX_H
37 #define BC_LEX_H
38 
39 #include <stdbool.h>
40 #include <stddef.h>
41 
42 #include <status.h>
43 #include <vector.h>
44 #include <lang.h>
45 
46 // Two convencience macros for throwing errors in lex code. They take care of
47 // plumbing like passing in the current line the lexer is on.
48 #define bc_lex_err(l, e) (bc_vm_handleError((e), (l)->line))
49 #define bc_lex_verr(l, e, ...) (bc_vm_handleError((e), (l)->line, __VA_ARGS__))
50 
51 // BC_LEX_NEG_CHAR returns the char that corresponds to negative for the
52 // current calculator.
53 //
54 // BC_LEX_LAST_NUM_CHAR returns the char that corresponds to the last valid
55 // char for numbers. In bc and dc, capital letters are part of numbers, to a
56 // point. (dc only goes up to hex, so its last valid char is 'F'.)
57 #if BC_ENABLED
58 
59 #if DC_ENABLED
60 #define BC_LEX_NEG_CHAR (BC_IS_BC ? '-' : '_')
61 #define BC_LEX_LAST_NUM_CHAR (BC_IS_BC ? 'Z' : 'F')
62 #else // DC_ENABLED
63 #define BC_LEX_NEG_CHAR ('-')
64 #define BC_LEX_LAST_NUM_CHAR ('Z')
65 #endif // DC_ENABLED
66 
67 #else // BC_ENABLED
68 
69 #define BC_LEX_NEG_CHAR ('_')
70 #define BC_LEX_LAST_NUM_CHAR ('F')
71 
72 #endif // BC_ENABLED
73 
74 /**
75  * Returns true if c is a valid number character.
76  * @param c         The char to check.
77  * @param pt        If a decimal point has already been seen.
78  * @param int_only  True if the number is expected to be an int only, false if
79  *                  non-integers are allowed.
80  * @return          True if @a c is a valid number character.
81  */
82 #define BC_LEX_NUM_CHAR(c, pt, int_only)                               \
83 	(isdigit(c) != 0 || ((c) >= 'A' && (c) <= BC_LEX_LAST_NUM_CHAR) || \
84 	 ((c) == '.' && !(pt) && !(int_only)))
85 
86 /// An enum of lex token types.
87 typedef enum BcLexType {
88 
89 	/// End of file.
90 	BC_LEX_EOF,
91 
92 	/// Marker for invalid tokens, used by bc and dc for const data.
93 	BC_LEX_INVALID,
94 
95 #if BC_ENABLED
96 
97 	/// Increment operator.
98 	BC_LEX_OP_INC,
99 
100 	/// Decrement operator.
101 	BC_LEX_OP_DEC,
102 
103 #endif // BC_ENABLED
104 
105 	/// BC_LEX_NEG is not used in lexing; it is only for parsing. The lexer
106 	/// marks all '-' characters as BC_LEX_OP_MINUS, but the parser needs to be
107 	/// able to distinguish them.
108 	BC_LEX_NEG,
109 
110 	/// Boolean not.
111 	BC_LEX_OP_BOOL_NOT,
112 
113 #if BC_ENABLE_EXTRA_MATH
114 
115 	/// Truncation operator.
116 	BC_LEX_OP_TRUNC,
117 
118 #endif // BC_ENABLE_EXTRA_MATH
119 
120 	/// Power operator.
121 	BC_LEX_OP_POWER,
122 
123 	/// Multiplication operator.
124 	BC_LEX_OP_MULTIPLY,
125 
126 	/// Division operator.
127 	BC_LEX_OP_DIVIDE,
128 
129 	/// Modulus operator.
130 	BC_LEX_OP_MODULUS,
131 
132 	/// Addition operator.
133 	BC_LEX_OP_PLUS,
134 
135 	/// Subtraction operator.
136 	BC_LEX_OP_MINUS,
137 
138 #if BC_ENABLE_EXTRA_MATH
139 	/// Places (truncate or extend) operator.
140 	BC_LEX_OP_PLACES,
141 
142 	/// Left (decimal) shift operator.
143 	BC_LEX_OP_LSHIFT,
144 
145 	/// Right (decimal) shift operator.
146 	BC_LEX_OP_RSHIFT,
147 #endif // BC_ENABLE_EXTRA_MATH
148 
149 	/// Equal operator.
150 	BC_LEX_OP_REL_EQ,
151 
152 	/// Less than or equal operator.
153 	BC_LEX_OP_REL_LE,
154 
155 	/// Greater than or equal operator.
156 	BC_LEX_OP_REL_GE,
157 
158 	/// Not equal operator.
159 	BC_LEX_OP_REL_NE,
160 
161 	/// Less than operator.
162 	BC_LEX_OP_REL_LT,
163 
164 	/// Greater than operator.
165 	BC_LEX_OP_REL_GT,
166 
167 	/// Boolean or operator.
168 	BC_LEX_OP_BOOL_OR,
169 
170 	/// Boolean and operator.
171 	BC_LEX_OP_BOOL_AND,
172 
173 #if BC_ENABLED
174 	/// Power assignment operator.
175 	BC_LEX_OP_ASSIGN_POWER,
176 
177 	/// Multiplication assignment operator.
178 	BC_LEX_OP_ASSIGN_MULTIPLY,
179 
180 	/// Division assignment operator.
181 	BC_LEX_OP_ASSIGN_DIVIDE,
182 
183 	/// Modulus assignment operator.
184 	BC_LEX_OP_ASSIGN_MODULUS,
185 
186 	/// Addition assignment operator.
187 	BC_LEX_OP_ASSIGN_PLUS,
188 
189 	/// Subtraction assignment operator.
190 	BC_LEX_OP_ASSIGN_MINUS,
191 
192 #if BC_ENABLE_EXTRA_MATH
193 
194 	/// Places (truncate or extend) assignment operator.
195 	BC_LEX_OP_ASSIGN_PLACES,
196 
197 	/// Left (decimal) shift assignment operator.
198 	BC_LEX_OP_ASSIGN_LSHIFT,
199 
200 	/// Right (decimal) shift assignment operator.
201 	BC_LEX_OP_ASSIGN_RSHIFT,
202 
203 #endif // BC_ENABLE_EXTRA_MATH
204 #endif // BC_ENABLED
205 
206 	/// Assignment operator.
207 	BC_LEX_OP_ASSIGN,
208 
209 	/// Newline.
210 	BC_LEX_NLINE,
211 
212 	/// Whitespace.
213 	BC_LEX_WHITESPACE,
214 
215 	/// Left parenthesis.
216 	BC_LEX_LPAREN,
217 
218 	/// Right parenthesis.
219 	BC_LEX_RPAREN,
220 
221 	/// Left bracket.
222 	BC_LEX_LBRACKET,
223 
224 	/// Comma.
225 	BC_LEX_COMMA,
226 
227 	/// Right bracket.
228 	BC_LEX_RBRACKET,
229 
230 	/// Left brace.
231 	BC_LEX_LBRACE,
232 
233 	/// Semicolon.
234 	BC_LEX_SCOLON,
235 
236 	/// Right brace.
237 	BC_LEX_RBRACE,
238 
239 	/// String.
240 	BC_LEX_STR,
241 
242 	/// Identifier/name.
243 	BC_LEX_NAME,
244 
245 	/// Constant number.
246 	BC_LEX_NUMBER,
247 
248 	// These keywords are in the order they are in for a reason. Don't change
249 	// the order unless you want a bunch of weird failures in the test suite.
250 	// In fact, almost all of these tokens are in a specific order for a reason.
251 
252 #if BC_ENABLED
253 
254 	/// bc auto keyword.
255 	BC_LEX_KW_AUTO,
256 
257 	/// bc break keyword.
258 	BC_LEX_KW_BREAK,
259 
260 	/// bc continue keyword.
261 	BC_LEX_KW_CONTINUE,
262 
263 	/// bc define keyword.
264 	BC_LEX_KW_DEFINE,
265 
266 	/// bc for keyword.
267 	BC_LEX_KW_FOR,
268 
269 	/// bc if keyword.
270 	BC_LEX_KW_IF,
271 
272 	/// bc limits keyword.
273 	BC_LEX_KW_LIMITS,
274 
275 	/// bc return keyword.
276 	BC_LEX_KW_RETURN,
277 
278 	/// bc while keyword.
279 	BC_LEX_KW_WHILE,
280 
281 	/// bc halt keyword.
282 	BC_LEX_KW_HALT,
283 
284 	/// bc last keyword.
285 	BC_LEX_KW_LAST,
286 
287 #endif // BC_ENABLED
288 
289 	/// bc ibase keyword.
290 	BC_LEX_KW_IBASE,
291 
292 	/// bc obase keyword.
293 	BC_LEX_KW_OBASE,
294 
295 	/// bc scale keyword.
296 	BC_LEX_KW_SCALE,
297 
298 #if BC_ENABLE_EXTRA_MATH
299 
300 	/// bc seed keyword.
301 	BC_LEX_KW_SEED,
302 
303 #endif // BC_ENABLE_EXTRA_MATH
304 
305 	/// bc length keyword.
306 	BC_LEX_KW_LENGTH,
307 
308 	/// bc print keyword.
309 	BC_LEX_KW_PRINT,
310 
311 	/// bc sqrt keyword.
312 	BC_LEX_KW_SQRT,
313 
314 	/// bc abs keyword.
315 	BC_LEX_KW_ABS,
316 
317 #if BC_ENABLE_EXTRA_MATH
318 
319 	/// bc irand keyword.
320 	BC_LEX_KW_IRAND,
321 
322 #endif // BC_ENABLE_EXTRA_MATH
323 
324 	/// bc asciffy keyword.
325 	BC_LEX_KW_ASCIIFY,
326 
327 	/// bc modexp keyword.
328 	BC_LEX_KW_MODEXP,
329 
330 	/// bc divmod keyword.
331 	BC_LEX_KW_DIVMOD,
332 
333 	/// bc quit keyword.
334 	BC_LEX_KW_QUIT,
335 
336 	/// bc read keyword.
337 	BC_LEX_KW_READ,
338 
339 #if BC_ENABLE_EXTRA_MATH
340 
341 	/// bc rand keyword.
342 	BC_LEX_KW_RAND,
343 
344 #endif // BC_ENABLE_EXTRA_MATH
345 
346 	/// bc maxibase keyword.
347 	BC_LEX_KW_MAXIBASE,
348 
349 	/// bc maxobase keyword.
350 	BC_LEX_KW_MAXOBASE,
351 
352 	/// bc maxscale keyword.
353 	BC_LEX_KW_MAXSCALE,
354 
355 #if BC_ENABLE_EXTRA_MATH
356 	/// bc maxrand keyword.
357 	BC_LEX_KW_MAXRAND,
358 #endif // BC_ENABLE_EXTRA_MATH
359 
360 	/// bc stream keyword.
361 	BC_LEX_KW_STREAM,
362 
363 	/// bc else keyword.
364 	BC_LEX_KW_ELSE,
365 
366 #if DC_ENABLED
367 
368 	/// A special token for dc to calculate equal without a register.
369 	BC_LEX_EQ_NO_REG,
370 
371 	/// Colon (array) operator.
372 	BC_LEX_COLON,
373 
374 	/// Execute command.
375 	BC_LEX_EXECUTE,
376 
377 	/// Print stack command.
378 	BC_LEX_PRINT_STACK,
379 
380 	/// Clear stack command.
381 	BC_LEX_CLEAR_STACK,
382 
383 	/// Register stack level command.
384 	BC_LEX_REG_STACK_LEVEL,
385 
386 	/// Main stack level command.
387 	BC_LEX_STACK_LEVEL,
388 
389 	/// Duplicate command.
390 	BC_LEX_DUPLICATE,
391 
392 	/// Swap (reverse) command.
393 	BC_LEX_SWAP,
394 
395 	/// Pop (remove) command.
396 	BC_LEX_POP,
397 
398 	/// Store ibase command.
399 	BC_LEX_STORE_IBASE,
400 
401 	/// Store obase command.
402 	BC_LEX_STORE_OBASE,
403 
404 	/// Store scale command.
405 	BC_LEX_STORE_SCALE,
406 
407 #if BC_ENABLE_EXTRA_MATH
408 	/// Store seed command.
409 	BC_LEX_STORE_SEED,
410 #endif // BC_ENABLE_EXTRA_MATH
411 
412 	/// Load variable onto stack command.
413 	BC_LEX_LOAD,
414 
415 	/// Pop off of variable stack onto results stack command.
416 	BC_LEX_LOAD_POP,
417 
418 	/// Push onto variable stack command.
419 	BC_LEX_STORE_PUSH,
420 
421 	/// Print with pop command.
422 	BC_LEX_PRINT_POP,
423 
424 	/// Parameterized quit command.
425 	BC_LEX_NQUIT,
426 
427 	/// Execution stack depth command.
428 	BC_LEX_EXEC_STACK_LENGTH,
429 
430 	/// Scale of number command. This is needed specifically for dc because bc
431 	/// parses the scale function in parts.
432 	BC_LEX_SCALE_FACTOR,
433 
434 	/// Array length command. This is needed specifically for dc because bc
435 	/// just reuses its length keyword.
436 	BC_LEX_ARRAY_LENGTH,
437 
438 #endif // DC_ENABLED
439 
440 } BcLexType;
441 
442 struct BcLex;
443 
444 /**
445  * A function pointer to call when another token is needed. Mostly called by the
446  * parser.
447  * @param l  The lexer.
448  */
449 typedef void (*BcLexNext)(struct BcLex* l);
450 
451 /// The lexer.
452 typedef struct BcLex {
453 
454 	/// A pointer to the text to lex.
455 	const char *buf;
456 
457 	/// The current index into buf.
458 	size_t i;
459 
460 	/// The current line.
461 	size_t line;
462 
463 	/// The length of buf.
464 	size_t len;
465 
466 	/// The current token.
467 	BcLexType t;
468 
469 	/// The previous token.
470 	BcLexType last;
471 
472 	/// A string to store extra data for tokens. For example, the @a BC_LEX_STR
473 	/// token really needs to store the actual string, and numbers also need the
474 	/// string.
475 	BcVec str;
476 
477 	/// If this is true, the lexer is processing stdin and can ask for more data
478 	/// if a string or comment are not properly terminated.
479 	bool is_stdin;
480 
481 } BcLex;
482 
483 /**
484  * Initializes a lexer.
485  * @param l  The lexer to initialize.
486  */
487 void bc_lex_init(BcLex *l);
488 
489 /**
490  * Frees a lexer. This is not guarded by #ifndef NDEBUG because a separate
491  * parser is created at runtime to parse read() expressions and dc strings, and
492  * that parser needs a lexer.
493  * @param l  The lexer to free.
494  */
495 void bc_lex_free(BcLex *l);
496 
497 /**
498  * Sets the filename that the lexer will be lexing.
499  * @param l     The lexer.
500  * @param file  The filename that the lexer will lex.
501  */
502 void bc_lex_file(BcLex *l, const char *file);
503 
504 /**
505  * Sets the text the lexer will lex.
506  * @param l         The lexer.
507  * @param text      The text to lex.
508  * @param is_stdin  True if the text is from stdin, false otherwise.
509  */
510 void bc_lex_text(BcLex *l, const char *text, bool is_stdin);
511 
512 /**
513  * Generic next function for the parser to call. It takes care of calling the
514  * correct @a BcLexNext function and consuming whitespace.
515  * @param l  The lexer.
516  */
517 void bc_lex_next(BcLex *l);
518 
519 /**
520  * Lexes a line comment (one beginning with '#' and going to a newline).
521  * @param l  The lexer.
522  */
523 void bc_lex_lineComment(BcLex *l);
524 
525 /**
526  * Lexes a general comment (C-style comment).
527  * @param l  The lexer.
528  */
529 void bc_lex_comment(BcLex *l);
530 
531 /**
532  * Lexes whitespace, finding as much as possible.
533  * @param l  The lexer.
534  */
535 void bc_lex_whitespace(BcLex *l);
536 
537 /**
538  * Lexes a number that begins with char @a start. This takes care of parsing
539  * numbers in scientific and engineering notations.
540  * @param l      The lexer.
541  * @param start  The starting char of the number. To detect a number and call
542  *               this function, the lexer had to eat the first char. It fixes
543  *               that by passing it in.
544  */
545 void bc_lex_number(BcLex *l, char start);
546 
547 /**
548  * Lexes a name/identifier.
549  * @param l  The lexer.
550  */
551 void bc_lex_name(BcLex *l);
552 
553 /**
554  * Lexes common whitespace characters.
555  * @param l  The lexer.
556  * @param c  The character to lex.
557  */
558 void bc_lex_commonTokens(BcLex *l, char c);
559 
560 /**
561  * Throws a parse error because char @a c was invalid.
562  * @param l  The lexer.
563  * @param c  The problem character.
564  */
565 void bc_lex_invalidChar(BcLex *l, char c);
566 
567 /**
568  * Reads a line from stdin and puts it into the lexer's buffer.
569  * @param l         The lexer.
570  */
571 bool bc_lex_readLine(BcLex *l);
572 
573 #endif // BC_LEX_H
574