xref: /freebsd/contrib/bc/include/lex.h (revision b64c5a0ace59af62eff52bfe110a521dc73c937b)
1 /*
2  * *****************************************************************************
3  *
4  * SPDX-License-Identifier: BSD-2-Clause
5  *
6  * Copyright (c) 2018-2024 Gavin D. Howard and contributors.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions are met:
10  *
11  * * Redistributions of source code must retain the above copyright notice, this
12  *   list of conditions and the following disclaimer.
13  *
14  * * Redistributions in binary form must reproduce the above copyright notice,
15  *   this list of conditions and the following disclaimer in the documentation
16  *   and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGE.
29  *
30  * *****************************************************************************
31  *
32  * Definitions for bc's lexer.
33  *
34  */
35 
36 #ifndef BC_LEX_H
37 #define BC_LEX_H
38 
39 #include <stdbool.h>
40 #include <stddef.h>
41 
42 #include <status.h>
43 #include <vector.h>
44 #include <lang.h>
45 
46 /**
47  * A convenience macro for throwing errors in lex code. This takes care of
48  * plumbing like passing in the current line the lexer is on.
49  * @param l  The lexer.
50  * @param e  The error.
51  */
52 #if BC_DEBUG
53 #define bc_lex_err(l, e) (bc_vm_handleError((e), __FILE__, __LINE__, (l)->line))
54 #else // BC_DEBUG
55 #define bc_lex_err(l, e) (bc_vm_handleError((e), (l)->line))
56 #endif // BC_DEBUG
57 
58 /**
59  * A convenience macro for throwing errors in lex code. This takes care of
60  * plumbing like passing in the current line the lexer is on.
61  * @param l  The lexer.
62  * @param e  The error.
63  */
64 #if BC_DEBUG
65 #define bc_lex_verr(l, e, ...) \
66 	(bc_vm_handleError((e), __FILE__, __LINE__, (l)->line, __VA_ARGS__))
67 #else // BC_DEBUG
68 #define bc_lex_verr(l, e, ...) (bc_vm_handleError((e), (l)->line, __VA_ARGS__))
69 #endif // BC_DEBUG
70 
71 // BC_LEX_NEG_CHAR returns the char that corresponds to negative for the
72 // current calculator.
73 //
74 // BC_LEX_LAST_NUM_CHAR returns the char that corresponds to the last valid
75 // char for numbers. In bc and dc, capital letters are part of numbers, to a
76 // point. (dc only goes up to hex, so its last valid char is 'F'.)
77 #if BC_ENABLED
78 
79 #if DC_ENABLED
80 #define BC_LEX_NEG_CHAR (BC_IS_BC ? '-' : '_')
81 #define BC_LEX_LAST_NUM_CHAR (BC_IS_BC ? 'Z' : 'F')
82 #else // DC_ENABLED
83 #define BC_LEX_NEG_CHAR ('-')
84 #define BC_LEX_LAST_NUM_CHAR ('Z')
85 #endif // DC_ENABLED
86 
87 #else // BC_ENABLED
88 
89 #define BC_LEX_NEG_CHAR ('_')
90 #define BC_LEX_LAST_NUM_CHAR ('F')
91 
92 #endif // BC_ENABLED
93 
94 /**
95  * Returns true if c is a valid number character.
96  * @param c         The char to check.
97  * @param pt        If a decimal point has already been seen.
98  * @param int_only  True if the number is expected to be an int only, false if
99  *                  non-integers are allowed.
100  * @return          True if @a c is a valid number character.
101  */
102 #define BC_LEX_NUM_CHAR(c, pt, int_only)                               \
103 	(isdigit(c) != 0 || ((c) >= 'A' && (c) <= BC_LEX_LAST_NUM_CHAR) || \
104 	 ((c) == '.' && !(pt) && !(int_only)))
105 
106 /// An enum of lex token types.
107 typedef enum BcLexType
108 {
109 	/// End of file.
110 	BC_LEX_EOF,
111 
112 	/// Marker for invalid tokens, used by bc and dc for const data.
113 	BC_LEX_INVALID,
114 
115 #if BC_ENABLED
116 
117 	/// Increment operator.
118 	BC_LEX_OP_INC,
119 
120 	/// Decrement operator.
121 	BC_LEX_OP_DEC,
122 
123 #endif // BC_ENABLED
124 
125 	/// BC_LEX_NEG is not used in lexing; it is only for parsing. The lexer
126 	/// marks all '-' characters as BC_LEX_OP_MINUS, but the parser needs to be
127 	/// able to distinguish them.
128 	BC_LEX_NEG,
129 
130 	/// Boolean not.
131 	BC_LEX_OP_BOOL_NOT,
132 
133 #if BC_ENABLE_EXTRA_MATH
134 
135 	/// Truncation operator.
136 	BC_LEX_OP_TRUNC,
137 
138 #endif // BC_ENABLE_EXTRA_MATH
139 
140 	/// Power operator.
141 	BC_LEX_OP_POWER,
142 
143 	/// Multiplication operator.
144 	BC_LEX_OP_MULTIPLY,
145 
146 	/// Division operator.
147 	BC_LEX_OP_DIVIDE,
148 
149 	/// Modulus operator.
150 	BC_LEX_OP_MODULUS,
151 
152 	/// Addition operator.
153 	BC_LEX_OP_PLUS,
154 
155 	/// Subtraction operator.
156 	BC_LEX_OP_MINUS,
157 
158 #if BC_ENABLE_EXTRA_MATH
159 
160 	/// Places (truncate or extend) operator.
161 	BC_LEX_OP_PLACES,
162 
163 	/// Left (decimal) shift operator.
164 	BC_LEX_OP_LSHIFT,
165 
166 	/// Right (decimal) shift operator.
167 	BC_LEX_OP_RSHIFT,
168 
169 #endif // BC_ENABLE_EXTRA_MATH
170 
171 	/// Equal operator.
172 	BC_LEX_OP_REL_EQ,
173 
174 	/// Less than or equal operator.
175 	BC_LEX_OP_REL_LE,
176 
177 	/// Greater than or equal operator.
178 	BC_LEX_OP_REL_GE,
179 
180 	/// Not equal operator.
181 	BC_LEX_OP_REL_NE,
182 
183 	/// Less than operator.
184 	BC_LEX_OP_REL_LT,
185 
186 	/// Greater than operator.
187 	BC_LEX_OP_REL_GT,
188 
189 	/// Boolean or operator.
190 	BC_LEX_OP_BOOL_OR,
191 
192 	/// Boolean and operator.
193 	BC_LEX_OP_BOOL_AND,
194 
195 #if BC_ENABLED
196 
197 	/// Power assignment operator.
198 	BC_LEX_OP_ASSIGN_POWER,
199 
200 	/// Multiplication assignment operator.
201 	BC_LEX_OP_ASSIGN_MULTIPLY,
202 
203 	/// Division assignment operator.
204 	BC_LEX_OP_ASSIGN_DIVIDE,
205 
206 	/// Modulus assignment operator.
207 	BC_LEX_OP_ASSIGN_MODULUS,
208 
209 	/// Addition assignment operator.
210 	BC_LEX_OP_ASSIGN_PLUS,
211 
212 	/// Subtraction assignment operator.
213 	BC_LEX_OP_ASSIGN_MINUS,
214 
215 #if BC_ENABLE_EXTRA_MATH
216 
217 	/// Places (truncate or extend) assignment operator.
218 	BC_LEX_OP_ASSIGN_PLACES,
219 
220 	/// Left (decimal) shift assignment operator.
221 	BC_LEX_OP_ASSIGN_LSHIFT,
222 
223 	/// Right (decimal) shift assignment operator.
224 	BC_LEX_OP_ASSIGN_RSHIFT,
225 
226 #endif // BC_ENABLE_EXTRA_MATH
227 #endif // BC_ENABLED
228 
229 	/// Assignment operator.
230 	BC_LEX_OP_ASSIGN,
231 
232 	/// Newline.
233 	BC_LEX_NLINE,
234 
235 	/// Whitespace.
236 	BC_LEX_WHITESPACE,
237 
238 	/// Left parenthesis.
239 	BC_LEX_LPAREN,
240 
241 	/// Right parenthesis.
242 	BC_LEX_RPAREN,
243 
244 	/// Left bracket.
245 	BC_LEX_LBRACKET,
246 
247 	/// Comma.
248 	BC_LEX_COMMA,
249 
250 	/// Right bracket.
251 	BC_LEX_RBRACKET,
252 
253 	/// Left brace.
254 	BC_LEX_LBRACE,
255 
256 	/// Semicolon.
257 	BC_LEX_SCOLON,
258 
259 	/// Right brace.
260 	BC_LEX_RBRACE,
261 
262 	/// String.
263 	BC_LEX_STR,
264 
265 	/// Identifier/name.
266 	BC_LEX_NAME,
267 
268 	/// Constant number.
269 	BC_LEX_NUMBER,
270 
271 	// These keywords are in the order they are in for a reason. Don't change
272 	// the order unless you want a bunch of weird failures in the test suite.
273 	// In fact, almost all of these tokens are in a specific order for a reason.
274 
275 #if BC_ENABLED
276 
277 	/// bc auto keyword.
278 	BC_LEX_KW_AUTO,
279 
280 	/// bc break keyword.
281 	BC_LEX_KW_BREAK,
282 
283 	/// bc continue keyword.
284 	BC_LEX_KW_CONTINUE,
285 
286 	/// bc define keyword.
287 	BC_LEX_KW_DEFINE,
288 
289 	/// bc for keyword.
290 	BC_LEX_KW_FOR,
291 
292 	/// bc if keyword.
293 	BC_LEX_KW_IF,
294 
295 	/// bc limits keyword.
296 	BC_LEX_KW_LIMITS,
297 
298 	/// bc return keyword.
299 	BC_LEX_KW_RETURN,
300 
301 	/// bc while keyword.
302 	BC_LEX_KW_WHILE,
303 
304 	/// bc halt keyword.
305 	BC_LEX_KW_HALT,
306 
307 	/// bc last keyword.
308 	BC_LEX_KW_LAST,
309 
310 #endif // BC_ENABLED
311 
312 	/// bc ibase keyword.
313 	BC_LEX_KW_IBASE,
314 
315 	/// bc obase keyword.
316 	BC_LEX_KW_OBASE,
317 
318 	/// bc scale keyword.
319 	BC_LEX_KW_SCALE,
320 
321 #if BC_ENABLE_EXTRA_MATH
322 
323 	/// bc seed keyword.
324 	BC_LEX_KW_SEED,
325 
326 #endif // BC_ENABLE_EXTRA_MATH
327 
328 	/// bc length keyword.
329 	BC_LEX_KW_LENGTH,
330 
331 	/// bc print keyword.
332 	BC_LEX_KW_PRINT,
333 
334 	/// bc sqrt keyword.
335 	BC_LEX_KW_SQRT,
336 
337 	/// bc abs keyword.
338 	BC_LEX_KW_ABS,
339 
340 	/// bc is_number keyword.
341 	BC_LEX_KW_IS_NUMBER,
342 
343 	/// bc is_string keyword.
344 	BC_LEX_KW_IS_STRING,
345 
346 #if BC_ENABLE_EXTRA_MATH
347 
348 	/// bc irand keyword.
349 	BC_LEX_KW_IRAND,
350 
351 #endif // BC_ENABLE_EXTRA_MATH
352 
353 	/// bc asciffy keyword.
354 	BC_LEX_KW_ASCIIFY,
355 
356 	/// bc modexp keyword.
357 	BC_LEX_KW_MODEXP,
358 
359 	/// bc divmod keyword.
360 	BC_LEX_KW_DIVMOD,
361 
362 	/// bc quit keyword.
363 	BC_LEX_KW_QUIT,
364 
365 	/// bc read keyword.
366 	BC_LEX_KW_READ,
367 
368 #if BC_ENABLE_EXTRA_MATH
369 
370 	/// bc rand keyword.
371 	BC_LEX_KW_RAND,
372 
373 #endif // BC_ENABLE_EXTRA_MATH
374 
375 	/// bc maxibase keyword.
376 	BC_LEX_KW_MAXIBASE,
377 
378 	/// bc maxobase keyword.
379 	BC_LEX_KW_MAXOBASE,
380 
381 	/// bc maxscale keyword.
382 	BC_LEX_KW_MAXSCALE,
383 
384 #if BC_ENABLE_EXTRA_MATH
385 
386 	/// bc maxrand keyword.
387 	BC_LEX_KW_MAXRAND,
388 
389 #endif // BC_ENABLE_EXTRA_MATH
390 
391 	/// bc line_length keyword.
392 	BC_LEX_KW_LINE_LENGTH,
393 
394 #if BC_ENABLED
395 
396 	/// bc global_stacks keyword.
397 	BC_LEX_KW_GLOBAL_STACKS,
398 
399 #endif // BC_ENABLED
400 
401 	/// bc leading_zero keyword.
402 	BC_LEX_KW_LEADING_ZERO,
403 
404 	/// bc stream keyword.
405 	BC_LEX_KW_STREAM,
406 
407 	/// bc else keyword.
408 	BC_LEX_KW_ELSE,
409 
410 #if DC_ENABLED
411 
412 	/// dc extended registers keyword.
413 	BC_LEX_EXTENDED_REGISTERS,
414 
415 	/// A special token for dc to calculate equal without a register.
416 	BC_LEX_EQ_NO_REG,
417 
418 	/// Colon (array) operator.
419 	BC_LEX_COLON,
420 
421 	/// Execute command.
422 	BC_LEX_EXECUTE,
423 
424 	/// Print stack command.
425 	BC_LEX_PRINT_STACK,
426 
427 	/// Clear stack command.
428 	BC_LEX_CLEAR_STACK,
429 
430 	/// Register stack level command.
431 	BC_LEX_REG_STACK_LEVEL,
432 
433 	/// Main stack level command.
434 	BC_LEX_STACK_LEVEL,
435 
436 	/// Duplicate command.
437 	BC_LEX_DUPLICATE,
438 
439 	/// Swap (reverse) command.
440 	BC_LEX_SWAP,
441 
442 	/// Pop (remove) command.
443 	BC_LEX_POP,
444 
445 	/// Store ibase command.
446 	BC_LEX_STORE_IBASE,
447 
448 	/// Store obase command.
449 	BC_LEX_STORE_OBASE,
450 
451 	/// Store scale command.
452 	BC_LEX_STORE_SCALE,
453 
454 #if BC_ENABLE_EXTRA_MATH
455 
456 	/// Store seed command.
457 	BC_LEX_STORE_SEED,
458 
459 #endif // BC_ENABLE_EXTRA_MATH
460 
461 	/// Load variable onto stack command.
462 	BC_LEX_LOAD,
463 
464 	/// Pop off of variable stack onto results stack command.
465 	BC_LEX_LOAD_POP,
466 
467 	/// Push onto variable stack command.
468 	BC_LEX_STORE_PUSH,
469 
470 	/// Print with pop command.
471 	BC_LEX_PRINT_POP,
472 
473 	/// Parameterized quit command.
474 	BC_LEX_NQUIT,
475 
476 	/// Execution stack depth command.
477 	BC_LEX_EXEC_STACK_LENGTH,
478 
479 	/// Scale of number command. This is needed specifically for dc because bc
480 	/// parses the scale function in parts.
481 	BC_LEX_SCALE_FACTOR,
482 
483 	/// Array length command. This is needed specifically for dc because bc
484 	/// just reuses its length keyword.
485 	BC_LEX_ARRAY_LENGTH,
486 
487 #endif // DC_ENABLED
488 
489 } BcLexType;
490 
491 struct BcLex;
492 
493 /**
494  * A function pointer to call when another token is needed. Mostly called by the
495  * parser.
496  * @param l  The lexer.
497  */
498 typedef void (*BcLexNext)(struct BcLex* l);
499 
500 /// The lexer.
501 typedef struct BcLex
502 {
503 	/// A pointer to the text to lex.
504 	const char* buf;
505 
506 	/// The current index into buf.
507 	size_t i;
508 
509 	/// The current line.
510 	size_t line;
511 
512 	/// The length of buf.
513 	size_t len;
514 
515 	/// The current token.
516 	BcLexType t;
517 
518 	/// The previous token.
519 	BcLexType last;
520 
521 	/// A string to store extra data for tokens. For example, the @a BC_LEX_STR
522 	/// token really needs to store the actual string, and numbers also need the
523 	/// string.
524 	BcVec str;
525 
526 	/// The mode the lexer is in.
527 	BcMode mode;
528 
529 } BcLex;
530 
531 /**
532  * Initializes a lexer.
533  * @param l  The lexer to initialize.
534  */
535 void
536 bc_lex_init(BcLex* l);
537 
538 /**
539  * Frees a lexer. This is not guarded by #if BC_DEBUG because a separate
540  * parser is created at runtime to parse read() expressions and dc strings, and
541  * that parser needs a lexer.
542  * @param l  The lexer to free.
543  */
544 void
545 bc_lex_free(BcLex* l);
546 
547 /**
548  * Sets the filename that the lexer will be lexing.
549  * @param l     The lexer.
550  * @param file  The filename that the lexer will lex.
551  */
552 void
553 bc_lex_file(BcLex* l, const char* file);
554 
555 /**
556  * Sets the text the lexer will lex.
557  * @param l     The lexer.
558  * @param text  The text to lex.
559  * @param mode  The mode to lex in.
560  */
561 void
562 bc_lex_text(BcLex* l, const char* text, BcMode mode);
563 
564 /**
565  * Generic next function for the parser to call. It takes care of calling the
566  * correct @a BcLexNext function and consuming whitespace.
567  * @param l  The lexer.
568  */
569 void
570 bc_lex_next(BcLex* l);
571 
572 /**
573  * Lexes a line comment (one beginning with '#' and going to a newline).
574  * @param l  The lexer.
575  */
576 void
577 bc_lex_lineComment(BcLex* l);
578 
579 /**
580  * Lexes a general comment (C-style comment).
581  * @param l  The lexer.
582  */
583 void
584 bc_lex_comment(BcLex* l);
585 
586 /**
587  * Lexes whitespace, finding as much as possible.
588  * @param l  The lexer.
589  */
590 void
591 bc_lex_whitespace(BcLex* l);
592 
593 /**
594  * Lexes a number that begins with char @a start. This takes care of parsing
595  * numbers in scientific and engineering notations.
596  * @param l      The lexer.
597  * @param start  The starting char of the number. To detect a number and call
598  *               this function, the lexer had to eat the first char. It fixes
599  *               that by passing it in.
600  */
601 void
602 bc_lex_number(BcLex* l, char start);
603 
604 /**
605  * Lexes a name/identifier.
606  * @param l  The lexer.
607  */
608 void
609 bc_lex_name(BcLex* l);
610 
611 /**
612  * Lexes common whitespace characters.
613  * @param l  The lexer.
614  * @param c  The character to lex.
615  */
616 void
617 bc_lex_commonTokens(BcLex* l, char c);
618 
619 /**
620  * Throws a parse error because char @a c was invalid.
621  * @param l  The lexer.
622  * @param c  The problem character.
623  */
624 void
625 bc_lex_invalidChar(BcLex* l, char c);
626 
627 /**
628  * Reads a line from stdin and puts it into the lexer's buffer.
629  * @param l  The lexer.
630  */
631 bool
632 bc_lex_readLine(BcLex* l);
633 
634 #endif // BC_LEX_H
635