xref: /freebsd/contrib/bc/include/lex.h (revision 5956d97f4b3204318ceb6aa9c77bd0bc6ea87a41)
1 /*
2  * *****************************************************************************
3  *
4  * SPDX-License-Identifier: BSD-2-Clause
5  *
6  * Copyright (c) 2018-2021 Gavin D. Howard and contributors.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions are met:
10  *
11  * * Redistributions of source code must retain the above copyright notice, this
12  *   list of conditions and the following disclaimer.
13  *
14  * * Redistributions in binary form must reproduce the above copyright notice,
15  *   this list of conditions and the following disclaimer in the documentation
16  *   and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGE.
29  *
30  * *****************************************************************************
31  *
32  * Definitions for bc's lexer.
33  *
34  */
35 
36 #ifndef BC_LEX_H
37 #define BC_LEX_H
38 
39 #include <stdbool.h>
40 #include <stddef.h>
41 
42 #include <status.h>
43 #include <vector.h>
44 #include <lang.h>
45 
46 // Two convencience macros for throwing errors in lex code. They take care of
47 // plumbing like passing in the current line the lexer is on.
48 #define bc_lex_err(l, e) (bc_vm_handleError((e), (l)->line))
49 #define bc_lex_verr(l, e, ...) (bc_vm_handleError((e), (l)->line, __VA_ARGS__))
50 
51 // BC_LEX_NEG_CHAR returns the char that corresponds to negative for the
52 // current calculator.
53 //
54 // BC_LEX_LAST_NUM_CHAR returns the char that corresponds to the last valid
55 // char for numbers. In bc and dc, capital letters are part of numbers, to a
56 // point. (dc only goes up to hex, so its last valid char is 'F'.)
57 #if BC_ENABLED
58 
59 #if DC_ENABLED
60 #define BC_LEX_NEG_CHAR (BC_IS_BC ? '-' : '_')
61 #define BC_LEX_LAST_NUM_CHAR (BC_IS_BC ? 'Z' : 'F')
62 #else // DC_ENABLED
63 #define BC_LEX_NEG_CHAR ('-')
64 #define BC_LEX_LAST_NUM_CHAR ('Z')
65 #endif // DC_ENABLED
66 
67 #else // BC_ENABLED
68 
69 #define BC_LEX_NEG_CHAR ('_')
70 #define BC_LEX_LAST_NUM_CHAR ('F')
71 
72 #endif // BC_ENABLED
73 
74 /**
75  * Returns true if c is a valid number character.
76  * @param c         The char to check.
77  * @param pt        If a decimal point has already been seen.
78  * @param int_only  True if the number is expected to be an int only, false if
79  *                  non-integers are allowed.
80  * @return          True if @a c is a valid number character.
81  */
82 #define BC_LEX_NUM_CHAR(c, pt, int_only)                               \
83 	(isdigit(c) != 0 || ((c) >= 'A' && (c) <= BC_LEX_LAST_NUM_CHAR) || \
84 	 ((c) == '.' && !(pt) && !(int_only)))
85 
86 /// An enum of lex token types.
87 typedef enum BcLexType
88 {
89 	/// End of file.
90 	BC_LEX_EOF,
91 
92 	/// Marker for invalid tokens, used by bc and dc for const data.
93 	BC_LEX_INVALID,
94 
95 #if BC_ENABLED
96 
97 	/// Increment operator.
98 	BC_LEX_OP_INC,
99 
100 	/// Decrement operator.
101 	BC_LEX_OP_DEC,
102 
103 #endif // BC_ENABLED
104 
105 	/// BC_LEX_NEG is not used in lexing; it is only for parsing. The lexer
106 	/// marks all '-' characters as BC_LEX_OP_MINUS, but the parser needs to be
107 	/// able to distinguish them.
108 	BC_LEX_NEG,
109 
110 	/// Boolean not.
111 	BC_LEX_OP_BOOL_NOT,
112 
113 #if BC_ENABLE_EXTRA_MATH
114 
115 	/// Truncation operator.
116 	BC_LEX_OP_TRUNC,
117 
118 #endif // BC_ENABLE_EXTRA_MATH
119 
120 	/// Power operator.
121 	BC_LEX_OP_POWER,
122 
123 	/// Multiplication operator.
124 	BC_LEX_OP_MULTIPLY,
125 
126 	/// Division operator.
127 	BC_LEX_OP_DIVIDE,
128 
129 	/// Modulus operator.
130 	BC_LEX_OP_MODULUS,
131 
132 	/// Addition operator.
133 	BC_LEX_OP_PLUS,
134 
135 	/// Subtraction operator.
136 	BC_LEX_OP_MINUS,
137 
138 #if BC_ENABLE_EXTRA_MATH
139 	/// Places (truncate or extend) operator.
140 	BC_LEX_OP_PLACES,
141 
142 	/// Left (decimal) shift operator.
143 	BC_LEX_OP_LSHIFT,
144 
145 	/// Right (decimal) shift operator.
146 	BC_LEX_OP_RSHIFT,
147 #endif // BC_ENABLE_EXTRA_MATH
148 
149 	/// Equal operator.
150 	BC_LEX_OP_REL_EQ,
151 
152 	/// Less than or equal operator.
153 	BC_LEX_OP_REL_LE,
154 
155 	/// Greater than or equal operator.
156 	BC_LEX_OP_REL_GE,
157 
158 	/// Not equal operator.
159 	BC_LEX_OP_REL_NE,
160 
161 	/// Less than operator.
162 	BC_LEX_OP_REL_LT,
163 
164 	/// Greater than operator.
165 	BC_LEX_OP_REL_GT,
166 
167 	/// Boolean or operator.
168 	BC_LEX_OP_BOOL_OR,
169 
170 	/// Boolean and operator.
171 	BC_LEX_OP_BOOL_AND,
172 
173 #if BC_ENABLED
174 	/// Power assignment operator.
175 	BC_LEX_OP_ASSIGN_POWER,
176 
177 	/// Multiplication assignment operator.
178 	BC_LEX_OP_ASSIGN_MULTIPLY,
179 
180 	/// Division assignment operator.
181 	BC_LEX_OP_ASSIGN_DIVIDE,
182 
183 	/// Modulus assignment operator.
184 	BC_LEX_OP_ASSIGN_MODULUS,
185 
186 	/// Addition assignment operator.
187 	BC_LEX_OP_ASSIGN_PLUS,
188 
189 	/// Subtraction assignment operator.
190 	BC_LEX_OP_ASSIGN_MINUS,
191 
192 #if BC_ENABLE_EXTRA_MATH
193 
194 	/// Places (truncate or extend) assignment operator.
195 	BC_LEX_OP_ASSIGN_PLACES,
196 
197 	/// Left (decimal) shift assignment operator.
198 	BC_LEX_OP_ASSIGN_LSHIFT,
199 
200 	/// Right (decimal) shift assignment operator.
201 	BC_LEX_OP_ASSIGN_RSHIFT,
202 
203 #endif // BC_ENABLE_EXTRA_MATH
204 #endif // BC_ENABLED
205 
206 	/// Assignment operator.
207 	BC_LEX_OP_ASSIGN,
208 
209 	/// Newline.
210 	BC_LEX_NLINE,
211 
212 	/// Whitespace.
213 	BC_LEX_WHITESPACE,
214 
215 	/// Left parenthesis.
216 	BC_LEX_LPAREN,
217 
218 	/// Right parenthesis.
219 	BC_LEX_RPAREN,
220 
221 	/// Left bracket.
222 	BC_LEX_LBRACKET,
223 
224 	/// Comma.
225 	BC_LEX_COMMA,
226 
227 	/// Right bracket.
228 	BC_LEX_RBRACKET,
229 
230 	/// Left brace.
231 	BC_LEX_LBRACE,
232 
233 	/// Semicolon.
234 	BC_LEX_SCOLON,
235 
236 	/// Right brace.
237 	BC_LEX_RBRACE,
238 
239 	/// String.
240 	BC_LEX_STR,
241 
242 	/// Identifier/name.
243 	BC_LEX_NAME,
244 
245 	/// Constant number.
246 	BC_LEX_NUMBER,
247 
248 	// These keywords are in the order they are in for a reason. Don't change
249 	// the order unless you want a bunch of weird failures in the test suite.
250 	// In fact, almost all of these tokens are in a specific order for a reason.
251 
252 #if BC_ENABLED
253 
254 	/// bc auto keyword.
255 	BC_LEX_KW_AUTO,
256 
257 	/// bc break keyword.
258 	BC_LEX_KW_BREAK,
259 
260 	/// bc continue keyword.
261 	BC_LEX_KW_CONTINUE,
262 
263 	/// bc define keyword.
264 	BC_LEX_KW_DEFINE,
265 
266 	/// bc for keyword.
267 	BC_LEX_KW_FOR,
268 
269 	/// bc if keyword.
270 	BC_LEX_KW_IF,
271 
272 	/// bc limits keyword.
273 	BC_LEX_KW_LIMITS,
274 
275 	/// bc return keyword.
276 	BC_LEX_KW_RETURN,
277 
278 	/// bc while keyword.
279 	BC_LEX_KW_WHILE,
280 
281 	/// bc halt keyword.
282 	BC_LEX_KW_HALT,
283 
284 	/// bc last keyword.
285 	BC_LEX_KW_LAST,
286 
287 #endif // BC_ENABLED
288 
289 	/// bc ibase keyword.
290 	BC_LEX_KW_IBASE,
291 
292 	/// bc obase keyword.
293 	BC_LEX_KW_OBASE,
294 
295 	/// bc scale keyword.
296 	BC_LEX_KW_SCALE,
297 
298 #if BC_ENABLE_EXTRA_MATH
299 
300 	/// bc seed keyword.
301 	BC_LEX_KW_SEED,
302 
303 #endif // BC_ENABLE_EXTRA_MATH
304 
305 	/// bc length keyword.
306 	BC_LEX_KW_LENGTH,
307 
308 	/// bc print keyword.
309 	BC_LEX_KW_PRINT,
310 
311 	/// bc sqrt keyword.
312 	BC_LEX_KW_SQRT,
313 
314 	/// bc abs keyword.
315 	BC_LEX_KW_ABS,
316 
317 #if BC_ENABLE_EXTRA_MATH
318 
319 	/// bc irand keyword.
320 	BC_LEX_KW_IRAND,
321 
322 #endif // BC_ENABLE_EXTRA_MATH
323 
324 	/// bc asciffy keyword.
325 	BC_LEX_KW_ASCIIFY,
326 
327 	/// bc modexp keyword.
328 	BC_LEX_KW_MODEXP,
329 
330 	/// bc divmod keyword.
331 	BC_LEX_KW_DIVMOD,
332 
333 	/// bc quit keyword.
334 	BC_LEX_KW_QUIT,
335 
336 	/// bc read keyword.
337 	BC_LEX_KW_READ,
338 
339 #if BC_ENABLE_EXTRA_MATH
340 
341 	/// bc rand keyword.
342 	BC_LEX_KW_RAND,
343 
344 #endif // BC_ENABLE_EXTRA_MATH
345 
346 	/// bc maxibase keyword.
347 	BC_LEX_KW_MAXIBASE,
348 
349 	/// bc maxobase keyword.
350 	BC_LEX_KW_MAXOBASE,
351 
352 	/// bc maxscale keyword.
353 	BC_LEX_KW_MAXSCALE,
354 
355 #if BC_ENABLE_EXTRA_MATH
356 	/// bc maxrand keyword.
357 	BC_LEX_KW_MAXRAND,
358 #endif // BC_ENABLE_EXTRA_MATH
359 
360 	/// bc line_length keyword.
361 	BC_LEX_KW_LINE_LENGTH,
362 
363 #if BC_ENABLED
364 
365 	/// bc global_stacks keyword.
366 	BC_LEX_KW_GLOBAL_STACKS,
367 
368 #endif // BC_ENABLED
369 
370 	/// bc leading_zero keyword.
371 	BC_LEX_KW_LEADING_ZERO,
372 
373 	/// bc stream keyword.
374 	BC_LEX_KW_STREAM,
375 
376 	/// bc else keyword.
377 	BC_LEX_KW_ELSE,
378 
379 #if DC_ENABLED
380 
381 	/// A special token for dc to calculate equal without a register.
382 	BC_LEX_EQ_NO_REG,
383 
384 	/// Colon (array) operator.
385 	BC_LEX_COLON,
386 
387 	/// Execute command.
388 	BC_LEX_EXECUTE,
389 
390 	/// Print stack command.
391 	BC_LEX_PRINT_STACK,
392 
393 	/// Clear stack command.
394 	BC_LEX_CLEAR_STACK,
395 
396 	/// Register stack level command.
397 	BC_LEX_REG_STACK_LEVEL,
398 
399 	/// Main stack level command.
400 	BC_LEX_STACK_LEVEL,
401 
402 	/// Duplicate command.
403 	BC_LEX_DUPLICATE,
404 
405 	/// Swap (reverse) command.
406 	BC_LEX_SWAP,
407 
408 	/// Pop (remove) command.
409 	BC_LEX_POP,
410 
411 	/// Store ibase command.
412 	BC_LEX_STORE_IBASE,
413 
414 	/// Store obase command.
415 	BC_LEX_STORE_OBASE,
416 
417 	/// Store scale command.
418 	BC_LEX_STORE_SCALE,
419 
420 #if BC_ENABLE_EXTRA_MATH
421 	/// Store seed command.
422 	BC_LEX_STORE_SEED,
423 #endif // BC_ENABLE_EXTRA_MATH
424 
425 	/// Load variable onto stack command.
426 	BC_LEX_LOAD,
427 
428 	/// Pop off of variable stack onto results stack command.
429 	BC_LEX_LOAD_POP,
430 
431 	/// Push onto variable stack command.
432 	BC_LEX_STORE_PUSH,
433 
434 	/// Print with pop command.
435 	BC_LEX_PRINT_POP,
436 
437 	/// Parameterized quit command.
438 	BC_LEX_NQUIT,
439 
440 	/// Execution stack depth command.
441 	BC_LEX_EXEC_STACK_LENGTH,
442 
443 	/// Scale of number command. This is needed specifically for dc because bc
444 	/// parses the scale function in parts.
445 	BC_LEX_SCALE_FACTOR,
446 
447 	/// Array length command. This is needed specifically for dc because bc
448 	/// just reuses its length keyword.
449 	BC_LEX_ARRAY_LENGTH,
450 
451 #endif // DC_ENABLED
452 
453 } BcLexType;
454 
455 struct BcLex;
456 
457 /**
458  * A function pointer to call when another token is needed. Mostly called by the
459  * parser.
460  * @param l  The lexer.
461  */
462 typedef void (*BcLexNext)(struct BcLex* l);
463 
464 /// The lexer.
465 typedef struct BcLex
466 {
467 	/// A pointer to the text to lex.
468 	const char* buf;
469 
470 	/// The current index into buf.
471 	size_t i;
472 
473 	/// The current line.
474 	size_t line;
475 
476 	/// The length of buf.
477 	size_t len;
478 
479 	/// The current token.
480 	BcLexType t;
481 
482 	/// The previous token.
483 	BcLexType last;
484 
485 	/// A string to store extra data for tokens. For example, the @a BC_LEX_STR
486 	/// token really needs to store the actual string, and numbers also need the
487 	/// string.
488 	BcVec str;
489 
490 	/// If this is true, the lexer is processing stdin and can ask for more data
491 	/// if a string or comment are not properly terminated.
492 	bool is_stdin;
493 
494 	/// If this is true, the lexer is processing expressions from the
495 	/// command-line and can ask for more data if a string or comment are not
496 	/// properly terminated.
497 	bool is_exprs;
498 
499 } BcLex;
500 
501 /**
502  * Initializes a lexer.
503  * @param l  The lexer to initialize.
504  */
505 void
506 bc_lex_init(BcLex* l);
507 
508 /**
509  * Frees a lexer. This is not guarded by #ifndef NDEBUG because a separate
510  * parser is created at runtime to parse read() expressions and dc strings, and
511  * that parser needs a lexer.
512  * @param l  The lexer to free.
513  */
514 void
515 bc_lex_free(BcLex* l);
516 
517 /**
518  * Sets the filename that the lexer will be lexing.
519  * @param l     The lexer.
520  * @param file  The filename that the lexer will lex.
521  */
522 void
523 bc_lex_file(BcLex* l, const char* file);
524 
525 /**
526  * Sets the text the lexer will lex.
527  * @param l         The lexer.
528  * @param text      The text to lex.
529  * @param is_stdin  True if the text is from stdin, false otherwise.
530  * @param is_exprs  True if the text is from command-line expressions, false
531  *                  otherwise.
532  */
533 void
534 bc_lex_text(BcLex* l, const char* text, bool is_stdin, bool is_exprs);
535 
536 /**
537  * Generic next function for the parser to call. It takes care of calling the
538  * correct @a BcLexNext function and consuming whitespace.
539  * @param l  The lexer.
540  */
541 void
542 bc_lex_next(BcLex* l);
543 
544 /**
545  * Lexes a line comment (one beginning with '#' and going to a newline).
546  * @param l  The lexer.
547  */
548 void
549 bc_lex_lineComment(BcLex* l);
550 
551 /**
552  * Lexes a general comment (C-style comment).
553  * @param l  The lexer.
554  */
555 void
556 bc_lex_comment(BcLex* l);
557 
558 /**
559  * Lexes whitespace, finding as much as possible.
560  * @param l  The lexer.
561  */
562 void
563 bc_lex_whitespace(BcLex* l);
564 
565 /**
566  * Lexes a number that begins with char @a start. This takes care of parsing
567  * numbers in scientific and engineering notations.
568  * @param l      The lexer.
569  * @param start  The starting char of the number. To detect a number and call
570  *               this function, the lexer had to eat the first char. It fixes
571  *               that by passing it in.
572  */
573 void
574 bc_lex_number(BcLex* l, char start);
575 
576 /**
577  * Lexes a name/identifier.
578  * @param l  The lexer.
579  */
580 void
581 bc_lex_name(BcLex* l);
582 
583 /**
584  * Lexes common whitespace characters.
585  * @param l  The lexer.
586  * @param c  The character to lex.
587  */
588 void
589 bc_lex_commonTokens(BcLex* l, char c);
590 
591 /**
592  * Throws a parse error because char @a c was invalid.
593  * @param l  The lexer.
594  * @param c  The problem character.
595  */
596 void
597 bc_lex_invalidChar(BcLex* l, char c);
598 
599 /**
600  * Reads a line from stdin and puts it into the lexer's buffer.
601  * @param l  The lexer.
602  */
603 bool
604 bc_lex_readLine(BcLex* l);
605 
606 #endif // BC_LEX_H
607