xref: /freebsd/contrib/bc/src/dc_lex.c (revision de9468837c92cab304c658480bd32dbe4e022d01)
1 /*
2  * *****************************************************************************
3  *
4  * SPDX-License-Identifier: BSD-2-Clause
5  *
6  * Copyright (c) 2018-2024 Gavin D. Howard and contributors.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions are met:
10  *
11  * * Redistributions of source code must retain the above copyright notice, this
12  *   list of conditions and the following disclaimer.
13  *
14  * * Redistributions in binary form must reproduce the above copyright notice,
15  *   this list of conditions and the following disclaimer in the documentation
16  *   and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGE.
29  *
30  * *****************************************************************************
31  *
32  * The lexer for dc.
33  *
34  */
35 
36 #if DC_ENABLED
37 
38 #include <ctype.h>
39 
40 #include <dc.h>
41 #include <vm.h>
42 
43 bool
44 dc_lex_negCommand(BcLex* l)
45 {
46 	char c = l->buf[l->i];
47 	return !BC_LEX_NUM_CHAR(c, false, false);
48 }
49 
50 /**
51  * Processes a dc command that needs a register. This is where the
52  * extended-register extension is implemented.
53  * @param l  The lexer.
54  */
55 static void
56 dc_lex_register(BcLex* l)
57 {
58 	// If extended register is enabled and the character is whitespace...
59 	if (DC_X && isspace(l->buf[l->i - 1]))
60 	{
61 		char c;
62 
63 		// Eat the whitespace.
64 		bc_lex_whitespace(l);
65 		c = l->buf[l->i];
66 
67 		// Check for a letter or underscore.
68 		if (BC_ERR(!isalpha(c) && c != '_'))
69 		{
70 			bc_lex_verr(l, BC_ERR_PARSE_CHAR, c);
71 		}
72 
73 		// Parse a normal identifier.
74 		l->i += 1;
75 		bc_lex_name(l);
76 	}
77 	else
78 	{
79 		// I don't allow newlines because newlines are used for controlling when
80 		// execution happens, and allowing newlines would just be complex.
81 		if (BC_ERR(l->buf[l->i - 1] == '\n'))
82 		{
83 			bc_lex_verr(l, BC_ERR_PARSE_CHAR, l->buf[l->i - 1]);
84 		}
85 
86 		// Set the lexer string and token.
87 		bc_vec_popAll(&l->str);
88 		bc_vec_pushByte(&l->str, (uchar) l->buf[l->i - 1]);
89 		bc_vec_pushByte(&l->str, '\0');
90 		l->t = BC_LEX_NAME;
91 	}
92 }
93 
94 /**
95  * Parses a dc string. Since dc's strings need to check for balanced brackets,
96  * we can't just parse bc and dc strings with different start and end
97  * characters. Oh, and dc strings need to check for escaped brackets.
98  * @param l  The lexer.
99  */
100 static void
101 dc_lex_string(BcLex* l)
102 {
103 	size_t depth, nls, i;
104 	char c;
105 	bool got_more;
106 
107 	// Set the token and clear the string.
108 	l->t = BC_LEX_STR;
109 	bc_vec_popAll(&l->str);
110 
111 	do
112 	{
113 		depth = 1;
114 		nls = 0;
115 		got_more = false;
116 
117 #if !BC_ENABLE_OSSFUZZ
118 		assert(l->mode != BC_MODE_STDIN || l->buf == vm->buffer.v);
119 #endif // !BC_ENABLE_OSSFUZZ
120 
121 		// This is the meat. As long as we don't run into the NUL byte, and we
122 		// have "depth", which means we haven't completely balanced brackets
123 		// yet, we continue eating the string.
124 		for (i = l->i; (c = l->buf[i]) && depth; ++i)
125 		{
126 			// Check for escaped brackets and set the depths as appropriate.
127 			if (c == '\\')
128 			{
129 				c = l->buf[++i];
130 				if (!c) break;
131 			}
132 			else
133 			{
134 				depth += (c == '[');
135 				depth -= (c == ']');
136 			}
137 
138 			// We want to adjust the line in the lexer as necessary.
139 			nls += (c == '\n');
140 
141 			if (depth) bc_vec_push(&l->str, &c);
142 		}
143 
144 		if (BC_ERR(c == '\0' && depth))
145 		{
146 			if (!vm->eof && l->mode != BC_MODE_FILE)
147 			{
148 				got_more = bc_lex_readLine(l);
149 			}
150 
151 			if (got_more)
152 			{
153 				bc_vec_popAll(&l->str);
154 			}
155 		}
156 	}
157 	while (got_more && depth);
158 
159 	// Obviously, if we didn't balance, that's an error.
160 	if (BC_ERR(c == '\0' && depth))
161 	{
162 		l->i = i;
163 		bc_lex_err(l, BC_ERR_PARSE_STRING);
164 	}
165 
166 	bc_vec_pushByte(&l->str, '\0');
167 
168 	l->i = i;
169 	l->line += nls;
170 }
171 
172 /**
173  * Lexes a dc token. This is the dc implementation of BcLexNext.
174  * @param l  The lexer.
175  */
176 void
177 dc_lex_token(BcLex* l)
178 {
179 	char c = l->buf[l->i++], c2;
180 	size_t i;
181 
182 	BC_SIG_ASSERT_LOCKED;
183 
184 	// If the last token was a command that needs a register, we need to parse a
185 	// register, so do so.
186 	for (i = 0; i < dc_lex_regs_len; ++i)
187 	{
188 		// If the token is a register token, take care of it and return.
189 		if (l->last == dc_lex_regs[i])
190 		{
191 			dc_lex_register(l);
192 			return;
193 		}
194 	}
195 
196 	// These lines are for tokens that easily correspond to one character. We
197 	// just set the token.
198 	if (c >= '"' && c <= '~' &&
199 	    (l->t = dc_lex_tokens[(c - '"')]) != BC_LEX_INVALID)
200 	{
201 		return;
202 	}
203 
204 	// This is the workhorse of the lexer when more complicated things are
205 	// needed.
206 	switch (c)
207 	{
208 		case '\0':
209 		case '\n':
210 		case '\t':
211 		case '\v':
212 		case '\f':
213 		case '\r':
214 		case ' ':
215 		{
216 			bc_lex_commonTokens(l, c);
217 			break;
218 		}
219 
220 		// We don't have the ! command, so we always expect certain things
221 		// after the exclamation point.
222 		case '!':
223 		{
224 			c2 = l->buf[l->i];
225 
226 			if (c2 == '=') l->t = BC_LEX_OP_REL_NE;
227 			else if (c2 == '<') l->t = BC_LEX_OP_REL_LE;
228 			else if (c2 == '>') l->t = BC_LEX_OP_REL_GE;
229 			else bc_lex_invalidChar(l, c);
230 
231 			l->i += 1;
232 
233 			break;
234 		}
235 
236 		case '#':
237 		{
238 			bc_lex_lineComment(l);
239 			break;
240 		}
241 
242 		case '.':
243 		{
244 			c2 = l->buf[l->i];
245 
246 			// If the character after is a number, this dot is part of a number.
247 			// Otherwise, it's the BSD dot (equivalent to last).
248 			if (BC_NO_ERR(BC_LEX_NUM_CHAR(c2, true, false)))
249 			{
250 				bc_lex_number(l, c);
251 			}
252 			else bc_lex_invalidChar(l, c);
253 
254 			break;
255 		}
256 
257 		case '0':
258 		case '1':
259 		case '2':
260 		case '3':
261 		case '4':
262 		case '5':
263 		case '6':
264 		case '7':
265 		case '8':
266 		case '9':
267 		case 'A':
268 		case 'B':
269 		case 'C':
270 		case 'D':
271 		case 'E':
272 		case 'F':
273 		{
274 			bc_lex_number(l, c);
275 			break;
276 		}
277 
278 		case 'g':
279 		{
280 			c2 = l->buf[l->i];
281 
282 			if (c2 == 'l') l->t = BC_LEX_KW_LINE_LENGTH;
283 			else if (c2 == 'x') l->t = BC_LEX_EXTENDED_REGISTERS;
284 			else if (c2 == 'z') l->t = BC_LEX_KW_LEADING_ZERO;
285 			else bc_lex_invalidChar(l, c2);
286 
287 			l->i += 1;
288 
289 			break;
290 		}
291 
292 		case '[':
293 		{
294 			dc_lex_string(l);
295 			break;
296 		}
297 
298 		default:
299 		{
300 			bc_lex_invalidChar(l, c);
301 		}
302 	}
303 }
304 #endif // DC_ENABLED
305