xref: /freebsd/contrib/bc/src/dc_lex.c (revision 656d68a711952ac2b92ed258502978c5ba1dbc73)
1 /*
2  * *****************************************************************************
3  *
4  * SPDX-License-Identifier: BSD-2-Clause
5  *
6  * Copyright (c) 2018-2021 Gavin D. Howard and contributors.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions are met:
10  *
11  * * Redistributions of source code must retain the above copyright notice, this
12  *   list of conditions and the following disclaimer.
13  *
14  * * Redistributions in binary form must reproduce the above copyright notice,
15  *   this list of conditions and the following disclaimer in the documentation
16  *   and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGE.
29  *
30  * *****************************************************************************
31  *
32  * The lexer for dc.
33  *
34  */
35 
36 #if DC_ENABLED
37 
38 #include <ctype.h>
39 
40 #include <dc.h>
41 #include <vm.h>
42 
43 bool dc_lex_negCommand(BcLex *l) {
44 	char c = l->buf[l->i];
45 	return !BC_LEX_NUM_CHAR(c, false, false);
46 }
47 
48 /**
49  * Processes a dc command that needs a register. This is where the
50  * extended-register extension is implemented.
51  * @param l  The lexer.
52  */
53 static void dc_lex_register(BcLex *l) {
54 
55 	// If extended register is enabled and the character is whitespace...
56 	if (DC_X && isspace(l->buf[l->i - 1])) {
57 
58 		char c;
59 
60 		// Eat the whitespace.
61 		bc_lex_whitespace(l);
62 		c = l->buf[l->i];
63 
64 		// Check for a letter or underscore.
65 		if (BC_ERR(!isalpha(c) && c != '_'))
66 			bc_lex_verr(l, BC_ERR_PARSE_CHAR, c);
67 
68 		// Parse a normal identifier.
69 		l->i += 1;
70 		bc_lex_name(l);
71 	}
72 	else {
73 
74 		// I don't allow newlines because newlines are used for controlling when
75 		// execution happens, and allowing newlines would just be complex.
76 		if (BC_ERR(l->buf[l->i - 1] == '\n'))
77 			bc_lex_verr(l, BC_ERR_PARSE_CHAR, l->buf[l->i - 1]);
78 
79 		// Set the lexer string and token.
80 		bc_vec_popAll(&l->str);
81 		bc_vec_pushByte(&l->str, (uchar) l->buf[l->i - 1]);
82 		bc_vec_pushByte(&l->str, '\0');
83 		l->t = BC_LEX_NAME;
84 	}
85 }
86 
87 /**
88  * Parses a dc string. Since dc's strings need to check for balanced brackets,
89  * we can't just parse bc and dc strings with different start and end
90  * characters. Oh, and dc strings need to check for escaped brackets.
91  * @param l  The lexer.
92  */
93 static void dc_lex_string(BcLex *l) {
94 
95 	size_t depth, nls, i;
96 	char c;
97 	bool got_more;
98 
99 	// Set the token and clear the string.
100 	l->t = BC_LEX_STR;
101 	bc_vec_popAll(&l->str);
102 
103 	do {
104 
105 		depth = 1;
106 		nls = 0;
107 		got_more = false;
108 
109 		assert(!l->is_stdin || l->buf == vm.buffer.v);
110 
111 		// This is the meat. As long as we don't run into the NUL byte, and we
112 		// have "depth", which means we haven't completely balanced brackets
113 		// yet, we continue eating the string.
114 		for (i = l->i; (c = l->buf[i]) && depth; ++i) {
115 
116 			// Check for escaped brackets and set the depths as appropriate.
117 			if (c == '\\') {
118 				c = l->buf[++i];
119 				if (!c) break;
120 			}
121 			else {
122 				depth += (c == '[');
123 				depth -= (c == ']');
124 			}
125 
126 			// We want to adjust the line in the lexer as necessary.
127 			nls += (c == '\n');
128 
129 			if (depth) bc_vec_push(&l->str, &c);
130 		}
131 
132 		if (BC_ERR(c == '\0' && depth)) {
133 			if (!vm.eof && (l->is_stdin || l->is_exprs))
134 				got_more = bc_lex_readLine(l);
135 			if (got_more) bc_vec_popAll(&l->str);
136 		}
137 
138 	} while (got_more && depth);
139 
140 	// Obviously, if we didn't balance, that's an error.
141 	if (BC_ERR(c == '\0' && depth)) {
142 		l->i = i;
143 		bc_lex_err(l, BC_ERR_PARSE_STRING);
144 	}
145 
146 	bc_vec_pushByte(&l->str, '\0');
147 
148 	l->i = i;
149 	l->line += nls;
150 }
151 
152 /**
153  * Lexes a dc token. This is the dc implementation of BcLexNext.
154  * @param l  The lexer.
155  */
156 void dc_lex_token(BcLex *l) {
157 
158 	char c = l->buf[l->i++], c2;
159 	size_t i;
160 
161 	BC_SIG_ASSERT_LOCKED;
162 
163 	// If the last token was a command that needs a register, we need to parse a
164 	// register, so do so.
165 	for (i = 0; i < dc_lex_regs_len; ++i) {
166 
167 		// If the token is a register token, take care of it and return.
168 		if (l->last == dc_lex_regs[i]) {
169 			dc_lex_register(l);
170 			return;
171 		}
172 	}
173 
174 	// These lines are for tokens that easily correspond to one character. We
175 	// just set the token.
176 	if (c >= '"' && c <= '~' &&
177 	    (l->t = dc_lex_tokens[(c - '"')]) != BC_LEX_INVALID)
178 	{
179 		return;
180 	}
181 
182 	// This is the workhorse of the lexer when more complicated things are
183 	// needed.
184 	switch (c) {
185 
186 		case '\0':
187 		case '\n':
188 		case '\t':
189 		case '\v':
190 		case '\f':
191 		case '\r':
192 		case ' ':
193 		{
194 			bc_lex_commonTokens(l, c);
195 			break;
196 		}
197 
198 		// We don't have the ! command, so we always expect certain things
199 		// after the exclamation point.
200 		case '!':
201 		{
202 			c2 = l->buf[l->i];
203 
204 			if (c2 == '=') l->t = BC_LEX_OP_REL_NE;
205 			else if (c2 == '<') l->t = BC_LEX_OP_REL_LE;
206 			else if (c2 == '>') l->t = BC_LEX_OP_REL_GE;
207 			else bc_lex_invalidChar(l, c);
208 
209 			l->i += 1;
210 
211 			break;
212 		}
213 
214 		case '#':
215 		{
216 			bc_lex_lineComment(l);
217 			break;
218 		}
219 
220 		case '.':
221 		{
222 			c2 = l->buf[l->i];
223 
224 			// If the character after is a number, this dot is part of a number.
225 			// Otherwise, it's the BSD dot (equivalent to last).
226 			if (BC_NO_ERR(BC_LEX_NUM_CHAR(c2, true, false)))
227 				bc_lex_number(l, c);
228 			else bc_lex_invalidChar(l, c);
229 
230 			break;
231 		}
232 
233 		case '0':
234 		case '1':
235 		case '2':
236 		case '3':
237 		case '4':
238 		case '5':
239 		case '6':
240 		case '7':
241 		case '8':
242 		case '9':
243 		case 'A':
244 		case 'B':
245 		case 'C':
246 		case 'D':
247 		case 'E':
248 		case 'F':
249 		{
250 			bc_lex_number(l, c);
251 			break;
252 		}
253 
254 		case 'g':
255 		{
256 			c2 = l->buf[l->i];
257 
258 			if (c2 == 'l') l->t = BC_LEX_KW_LINE_LENGTH;
259 			else if (c2 == 'z') l->t = BC_LEX_KW_LEADING_ZERO;
260 			else bc_lex_invalidChar(l, c2);
261 
262 			l->i += 1;
263 
264 			break;
265 		}
266 
267 		case '[':
268 		{
269 			dc_lex_string(l);
270 			break;
271 		}
272 
273 		default:
274 		{
275 			bc_lex_invalidChar(l, c);
276 		}
277 	}
278 }
279 #endif // DC_ENABLED
280