xref: /freebsd/contrib/mandoc/roff_escape.c (revision b64c5a0ace59af62eff52bfe110a521dc73c937b)
1 /* $Id: roff_escape.c,v 1.15 2024/05/16 21:23:00 schwarze Exp $ */
2 /*
3  * Copyright (c) 2011, 2012, 2013, 2014, 2015, 2017, 2018, 2020, 2022
4  *               Ingo Schwarze <schwarze@openbsd.org>
5  * Copyright (c) 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  *
19  * Parser for roff(7) escape sequences.
20  * To be used by all mandoc(1) parsers and formatters.
21  */
22 #include <assert.h>
23 #include <ctype.h>
24 #include <limits.h>
25 #include <stdio.h>
26 #include <string.h>
27 
28 #include "mandoc.h"
29 #include "roff.h"
30 #include "roff_int.h"
31 
32 /*
33  * Traditional escape sequence interpreter for general use
34  * including in high-level formatters.  This function does not issue
35  * diagnostics and is not usable for expansion in the roff(7) parser.
36  * It is documented in the mandoc_escape(3) manual page.
37  */
38 enum mandoc_esc
39 mandoc_escape(const char **rendarg, const char **rarg, int *rargl)
40 {
41         int		 iarg, iendarg, iend;
42         enum mandoc_esc  rval;
43 
44         rval = roff_escape(--*rendarg, 0, 0,
45 	    NULL, NULL, &iarg, &iendarg, &iend);
46         assert(rval != ESCAPE_EXPAND);
47         if (rarg != NULL)
48 	       *rarg = *rendarg + iarg;
49         if (rargl != NULL)
50 	       *rargl = iendarg - iarg;
51         *rendarg += iend;
52         return rval;
53 }
54 
55 /*
56  * Full-featured escape sequence parser.
57  * If it encounters a nested escape sequence that requires expansion
58  * by the parser and re-parsing, the positions of that inner escape
59  * sequence are returned in *resc ... *rend.
60  * Otherwise, *resc is set to aesc and the positions of the escape
61  * sequence starting at aesc are returned.
62  * Diagnostic messages are generated if and only if ln != 0,
63  * that is, if and only if called by roff_expand().
64  */
65 enum mandoc_esc
66 roff_escape(const char *buf, const int ln, const int aesc,
67     int *resc, int *rnam, int *rarg, int *rendarg, int *rend)
68 {
69 	int		 iesc;		/* index of leading escape char */
70 	int		 inam;		/* index of escape name */
71 	int		 iarg;		/* index beginning the argument */
72 	int		 iendarg;	/* index right after the argument */
73 	int		 iend;		/* index right after the sequence */
74 	int		 sesc, snam, sarg, sendarg, send; /* for sub-escape */
75 	int		 escterm;	/* whether term is escaped */
76 	int		 maxl;		/* expected length of the argument */
77 	int		 argl;		/* actual length of the argument */
78 	int		 c, i;		/* for \[char...] parsing */
79 	int 		 valid_A;	/* for \A parsing */
80 	enum mandoc_esc	 rval;		/* return value */
81 	enum mandoc_esc	 stype;		/* for sub-escape */
82 	enum mandocerr	 err;		/* diagnostic code */
83 	char		 term;		/* byte terminating the argument */
84 
85 	/*
86 	 * Treat "\E" just like "\";
87 	 * it only makes a difference in copy mode.
88 	 */
89 
90 	iesc = inam = aesc;
91 	do {
92 		inam++;
93 	} while (buf[inam] == 'E');
94 
95 	/*
96 	 * Sort the following cases first by syntax category,
97 	 * then by escape sequence type, and finally by ASCII code.
98 	 */
99 
100 	iarg = iendarg = iend = inam + 1;
101 	maxl = INT_MAX;
102 	term = '\0';
103 	err = MANDOCERR_OK;
104 	switch (buf[inam]) {
105 
106 	/* Escape sequences taking no arguments at all. */
107 
108 	case '!':
109 	case '?':
110 	case 'r':
111 		rval = ESCAPE_UNSUPP;
112 		goto out;
113 
114 	case '%':
115 	case '&':
116 	case ')':
117 	case ',':
118 	case '/':
119 	case '^':
120 	case 'a':
121 	case 'd':
122 	case 't':
123 	case 'u':
124 	case '{':
125 	case '|':
126 	case '}':
127 		rval = ESCAPE_IGNORE;
128 		goto out;
129 
130 	case '\0':
131 		iendarg = --iend;
132 		/* FALLTHROUGH */
133 	case '.':
134 	case '\\':
135 	default:
136 		iarg--;
137 		rval = ESCAPE_UNDEF;
138 		goto out;
139 
140 	case ' ':
141 	case '\'':
142 	case '-':
143 	case '0':
144 	case ':':
145 	case '_':
146 	case '`':
147 	case 'e':
148 	case '~':
149 		iarg--;
150 		argl = 1;
151 		rval = ESCAPE_SPECIAL;
152 		goto out;
153 	case 'p':
154 		rval = ESCAPE_BREAK;
155 		goto out;
156 	case 'c':
157 		rval = ESCAPE_NOSPACE;
158 		goto out;
159 	case 'z':
160 		rval = ESCAPE_SKIPCHAR;
161 		goto out;
162 
163 	/* Standard argument format. */
164 
165 	case '$':
166 	case '*':
167 	case 'V':
168 	case 'g':
169 	case 'n':
170 		rval = ESCAPE_EXPAND;
171 		break;
172 	case 'F':
173 	case 'M':
174 	case 'O':
175 	case 'Y':
176 	case 'k':
177 	case 'm':
178 		rval = ESCAPE_IGNORE;
179 		break;
180 	case '(':
181 	case '[':
182 		rval = ESCAPE_SPECIAL;
183 		iendarg = iend = --iarg;
184 		break;
185 	case 'f':
186 		rval = ESCAPE_FONT;
187 		break;
188 
189 	/* Quoted arguments */
190 
191 	case 'A':
192 	case 'B':
193 	case 'w':
194 		rval = ESCAPE_EXPAND;
195 		term = '\b';
196 		break;
197 	case 'D':
198 	case 'H':
199 	case 'L':
200 	case 'R':
201 	case 'S':
202 	case 'X':
203 	case 'Z':
204 	case 'b':
205 	case 'v':
206 	case 'x':
207 		rval = ESCAPE_IGNORE;
208 		term = '\b';
209 		break;
210 	case 'C':
211 		rval = ESCAPE_SPECIAL;
212 		term = '\b';
213 		break;
214 	case 'N':
215 		rval = ESCAPE_NUMBERED;
216 		term = '\b';
217 		break;
218 	case 'h':
219 		rval = ESCAPE_HORIZ;
220 		term = '\b';
221 		break;
222 	case 'l':
223 		rval = ESCAPE_HLINE;
224 		term = '\b';
225 		break;
226 	case 'o':
227 		rval = ESCAPE_OVERSTRIKE;
228 		term = '\b';
229 		break;
230 
231 	/* Sizes support both forms, with additional peculiarities. */
232 
233 	case 's':
234 		rval = ESCAPE_IGNORE;
235 		if (buf[iarg] == '+' || buf[iarg] == '-'||
236 		    buf[iarg] == ASCII_HYPH)
237 			iarg++;
238 		switch (buf[iarg]) {
239 		case '(':
240 			maxl = 2;
241 			iarg++;
242 			break;
243 		case '[':
244 			term = ']';
245 			iarg++;
246 			break;
247 		case '\'':
248 			term = '\'';
249 			iarg++;
250 			break;
251 		case '1':
252 		case '2':
253 		case '3':
254 			if (buf[iarg - 1] == 's' &&
255 			    isdigit((unsigned char)buf[iarg + 1])) {
256 				maxl = 2;
257 				break;
258 			}
259 			/* FALLTHROUGH */
260 		default:
261 			maxl = 1;
262 			break;
263 		}
264 		iendarg = iend = iarg;
265 	}
266 
267 	/* Decide how to end the argument. */
268 
269 	escterm = 0;
270 	stype = ESCAPE_EXPAND;
271 	if ((term == '\b' || (term == '\0' && maxl == INT_MAX)) &&
272 	    buf[iarg] == buf[iesc]) {
273 		stype = roff_escape(buf, ln, iendarg,
274 		    &sesc, &snam, &sarg, &sendarg, &send);
275 		if (stype == ESCAPE_EXPAND)
276 			goto out_sub;
277 	}
278 
279 	if (term == '\b') {
280 		if (stype == ESCAPE_UNDEF)
281 			iarg++;
282 		if (stype != ESCAPE_EXPAND && stype != ESCAPE_UNDEF) {
283 			if (strchr("BHLRSNhlvx", buf[inam]) != NULL &&
284 			    strchr(" ,.0DLOXYZ^abdhlortuvx|~",
285 			    buf[snam]) != NULL) {
286 				err = MANDOCERR_ESC_DELIM;
287 				iend = send;
288 				iarg = iendarg = sesc;
289 				goto out;
290 			}
291 			escterm = 1;
292 			iarg = send;
293 			term = buf[snam];
294 		} else if (strchr("BDHLRSvxNhl", buf[inam]) != NULL &&
295 		    strchr(" %&()*+-./0123456789:<=>", buf[iarg]) != NULL) {
296 			err = MANDOCERR_ESC_DELIM;
297 			if (rval != ESCAPE_EXPAND)
298 				rval = ESCAPE_ERROR;
299 			if (buf[inam] != 'D') {
300 				iendarg = iend = iarg + 1;
301 				goto out;
302 			}
303 		}
304 		if (term == '\b')
305 			term = buf[iarg++];
306 	} else if (term == '\0' && maxl == INT_MAX) {
307 		if (buf[inam] == 'n' && (buf[iarg] == '+' || buf[iarg] == '-'))
308 			iarg++;
309 		switch (buf[iarg]) {
310 		case '(':
311 			maxl = 2;
312 			iarg++;
313 			break;
314 		case '[':
315 			if (buf[++iarg] == ' ') {
316 				iendarg = iend = iarg + 1;
317 				err = MANDOCERR_ESC_ARG;
318 				rval = ESCAPE_ERROR;
319 				goto out;
320 			}
321 			term = ']';
322 			break;
323 		default:
324 			maxl = 1;
325 			break;
326 		}
327 	}
328 
329 	/* Advance to the end of the argument. */
330 
331 	valid_A = 1;
332 	iendarg = iarg;
333 	while (maxl > 0) {
334 		if (buf[iendarg] == '\0') {
335 			err = MANDOCERR_ESC_INCOMPLETE;
336 			if (rval != ESCAPE_EXPAND &&
337 			    rval != ESCAPE_OVERSTRIKE)
338 				rval = ESCAPE_ERROR;
339 			/* Usually, ignore an incomplete argument. */
340 			if (strchr("Aow", buf[inam]) == NULL)
341 				iendarg = iarg;
342 			break;
343 		}
344 		if (escterm == 0 && buf[iendarg] == term) {
345 			iend = iendarg + 1;
346 			break;
347 		}
348 		if (buf[iendarg] == buf[iesc]) {
349 			stype = roff_escape(buf, ln, iendarg,
350 			    &sesc, &snam, &sarg, &sendarg, &send);
351 			if (stype == ESCAPE_EXPAND)
352 				goto out_sub;
353 			iend = send;
354 			if (escterm == 1 &&
355 			    (buf[snam] == term || buf[inam] == 'N'))
356 				break;
357 			if (stype != ESCAPE_UNDEF)
358 				valid_A = 0;
359 			iendarg = send;
360 		} else if (buf[inam] == 'N' &&
361 		    isdigit((unsigned char)buf[iendarg]) == 0) {
362 			iend = iendarg + 1;
363 			break;
364 		} else {
365 			if (buf[iendarg] == ' ' || buf[iendarg] == '\t')
366 				valid_A = 0;
367 			if (maxl != INT_MAX)
368 				maxl--;
369 			iend = ++iendarg;
370 		}
371 	}
372 
373 	/* Post-process depending on the content of the argument. */
374 
375 	argl = iendarg - iarg;
376 	switch (buf[inam]) {
377 	case '*':
378 		if (resc == NULL && argl == 2 &&
379 		    buf[iarg] == '.' && buf[iarg + 1] == 'T')
380 			rval = ESCAPE_DEVICE;
381 		break;
382 	case 'A':
383 		if (valid_A == 0)
384 			iendarg = iarg;
385 		break;
386 	case 'O':
387 		switch (buf[iarg]) {
388 		case '0':
389 			rval = ESCAPE_UNSUPP;
390 			break;
391 		case '1':
392 		case '2':
393 		case '3':
394 		case '4':
395 			if (argl == 1)
396 				rval = ESCAPE_IGNORE;
397 			else {
398 				err = MANDOCERR_ESC_ARG;
399 				rval = ESCAPE_ERROR;
400 			}
401 			break;
402 		case '5':
403 			if (buf[iarg - 1] == '[')
404 				rval = ESCAPE_UNSUPP;
405 			else {
406 				err = MANDOCERR_ESC_ARG;
407 				rval = ESCAPE_ERROR;
408 			}
409 			break;
410 		default:
411 			err = MANDOCERR_ESC_ARG;
412 			rval = ESCAPE_ERROR;
413 			break;
414 		}
415 		break;
416 	default:
417 		break;
418 	}
419 
420 	switch (rval) {
421 	case ESCAPE_FONT:
422 		rval = mandoc_font(buf + iarg, argl);
423 		if (rval == ESCAPE_ERROR)
424 			err = MANDOCERR_ESC_ARG;
425 		break;
426 
427 	case ESCAPE_SPECIAL:
428 		if (argl == 0) {
429 			err = MANDOCERR_ESC_BADCHAR;
430 			rval = ESCAPE_ERROR;
431 			break;
432 		}
433 
434 		/*
435 		 * The file chars.c only provides one common list of
436 		 * character names, but \[-] == \- is the only one of
437 		 * the characters with one-byte names that allows
438 		 * enclosing the name in brackets.
439 		 */
440 
441 		if (term != '\0' && argl == 1 && buf[iarg] != '-') {
442 			err = MANDOCERR_ESC_BADCHAR;
443 			rval = ESCAPE_ERROR;
444 			break;
445 		}
446 
447 		/* Treat \[char...] as an alias for \N'...'. */
448 
449 		if (buf[iarg] == 'c') {
450 			if (argl < 6 || argl > 7 ||
451 			    strncmp(buf + iarg, "char", 4) != 0 ||
452 			    (int)strspn(buf + iarg + 4, "0123456789")
453 			     + 4 < argl)
454 				break;
455 			c = 0;
456 			for (i = iarg; i < iendarg; i++)
457 				c = 10 * c + (buf[i] - '0');
458 			if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff) {
459 				err = MANDOCERR_ESC_BADCHAR;
460 				break;
461 			}
462 			iarg += 4;
463 			rval = ESCAPE_NUMBERED;
464 			break;
465 		}
466 
467 		/*
468 		 * Unicode escapes are defined in groff as \[u0000]
469 		 * to \[u10FFFF], where the contained value must be
470 		 * a valid Unicode codepoint.
471 		 */
472 
473 		if (buf[iarg] != 'u' || argl < 5 || argl > 7)
474 			break;
475 		if (argl == 7 &&  /* beyond the Unicode range */
476 		    (buf[iarg + 1] != '1' || buf[iarg + 2] != '0')) {
477 			err = MANDOCERR_ESC_BADCHAR;
478 			break;
479 		}
480 		if (argl == 6 && buf[iarg + 1] == '0') {
481 			err = MANDOCERR_ESC_BADCHAR;
482 			break;
483 		}
484 		if (argl == 5 &&  /* UTF-16 surrogate */
485 		    toupper((unsigned char)buf[iarg + 1]) == 'D' &&
486 		    strchr("89ABCDEFabcdef", buf[iarg + 2]) != NULL) {
487 			err = MANDOCERR_ESC_BADCHAR;
488 			break;
489 		}
490 		if ((int)strspn(buf + iarg + 1, "0123456789ABCDEFabcdef")
491 		    + 1 == argl)
492 			rval = ESCAPE_UNICODE;
493 		break;
494 	default:
495 		break;
496 	}
497 	goto out;
498 
499 out_sub:
500 	iesc = sesc;
501 	inam = snam;
502 	iarg = sarg;
503 	iendarg = sendarg;
504 	iend = send;
505 	rval = ESCAPE_EXPAND;
506 
507 out:
508 	if (resc != NULL)
509 		*resc = iesc;
510 	if (rnam != NULL)
511 		*rnam = inam;
512 	if (rarg != NULL)
513 		*rarg = iarg;
514 	if (rendarg != NULL)
515 		*rendarg = iendarg;
516 	if (rend != NULL)
517 		*rend = iend;
518 	if (ln == 0)
519 		return rval;
520 
521 	/*
522 	 * Diagnostic messages are only issued when called
523 	 * from the parser, not when called from the formatters.
524 	 */
525 
526 	switch (rval) {
527 	case ESCAPE_UNSUPP:
528 		err = MANDOCERR_ESC_UNSUPP;
529 		break;
530 	case ESCAPE_UNDEF:
531 		if (buf[inam] != '\\' && buf[inam] != '.')
532 			err = MANDOCERR_ESC_UNDEF;
533 		break;
534 	case ESCAPE_SPECIAL:
535 		if (mchars_spec2cp(buf + iarg, argl) >= 0)
536 			err = MANDOCERR_OK;
537 		else if (err == MANDOCERR_OK)
538 			err = MANDOCERR_ESC_UNKCHAR;
539 		break;
540 	default:
541 		break;
542 	}
543 	if (err != MANDOCERR_OK)
544 		mandoc_msg(err, ln, iesc, "%.*s", iend - iesc, buf + iesc);
545 	return rval;
546 }
547