1 /* $Id: roff_escape.c,v 1.15 2024/05/16 21:23:00 schwarze Exp $ */
2 /*
3 * Copyright (c) 2011, 2012, 2013, 2014, 2015, 2017, 2018, 2020, 2022
4 * Ingo Schwarze <schwarze@openbsd.org>
5 * Copyright (c) 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
6 *
7 * Permission to use, copy, modify, and distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
10 *
11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18 *
19 * Parser for roff(7) escape sequences.
20 * To be used by all mandoc(1) parsers and formatters.
21 */
22 #include <assert.h>
23 #include <ctype.h>
24 #include <limits.h>
25 #include <stdio.h>
26 #include <string.h>
27
28 #include "mandoc.h"
29 #include "roff.h"
30 #include "roff_int.h"
31
32 /*
33 * Traditional escape sequence interpreter for general use
34 * including in high-level formatters. This function does not issue
35 * diagnostics and is not usable for expansion in the roff(7) parser.
36 * It is documented in the mandoc_escape(3) manual page.
37 */
38 enum mandoc_esc
mandoc_escape(const char ** rendarg,const char ** rarg,int * rargl)39 mandoc_escape(const char **rendarg, const char **rarg, int *rargl)
40 {
41 int iarg, iendarg, iend;
42 enum mandoc_esc rval;
43
44 rval = roff_escape(--*rendarg, 0, 0,
45 NULL, NULL, &iarg, &iendarg, &iend);
46 assert(rval != ESCAPE_EXPAND);
47 if (rarg != NULL)
48 *rarg = *rendarg + iarg;
49 if (rargl != NULL)
50 *rargl = iendarg - iarg;
51 *rendarg += iend;
52 return rval;
53 }
54
55 /*
56 * Full-featured escape sequence parser.
57 * If it encounters a nested escape sequence that requires expansion
58 * by the parser and re-parsing, the positions of that inner escape
59 * sequence are returned in *resc ... *rend.
60 * Otherwise, *resc is set to aesc and the positions of the escape
61 * sequence starting at aesc are returned.
62 * Diagnostic messages are generated if and only if ln != 0,
63 * that is, if and only if called by roff_expand().
64 */
65 enum mandoc_esc
roff_escape(const char * buf,const int ln,const int aesc,int * resc,int * rnam,int * rarg,int * rendarg,int * rend)66 roff_escape(const char *buf, const int ln, const int aesc,
67 int *resc, int *rnam, int *rarg, int *rendarg, int *rend)
68 {
69 int iesc; /* index of leading escape char */
70 int inam; /* index of escape name */
71 int iarg; /* index beginning the argument */
72 int iendarg; /* index right after the argument */
73 int iend; /* index right after the sequence */
74 int sesc, snam, sarg, sendarg, send; /* for sub-escape */
75 int escterm; /* whether term is escaped */
76 int maxl; /* expected length of the argument */
77 int argl; /* actual length of the argument */
78 int c, i; /* for \[char...] parsing */
79 int valid_A; /* for \A parsing */
80 enum mandoc_esc rval; /* return value */
81 enum mandoc_esc stype; /* for sub-escape */
82 enum mandocerr err; /* diagnostic code */
83 char term; /* byte terminating the argument */
84
85 /*
86 * Treat "\E" just like "\";
87 * it only makes a difference in copy mode.
88 */
89
90 iesc = inam = aesc;
91 do {
92 inam++;
93 } while (buf[inam] == 'E');
94
95 /*
96 * Sort the following cases first by syntax category,
97 * then by escape sequence type, and finally by ASCII code.
98 */
99
100 iarg = iendarg = iend = inam + 1;
101 maxl = INT_MAX;
102 term = '\0';
103 err = MANDOCERR_OK;
104 switch (buf[inam]) {
105
106 /* Escape sequences taking no arguments at all. */
107
108 case '!':
109 case '?':
110 case 'r':
111 rval = ESCAPE_UNSUPP;
112 goto out;
113
114 case '%':
115 case '&':
116 case ')':
117 case ',':
118 case '/':
119 case '^':
120 case 'a':
121 case 'd':
122 case 't':
123 case 'u':
124 case '{':
125 case '|':
126 case '}':
127 rval = ESCAPE_IGNORE;
128 goto out;
129
130 case '\0':
131 iendarg = --iend;
132 /* FALLTHROUGH */
133 case '.':
134 case '\\':
135 default:
136 iarg--;
137 rval = ESCAPE_UNDEF;
138 goto out;
139
140 case ' ':
141 case '\'':
142 case '-':
143 case '0':
144 case ':':
145 case '_':
146 case '`':
147 case 'e':
148 case '~':
149 iarg--;
150 argl = 1;
151 rval = ESCAPE_SPECIAL;
152 goto out;
153 case 'p':
154 rval = ESCAPE_BREAK;
155 goto out;
156 case 'c':
157 rval = ESCAPE_NOSPACE;
158 goto out;
159 case 'z':
160 rval = ESCAPE_SKIPCHAR;
161 goto out;
162
163 /* Standard argument format. */
164
165 case '$':
166 case '*':
167 case 'V':
168 case 'g':
169 case 'n':
170 rval = ESCAPE_EXPAND;
171 break;
172 case 'F':
173 case 'M':
174 case 'O':
175 case 'Y':
176 case 'k':
177 case 'm':
178 rval = ESCAPE_IGNORE;
179 break;
180 case '(':
181 case '[':
182 rval = ESCAPE_SPECIAL;
183 iendarg = iend = --iarg;
184 break;
185 case 'f':
186 rval = ESCAPE_FONT;
187 break;
188
189 /* Quoted arguments */
190
191 case 'A':
192 case 'B':
193 case 'w':
194 rval = ESCAPE_EXPAND;
195 term = '\b';
196 break;
197 case 'D':
198 case 'H':
199 case 'L':
200 case 'R':
201 case 'S':
202 case 'X':
203 case 'Z':
204 case 'b':
205 case 'v':
206 case 'x':
207 rval = ESCAPE_IGNORE;
208 term = '\b';
209 break;
210 case 'C':
211 rval = ESCAPE_SPECIAL;
212 term = '\b';
213 break;
214 case 'N':
215 rval = ESCAPE_NUMBERED;
216 term = '\b';
217 break;
218 case 'h':
219 rval = ESCAPE_HORIZ;
220 term = '\b';
221 break;
222 case 'l':
223 rval = ESCAPE_HLINE;
224 term = '\b';
225 break;
226 case 'o':
227 rval = ESCAPE_OVERSTRIKE;
228 term = '\b';
229 break;
230
231 /* Sizes support both forms, with additional peculiarities. */
232
233 case 's':
234 rval = ESCAPE_IGNORE;
235 if (buf[iarg] == '+' || buf[iarg] == '-'||
236 buf[iarg] == ASCII_HYPH)
237 iarg++;
238 switch (buf[iarg]) {
239 case '(':
240 maxl = 2;
241 iarg++;
242 break;
243 case '[':
244 term = ']';
245 iarg++;
246 break;
247 case '\'':
248 term = '\'';
249 iarg++;
250 break;
251 case '1':
252 case '2':
253 case '3':
254 if (buf[iarg - 1] == 's' &&
255 isdigit((unsigned char)buf[iarg + 1])) {
256 maxl = 2;
257 break;
258 }
259 /* FALLTHROUGH */
260 default:
261 maxl = 1;
262 break;
263 }
264 iendarg = iend = iarg;
265 }
266
267 /* Decide how to end the argument. */
268
269 escterm = 0;
270 stype = ESCAPE_EXPAND;
271 if ((term == '\b' || (term == '\0' && maxl == INT_MAX)) &&
272 buf[iarg] == buf[iesc]) {
273 stype = roff_escape(buf, ln, iendarg,
274 &sesc, &snam, &sarg, &sendarg, &send);
275 if (stype == ESCAPE_EXPAND)
276 goto out_sub;
277 }
278
279 if (term == '\b') {
280 if (stype == ESCAPE_UNDEF)
281 iarg++;
282 if (stype != ESCAPE_EXPAND && stype != ESCAPE_UNDEF) {
283 if (strchr("BHLRSNhlvx", buf[inam]) != NULL &&
284 strchr(" ,.0DLOXYZ^abdhlortuvx|~",
285 buf[snam]) != NULL) {
286 err = MANDOCERR_ESC_DELIM;
287 iend = send;
288 iarg = iendarg = sesc;
289 goto out;
290 }
291 escterm = 1;
292 iarg = send;
293 term = buf[snam];
294 } else if (strchr("BDHLRSvxNhl", buf[inam]) != NULL &&
295 strchr(" %&()*+-./0123456789:<=>", buf[iarg]) != NULL) {
296 err = MANDOCERR_ESC_DELIM;
297 if (rval != ESCAPE_EXPAND)
298 rval = ESCAPE_ERROR;
299 if (buf[inam] != 'D') {
300 iendarg = iend = iarg + 1;
301 goto out;
302 }
303 }
304 if (term == '\b')
305 term = buf[iarg++];
306 } else if (term == '\0' && maxl == INT_MAX) {
307 if (buf[inam] == 'n' && (buf[iarg] == '+' || buf[iarg] == '-'))
308 iarg++;
309 switch (buf[iarg]) {
310 case '(':
311 maxl = 2;
312 iarg++;
313 break;
314 case '[':
315 if (buf[++iarg] == ' ') {
316 iendarg = iend = iarg + 1;
317 err = MANDOCERR_ESC_ARG;
318 rval = ESCAPE_ERROR;
319 goto out;
320 }
321 term = ']';
322 break;
323 default:
324 maxl = 1;
325 break;
326 }
327 }
328
329 /* Advance to the end of the argument. */
330
331 valid_A = 1;
332 iendarg = iarg;
333 while (maxl > 0) {
334 if (buf[iendarg] == '\0') {
335 err = MANDOCERR_ESC_INCOMPLETE;
336 if (rval != ESCAPE_EXPAND &&
337 rval != ESCAPE_OVERSTRIKE)
338 rval = ESCAPE_ERROR;
339 /* Usually, ignore an incomplete argument. */
340 if (strchr("Aow", buf[inam]) == NULL)
341 iendarg = iarg;
342 break;
343 }
344 if (escterm == 0 && buf[iendarg] == term) {
345 iend = iendarg + 1;
346 break;
347 }
348 if (buf[iendarg] == buf[iesc]) {
349 stype = roff_escape(buf, ln, iendarg,
350 &sesc, &snam, &sarg, &sendarg, &send);
351 if (stype == ESCAPE_EXPAND)
352 goto out_sub;
353 iend = send;
354 if (escterm == 1 &&
355 (buf[snam] == term || buf[inam] == 'N'))
356 break;
357 if (stype != ESCAPE_UNDEF)
358 valid_A = 0;
359 iendarg = send;
360 } else if (buf[inam] == 'N' &&
361 isdigit((unsigned char)buf[iendarg]) == 0) {
362 iend = iendarg + 1;
363 break;
364 } else {
365 if (buf[iendarg] == ' ' || buf[iendarg] == '\t')
366 valid_A = 0;
367 if (maxl != INT_MAX)
368 maxl--;
369 iend = ++iendarg;
370 }
371 }
372
373 /* Post-process depending on the content of the argument. */
374
375 argl = iendarg - iarg;
376 switch (buf[inam]) {
377 case '*':
378 if (resc == NULL && argl == 2 &&
379 buf[iarg] == '.' && buf[iarg + 1] == 'T')
380 rval = ESCAPE_DEVICE;
381 break;
382 case 'A':
383 if (valid_A == 0)
384 iendarg = iarg;
385 break;
386 case 'O':
387 switch (buf[iarg]) {
388 case '0':
389 rval = ESCAPE_UNSUPP;
390 break;
391 case '1':
392 case '2':
393 case '3':
394 case '4':
395 if (argl == 1)
396 rval = ESCAPE_IGNORE;
397 else {
398 err = MANDOCERR_ESC_ARG;
399 rval = ESCAPE_ERROR;
400 }
401 break;
402 case '5':
403 if (buf[iarg - 1] == '[')
404 rval = ESCAPE_UNSUPP;
405 else {
406 err = MANDOCERR_ESC_ARG;
407 rval = ESCAPE_ERROR;
408 }
409 break;
410 default:
411 err = MANDOCERR_ESC_ARG;
412 rval = ESCAPE_ERROR;
413 break;
414 }
415 break;
416 default:
417 break;
418 }
419
420 switch (rval) {
421 case ESCAPE_FONT:
422 rval = mandoc_font(buf + iarg, argl);
423 if (rval == ESCAPE_ERROR)
424 err = MANDOCERR_ESC_ARG;
425 break;
426
427 case ESCAPE_SPECIAL:
428 if (argl == 0) {
429 err = MANDOCERR_ESC_BADCHAR;
430 rval = ESCAPE_ERROR;
431 break;
432 }
433
434 /*
435 * The file chars.c only provides one common list of
436 * character names, but \[-] == \- is the only one of
437 * the characters with one-byte names that allows
438 * enclosing the name in brackets.
439 */
440
441 if (term != '\0' && argl == 1 && buf[iarg] != '-') {
442 err = MANDOCERR_ESC_BADCHAR;
443 rval = ESCAPE_ERROR;
444 break;
445 }
446
447 /* Treat \[char...] as an alias for \N'...'. */
448
449 if (buf[iarg] == 'c') {
450 if (argl < 6 || argl > 7 ||
451 strncmp(buf + iarg, "char", 4) != 0 ||
452 (int)strspn(buf + iarg + 4, "0123456789")
453 + 4 < argl)
454 break;
455 c = 0;
456 for (i = iarg; i < iendarg; i++)
457 c = 10 * c + (buf[i] - '0');
458 if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff) {
459 err = MANDOCERR_ESC_BADCHAR;
460 break;
461 }
462 iarg += 4;
463 rval = ESCAPE_NUMBERED;
464 break;
465 }
466
467 /*
468 * Unicode escapes are defined in groff as \[u0000]
469 * to \[u10FFFF], where the contained value must be
470 * a valid Unicode codepoint.
471 */
472
473 if (buf[iarg] != 'u' || argl < 5 || argl > 7)
474 break;
475 if (argl == 7 && /* beyond the Unicode range */
476 (buf[iarg + 1] != '1' || buf[iarg + 2] != '0')) {
477 err = MANDOCERR_ESC_BADCHAR;
478 break;
479 }
480 if (argl == 6 && buf[iarg + 1] == '0') {
481 err = MANDOCERR_ESC_BADCHAR;
482 break;
483 }
484 if (argl == 5 && /* UTF-16 surrogate */
485 toupper((unsigned char)buf[iarg + 1]) == 'D' &&
486 strchr("89ABCDEFabcdef", buf[iarg + 2]) != NULL) {
487 err = MANDOCERR_ESC_BADCHAR;
488 break;
489 }
490 if ((int)strspn(buf + iarg + 1, "0123456789ABCDEFabcdef")
491 + 1 == argl)
492 rval = ESCAPE_UNICODE;
493 break;
494 default:
495 break;
496 }
497 goto out;
498
499 out_sub:
500 iesc = sesc;
501 inam = snam;
502 iarg = sarg;
503 iendarg = sendarg;
504 iend = send;
505 rval = ESCAPE_EXPAND;
506
507 out:
508 if (resc != NULL)
509 *resc = iesc;
510 if (rnam != NULL)
511 *rnam = inam;
512 if (rarg != NULL)
513 *rarg = iarg;
514 if (rendarg != NULL)
515 *rendarg = iendarg;
516 if (rend != NULL)
517 *rend = iend;
518 if (ln == 0)
519 return rval;
520
521 /*
522 * Diagnostic messages are only issued when called
523 * from the parser, not when called from the formatters.
524 */
525
526 switch (rval) {
527 case ESCAPE_UNSUPP:
528 err = MANDOCERR_ESC_UNSUPP;
529 break;
530 case ESCAPE_UNDEF:
531 if (buf[inam] != '\\' && buf[inam] != '.')
532 err = MANDOCERR_ESC_UNDEF;
533 break;
534 case ESCAPE_SPECIAL:
535 if (mchars_spec2cp(buf + iarg, argl) >= 0)
536 err = MANDOCERR_OK;
537 else if (err == MANDOCERR_OK)
538 err = MANDOCERR_ESC_UNKCHAR;
539 break;
540 default:
541 break;
542 }
543 if (err != MANDOCERR_OK)
544 mandoc_msg(err, ln, iesc, "%.*s", iend - iesc, buf + iesc);
545 return rval;
546 }
547