1 /* $Id: roff_escape.c,v 1.15 2024/05/16 21:23:00 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2011, 2012, 2013, 2014, 2015, 2017, 2018, 2020, 2022 4 * Ingo Schwarze <schwarze@openbsd.org> 5 * Copyright (c) 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> 6 * 7 * Permission to use, copy, modify, and distribute this software for any 8 * purpose with or without fee is hereby granted, provided that the above 9 * copyright notice and this permission notice appear in all copies. 10 * 11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES 12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR 14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 18 * 19 * Parser for roff(7) escape sequences. 20 * To be used by all mandoc(1) parsers and formatters. 21 */ 22 #include <assert.h> 23 #include <ctype.h> 24 #include <limits.h> 25 #include <stdio.h> 26 #include <string.h> 27 28 #include "mandoc.h" 29 #include "roff.h" 30 #include "roff_int.h" 31 32 /* 33 * Traditional escape sequence interpreter for general use 34 * including in high-level formatters. This function does not issue 35 * diagnostics and is not usable for expansion in the roff(7) parser. 36 * It is documented in the mandoc_escape(3) manual page. 37 */ 38 enum mandoc_esc 39 mandoc_escape(const char **rendarg, const char **rarg, int *rargl) 40 { 41 int iarg, iendarg, iend; 42 enum mandoc_esc rval; 43 44 rval = roff_escape(--*rendarg, 0, 0, 45 NULL, NULL, &iarg, &iendarg, &iend); 46 assert(rval != ESCAPE_EXPAND); 47 if (rarg != NULL) 48 *rarg = *rendarg + iarg; 49 if (rargl != NULL) 50 *rargl = iendarg - iarg; 51 *rendarg += iend; 52 return rval; 53 } 54 55 /* 56 * Full-featured escape sequence parser. 57 * If it encounters a nested escape sequence that requires expansion 58 * by the parser and re-parsing, the positions of that inner escape 59 * sequence are returned in *resc ... *rend. 60 * Otherwise, *resc is set to aesc and the positions of the escape 61 * sequence starting at aesc are returned. 62 * Diagnostic messages are generated if and only if ln != 0, 63 * that is, if and only if called by roff_expand(). 64 */ 65 enum mandoc_esc 66 roff_escape(const char *buf, const int ln, const int aesc, 67 int *resc, int *rnam, int *rarg, int *rendarg, int *rend) 68 { 69 int iesc; /* index of leading escape char */ 70 int inam; /* index of escape name */ 71 int iarg; /* index beginning the argument */ 72 int iendarg; /* index right after the argument */ 73 int iend; /* index right after the sequence */ 74 int sesc, snam, sarg, sendarg, send; /* for sub-escape */ 75 int escterm; /* whether term is escaped */ 76 int maxl; /* expected length of the argument */ 77 int argl; /* actual length of the argument */ 78 int c, i; /* for \[char...] parsing */ 79 int valid_A; /* for \A parsing */ 80 enum mandoc_esc rval; /* return value */ 81 enum mandoc_esc stype; /* for sub-escape */ 82 enum mandocerr err; /* diagnostic code */ 83 char term; /* byte terminating the argument */ 84 85 /* 86 * Treat "\E" just like "\"; 87 * it only makes a difference in copy mode. 88 */ 89 90 iesc = inam = aesc; 91 do { 92 inam++; 93 } while (buf[inam] == 'E'); 94 95 /* 96 * Sort the following cases first by syntax category, 97 * then by escape sequence type, and finally by ASCII code. 98 */ 99 100 iarg = iendarg = iend = inam + 1; 101 maxl = INT_MAX; 102 term = '\0'; 103 err = MANDOCERR_OK; 104 switch (buf[inam]) { 105 106 /* Escape sequences taking no arguments at all. */ 107 108 case '!': 109 case '?': 110 case 'r': 111 rval = ESCAPE_UNSUPP; 112 goto out; 113 114 case '%': 115 case '&': 116 case ')': 117 case ',': 118 case '/': 119 case '^': 120 case 'a': 121 case 'd': 122 case 't': 123 case 'u': 124 case '{': 125 case '|': 126 case '}': 127 rval = ESCAPE_IGNORE; 128 goto out; 129 130 case '\0': 131 iendarg = --iend; 132 /* FALLTHROUGH */ 133 case '.': 134 case '\\': 135 default: 136 iarg--; 137 rval = ESCAPE_UNDEF; 138 goto out; 139 140 case ' ': 141 case '\'': 142 case '-': 143 case '0': 144 case ':': 145 case '_': 146 case '`': 147 case 'e': 148 case '~': 149 iarg--; 150 argl = 1; 151 rval = ESCAPE_SPECIAL; 152 goto out; 153 case 'p': 154 rval = ESCAPE_BREAK; 155 goto out; 156 case 'c': 157 rval = ESCAPE_NOSPACE; 158 goto out; 159 case 'z': 160 rval = ESCAPE_SKIPCHAR; 161 goto out; 162 163 /* Standard argument format. */ 164 165 case '$': 166 case '*': 167 case 'V': 168 case 'g': 169 case 'n': 170 rval = ESCAPE_EXPAND; 171 break; 172 case 'F': 173 case 'M': 174 case 'O': 175 case 'Y': 176 case 'k': 177 case 'm': 178 rval = ESCAPE_IGNORE; 179 break; 180 case '(': 181 case '[': 182 rval = ESCAPE_SPECIAL; 183 iendarg = iend = --iarg; 184 break; 185 case 'f': 186 rval = ESCAPE_FONT; 187 break; 188 189 /* Quoted arguments */ 190 191 case 'A': 192 case 'B': 193 case 'w': 194 rval = ESCAPE_EXPAND; 195 term = '\b'; 196 break; 197 case 'D': 198 case 'H': 199 case 'L': 200 case 'R': 201 case 'S': 202 case 'X': 203 case 'Z': 204 case 'b': 205 case 'v': 206 case 'x': 207 rval = ESCAPE_IGNORE; 208 term = '\b'; 209 break; 210 case 'C': 211 rval = ESCAPE_SPECIAL; 212 term = '\b'; 213 break; 214 case 'N': 215 rval = ESCAPE_NUMBERED; 216 term = '\b'; 217 break; 218 case 'h': 219 rval = ESCAPE_HORIZ; 220 term = '\b'; 221 break; 222 case 'l': 223 rval = ESCAPE_HLINE; 224 term = '\b'; 225 break; 226 case 'o': 227 rval = ESCAPE_OVERSTRIKE; 228 term = '\b'; 229 break; 230 231 /* Sizes support both forms, with additional peculiarities. */ 232 233 case 's': 234 rval = ESCAPE_IGNORE; 235 if (buf[iarg] == '+' || buf[iarg] == '-'|| 236 buf[iarg] == ASCII_HYPH) 237 iarg++; 238 switch (buf[iarg]) { 239 case '(': 240 maxl = 2; 241 iarg++; 242 break; 243 case '[': 244 term = ']'; 245 iarg++; 246 break; 247 case '\'': 248 term = '\''; 249 iarg++; 250 break; 251 case '1': 252 case '2': 253 case '3': 254 if (buf[iarg - 1] == 's' && 255 isdigit((unsigned char)buf[iarg + 1])) { 256 maxl = 2; 257 break; 258 } 259 /* FALLTHROUGH */ 260 default: 261 maxl = 1; 262 break; 263 } 264 iendarg = iend = iarg; 265 } 266 267 /* Decide how to end the argument. */ 268 269 escterm = 0; 270 stype = ESCAPE_EXPAND; 271 if ((term == '\b' || (term == '\0' && maxl == INT_MAX)) && 272 buf[iarg] == buf[iesc]) { 273 stype = roff_escape(buf, ln, iendarg, 274 &sesc, &snam, &sarg, &sendarg, &send); 275 if (stype == ESCAPE_EXPAND) 276 goto out_sub; 277 } 278 279 if (term == '\b') { 280 if (stype == ESCAPE_UNDEF) 281 iarg++; 282 if (stype != ESCAPE_EXPAND && stype != ESCAPE_UNDEF) { 283 if (strchr("BHLRSNhlvx", buf[inam]) != NULL && 284 strchr(" ,.0DLOXYZ^abdhlortuvx|~", 285 buf[snam]) != NULL) { 286 err = MANDOCERR_ESC_DELIM; 287 iend = send; 288 iarg = iendarg = sesc; 289 goto out; 290 } 291 escterm = 1; 292 iarg = send; 293 term = buf[snam]; 294 } else if (strchr("BDHLRSvxNhl", buf[inam]) != NULL && 295 strchr(" %&()*+-./0123456789:<=>", buf[iarg]) != NULL) { 296 err = MANDOCERR_ESC_DELIM; 297 if (rval != ESCAPE_EXPAND) 298 rval = ESCAPE_ERROR; 299 if (buf[inam] != 'D') { 300 iendarg = iend = iarg + 1; 301 goto out; 302 } 303 } 304 if (term == '\b') 305 term = buf[iarg++]; 306 } else if (term == '\0' && maxl == INT_MAX) { 307 if (buf[inam] == 'n' && (buf[iarg] == '+' || buf[iarg] == '-')) 308 iarg++; 309 switch (buf[iarg]) { 310 case '(': 311 maxl = 2; 312 iarg++; 313 break; 314 case '[': 315 if (buf[++iarg] == ' ') { 316 iendarg = iend = iarg + 1; 317 err = MANDOCERR_ESC_ARG; 318 rval = ESCAPE_ERROR; 319 goto out; 320 } 321 term = ']'; 322 break; 323 default: 324 maxl = 1; 325 break; 326 } 327 } 328 329 /* Advance to the end of the argument. */ 330 331 valid_A = 1; 332 iendarg = iarg; 333 while (maxl > 0) { 334 if (buf[iendarg] == '\0') { 335 err = MANDOCERR_ESC_INCOMPLETE; 336 if (rval != ESCAPE_EXPAND && 337 rval != ESCAPE_OVERSTRIKE) 338 rval = ESCAPE_ERROR; 339 /* Usually, ignore an incomplete argument. */ 340 if (strchr("Aow", buf[inam]) == NULL) 341 iendarg = iarg; 342 break; 343 } 344 if (escterm == 0 && buf[iendarg] == term) { 345 iend = iendarg + 1; 346 break; 347 } 348 if (buf[iendarg] == buf[iesc]) { 349 stype = roff_escape(buf, ln, iendarg, 350 &sesc, &snam, &sarg, &sendarg, &send); 351 if (stype == ESCAPE_EXPAND) 352 goto out_sub; 353 iend = send; 354 if (escterm == 1 && 355 (buf[snam] == term || buf[inam] == 'N')) 356 break; 357 if (stype != ESCAPE_UNDEF) 358 valid_A = 0; 359 iendarg = send; 360 } else if (buf[inam] == 'N' && 361 isdigit((unsigned char)buf[iendarg]) == 0) { 362 iend = iendarg + 1; 363 break; 364 } else { 365 if (buf[iendarg] == ' ' || buf[iendarg] == '\t') 366 valid_A = 0; 367 if (maxl != INT_MAX) 368 maxl--; 369 iend = ++iendarg; 370 } 371 } 372 373 /* Post-process depending on the content of the argument. */ 374 375 argl = iendarg - iarg; 376 switch (buf[inam]) { 377 case '*': 378 if (resc == NULL && argl == 2 && 379 buf[iarg] == '.' && buf[iarg + 1] == 'T') 380 rval = ESCAPE_DEVICE; 381 break; 382 case 'A': 383 if (valid_A == 0) 384 iendarg = iarg; 385 break; 386 case 'O': 387 switch (buf[iarg]) { 388 case '0': 389 rval = ESCAPE_UNSUPP; 390 break; 391 case '1': 392 case '2': 393 case '3': 394 case '4': 395 if (argl == 1) 396 rval = ESCAPE_IGNORE; 397 else { 398 err = MANDOCERR_ESC_ARG; 399 rval = ESCAPE_ERROR; 400 } 401 break; 402 case '5': 403 if (buf[iarg - 1] == '[') 404 rval = ESCAPE_UNSUPP; 405 else { 406 err = MANDOCERR_ESC_ARG; 407 rval = ESCAPE_ERROR; 408 } 409 break; 410 default: 411 err = MANDOCERR_ESC_ARG; 412 rval = ESCAPE_ERROR; 413 break; 414 } 415 break; 416 default: 417 break; 418 } 419 420 switch (rval) { 421 case ESCAPE_FONT: 422 rval = mandoc_font(buf + iarg, argl); 423 if (rval == ESCAPE_ERROR) 424 err = MANDOCERR_ESC_ARG; 425 break; 426 427 case ESCAPE_SPECIAL: 428 if (argl == 0) { 429 err = MANDOCERR_ESC_BADCHAR; 430 rval = ESCAPE_ERROR; 431 break; 432 } 433 434 /* 435 * The file chars.c only provides one common list of 436 * character names, but \[-] == \- is the only one of 437 * the characters with one-byte names that allows 438 * enclosing the name in brackets. 439 */ 440 441 if (term != '\0' && argl == 1 && buf[iarg] != '-') { 442 err = MANDOCERR_ESC_BADCHAR; 443 rval = ESCAPE_ERROR; 444 break; 445 } 446 447 /* Treat \[char...] as an alias for \N'...'. */ 448 449 if (buf[iarg] == 'c') { 450 if (argl < 6 || argl > 7 || 451 strncmp(buf + iarg, "char", 4) != 0 || 452 (int)strspn(buf + iarg + 4, "0123456789") 453 + 4 < argl) 454 break; 455 c = 0; 456 for (i = iarg; i < iendarg; i++) 457 c = 10 * c + (buf[i] - '0'); 458 if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff) { 459 err = MANDOCERR_ESC_BADCHAR; 460 break; 461 } 462 iarg += 4; 463 rval = ESCAPE_NUMBERED; 464 break; 465 } 466 467 /* 468 * Unicode escapes are defined in groff as \[u0000] 469 * to \[u10FFFF], where the contained value must be 470 * a valid Unicode codepoint. 471 */ 472 473 if (buf[iarg] != 'u' || argl < 5 || argl > 7) 474 break; 475 if (argl == 7 && /* beyond the Unicode range */ 476 (buf[iarg + 1] != '1' || buf[iarg + 2] != '0')) { 477 err = MANDOCERR_ESC_BADCHAR; 478 break; 479 } 480 if (argl == 6 && buf[iarg + 1] == '0') { 481 err = MANDOCERR_ESC_BADCHAR; 482 break; 483 } 484 if (argl == 5 && /* UTF-16 surrogate */ 485 toupper((unsigned char)buf[iarg + 1]) == 'D' && 486 strchr("89ABCDEFabcdef", buf[iarg + 2]) != NULL) { 487 err = MANDOCERR_ESC_BADCHAR; 488 break; 489 } 490 if ((int)strspn(buf + iarg + 1, "0123456789ABCDEFabcdef") 491 + 1 == argl) 492 rval = ESCAPE_UNICODE; 493 break; 494 default: 495 break; 496 } 497 goto out; 498 499 out_sub: 500 iesc = sesc; 501 inam = snam; 502 iarg = sarg; 503 iendarg = sendarg; 504 iend = send; 505 rval = ESCAPE_EXPAND; 506 507 out: 508 if (resc != NULL) 509 *resc = iesc; 510 if (rnam != NULL) 511 *rnam = inam; 512 if (rarg != NULL) 513 *rarg = iarg; 514 if (rendarg != NULL) 515 *rendarg = iendarg; 516 if (rend != NULL) 517 *rend = iend; 518 if (ln == 0) 519 return rval; 520 521 /* 522 * Diagnostic messages are only issued when called 523 * from the parser, not when called from the formatters. 524 */ 525 526 switch (rval) { 527 case ESCAPE_UNSUPP: 528 err = MANDOCERR_ESC_UNSUPP; 529 break; 530 case ESCAPE_UNDEF: 531 if (buf[inam] != '\\' && buf[inam] != '.') 532 err = MANDOCERR_ESC_UNDEF; 533 break; 534 case ESCAPE_SPECIAL: 535 if (mchars_spec2cp(buf + iarg, argl) >= 0) 536 err = MANDOCERR_OK; 537 else if (err == MANDOCERR_OK) 538 err = MANDOCERR_ESC_UNKCHAR; 539 break; 540 default: 541 break; 542 } 543 if (err != MANDOCERR_OK) 544 mandoc_msg(err, ln, iesc, "%.*s", iend - iesc, buf + iesc); 545 return rval; 546 } 547