1 /* $Id: mandoc.c,v 1.92 2015/02/20 23:55:10 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2011-2015 Ingo Schwarze <schwarze@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 #include "config.h" 19 20 #include <sys/types.h> 21 22 #include <assert.h> 23 #include <ctype.h> 24 #include <errno.h> 25 #include <limits.h> 26 #include <stdlib.h> 27 #include <stdio.h> 28 #include <string.h> 29 #include <time.h> 30 31 #include "mandoc.h" 32 #include "mandoc_aux.h" 33 #include "libmandoc.h" 34 35 #define DATESIZE 32 36 37 static int a2time(time_t *, const char *, const char *); 38 static char *time2a(time_t); 39 40 41 enum mandoc_esc 42 mandoc_escape(const char **end, const char **start, int *sz) 43 { 44 const char *local_start; 45 int local_sz; 46 char term; 47 enum mandoc_esc gly; 48 49 /* 50 * When the caller doesn't provide return storage, 51 * use local storage. 52 */ 53 54 if (NULL == start) 55 start = &local_start; 56 if (NULL == sz) 57 sz = &local_sz; 58 59 /* 60 * Beyond the backslash, at least one input character 61 * is part of the escape sequence. With one exception 62 * (see below), that character won't be returned. 63 */ 64 65 gly = ESCAPE_ERROR; 66 *start = ++*end; 67 *sz = 0; 68 term = '\0'; 69 70 switch ((*start)[-1]) { 71 /* 72 * First the glyphs. There are several different forms of 73 * these, but each eventually returns a substring of the glyph 74 * name. 75 */ 76 case '(': 77 gly = ESCAPE_SPECIAL; 78 *sz = 2; 79 break; 80 case '[': 81 gly = ESCAPE_SPECIAL; 82 term = ']'; 83 break; 84 case 'C': 85 if ('\'' != **start) 86 return(ESCAPE_ERROR); 87 *start = ++*end; 88 gly = ESCAPE_SPECIAL; 89 term = '\''; 90 break; 91 92 /* 93 * Escapes taking no arguments at all. 94 */ 95 case 'd': 96 /* FALLTHROUGH */ 97 case 'u': 98 return(ESCAPE_IGNORE); 99 100 /* 101 * The \z escape is supposed to output the following 102 * character without advancing the cursor position. 103 * Since we are mostly dealing with terminal mode, 104 * let us just skip the next character. 105 */ 106 case 'z': 107 return(ESCAPE_SKIPCHAR); 108 109 /* 110 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where 111 * 'X' is the trigger. These have opaque sub-strings. 112 */ 113 case 'F': 114 /* FALLTHROUGH */ 115 case 'g': 116 /* FALLTHROUGH */ 117 case 'k': 118 /* FALLTHROUGH */ 119 case 'M': 120 /* FALLTHROUGH */ 121 case 'm': 122 /* FALLTHROUGH */ 123 case 'n': 124 /* FALLTHROUGH */ 125 case 'V': 126 /* FALLTHROUGH */ 127 case 'Y': 128 gly = ESCAPE_IGNORE; 129 /* FALLTHROUGH */ 130 case 'f': 131 if (ESCAPE_ERROR == gly) 132 gly = ESCAPE_FONT; 133 switch (**start) { 134 case '(': 135 *start = ++*end; 136 *sz = 2; 137 break; 138 case '[': 139 *start = ++*end; 140 term = ']'; 141 break; 142 default: 143 *sz = 1; 144 break; 145 } 146 break; 147 148 /* 149 * These escapes are of the form \X'Y', where 'X' is the trigger 150 * and 'Y' is any string. These have opaque sub-strings. 151 * The \B and \w escapes are handled in roff.c, roff_res(). 152 */ 153 case 'A': 154 /* FALLTHROUGH */ 155 case 'b': 156 /* FALLTHROUGH */ 157 case 'D': 158 /* FALLTHROUGH */ 159 case 'R': 160 /* FALLTHROUGH */ 161 case 'X': 162 /* FALLTHROUGH */ 163 case 'Z': 164 gly = ESCAPE_IGNORE; 165 /* FALLTHROUGH */ 166 case 'o': 167 if (**start == '\0') 168 return(ESCAPE_ERROR); 169 if (gly == ESCAPE_ERROR) 170 gly = ESCAPE_OVERSTRIKE; 171 term = **start; 172 *start = ++*end; 173 break; 174 175 /* 176 * These escapes are of the form \X'N', where 'X' is the trigger 177 * and 'N' resolves to a numerical expression. 178 */ 179 case 'h': 180 /* FALLTHROUGH */ 181 case 'H': 182 /* FALLTHROUGH */ 183 case 'L': 184 /* FALLTHROUGH */ 185 case 'l': 186 /* FALLTHROUGH */ 187 case 'S': 188 /* FALLTHROUGH */ 189 case 'v': 190 /* FALLTHROUGH */ 191 case 'x': 192 if (strchr(" %&()*+-./0123456789:<=>", **start)) { 193 if ('\0' != **start) 194 ++*end; 195 return(ESCAPE_ERROR); 196 } 197 gly = ESCAPE_IGNORE; 198 term = **start; 199 *start = ++*end; 200 break; 201 202 /* 203 * Special handling for the numbered character escape. 204 * XXX Do any other escapes need similar handling? 205 */ 206 case 'N': 207 if ('\0' == **start) 208 return(ESCAPE_ERROR); 209 (*end)++; 210 if (isdigit((unsigned char)**start)) { 211 *sz = 1; 212 return(ESCAPE_IGNORE); 213 } 214 (*start)++; 215 while (isdigit((unsigned char)**end)) 216 (*end)++; 217 *sz = *end - *start; 218 if ('\0' != **end) 219 (*end)++; 220 return(ESCAPE_NUMBERED); 221 222 /* 223 * Sizes get a special category of their own. 224 */ 225 case 's': 226 gly = ESCAPE_IGNORE; 227 228 /* See +/- counts as a sign. */ 229 if ('+' == **end || '-' == **end || ASCII_HYPH == **end) 230 *start = ++*end; 231 232 switch (**end) { 233 case '(': 234 *start = ++*end; 235 *sz = 2; 236 break; 237 case '[': 238 *start = ++*end; 239 term = ']'; 240 break; 241 case '\'': 242 *start = ++*end; 243 term = '\''; 244 break; 245 case '3': 246 /* FALLTHROUGH */ 247 case '2': 248 /* FALLTHROUGH */ 249 case '1': 250 *sz = (*end)[-1] == 's' && 251 isdigit((unsigned char)(*end)[1]) ? 2 : 1; 252 break; 253 default: 254 *sz = 1; 255 break; 256 } 257 258 break; 259 260 /* 261 * Anything else is assumed to be a glyph. 262 * In this case, pass back the character after the backslash. 263 */ 264 default: 265 gly = ESCAPE_SPECIAL; 266 *start = --*end; 267 *sz = 1; 268 break; 269 } 270 271 assert(ESCAPE_ERROR != gly); 272 273 /* 274 * Read up to the terminating character, 275 * paying attention to nested escapes. 276 */ 277 278 if ('\0' != term) { 279 while (**end != term) { 280 switch (**end) { 281 case '\0': 282 return(ESCAPE_ERROR); 283 case '\\': 284 (*end)++; 285 if (ESCAPE_ERROR == 286 mandoc_escape(end, NULL, NULL)) 287 return(ESCAPE_ERROR); 288 break; 289 default: 290 (*end)++; 291 break; 292 } 293 } 294 *sz = (*end)++ - *start; 295 } else { 296 assert(*sz > 0); 297 if ((size_t)*sz > strlen(*start)) 298 return(ESCAPE_ERROR); 299 *end += *sz; 300 } 301 302 /* Run post-processors. */ 303 304 switch (gly) { 305 case ESCAPE_FONT: 306 if (2 == *sz) { 307 if ('C' == **start) { 308 /* 309 * Treat constant-width font modes 310 * just like regular font modes. 311 */ 312 (*start)++; 313 (*sz)--; 314 } else { 315 if ('B' == (*start)[0] && 'I' == (*start)[1]) 316 gly = ESCAPE_FONTBI; 317 break; 318 } 319 } else if (1 != *sz) 320 break; 321 322 switch (**start) { 323 case '3': 324 /* FALLTHROUGH */ 325 case 'B': 326 gly = ESCAPE_FONTBOLD; 327 break; 328 case '2': 329 /* FALLTHROUGH */ 330 case 'I': 331 gly = ESCAPE_FONTITALIC; 332 break; 333 case 'P': 334 gly = ESCAPE_FONTPREV; 335 break; 336 case '1': 337 /* FALLTHROUGH */ 338 case 'R': 339 gly = ESCAPE_FONTROMAN; 340 break; 341 } 342 break; 343 case ESCAPE_SPECIAL: 344 if (1 == *sz && 'c' == **start) 345 gly = ESCAPE_NOSPACE; 346 /* 347 * Unicode escapes are defined in groff as \[u0000] 348 * to \[u10FFFF], where the contained value must be 349 * a valid Unicode codepoint. Here, however, only 350 * check the length and range. 351 */ 352 if (**start != 'u' || *sz < 5 || *sz > 7) 353 break; 354 if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0')) 355 break; 356 if (*sz == 6 && (*start)[1] == '0') 357 break; 358 if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef") 359 + 1 == *sz) 360 gly = ESCAPE_UNICODE; 361 break; 362 default: 363 break; 364 } 365 366 return(gly); 367 } 368 369 /* 370 * Parse a quoted or unquoted roff-style request or macro argument. 371 * Return a pointer to the parsed argument, which is either the original 372 * pointer or advanced by one byte in case the argument is quoted. 373 * NUL-terminate the argument in place. 374 * Collapse pairs of quotes inside quoted arguments. 375 * Advance the argument pointer to the next argument, 376 * or to the NUL byte terminating the argument line. 377 */ 378 char * 379 mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos) 380 { 381 char *start, *cp; 382 int quoted, pairs, white; 383 384 /* Quoting can only start with a new word. */ 385 start = *cpp; 386 quoted = 0; 387 if ('"' == *start) { 388 quoted = 1; 389 start++; 390 } 391 392 pairs = 0; 393 white = 0; 394 for (cp = start; '\0' != *cp; cp++) { 395 396 /* 397 * Move the following text left 398 * after quoted quotes and after "\\" and "\t". 399 */ 400 if (pairs) 401 cp[-pairs] = cp[0]; 402 403 if ('\\' == cp[0]) { 404 /* 405 * In copy mode, translate double to single 406 * backslashes and backslash-t to literal tabs. 407 */ 408 switch (cp[1]) { 409 case 't': 410 cp[0] = '\t'; 411 /* FALLTHROUGH */ 412 case '\\': 413 pairs++; 414 cp++; 415 break; 416 case ' ': 417 /* Skip escaped blanks. */ 418 if (0 == quoted) 419 cp++; 420 break; 421 default: 422 break; 423 } 424 } else if (0 == quoted) { 425 if (' ' == cp[0]) { 426 /* Unescaped blanks end unquoted args. */ 427 white = 1; 428 break; 429 } 430 } else if ('"' == cp[0]) { 431 if ('"' == cp[1]) { 432 /* Quoted quotes collapse. */ 433 pairs++; 434 cp++; 435 } else { 436 /* Unquoted quotes end quoted args. */ 437 quoted = 2; 438 break; 439 } 440 } 441 } 442 443 /* Quoted argument without a closing quote. */ 444 if (1 == quoted) 445 mandoc_msg(MANDOCERR_ARG_QUOTE, parse, ln, *pos, NULL); 446 447 /* NUL-terminate this argument and move to the next one. */ 448 if (pairs) 449 cp[-pairs] = '\0'; 450 if ('\0' != *cp) { 451 *cp++ = '\0'; 452 while (' ' == *cp) 453 cp++; 454 } 455 *pos += (int)(cp - start) + (quoted ? 1 : 0); 456 *cpp = cp; 457 458 if ('\0' == *cp && (white || ' ' == cp[-1])) 459 mandoc_msg(MANDOCERR_SPACE_EOL, parse, ln, *pos, NULL); 460 461 return(start); 462 } 463 464 static int 465 a2time(time_t *t, const char *fmt, const char *p) 466 { 467 struct tm tm; 468 char *pp; 469 470 memset(&tm, 0, sizeof(struct tm)); 471 472 pp = NULL; 473 #if HAVE_STRPTIME 474 pp = strptime(p, fmt, &tm); 475 #endif 476 if (NULL != pp && '\0' == *pp) { 477 *t = mktime(&tm); 478 return(1); 479 } 480 481 return(0); 482 } 483 484 static char * 485 time2a(time_t t) 486 { 487 struct tm *tm; 488 char *buf, *p; 489 size_t ssz; 490 int isz; 491 492 tm = localtime(&t); 493 if (tm == NULL) 494 return(NULL); 495 496 /* 497 * Reserve space: 498 * up to 9 characters for the month (September) + blank 499 * up to 2 characters for the day + comma + blank 500 * 4 characters for the year and a terminating '\0' 501 */ 502 p = buf = mandoc_malloc(10 + 4 + 4 + 1); 503 504 if (0 == (ssz = strftime(p, 10 + 1, "%B ", tm))) 505 goto fail; 506 p += (int)ssz; 507 508 if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday))) 509 goto fail; 510 p += isz; 511 512 if (0 == strftime(p, 4 + 1, "%Y", tm)) 513 goto fail; 514 return(buf); 515 516 fail: 517 free(buf); 518 return(NULL); 519 } 520 521 char * 522 mandoc_normdate(struct mparse *parse, char *in, int ln, int pos) 523 { 524 char *out; 525 time_t t; 526 527 if (NULL == in || '\0' == *in || 528 0 == strcmp(in, "$" "Mdocdate$")) { 529 mandoc_msg(MANDOCERR_DATE_MISSING, parse, ln, pos, NULL); 530 time(&t); 531 } 532 else if (a2time(&t, "%Y-%m-%d", in)) 533 t = 0; 534 else if (!a2time(&t, "$" "Mdocdate: %b %d %Y $", in) && 535 !a2time(&t, "%b %d, %Y", in)) { 536 mandoc_msg(MANDOCERR_DATE_BAD, parse, ln, pos, in); 537 t = 0; 538 } 539 out = t ? time2a(t) : NULL; 540 return(out ? out : mandoc_strdup(in)); 541 } 542 543 int 544 mandoc_eos(const char *p, size_t sz) 545 { 546 const char *q; 547 int enclosed, found; 548 549 if (0 == sz) 550 return(0); 551 552 /* 553 * End-of-sentence recognition must include situations where 554 * some symbols, such as `)', allow prior EOS punctuation to 555 * propagate outward. 556 */ 557 558 enclosed = found = 0; 559 for (q = p + (int)sz - 1; q >= p; q--) { 560 switch (*q) { 561 case '\"': 562 /* FALLTHROUGH */ 563 case '\'': 564 /* FALLTHROUGH */ 565 case ']': 566 /* FALLTHROUGH */ 567 case ')': 568 if (0 == found) 569 enclosed = 1; 570 break; 571 case '.': 572 /* FALLTHROUGH */ 573 case '!': 574 /* FALLTHROUGH */ 575 case '?': 576 found = 1; 577 break; 578 default: 579 return(found && (!enclosed || isalnum((unsigned char)*q))); 580 } 581 } 582 583 return(found && !enclosed); 584 } 585 586 /* 587 * Convert a string to a long that may not be <0. 588 * If the string is invalid, or is less than 0, return -1. 589 */ 590 int 591 mandoc_strntoi(const char *p, size_t sz, int base) 592 { 593 char buf[32]; 594 char *ep; 595 long v; 596 597 if (sz > 31) 598 return(-1); 599 600 memcpy(buf, p, sz); 601 buf[(int)sz] = '\0'; 602 603 errno = 0; 604 v = strtol(buf, &ep, base); 605 606 if (buf[0] == '\0' || *ep != '\0') 607 return(-1); 608 609 if (v > INT_MAX) 610 v = INT_MAX; 611 if (v < INT_MIN) 612 v = INT_MIN; 613 614 return((int)v); 615 } 616