1 /* $Id: preconv.c,v 1.17 2018/12/13 11:55:47 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2014 Ingo Schwarze <schwarze@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 #include "config.h" 19 20 #include <sys/types.h> 21 22 #include <assert.h> 23 #include <stdio.h> 24 #include <string.h> 25 26 #include "mandoc.h" 27 #include "roff.h" 28 #include "mandoc_parse.h" 29 #include "libmandoc.h" 30 31 int 32 preconv_encode(const struct buf *ib, size_t *ii, struct buf *ob, size_t *oi, 33 int *filenc) 34 { 35 const unsigned char *cu; 36 int nby; 37 unsigned int accum; 38 39 cu = (const unsigned char *)ib->buf + *ii; 40 assert(*cu & 0x80); 41 42 if ( ! (*filenc & MPARSE_UTF8)) 43 goto latin; 44 45 nby = 1; 46 while (nby < 5 && *cu & (1 << (7 - nby))) 47 nby++; 48 49 switch (nby) { 50 case 2: 51 accum = *cu & 0x1f; 52 if (accum < 0x02) /* Obfuscated ASCII. */ 53 goto latin; 54 break; 55 case 3: 56 accum = *cu & 0x0f; 57 break; 58 case 4: 59 accum = *cu & 0x07; 60 if (accum > 0x04) /* Beyond Unicode. */ 61 goto latin; 62 break; 63 default: /* Bad sequence header. */ 64 goto latin; 65 } 66 67 cu++; 68 switch (nby) { 69 case 3: 70 if ((accum == 0x00 && ! (*cu & 0x20)) || /* Use 2-byte. */ 71 (accum == 0x0d && *cu & 0x20)) /* Surrogates. */ 72 goto latin; 73 break; 74 case 4: 75 if ((accum == 0x00 && ! (*cu & 0x30)) || /* Use 3-byte. */ 76 (accum == 0x04 && *cu & 0x30)) /* Beyond Unicode. */ 77 goto latin; 78 break; 79 default: 80 break; 81 } 82 83 while (--nby) { 84 if ((*cu & 0xc0) != 0x80) /* Invalid continuation. */ 85 goto latin; 86 accum <<= 6; 87 accum += *cu & 0x3f; 88 cu++; 89 } 90 91 assert(accum > 0x7f); 92 assert(accum < 0x110000); 93 assert(accum < 0xd800 || accum > 0xdfff); 94 95 *oi += snprintf(ob->buf + *oi, 11, "\\[u%.4X]", accum); 96 *ii = (const char *)cu - ib->buf; 97 *filenc &= ~MPARSE_LATIN1; 98 return 1; 99 100 latin: 101 if ( ! (*filenc & MPARSE_LATIN1)) 102 return 0; 103 104 *oi += snprintf(ob->buf + *oi, 11, 105 "\\[u%.4X]", (unsigned char)ib->buf[(*ii)++]); 106 107 *filenc &= ~MPARSE_UTF8; 108 return 1; 109 } 110 111 int 112 preconv_cue(const struct buf *b, size_t offset) 113 { 114 const char *ln, *eoln, *eoph; 115 size_t sz, phsz; 116 117 ln = b->buf + offset; 118 sz = b->sz - offset; 119 120 /* Look for the end-of-line. */ 121 122 if (NULL == (eoln = memchr(ln, '\n', sz))) 123 eoln = ln + sz; 124 125 /* Check if we have the correct header/trailer. */ 126 127 if ((sz = (size_t)(eoln - ln)) < 10 || 128 memcmp(ln, ".\\\" -*-", 7) || memcmp(eoln - 3, "-*-", 3)) 129 return MPARSE_UTF8 | MPARSE_LATIN1; 130 131 /* Move after the header and adjust for the trailer. */ 132 133 ln += 7; 134 sz -= 10; 135 136 while (sz > 0) { 137 while (sz > 0 && ' ' == *ln) { 138 ln++; 139 sz--; 140 } 141 if (0 == sz) 142 break; 143 144 /* Find the end-of-phrase marker (or eoln). */ 145 146 if (NULL == (eoph = memchr(ln, ';', sz))) 147 eoph = eoln - 3; 148 else 149 eoph++; 150 151 /* Only account for the "coding" phrase. */ 152 153 if ((phsz = eoph - ln) < 7 || 154 strncasecmp(ln, "coding:", 7)) { 155 sz -= phsz; 156 ln += phsz; 157 continue; 158 } 159 160 sz -= 7; 161 ln += 7; 162 163 while (sz > 0 && ' ' == *ln) { 164 ln++; 165 sz--; 166 } 167 if (0 == sz) 168 return 0; 169 170 /* Check us against known encodings. */ 171 172 if (phsz > 4 && !strncasecmp(ln, "utf-8", 5)) 173 return MPARSE_UTF8; 174 if (phsz > 10 && !strncasecmp(ln, "iso-latin-1", 11)) 175 return MPARSE_LATIN1; 176 return 0; 177 } 178 return MPARSE_UTF8 | MPARSE_LATIN1; 179 } 180