1 /* $Id: preconv.c,v 1.16 2017/02/18 13:43:52 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2014 Ingo Schwarze <schwarze@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 #include "config.h" 19 20 #include <sys/types.h> 21 22 #include <assert.h> 23 #include <stdio.h> 24 #include <string.h> 25 #include "mandoc.h" 26 #include "libmandoc.h" 27 28 int 29 preconv_encode(const struct buf *ib, size_t *ii, struct buf *ob, size_t *oi, 30 int *filenc) 31 { 32 const unsigned char *cu; 33 int nby; 34 unsigned int accum; 35 36 cu = (const unsigned char *)ib->buf + *ii; 37 assert(*cu & 0x80); 38 39 if ( ! (*filenc & MPARSE_UTF8)) 40 goto latin; 41 42 nby = 1; 43 while (nby < 5 && *cu & (1 << (7 - nby))) 44 nby++; 45 46 switch (nby) { 47 case 2: 48 accum = *cu & 0x1f; 49 if (accum < 0x02) /* Obfuscated ASCII. */ 50 goto latin; 51 break; 52 case 3: 53 accum = *cu & 0x0f; 54 break; 55 case 4: 56 accum = *cu & 0x07; 57 if (accum > 0x04) /* Beyond Unicode. */ 58 goto latin; 59 break; 60 default: /* Bad sequence header. */ 61 goto latin; 62 } 63 64 cu++; 65 switch (nby) { 66 case 3: 67 if ((accum == 0x00 && ! (*cu & 0x20)) || /* Use 2-byte. */ 68 (accum == 0x0d && *cu & 0x20)) /* Surrogates. */ 69 goto latin; 70 break; 71 case 4: 72 if ((accum == 0x00 && ! (*cu & 0x30)) || /* Use 3-byte. */ 73 (accum == 0x04 && *cu & 0x30)) /* Beyond Unicode. */ 74 goto latin; 75 break; 76 default: 77 break; 78 } 79 80 while (--nby) { 81 if ((*cu & 0xc0) != 0x80) /* Invalid continuation. */ 82 goto latin; 83 accum <<= 6; 84 accum += *cu & 0x3f; 85 cu++; 86 } 87 88 assert(accum > 0x7f); 89 assert(accum < 0x110000); 90 assert(accum < 0xd800 || accum > 0xdfff); 91 92 *oi += snprintf(ob->buf + *oi, 11, "\\[u%.4X]", accum); 93 *ii = (const char *)cu - ib->buf; 94 *filenc &= ~MPARSE_LATIN1; 95 return 1; 96 97 latin: 98 if ( ! (*filenc & MPARSE_LATIN1)) 99 return 0; 100 101 *oi += snprintf(ob->buf + *oi, 11, 102 "\\[u%.4X]", (unsigned char)ib->buf[(*ii)++]); 103 104 *filenc &= ~MPARSE_UTF8; 105 return 1; 106 } 107 108 int 109 preconv_cue(const struct buf *b, size_t offset) 110 { 111 const char *ln, *eoln, *eoph; 112 size_t sz, phsz; 113 114 ln = b->buf + offset; 115 sz = b->sz - offset; 116 117 /* Look for the end-of-line. */ 118 119 if (NULL == (eoln = memchr(ln, '\n', sz))) 120 eoln = ln + sz; 121 122 /* Check if we have the correct header/trailer. */ 123 124 if ((sz = (size_t)(eoln - ln)) < 10 || 125 memcmp(ln, ".\\\" -*-", 7) || memcmp(eoln - 3, "-*-", 3)) 126 return MPARSE_UTF8 | MPARSE_LATIN1; 127 128 /* Move after the header and adjust for the trailer. */ 129 130 ln += 7; 131 sz -= 10; 132 133 while (sz > 0) { 134 while (sz > 0 && ' ' == *ln) { 135 ln++; 136 sz--; 137 } 138 if (0 == sz) 139 break; 140 141 /* Find the end-of-phrase marker (or eoln). */ 142 143 if (NULL == (eoph = memchr(ln, ';', sz))) 144 eoph = eoln - 3; 145 else 146 eoph++; 147 148 /* Only account for the "coding" phrase. */ 149 150 if ((phsz = eoph - ln) < 7 || 151 strncasecmp(ln, "coding:", 7)) { 152 sz -= phsz; 153 ln += phsz; 154 continue; 155 } 156 157 sz -= 7; 158 ln += 7; 159 160 while (sz > 0 && ' ' == *ln) { 161 ln++; 162 sz--; 163 } 164 if (0 == sz) 165 return 0; 166 167 /* Check us against known encodings. */ 168 169 if (phsz > 4 && !strncasecmp(ln, "utf-8", 5)) 170 return MPARSE_UTF8; 171 if (phsz > 10 && !strncasecmp(ln, "iso-latin-1", 11)) 172 return MPARSE_LATIN1; 173 return 0; 174 } 175 return MPARSE_UTF8 | MPARSE_LATIN1; 176 } 177