1 /* $Id: preconv.c,v 1.6 2013/06/02 03:52:21 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv> 4 * 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 */ 17 #ifdef HAVE_CONFIG_H 18 #include "config.h" 19 #endif 20 21 #ifdef HAVE_MMAP 22 #include <sys/stat.h> 23 #include <sys/mman.h> 24 #endif 25 26 #include <assert.h> 27 #include <fcntl.h> 28 #include <stdio.h> 29 #include <stdlib.h> 30 #include <string.h> 31 #include <unistd.h> 32 33 /* 34 * The read_whole_file() and resize_buf() functions are copied from 35 * read.c, including all dependency code. 36 */ 37 38 enum enc { 39 ENC_UTF_8, /* UTF-8 */ 40 ENC_US_ASCII, /* US-ASCII */ 41 ENC_LATIN_1, /* Latin-1 */ 42 ENC__MAX 43 }; 44 45 struct buf { 46 char *buf; /* binary input buffer */ 47 size_t sz; /* size of binary buffer */ 48 size_t offs; /* starting buffer offset */ 49 }; 50 51 struct encode { 52 const char *name; 53 int (*conv)(const struct buf *); 54 }; 55 56 static int cue_enc(const struct buf *, size_t *, enum enc *); 57 static int conv_latin_1(const struct buf *); 58 static int conv_us_ascii(const struct buf *); 59 static int conv_utf_8(const struct buf *); 60 static int read_whole_file(const char *, int, 61 struct buf *, int *); 62 static void resize_buf(struct buf *, size_t); 63 static void usage(void); 64 65 static const struct encode encs[ENC__MAX] = { 66 { "utf-8", conv_utf_8 }, /* ENC_UTF_8 */ 67 { "us-ascii", conv_us_ascii }, /* ENC_US_ASCII */ 68 { "latin-1", conv_latin_1 }, /* ENC_LATIN_1 */ 69 }; 70 71 static const char *progname; 72 73 static void 74 usage(void) 75 { 76 77 fprintf(stderr, "usage: %s " 78 "[-D enc] " 79 "[-e ENC] " 80 "[file]\n", progname); 81 } 82 83 static int 84 conv_latin_1(const struct buf *b) 85 { 86 size_t i; 87 unsigned char cu; 88 const char *cp; 89 90 cp = b->buf + (int)b->offs; 91 92 /* 93 * Latin-1 falls into the first 256 code-points of Unicode, so 94 * there's no need for any sort of translation. Just make the 95 * 8-bit characters use the Unicode escape. 96 * Note that binary values 128 < v < 160 are passed through 97 * unmodified to mandoc. 98 */ 99 100 for (i = b->offs; i < b->sz; i++) { 101 cu = (unsigned char)*cp++; 102 cu < 160U ? putchar(cu) : printf("\\[u%.4X]", cu); 103 } 104 105 return(1); 106 } 107 108 static int 109 conv_us_ascii(const struct buf *b) 110 { 111 112 /* 113 * US-ASCII has no conversion since it falls into the first 128 114 * bytes of Unicode. 115 */ 116 117 fwrite(b->buf, 1, b->sz, stdout); 118 return(1); 119 } 120 121 static int 122 conv_utf_8(const struct buf *b) 123 { 124 int state, be; 125 unsigned int accum; 126 size_t i; 127 unsigned char cu; 128 const char *cp; 129 const long one = 1L; 130 131 cp = b->buf + (int)b->offs; 132 state = 0; 133 accum = 0U; 134 be = 0; 135 136 /* Quick test for big-endian value. */ 137 138 if ( ! (*((const char *)(&one)))) 139 be = 1; 140 141 for (i = b->offs; i < b->sz; i++) { 142 cu = (unsigned char)*cp++; 143 if (state) { 144 if ( ! (cu & 128) || (cu & 64)) { 145 /* Bad sequence header. */ 146 return(0); 147 } 148 149 /* Accept only legitimate bit patterns. */ 150 151 if (cu > 191 || cu < 128) { 152 /* Bad in-sequence bits. */ 153 return(0); 154 } 155 156 accum |= (cu & 63) << --state * 6; 157 158 /* 159 * Accum is held in little-endian order as 160 * stipulated by the UTF-8 sequence coding. We 161 * need to convert to a native big-endian if our 162 * architecture requires it. 163 */ 164 165 if (0 == state && be) 166 accum = (accum >> 24) | 167 ((accum << 8) & 0x00FF0000) | 168 ((accum >> 8) & 0x0000FF00) | 169 (accum << 24); 170 171 if (0 == state) { 172 accum < 128U ? putchar(accum) : 173 printf("\\[u%.4X]", accum); 174 accum = 0U; 175 } 176 } else if (cu & (1 << 7)) { 177 /* 178 * Entering a UTF-8 state: if we encounter a 179 * UTF-8 bitmask, calculate the expected UTF-8 180 * state from it. 181 */ 182 for (state = 0; state < 7; state++) 183 if ( ! (cu & (1 << (7 - state)))) 184 break; 185 186 /* Accept only legitimate bit patterns. */ 187 188 switch (state) { 189 case (4): 190 if (cu <= 244 && cu >= 240) { 191 accum = (cu & 7) << 18; 192 break; 193 } 194 /* Bad 4-sequence start bits. */ 195 return(0); 196 case (3): 197 if (cu <= 239 && cu >= 224) { 198 accum = (cu & 15) << 12; 199 break; 200 } 201 /* Bad 3-sequence start bits. */ 202 return(0); 203 case (2): 204 if (cu <= 223 && cu >= 194) { 205 accum = (cu & 31) << 6; 206 break; 207 } 208 /* Bad 2-sequence start bits. */ 209 return(0); 210 default: 211 /* Bad sequence bit mask. */ 212 return(0); 213 } 214 state--; 215 } else 216 putchar(cu); 217 } 218 219 if (0 != state) { 220 /* Bad trailing bits. */ 221 return(0); 222 } 223 224 return(1); 225 } 226 227 static void 228 resize_buf(struct buf *buf, size_t initial) 229 { 230 231 buf->sz = buf->sz > initial / 2 ? 232 2 * buf->sz : initial; 233 234 buf->buf = realloc(buf->buf, buf->sz); 235 if (NULL == buf->buf) { 236 perror(NULL); 237 exit(EXIT_FAILURE); 238 } 239 } 240 241 static int 242 read_whole_file(const char *f, int fd, 243 struct buf *fb, int *with_mmap) 244 { 245 size_t off; 246 ssize_t ssz; 247 248 #ifdef HAVE_MMAP 249 struct stat st; 250 if (-1 == fstat(fd, &st)) { 251 perror(f); 252 return(0); 253 } 254 255 /* 256 * If we're a regular file, try just reading in the whole entry 257 * via mmap(). This is faster than reading it into blocks, and 258 * since each file is only a few bytes to begin with, I'm not 259 * concerned that this is going to tank any machines. 260 */ 261 262 if (S_ISREG(st.st_mode) && st.st_size >= (1U << 31)) { 263 fprintf(stderr, "%s: input too large\n", f); 264 return(0); 265 } 266 267 if (S_ISREG(st.st_mode)) { 268 *with_mmap = 1; 269 fb->sz = (size_t)st.st_size; 270 fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0); 271 if (fb->buf != MAP_FAILED) 272 return(1); 273 } 274 #endif 275 276 /* 277 * If this isn't a regular file (like, say, stdin), then we must 278 * go the old way and just read things in bit by bit. 279 */ 280 281 *with_mmap = 0; 282 off = 0; 283 fb->sz = 0; 284 fb->buf = NULL; 285 for (;;) { 286 if (off == fb->sz && fb->sz == (1U << 31)) { 287 fprintf(stderr, "%s: input too large\n", f); 288 break; 289 } 290 291 if (off == fb->sz) 292 resize_buf(fb, 65536); 293 294 ssz = read(fd, fb->buf + (int)off, fb->sz - off); 295 if (ssz == 0) { 296 fb->sz = off; 297 return(1); 298 } 299 if (ssz == -1) { 300 perror(f); 301 break; 302 } 303 off += (size_t)ssz; 304 } 305 306 free(fb->buf); 307 fb->buf = NULL; 308 return(0); 309 } 310 311 static int 312 cue_enc(const struct buf *b, size_t *offs, enum enc *enc) 313 { 314 const char *ln, *eoln, *eoph; 315 size_t sz, phsz, nsz; 316 int i; 317 318 ln = b->buf + (int)*offs; 319 sz = b->sz - *offs; 320 321 /* Look for the end-of-line. */ 322 323 if (NULL == (eoln = memchr(ln, '\n', sz))) 324 return(-1); 325 326 /* Set next-line marker. */ 327 328 *offs = (size_t)((eoln + 1) - b->buf); 329 330 /* Check if we have the correct header/trailer. */ 331 332 if ((sz = (size_t)(eoln - ln)) < 10 || 333 memcmp(ln, ".\\\" -*-", 7) || 334 memcmp(eoln - 3, "-*-", 3)) 335 return(0); 336 337 /* Move after the header and adjust for the trailer. */ 338 339 ln += 7; 340 sz -= 10; 341 342 while (sz > 0) { 343 while (sz > 0 && ' ' == *ln) { 344 ln++; 345 sz--; 346 } 347 if (0 == sz) 348 break; 349 350 /* Find the end-of-phrase marker (or eoln). */ 351 352 if (NULL == (eoph = memchr(ln, ';', sz))) 353 eoph = eoln - 3; 354 else 355 eoph++; 356 357 /* Only account for the "coding" phrase. */ 358 359 if ((phsz = (size_t)(eoph - ln)) < 7 || 360 strncasecmp(ln, "coding:", 7)) { 361 sz -= phsz; 362 ln += phsz; 363 continue; 364 } 365 366 sz -= 7; 367 ln += 7; 368 369 while (sz > 0 && ' ' == *ln) { 370 ln++; 371 sz--; 372 } 373 if (0 == sz) 374 break; 375 376 /* Check us against known encodings. */ 377 378 for (i = 0; i < (int)ENC__MAX; i++) { 379 nsz = strlen(encs[i].name); 380 if (phsz < nsz) 381 continue; 382 if (strncasecmp(ln, encs[i].name, nsz)) 383 continue; 384 385 *enc = (enum enc)i; 386 return(1); 387 } 388 389 /* Unknown encoding. */ 390 391 *enc = ENC__MAX; 392 return(1); 393 } 394 395 return(0); 396 } 397 398 int 399 main(int argc, char *argv[]) 400 { 401 int i, ch, map, fd, rc; 402 struct buf b; 403 const char *fn; 404 enum enc enc, def; 405 unsigned char bom[3] = { 0xEF, 0xBB, 0xBF }; 406 size_t offs; 407 extern int optind; 408 extern char *optarg; 409 410 progname = strrchr(argv[0], '/'); 411 if (progname == NULL) 412 progname = argv[0]; 413 else 414 ++progname; 415 416 fn = "<stdin>"; 417 fd = STDIN_FILENO; 418 rc = EXIT_FAILURE; 419 enc = def = ENC__MAX; 420 map = 0; 421 422 memset(&b, 0, sizeof(struct buf)); 423 424 while (-1 != (ch = getopt(argc, argv, "D:e:rdvh"))) 425 switch (ch) { 426 case ('D'): 427 /* FALLTHROUGH */ 428 case ('e'): 429 for (i = 0; i < (int)ENC__MAX; i++) { 430 if (strcasecmp(optarg, encs[i].name)) 431 continue; 432 break; 433 } 434 if (i < (int)ENC__MAX) { 435 if ('D' == ch) 436 def = (enum enc)i; 437 else 438 enc = (enum enc)i; 439 break; 440 } 441 442 fprintf(stderr, "%s: Bad encoding\n", optarg); 443 return(EXIT_FAILURE); 444 case ('r'): 445 /* FALLTHROUGH */ 446 case ('d'): 447 /* FALLTHROUGH */ 448 case ('v'): 449 /* Compatibility with GNU preconv. */ 450 break; 451 case ('h'): 452 /* Compatibility with GNU preconv. */ 453 /* FALLTHROUGH */ 454 default: 455 usage(); 456 return(EXIT_FAILURE); 457 } 458 459 argc -= optind; 460 argv += optind; 461 462 /* 463 * Open and read the first argument on the command-line. 464 * If we don't have one, we default to stdin. 465 */ 466 467 if (argc > 0) { 468 fn = *argv; 469 fd = open(fn, O_RDONLY, 0); 470 if (-1 == fd) { 471 perror(fn); 472 return(EXIT_FAILURE); 473 } 474 } 475 476 if ( ! read_whole_file(fn, fd, &b, &map)) 477 goto out; 478 479 /* Try to read the UTF-8 BOM. */ 480 481 if (ENC__MAX == enc) 482 if (b.sz > 3 && 0 == memcmp(b.buf, bom, 3)) { 483 b.offs = 3; 484 enc = ENC_UTF_8; 485 } 486 487 /* Try reading from the "-*-" cue. */ 488 489 if (ENC__MAX == enc) { 490 offs = b.offs; 491 ch = cue_enc(&b, &offs, &enc); 492 if (0 == ch) 493 ch = cue_enc(&b, &offs, &enc); 494 } 495 496 /* 497 * No encoding has been detected. 498 * Thus, we either fall into our default encoder, if specified, 499 * or use Latin-1 if all else fails. 500 */ 501 502 if (ENC__MAX == enc) 503 enc = ENC__MAX == def ? ENC_LATIN_1 : def; 504 505 if ( ! (*encs[(int)enc].conv)(&b)) { 506 fprintf(stderr, "%s: Bad encoding\n", fn); 507 goto out; 508 } 509 510 rc = EXIT_SUCCESS; 511 out: 512 #ifdef HAVE_MMAP 513 if (map) 514 munmap(b.buf, b.sz); 515 else 516 #endif 517 free(b.buf); 518 519 if (fd > STDIN_FILENO) 520 close(fd); 521 522 return(rc); 523 } 524