1*95c635efSGarrett D'Amore /* $Id: preconv.c,v 1.5 2011/07/24 18:15:14 kristaps Exp $ */ 2*95c635efSGarrett D'Amore /* 3*95c635efSGarrett D'Amore * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv> 4*95c635efSGarrett D'Amore * 5*95c635efSGarrett D'Amore * Permission to use, copy, modify, and distribute this software for any 6*95c635efSGarrett D'Amore * purpose with or without fee is hereby granted, provided that the above 7*95c635efSGarrett D'Amore * copyright notice and this permission notice appear in all copies. 8*95c635efSGarrett D'Amore * 9*95c635efSGarrett D'Amore * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10*95c635efSGarrett D'Amore * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11*95c635efSGarrett D'Amore * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12*95c635efSGarrett D'Amore * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13*95c635efSGarrett D'Amore * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14*95c635efSGarrett D'Amore * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15*95c635efSGarrett D'Amore * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16*95c635efSGarrett D'Amore */ 17*95c635efSGarrett D'Amore #ifdef HAVE_CONFIG_H 18*95c635efSGarrett D'Amore #include "config.h" 19*95c635efSGarrett D'Amore #endif 20*95c635efSGarrett D'Amore 21*95c635efSGarrett D'Amore #ifdef HAVE_MMAP 22*95c635efSGarrett D'Amore #include <sys/stat.h> 23*95c635efSGarrett D'Amore #include <sys/mman.h> 24*95c635efSGarrett D'Amore #endif 25*95c635efSGarrett D'Amore 26*95c635efSGarrett D'Amore #include <assert.h> 27*95c635efSGarrett D'Amore #include <fcntl.h> 28*95c635efSGarrett D'Amore #include <stdio.h> 29*95c635efSGarrett D'Amore #include <stdlib.h> 30*95c635efSGarrett D'Amore #include <string.h> 31*95c635efSGarrett D'Amore #include <unistd.h> 32*95c635efSGarrett D'Amore 33*95c635efSGarrett D'Amore /* 34*95c635efSGarrett D'Amore * The read_whole_file() and resize_buf() functions are copied from 35*95c635efSGarrett D'Amore * read.c, including all dependency code (MAP_FILE, etc.). 36*95c635efSGarrett D'Amore */ 37*95c635efSGarrett D'Amore 38*95c635efSGarrett D'Amore #ifndef MAP_FILE 39*95c635efSGarrett D'Amore #define MAP_FILE 0 40*95c635efSGarrett D'Amore #endif 41*95c635efSGarrett D'Amore 42*95c635efSGarrett D'Amore enum enc { 43*95c635efSGarrett D'Amore ENC_UTF_8, /* UTF-8 */ 44*95c635efSGarrett D'Amore ENC_US_ASCII, /* US-ASCII */ 45*95c635efSGarrett D'Amore ENC_LATIN_1, /* Latin-1 */ 46*95c635efSGarrett D'Amore ENC__MAX 47*95c635efSGarrett D'Amore }; 48*95c635efSGarrett D'Amore 49*95c635efSGarrett D'Amore struct buf { 50*95c635efSGarrett D'Amore char *buf; /* binary input buffer */ 51*95c635efSGarrett D'Amore size_t sz; /* size of binary buffer */ 52*95c635efSGarrett D'Amore size_t offs; /* starting buffer offset */ 53*95c635efSGarrett D'Amore }; 54*95c635efSGarrett D'Amore 55*95c635efSGarrett D'Amore struct encode { 56*95c635efSGarrett D'Amore const char *name; 57*95c635efSGarrett D'Amore int (*conv)(const struct buf *); 58*95c635efSGarrett D'Amore }; 59*95c635efSGarrett D'Amore 60*95c635efSGarrett D'Amore static int cue_enc(const struct buf *, size_t *, enum enc *); 61*95c635efSGarrett D'Amore static int conv_latin_1(const struct buf *); 62*95c635efSGarrett D'Amore static int conv_us_ascii(const struct buf *); 63*95c635efSGarrett D'Amore static int conv_utf_8(const struct buf *); 64*95c635efSGarrett D'Amore static int read_whole_file(const char *, int, 65*95c635efSGarrett D'Amore struct buf *, int *); 66*95c635efSGarrett D'Amore static void resize_buf(struct buf *, size_t); 67*95c635efSGarrett D'Amore static void usage(void); 68*95c635efSGarrett D'Amore 69*95c635efSGarrett D'Amore static const struct encode encs[ENC__MAX] = { 70*95c635efSGarrett D'Amore { "utf-8", conv_utf_8 }, /* ENC_UTF_8 */ 71*95c635efSGarrett D'Amore { "us-ascii", conv_us_ascii }, /* ENC_US_ASCII */ 72*95c635efSGarrett D'Amore { "latin-1", conv_latin_1 }, /* ENC_LATIN_1 */ 73*95c635efSGarrett D'Amore }; 74*95c635efSGarrett D'Amore 75*95c635efSGarrett D'Amore static const char *progname; 76*95c635efSGarrett D'Amore 77*95c635efSGarrett D'Amore static void 78*95c635efSGarrett D'Amore usage(void) 79*95c635efSGarrett D'Amore { 80*95c635efSGarrett D'Amore 81*95c635efSGarrett D'Amore fprintf(stderr, "usage: %s " 82*95c635efSGarrett D'Amore "[-D enc] " 83*95c635efSGarrett D'Amore "[-e ENC] " 84*95c635efSGarrett D'Amore "[file]\n", progname); 85*95c635efSGarrett D'Amore } 86*95c635efSGarrett D'Amore 87*95c635efSGarrett D'Amore static int 88*95c635efSGarrett D'Amore conv_latin_1(const struct buf *b) 89*95c635efSGarrett D'Amore { 90*95c635efSGarrett D'Amore size_t i; 91*95c635efSGarrett D'Amore unsigned char cu; 92*95c635efSGarrett D'Amore const char *cp; 93*95c635efSGarrett D'Amore 94*95c635efSGarrett D'Amore cp = b->buf + (int)b->offs; 95*95c635efSGarrett D'Amore 96*95c635efSGarrett D'Amore /* 97*95c635efSGarrett D'Amore * Latin-1 falls into the first 256 code-points of Unicode, so 98*95c635efSGarrett D'Amore * there's no need for any sort of translation. Just make the 99*95c635efSGarrett D'Amore * 8-bit characters use the Unicode escape. 100*95c635efSGarrett D'Amore * Note that binary values 128 < v < 160 are passed through 101*95c635efSGarrett D'Amore * unmodified to mandoc. 102*95c635efSGarrett D'Amore */ 103*95c635efSGarrett D'Amore 104*95c635efSGarrett D'Amore for (i = b->offs; i < b->sz; i++) { 105*95c635efSGarrett D'Amore cu = (unsigned char)*cp++; 106*95c635efSGarrett D'Amore cu < 160U ? putchar(cu) : printf("\\[u%.4X]", cu); 107*95c635efSGarrett D'Amore } 108*95c635efSGarrett D'Amore 109*95c635efSGarrett D'Amore return(1); 110*95c635efSGarrett D'Amore } 111*95c635efSGarrett D'Amore 112*95c635efSGarrett D'Amore static int 113*95c635efSGarrett D'Amore conv_us_ascii(const struct buf *b) 114*95c635efSGarrett D'Amore { 115*95c635efSGarrett D'Amore 116*95c635efSGarrett D'Amore /* 117*95c635efSGarrett D'Amore * US-ASCII has no conversion since it falls into the first 128 118*95c635efSGarrett D'Amore * bytes of Unicode. 119*95c635efSGarrett D'Amore */ 120*95c635efSGarrett D'Amore 121*95c635efSGarrett D'Amore fwrite(b->buf, 1, b->sz, stdout); 122*95c635efSGarrett D'Amore return(1); 123*95c635efSGarrett D'Amore } 124*95c635efSGarrett D'Amore 125*95c635efSGarrett D'Amore static int 126*95c635efSGarrett D'Amore conv_utf_8(const struct buf *b) 127*95c635efSGarrett D'Amore { 128*95c635efSGarrett D'Amore int state, be; 129*95c635efSGarrett D'Amore unsigned int accum; 130*95c635efSGarrett D'Amore size_t i; 131*95c635efSGarrett D'Amore unsigned char cu; 132*95c635efSGarrett D'Amore const char *cp; 133*95c635efSGarrett D'Amore const long one = 1L; 134*95c635efSGarrett D'Amore 135*95c635efSGarrett D'Amore cp = b->buf + (int)b->offs; 136*95c635efSGarrett D'Amore state = 0; 137*95c635efSGarrett D'Amore accum = 0U; 138*95c635efSGarrett D'Amore be = 0; 139*95c635efSGarrett D'Amore 140*95c635efSGarrett D'Amore /* Quick test for big-endian value. */ 141*95c635efSGarrett D'Amore 142*95c635efSGarrett D'Amore if ( ! (*((const char *)(&one)))) 143*95c635efSGarrett D'Amore be = 1; 144*95c635efSGarrett D'Amore 145*95c635efSGarrett D'Amore for (i = b->offs; i < b->sz; i++) { 146*95c635efSGarrett D'Amore cu = (unsigned char)*cp++; 147*95c635efSGarrett D'Amore if (state) { 148*95c635efSGarrett D'Amore if ( ! (cu & 128) || (cu & 64)) { 149*95c635efSGarrett D'Amore /* Bad sequence header. */ 150*95c635efSGarrett D'Amore return(0); 151*95c635efSGarrett D'Amore } 152*95c635efSGarrett D'Amore 153*95c635efSGarrett D'Amore /* Accept only legitimate bit patterns. */ 154*95c635efSGarrett D'Amore 155*95c635efSGarrett D'Amore if (cu > 191 || cu < 128) { 156*95c635efSGarrett D'Amore /* Bad in-sequence bits. */ 157*95c635efSGarrett D'Amore return(0); 158*95c635efSGarrett D'Amore } 159*95c635efSGarrett D'Amore 160*95c635efSGarrett D'Amore accum |= (cu & 63) << --state * 6; 161*95c635efSGarrett D'Amore 162*95c635efSGarrett D'Amore /* 163*95c635efSGarrett D'Amore * Accum is held in little-endian order as 164*95c635efSGarrett D'Amore * stipulated by the UTF-8 sequence coding. We 165*95c635efSGarrett D'Amore * need to convert to a native big-endian if our 166*95c635efSGarrett D'Amore * architecture requires it. 167*95c635efSGarrett D'Amore */ 168*95c635efSGarrett D'Amore 169*95c635efSGarrett D'Amore if (0 == state && be) 170*95c635efSGarrett D'Amore accum = (accum >> 24) | 171*95c635efSGarrett D'Amore ((accum << 8) & 0x00FF0000) | 172*95c635efSGarrett D'Amore ((accum >> 8) & 0x0000FF00) | 173*95c635efSGarrett D'Amore (accum << 24); 174*95c635efSGarrett D'Amore 175*95c635efSGarrett D'Amore if (0 == state) { 176*95c635efSGarrett D'Amore accum < 128U ? putchar(accum) : 177*95c635efSGarrett D'Amore printf("\\[u%.4X]", accum); 178*95c635efSGarrett D'Amore accum = 0U; 179*95c635efSGarrett D'Amore } 180*95c635efSGarrett D'Amore } else if (cu & (1 << 7)) { 181*95c635efSGarrett D'Amore /* 182*95c635efSGarrett D'Amore * Entering a UTF-8 state: if we encounter a 183*95c635efSGarrett D'Amore * UTF-8 bitmask, calculate the expected UTF-8 184*95c635efSGarrett D'Amore * state from it. 185*95c635efSGarrett D'Amore */ 186*95c635efSGarrett D'Amore for (state = 0; state < 7; state++) 187*95c635efSGarrett D'Amore if ( ! (cu & (1 << (7 - state)))) 188*95c635efSGarrett D'Amore break; 189*95c635efSGarrett D'Amore 190*95c635efSGarrett D'Amore /* Accept only legitimate bit patterns. */ 191*95c635efSGarrett D'Amore 192*95c635efSGarrett D'Amore switch (state) { 193*95c635efSGarrett D'Amore case (4): 194*95c635efSGarrett D'Amore if (cu <= 244 && cu >= 240) { 195*95c635efSGarrett D'Amore accum = (cu & 7) << 18; 196*95c635efSGarrett D'Amore break; 197*95c635efSGarrett D'Amore } 198*95c635efSGarrett D'Amore /* Bad 4-sequence start bits. */ 199*95c635efSGarrett D'Amore return(0); 200*95c635efSGarrett D'Amore case (3): 201*95c635efSGarrett D'Amore if (cu <= 239 && cu >= 224) { 202*95c635efSGarrett D'Amore accum = (cu & 15) << 12; 203*95c635efSGarrett D'Amore break; 204*95c635efSGarrett D'Amore } 205*95c635efSGarrett D'Amore /* Bad 3-sequence start bits. */ 206*95c635efSGarrett D'Amore return(0); 207*95c635efSGarrett D'Amore case (2): 208*95c635efSGarrett D'Amore if (cu <= 223 && cu >= 194) { 209*95c635efSGarrett D'Amore accum = (cu & 31) << 6; 210*95c635efSGarrett D'Amore break; 211*95c635efSGarrett D'Amore } 212*95c635efSGarrett D'Amore /* Bad 2-sequence start bits. */ 213*95c635efSGarrett D'Amore return(0); 214*95c635efSGarrett D'Amore default: 215*95c635efSGarrett D'Amore /* Bad sequence bit mask. */ 216*95c635efSGarrett D'Amore return(0); 217*95c635efSGarrett D'Amore } 218*95c635efSGarrett D'Amore state--; 219*95c635efSGarrett D'Amore } else 220*95c635efSGarrett D'Amore putchar(cu); 221*95c635efSGarrett D'Amore } 222*95c635efSGarrett D'Amore 223*95c635efSGarrett D'Amore if (0 != state) { 224*95c635efSGarrett D'Amore /* Bad trailing bits. */ 225*95c635efSGarrett D'Amore return(0); 226*95c635efSGarrett D'Amore } 227*95c635efSGarrett D'Amore 228*95c635efSGarrett D'Amore return(1); 229*95c635efSGarrett D'Amore } 230*95c635efSGarrett D'Amore 231*95c635efSGarrett D'Amore static void 232*95c635efSGarrett D'Amore resize_buf(struct buf *buf, size_t initial) 233*95c635efSGarrett D'Amore { 234*95c635efSGarrett D'Amore 235*95c635efSGarrett D'Amore buf->sz = buf->sz > initial / 2 ? 236*95c635efSGarrett D'Amore 2 * buf->sz : initial; 237*95c635efSGarrett D'Amore 238*95c635efSGarrett D'Amore buf->buf = realloc(buf->buf, buf->sz); 239*95c635efSGarrett D'Amore if (NULL == buf->buf) { 240*95c635efSGarrett D'Amore perror(NULL); 241*95c635efSGarrett D'Amore exit(EXIT_FAILURE); 242*95c635efSGarrett D'Amore } 243*95c635efSGarrett D'Amore } 244*95c635efSGarrett D'Amore 245*95c635efSGarrett D'Amore static int 246*95c635efSGarrett D'Amore read_whole_file(const char *f, int fd, 247*95c635efSGarrett D'Amore struct buf *fb, int *with_mmap) 248*95c635efSGarrett D'Amore { 249*95c635efSGarrett D'Amore size_t off; 250*95c635efSGarrett D'Amore ssize_t ssz; 251*95c635efSGarrett D'Amore 252*95c635efSGarrett D'Amore #ifdef HAVE_MMAP 253*95c635efSGarrett D'Amore struct stat st; 254*95c635efSGarrett D'Amore if (-1 == fstat(fd, &st)) { 255*95c635efSGarrett D'Amore perror(f); 256*95c635efSGarrett D'Amore return(0); 257*95c635efSGarrett D'Amore } 258*95c635efSGarrett D'Amore 259*95c635efSGarrett D'Amore /* 260*95c635efSGarrett D'Amore * If we're a regular file, try just reading in the whole entry 261*95c635efSGarrett D'Amore * via mmap(). This is faster than reading it into blocks, and 262*95c635efSGarrett D'Amore * since each file is only a few bytes to begin with, I'm not 263*95c635efSGarrett D'Amore * concerned that this is going to tank any machines. 264*95c635efSGarrett D'Amore */ 265*95c635efSGarrett D'Amore 266*95c635efSGarrett D'Amore if (S_ISREG(st.st_mode) && st.st_size >= (1U << 31)) { 267*95c635efSGarrett D'Amore fprintf(stderr, "%s: input too large\n", f); 268*95c635efSGarrett D'Amore return(0); 269*95c635efSGarrett D'Amore } 270*95c635efSGarrett D'Amore 271*95c635efSGarrett D'Amore if (S_ISREG(st.st_mode)) { 272*95c635efSGarrett D'Amore *with_mmap = 1; 273*95c635efSGarrett D'Amore fb->sz = (size_t)st.st_size; 274*95c635efSGarrett D'Amore fb->buf = mmap(NULL, fb->sz, PROT_READ, 275*95c635efSGarrett D'Amore MAP_FILE|MAP_SHARED, fd, 0); 276*95c635efSGarrett D'Amore if (fb->buf != MAP_FAILED) 277*95c635efSGarrett D'Amore return(1); 278*95c635efSGarrett D'Amore } 279*95c635efSGarrett D'Amore #endif 280*95c635efSGarrett D'Amore 281*95c635efSGarrett D'Amore /* 282*95c635efSGarrett D'Amore * If this isn't a regular file (like, say, stdin), then we must 283*95c635efSGarrett D'Amore * go the old way and just read things in bit by bit. 284*95c635efSGarrett D'Amore */ 285*95c635efSGarrett D'Amore 286*95c635efSGarrett D'Amore *with_mmap = 0; 287*95c635efSGarrett D'Amore off = 0; 288*95c635efSGarrett D'Amore fb->sz = 0; 289*95c635efSGarrett D'Amore fb->buf = NULL; 290*95c635efSGarrett D'Amore for (;;) { 291*95c635efSGarrett D'Amore if (off == fb->sz && fb->sz == (1U << 31)) { 292*95c635efSGarrett D'Amore fprintf(stderr, "%s: input too large\n", f); 293*95c635efSGarrett D'Amore break; 294*95c635efSGarrett D'Amore } 295*95c635efSGarrett D'Amore 296*95c635efSGarrett D'Amore if (off == fb->sz) 297*95c635efSGarrett D'Amore resize_buf(fb, 65536); 298*95c635efSGarrett D'Amore 299*95c635efSGarrett D'Amore ssz = read(fd, fb->buf + (int)off, fb->sz - off); 300*95c635efSGarrett D'Amore if (ssz == 0) { 301*95c635efSGarrett D'Amore fb->sz = off; 302*95c635efSGarrett D'Amore return(1); 303*95c635efSGarrett D'Amore } 304*95c635efSGarrett D'Amore if (ssz == -1) { 305*95c635efSGarrett D'Amore perror(f); 306*95c635efSGarrett D'Amore break; 307*95c635efSGarrett D'Amore } 308*95c635efSGarrett D'Amore off += (size_t)ssz; 309*95c635efSGarrett D'Amore } 310*95c635efSGarrett D'Amore 311*95c635efSGarrett D'Amore free(fb->buf); 312*95c635efSGarrett D'Amore fb->buf = NULL; 313*95c635efSGarrett D'Amore return(0); 314*95c635efSGarrett D'Amore } 315*95c635efSGarrett D'Amore 316*95c635efSGarrett D'Amore static int 317*95c635efSGarrett D'Amore cue_enc(const struct buf *b, size_t *offs, enum enc *enc) 318*95c635efSGarrett D'Amore { 319*95c635efSGarrett D'Amore const char *ln, *eoln, *eoph; 320*95c635efSGarrett D'Amore size_t sz, phsz, nsz; 321*95c635efSGarrett D'Amore int i; 322*95c635efSGarrett D'Amore 323*95c635efSGarrett D'Amore ln = b->buf + (int)*offs; 324*95c635efSGarrett D'Amore sz = b->sz - *offs; 325*95c635efSGarrett D'Amore 326*95c635efSGarrett D'Amore /* Look for the end-of-line. */ 327*95c635efSGarrett D'Amore 328*95c635efSGarrett D'Amore if (NULL == (eoln = memchr(ln, '\n', sz))) 329*95c635efSGarrett D'Amore return(-1); 330*95c635efSGarrett D'Amore 331*95c635efSGarrett D'Amore /* Set next-line marker. */ 332*95c635efSGarrett D'Amore 333*95c635efSGarrett D'Amore *offs = (size_t)((eoln + 1) - b->buf); 334*95c635efSGarrett D'Amore 335*95c635efSGarrett D'Amore /* Check if we have the correct header/trailer. */ 336*95c635efSGarrett D'Amore 337*95c635efSGarrett D'Amore if ((sz = (size_t)(eoln - ln)) < 10 || 338*95c635efSGarrett D'Amore memcmp(ln, ".\\\" -*-", 7) || 339*95c635efSGarrett D'Amore memcmp(eoln - 3, "-*-", 3)) 340*95c635efSGarrett D'Amore return(0); 341*95c635efSGarrett D'Amore 342*95c635efSGarrett D'Amore /* Move after the header and adjust for the trailer. */ 343*95c635efSGarrett D'Amore 344*95c635efSGarrett D'Amore ln += 7; 345*95c635efSGarrett D'Amore sz -= 10; 346*95c635efSGarrett D'Amore 347*95c635efSGarrett D'Amore while (sz > 0) { 348*95c635efSGarrett D'Amore while (sz > 0 && ' ' == *ln) { 349*95c635efSGarrett D'Amore ln++; 350*95c635efSGarrett D'Amore sz--; 351*95c635efSGarrett D'Amore } 352*95c635efSGarrett D'Amore if (0 == sz) 353*95c635efSGarrett D'Amore break; 354*95c635efSGarrett D'Amore 355*95c635efSGarrett D'Amore /* Find the end-of-phrase marker (or eoln). */ 356*95c635efSGarrett D'Amore 357*95c635efSGarrett D'Amore if (NULL == (eoph = memchr(ln, ';', sz))) 358*95c635efSGarrett D'Amore eoph = eoln - 3; 359*95c635efSGarrett D'Amore else 360*95c635efSGarrett D'Amore eoph++; 361*95c635efSGarrett D'Amore 362*95c635efSGarrett D'Amore /* Only account for the "coding" phrase. */ 363*95c635efSGarrett D'Amore 364*95c635efSGarrett D'Amore if ((phsz = (size_t)(eoph - ln)) < 7 || 365*95c635efSGarrett D'Amore strncasecmp(ln, "coding:", 7)) { 366*95c635efSGarrett D'Amore sz -= phsz; 367*95c635efSGarrett D'Amore ln += phsz; 368*95c635efSGarrett D'Amore continue; 369*95c635efSGarrett D'Amore } 370*95c635efSGarrett D'Amore 371*95c635efSGarrett D'Amore sz -= 7; 372*95c635efSGarrett D'Amore ln += 7; 373*95c635efSGarrett D'Amore 374*95c635efSGarrett D'Amore while (sz > 0 && ' ' == *ln) { 375*95c635efSGarrett D'Amore ln++; 376*95c635efSGarrett D'Amore sz--; 377*95c635efSGarrett D'Amore } 378*95c635efSGarrett D'Amore if (0 == sz) 379*95c635efSGarrett D'Amore break; 380*95c635efSGarrett D'Amore 381*95c635efSGarrett D'Amore /* Check us against known encodings. */ 382*95c635efSGarrett D'Amore 383*95c635efSGarrett D'Amore for (i = 0; i < (int)ENC__MAX; i++) { 384*95c635efSGarrett D'Amore nsz = strlen(encs[i].name); 385*95c635efSGarrett D'Amore if (phsz < nsz) 386*95c635efSGarrett D'Amore continue; 387*95c635efSGarrett D'Amore if (strncasecmp(ln, encs[i].name, nsz)) 388*95c635efSGarrett D'Amore continue; 389*95c635efSGarrett D'Amore 390*95c635efSGarrett D'Amore *enc = (enum enc)i; 391*95c635efSGarrett D'Amore return(1); 392*95c635efSGarrett D'Amore } 393*95c635efSGarrett D'Amore 394*95c635efSGarrett D'Amore /* Unknown encoding. */ 395*95c635efSGarrett D'Amore 396*95c635efSGarrett D'Amore *enc = ENC__MAX; 397*95c635efSGarrett D'Amore return(1); 398*95c635efSGarrett D'Amore } 399*95c635efSGarrett D'Amore 400*95c635efSGarrett D'Amore return(0); 401*95c635efSGarrett D'Amore } 402*95c635efSGarrett D'Amore 403*95c635efSGarrett D'Amore int 404*95c635efSGarrett D'Amore main(int argc, char *argv[]) 405*95c635efSGarrett D'Amore { 406*95c635efSGarrett D'Amore int i, ch, map, fd, rc; 407*95c635efSGarrett D'Amore struct buf b; 408*95c635efSGarrett D'Amore const char *fn; 409*95c635efSGarrett D'Amore enum enc enc, def; 410*95c635efSGarrett D'Amore unsigned char bom[3] = { 0xEF, 0xBB, 0xBF }; 411*95c635efSGarrett D'Amore size_t offs; 412*95c635efSGarrett D'Amore extern int optind; 413*95c635efSGarrett D'Amore extern char *optarg; 414*95c635efSGarrett D'Amore 415*95c635efSGarrett D'Amore progname = strrchr(argv[0], '/'); 416*95c635efSGarrett D'Amore if (progname == NULL) 417*95c635efSGarrett D'Amore progname = argv[0]; 418*95c635efSGarrett D'Amore else 419*95c635efSGarrett D'Amore ++progname; 420*95c635efSGarrett D'Amore 421*95c635efSGarrett D'Amore fn = "<stdin>"; 422*95c635efSGarrett D'Amore fd = STDIN_FILENO; 423*95c635efSGarrett D'Amore rc = EXIT_FAILURE; 424*95c635efSGarrett D'Amore enc = def = ENC__MAX; 425*95c635efSGarrett D'Amore map = 0; 426*95c635efSGarrett D'Amore 427*95c635efSGarrett D'Amore memset(&b, 0, sizeof(struct buf)); 428*95c635efSGarrett D'Amore 429*95c635efSGarrett D'Amore while (-1 != (ch = getopt(argc, argv, "D:e:rdvh"))) 430*95c635efSGarrett D'Amore switch (ch) { 431*95c635efSGarrett D'Amore case ('D'): 432*95c635efSGarrett D'Amore /* FALLTHROUGH */ 433*95c635efSGarrett D'Amore case ('e'): 434*95c635efSGarrett D'Amore for (i = 0; i < (int)ENC__MAX; i++) { 435*95c635efSGarrett D'Amore if (strcasecmp(optarg, encs[i].name)) 436*95c635efSGarrett D'Amore continue; 437*95c635efSGarrett D'Amore break; 438*95c635efSGarrett D'Amore } 439*95c635efSGarrett D'Amore if (i < (int)ENC__MAX) { 440*95c635efSGarrett D'Amore if ('D' == ch) 441*95c635efSGarrett D'Amore def = (enum enc)i; 442*95c635efSGarrett D'Amore else 443*95c635efSGarrett D'Amore enc = (enum enc)i; 444*95c635efSGarrett D'Amore break; 445*95c635efSGarrett D'Amore } 446*95c635efSGarrett D'Amore 447*95c635efSGarrett D'Amore fprintf(stderr, "%s: Bad encoding\n", optarg); 448*95c635efSGarrett D'Amore return(EXIT_FAILURE); 449*95c635efSGarrett D'Amore case ('r'): 450*95c635efSGarrett D'Amore /* FALLTHROUGH */ 451*95c635efSGarrett D'Amore case ('d'): 452*95c635efSGarrett D'Amore /* FALLTHROUGH */ 453*95c635efSGarrett D'Amore case ('v'): 454*95c635efSGarrett D'Amore /* Compatibility with GNU preconv. */ 455*95c635efSGarrett D'Amore break; 456*95c635efSGarrett D'Amore case ('h'): 457*95c635efSGarrett D'Amore /* Compatibility with GNU preconv. */ 458*95c635efSGarrett D'Amore /* FALLTHROUGH */ 459*95c635efSGarrett D'Amore default: 460*95c635efSGarrett D'Amore usage(); 461*95c635efSGarrett D'Amore return(EXIT_FAILURE); 462*95c635efSGarrett D'Amore } 463*95c635efSGarrett D'Amore 464*95c635efSGarrett D'Amore argc -= optind; 465*95c635efSGarrett D'Amore argv += optind; 466*95c635efSGarrett D'Amore 467*95c635efSGarrett D'Amore /* 468*95c635efSGarrett D'Amore * Open and read the first argument on the command-line. 469*95c635efSGarrett D'Amore * If we don't have one, we default to stdin. 470*95c635efSGarrett D'Amore */ 471*95c635efSGarrett D'Amore 472*95c635efSGarrett D'Amore if (argc > 0) { 473*95c635efSGarrett D'Amore fn = *argv; 474*95c635efSGarrett D'Amore fd = open(fn, O_RDONLY, 0); 475*95c635efSGarrett D'Amore if (-1 == fd) { 476*95c635efSGarrett D'Amore perror(fn); 477*95c635efSGarrett D'Amore return(EXIT_FAILURE); 478*95c635efSGarrett D'Amore } 479*95c635efSGarrett D'Amore } 480*95c635efSGarrett D'Amore 481*95c635efSGarrett D'Amore if ( ! read_whole_file(fn, fd, &b, &map)) 482*95c635efSGarrett D'Amore goto out; 483*95c635efSGarrett D'Amore 484*95c635efSGarrett D'Amore /* Try to read the UTF-8 BOM. */ 485*95c635efSGarrett D'Amore 486*95c635efSGarrett D'Amore if (ENC__MAX == enc) 487*95c635efSGarrett D'Amore if (b.sz > 3 && 0 == memcmp(b.buf, bom, 3)) { 488*95c635efSGarrett D'Amore b.offs = 3; 489*95c635efSGarrett D'Amore enc = ENC_UTF_8; 490*95c635efSGarrett D'Amore } 491*95c635efSGarrett D'Amore 492*95c635efSGarrett D'Amore /* Try reading from the "-*-" cue. */ 493*95c635efSGarrett D'Amore 494*95c635efSGarrett D'Amore if (ENC__MAX == enc) { 495*95c635efSGarrett D'Amore offs = b.offs; 496*95c635efSGarrett D'Amore ch = cue_enc(&b, &offs, &enc); 497*95c635efSGarrett D'Amore if (0 == ch) 498*95c635efSGarrett D'Amore ch = cue_enc(&b, &offs, &enc); 499*95c635efSGarrett D'Amore } 500*95c635efSGarrett D'Amore 501*95c635efSGarrett D'Amore /* 502*95c635efSGarrett D'Amore * No encoding has been detected. 503*95c635efSGarrett D'Amore * Thus, we either fall into our default encoder, if specified, 504*95c635efSGarrett D'Amore * or use Latin-1 if all else fails. 505*95c635efSGarrett D'Amore */ 506*95c635efSGarrett D'Amore 507*95c635efSGarrett D'Amore if (ENC__MAX == enc) 508*95c635efSGarrett D'Amore enc = ENC__MAX == def ? ENC_LATIN_1 : def; 509*95c635efSGarrett D'Amore 510*95c635efSGarrett D'Amore if ( ! (*encs[(int)enc].conv)(&b)) { 511*95c635efSGarrett D'Amore fprintf(stderr, "%s: Bad encoding\n", fn); 512*95c635efSGarrett D'Amore goto out; 513*95c635efSGarrett D'Amore } 514*95c635efSGarrett D'Amore 515*95c635efSGarrett D'Amore rc = EXIT_SUCCESS; 516*95c635efSGarrett D'Amore out: 517*95c635efSGarrett D'Amore #ifdef HAVE_MMAP 518*95c635efSGarrett D'Amore if (map) 519*95c635efSGarrett D'Amore munmap(b.buf, b.sz); 520*95c635efSGarrett D'Amore else 521*95c635efSGarrett D'Amore #endif 522*95c635efSGarrett D'Amore free(b.buf); 523*95c635efSGarrett D'Amore 524*95c635efSGarrett D'Amore if (fd > STDIN_FILENO) 525*95c635efSGarrett D'Amore close(fd); 526*95c635efSGarrett D'Amore 527*95c635efSGarrett D'Amore return(rc); 528*95c635efSGarrett D'Amore } 529