1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2003 Ryuichiro Imura 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 /* 30 * kiconv(3) requires shared linked, and reduce module size 31 * when statically linked. 32 */ 33 34 #ifdef PIC 35 36 /* 37 * Why do we need quirks? 38 * Since each vendors has their own Unicode mapping rules, 39 * we need some quirks until iconv(3) supports them. 40 * We can define Microsoft mappings here. 41 * 42 * For example, the eucJP and Unocode mapping rule is based on 43 * the JIS standard. Since Microsoft uses cp932 for Unicode mapping 44 * which is not truly based on the JIS standard, reading a file 45 * system created by Microsoft Windows family using eucJP/Unicode 46 * mapping rule will cause a problem. That's why we define eucJP-ms here. 47 * The eucJP-ms has been defined by The Open Group Japan Vendor Council. 48 * 49 * Well, Apple Mac OS also has their own Unicode mappings, 50 * but we won't require these quirks here, because HFS doesn't have 51 * Unicode and HFS+ has decomposed Unicode which can not be 52 * handled by this xlat16 converter. 53 */ 54 55 #include <sys/types.h> 56 #include <sys/iconv.h> 57 58 #include <stdio.h> 59 #include <string.h> 60 61 #include "quirks.h" 62 63 /* 64 * All lists of quirk character set 65 */ 66 static struct { 67 int vendor; /* reserved for non MS mapping */ 68 const char *base_codeset, *quirk_codeset; 69 } quirk_list[] = { 70 { KICONV_VENDOR_MICSFT, "eucJP", "eucJP-ms" }, 71 { KICONV_VENDOR_MICSFT, "EUC-JP", "eucJP-ms" }, 72 { KICONV_VENDOR_MICSFT, "SJIS", "SJIS-ms" }, 73 { KICONV_VENDOR_MICSFT, "Shift_JIS", "SJIS-ms" }, 74 { KICONV_VENDOR_MICSFT, "Big5", "Big5-ms" } 75 }; 76 77 /* 78 * The character list to replace for Japanese MS-Windows. 79 */ 80 static struct quirk_replace_list quirk_jis_cp932[] = { 81 { 0x00a2, 0xffe0 }, /* Cent Sign, Fullwidth Cent Sign */ 82 { 0x00a3, 0xffe1 }, /* Pound Sign, Fullwidth Pound Sign */ 83 { 0x00ac, 0xffe2 }, /* Not Sign, Fullwidth Not Sign */ 84 { 0x2016, 0x2225 }, /* Double Vertical Line, Parallel To */ 85 { 0x203e, 0x007e }, /* Overline, Tilde */ 86 { 0x2212, 0xff0d }, /* Minus Sign, Fullwidth Hyphenminus */ 87 { 0x301c, 0xff5e } /* Wave Dash, Fullwidth Tilde */ 88 }; 89 90 /* 91 * All entries of quirks 92 */ 93 #define NumOf(n) (sizeof((n)) / sizeof((n)[0])) 94 static struct { 95 const char *quirk_codeset, *iconv_codeset, *pair_codeset; 96 struct quirk_replace_list (*replace_list)[]; 97 size_t num_of_replaces; 98 } quirk_table[] = { 99 { 100 "eucJP-ms", "eucJP", ENCODING_UNICODE, 101 (struct quirk_replace_list (*)[])&quirk_jis_cp932, 102 NumOf(quirk_jis_cp932) 103 }, 104 { 105 "SJIS-ms", "CP932", ENCODING_UNICODE, 106 /* XXX - quirk_replace_list should be NULL */ 107 (struct quirk_replace_list (*)[])&quirk_jis_cp932, 108 NumOf(quirk_jis_cp932) 109 }, 110 { 111 "Big5-ms", "CP950", ENCODING_UNICODE, 112 NULL, 0 113 } 114 }; 115 116 117 const char * 118 kiconv_quirkcs(const char* base, int vendor) 119 { 120 size_t i; 121 122 /* 123 * We should compare codeset names ignoring case here, 124 * so that quirk could be used for all of the user input 125 * patterns. 126 */ 127 for (i = 0; i < NumOf(quirk_list); i++) 128 if (quirk_list[i].vendor == vendor && 129 strcasecmp(quirk_list[i].base_codeset, base) == 0) 130 return (quirk_list[i].quirk_codeset); 131 132 return (base); 133 } 134 135 /* 136 * Internal Functions 137 */ 138 const char * 139 search_quirk(const char *given_codeset, 140 const char *pair_codeset, 141 struct quirk_replace_list **replace_list, 142 size_t *num_of_replaces) 143 { 144 size_t i; 145 146 *replace_list = NULL; 147 *num_of_replaces = 0; 148 for (i = 0; i < NumOf(quirk_table); i++) 149 if (strcmp(quirk_table[i].quirk_codeset, given_codeset) == 0) { 150 if (strcmp(quirk_table[i].pair_codeset, pair_codeset) == 0) { 151 *replace_list = *quirk_table[i].replace_list; 152 *num_of_replaces = quirk_table[i].num_of_replaces; 153 } 154 return (quirk_table[i].iconv_codeset); 155 } 156 157 return (given_codeset); 158 } 159 160 uint16_t 161 quirk_vendor2unix(uint16_t c, struct quirk_replace_list *replace_list, size_t num) 162 { 163 size_t i; 164 165 for (i = 0; i < num; i++) 166 if (replace_list[i].vendor_code == c) 167 return (replace_list[i].standard_code); 168 169 return (c); 170 } 171 172 uint16_t 173 quirk_unix2vendor(uint16_t c, struct quirk_replace_list *replace_list, size_t num) 174 { 175 size_t i; 176 177 for (i = 0; i < num; i++) 178 if (replace_list[i].standard_code == c) 179 return (replace_list[i].vendor_code); 180 181 return (c); 182 } 183 184 #else /* statically linked */ 185 186 #include <sys/types.h> 187 #include <sys/iconv.h> 188 189 const char * 190 kiconv_quirkcs(const char* base __unused, int vendor __unused) 191 { 192 193 return (base); 194 } 195 196 #endif /* PIC */ 197