1 //===-- TextEncoding.cpp - Text encoding conversion class ---------*- C++ -*-=// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file provides utility classes to convert between different character 11 /// encodings. 12 /// 13 //===----------------------------------------------------------------------===// 14 15 #include "llvm/Support/TextEncoding.h" 16 #include "llvm/ADT/SmallString.h" 17 #include "llvm/ADT/SmallVector.h" 18 #include "llvm/ADT/StringExtras.h" 19 #include "llvm/Support/ConvertEBCDIC.h" 20 #include <system_error> 21 22 #if HAVE_ICU 23 #include <unicode/ucnv.h> 24 #elif HAVE_ICONV 25 #include <iconv.h> 26 #endif 27 28 using namespace llvm; 29 30 // Normalize the charset name with the charset alias matching algorithm proposed 31 // in https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching. 32 static void normalizeCharSetName(StringRef CSName, 33 SmallVectorImpl<char> &Normalized) { 34 bool PrevDigit = false; 35 for (auto Ch : CSName) { 36 if (isAlnum(Ch)) { 37 Ch = toLower(Ch); 38 if (Ch != '0' || PrevDigit) { 39 PrevDigit = isDigit(Ch); 40 Normalized.push_back(Ch); 41 } 42 } 43 } 44 } 45 46 // Maps the encoding name to enum constant if possible. 47 static std::optional<TextEncoding> getKnownEncoding(StringRef Name) { 48 SmallString<16> Normalized; 49 normalizeCharSetName(Name, Normalized); 50 if (Normalized.equals("utf8")) 51 return TextEncoding::UTF8; 52 if (Normalized.equals("ibm1047")) 53 return TextEncoding::IBM1047; 54 return std::nullopt; 55 } 56 57 LLVM_ATTRIBUTE_UNUSED static void 58 HandleOverflow(size_t &Capacity, char *&Output, size_t &OutputLength, 59 SmallVectorImpl<char> &Result) { 60 // No space left in output buffer. Double the size of the underlying 61 // memory in the SmallVectorImpl, adjust pointer and length and continue 62 // the conversion. 63 Capacity = 64 (Capacity < Result.max_size() / 2) ? 2 * Capacity : Result.max_size(); 65 Result.resize(0); 66 Result.resize_for_overwrite(Capacity); 67 Output = static_cast<char *>(Result.data()); 68 OutputLength = Capacity; 69 } 70 71 namespace { 72 enum ConversionType { 73 UTF8ToIBM1047, 74 IBM1047ToUTF8, 75 }; 76 77 // Support conversion between EBCDIC 1047 and UTF-8. This class uses 78 // built-in translation tables that allow for translation between the 79 // aforementioned encodings. The use of tables for conversion is only 80 // possible because EBCDIC 1047 is a single-byte, stateless encoding; other 81 // encodings are not supported. 82 class TextEncodingConverterTable final 83 : public details::TextEncodingConverterImplBase { 84 const ConversionType ConvType; 85 86 public: 87 TextEncodingConverterTable(ConversionType ConvType) : ConvType(ConvType) {} 88 89 std::error_code convertString(StringRef Source, 90 SmallVectorImpl<char> &Result) override; 91 92 void reset() override {} 93 }; 94 95 std::error_code 96 TextEncodingConverterTable::convertString(StringRef Source, 97 SmallVectorImpl<char> &Result) { 98 switch (ConvType) { 99 case IBM1047ToUTF8: 100 ConverterEBCDIC::convertToUTF8(Source, Result); 101 return std::error_code(); 102 case UTF8ToIBM1047: 103 return ConverterEBCDIC::convertToEBCDIC(Source, Result); 104 } 105 llvm_unreachable("Invalid ConvType!"); 106 return std::error_code(); 107 } 108 109 #if HAVE_ICU 110 struct UConverterDeleter { 111 void operator()(UConverter *Converter) const { 112 if (Converter) 113 ucnv_close(Converter); 114 } 115 }; 116 using UConverterUniquePtr = std::unique_ptr<UConverter, UConverterDeleter>; 117 118 class TextEncodingConverterICU final 119 : public details::TextEncodingConverterImplBase { 120 UConverterUniquePtr FromConvDesc; 121 UConverterUniquePtr ToConvDesc; 122 123 public: 124 TextEncodingConverterICU(UConverterUniquePtr FromConverter, 125 UConverterUniquePtr ToConverter) 126 : FromConvDesc(std::move(FromConverter)), 127 ToConvDesc(std::move(ToConverter)) {} 128 129 std::error_code convertString(StringRef Source, 130 SmallVectorImpl<char> &Result) override; 131 132 void reset() override; 133 }; 134 135 // TODO: The current implementation discards the partial result and restarts the 136 // conversion from the beginning if there is a conversion error due to 137 // insufficient buffer size. In the future, it would better to save the partial 138 // result and resume the conversion for the remaining string. 139 // TODO: Improve translation of ICU errors to error_code 140 std::error_code 141 TextEncodingConverterICU::convertString(StringRef Source, 142 SmallVectorImpl<char> &Result) { 143 // Setup the input in case it has no backing data. 144 size_t InputLength = Source.size(); 145 const char *In = InputLength ? const_cast<char *>(Source.data()) : ""; 146 147 // Setup the output. We directly write into the SmallVector. 148 size_t Capacity = Result.capacity(); 149 size_t OutputLength = Capacity; 150 Result.resize_for_overwrite(Capacity); 151 char *Output; 152 UErrorCode EC = U_ZERO_ERROR; 153 154 ucnv_setToUCallBack(&*FromConvDesc, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, 155 &EC); 156 ucnv_setFromUCallBack(&*ToConvDesc, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, 157 NULL, &EC); 158 assert(U_SUCCESS(EC)); 159 160 do { 161 EC = U_ZERO_ERROR; 162 const char *Input = In; 163 164 Output = InputLength ? static_cast<char *>(Result.data()) : nullptr; 165 ucnv_convertEx(&*ToConvDesc, &*FromConvDesc, &Output, Result.end(), &Input, 166 In + InputLength, /*pivotStart=*/NULL, 167 /*pivotSource=*/NULL, /*pivotTarget=*/NULL, 168 /*pivotLimit=*/NULL, /*reset=*/true, 169 /*flush=*/true, &EC); 170 if (U_FAILURE(EC)) { 171 if (EC == U_BUFFER_OVERFLOW_ERROR) { 172 if (Capacity < Result.max_size()) { 173 HandleOverflow(Capacity, Output, OutputLength, Result); 174 continue; 175 } else 176 return std::error_code(E2BIG, std::generic_category()); 177 } 178 // Some other error occured. 179 Result.resize(Output - Result.data()); 180 return std::error_code(EILSEQ, std::generic_category()); 181 } 182 break; 183 } while (true); 184 185 Result.resize(Output - Result.data()); 186 return std::error_code(); 187 } 188 189 void TextEncodingConverterICU::reset() { 190 ucnv_reset(&*FromConvDesc); 191 ucnv_reset(&*ToConvDesc); 192 } 193 194 #elif HAVE_ICONV 195 class TextEncodingConverterIconv final 196 : public details::TextEncodingConverterImplBase { 197 class UniqueIconvT { 198 iconv_t ConvDesc; 199 200 public: 201 operator iconv_t() const { return ConvDesc; } 202 UniqueIconvT(iconv_t CD) : ConvDesc(CD) {} 203 ~UniqueIconvT() { 204 if (ConvDesc != (iconv_t)-1) { 205 iconv_close(ConvDesc); 206 ConvDesc = (iconv_t)-1; 207 } 208 } 209 UniqueIconvT(UniqueIconvT &&Other) : ConvDesc(Other.ConvDesc) { 210 Other.ConvDesc = (iconv_t)-1; 211 } 212 UniqueIconvT &operator=(UniqueIconvT &&Other) { 213 if (&Other != this) { 214 ConvDesc = Other.ConvDesc; 215 Other.ConvDesc = (iconv_t)-1; 216 } 217 return *this; 218 } 219 }; 220 UniqueIconvT ConvDesc; 221 222 public: 223 TextEncodingConverterIconv(UniqueIconvT ConvDesc) 224 : ConvDesc(std::move(ConvDesc)) {} 225 226 std::error_code convertString(StringRef Source, 227 SmallVectorImpl<char> &Result) override; 228 229 void reset() override; 230 }; 231 232 // TODO: The current implementation discards the partial result and restarts the 233 // conversion from the beginning if there is a conversion error due to 234 // insufficient buffer size. In the future, it would better to save the partial 235 // result and resume the conversion for the remaining string. 236 std::error_code 237 TextEncodingConverterIconv::convertString(StringRef Source, 238 SmallVectorImpl<char> &Result) { 239 // Setup the output. We directly write into the SmallVector. 240 size_t Capacity = Result.capacity(); 241 char *Output = static_cast<char *>(Result.data()); 242 size_t OutputLength = Capacity; 243 Result.resize_for_overwrite(Capacity); 244 245 size_t Ret; 246 // Handle errors returned from iconv(). 247 auto HandleError = [&Capacity, &Output, &OutputLength, &Result, 248 this](size_t Ret) { 249 if (Ret == static_cast<size_t>(-1)) { 250 // An error occured. Check if we can gracefully handle it. 251 if (errno == E2BIG && Capacity < Result.max_size()) { 252 HandleOverflow(Capacity, Output, OutputLength, Result); 253 // Reset converter 254 reset(); 255 return std::error_code(); 256 } else { 257 // Some other error occured. 258 Result.resize(Output - Result.data()); 259 return std::error_code(errno, std::generic_category()); 260 } 261 } else { 262 // A positive return value indicates that some characters were converted 263 // in a nonreversible way, that is, replaced with a SUB symbol. Returning 264 // an error in this case makes sure that both conversion routines behave 265 // in the same way. 266 return std::make_error_code(std::errc::illegal_byte_sequence); 267 } 268 }; 269 270 do { 271 // Setup the input. Use nullptr to reset iconv state if input length is 272 // zero. 273 size_t InputLength = Source.size(); 274 char *Input = const_cast<char *>(InputLength ? Source.data() : ""); 275 Ret = iconv(ConvDesc, &Input, &InputLength, &Output, &OutputLength); 276 if (Ret != 0) { 277 if (auto EC = HandleError(Ret)) 278 return EC; 279 continue; 280 } 281 // Flush the converter 282 Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength); 283 if (Ret != 0) { 284 if (auto EC = HandleError(Ret)) 285 return EC; 286 continue; 287 } 288 break; 289 } while (true); 290 291 // Re-adjust size to actual size. 292 Result.resize(Output - Result.data()); 293 return std::error_code(); 294 } 295 296 inline void TextEncodingConverterIconv::reset() { 297 iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr); 298 } 299 300 #endif // HAVE_ICONV 301 } // namespace 302 303 ErrorOr<TextEncodingConverter> 304 TextEncodingConverter::create(TextEncoding CPFrom, TextEncoding CPTo) { 305 306 // Text encodings should be distinct. 307 if (CPFrom == CPTo) 308 return std::make_error_code(std::errc::invalid_argument); 309 310 ConversionType Conversion; 311 if (CPFrom == TextEncoding::UTF8 && CPTo == TextEncoding::IBM1047) 312 Conversion = UTF8ToIBM1047; 313 else if (CPFrom == TextEncoding::IBM1047 && CPTo == TextEncoding::UTF8) 314 Conversion = IBM1047ToUTF8; 315 else 316 return std::make_error_code(std::errc::invalid_argument); 317 318 return TextEncodingConverter( 319 std::make_unique<TextEncodingConverterTable>(Conversion)); 320 } 321 322 ErrorOr<TextEncodingConverter> TextEncodingConverter::create(StringRef From, 323 StringRef To) { 324 std::optional<TextEncoding> FromEncoding = getKnownEncoding(From); 325 std::optional<TextEncoding> ToEncoding = getKnownEncoding(To); 326 if (FromEncoding && ToEncoding) { 327 ErrorOr<TextEncodingConverter> Converter = 328 create(*FromEncoding, *ToEncoding); 329 if (Converter) 330 return Converter; 331 } 332 #if HAVE_ICU 333 UErrorCode EC = U_ZERO_ERROR; 334 UConverterUniquePtr FromConvDesc(ucnv_open(From.str().c_str(), &EC)); 335 if (U_FAILURE(EC)) 336 return std::make_error_code(std::errc::invalid_argument); 337 338 UConverterUniquePtr ToConvDesc(ucnv_open(To.str().c_str(), &EC)); 339 if (U_FAILURE(EC)) 340 return std::make_error_code(std::errc::invalid_argument); 341 342 auto Converter = std::make_unique<TextEncodingConverterICU>( 343 std::move(FromConvDesc), std::move(ToConvDesc)); 344 return TextEncodingConverter(std::move(Converter)); 345 #elif HAVE_ICONV 346 iconv_t ConvDesc = iconv_open(To.str().c_str(), From.str().c_str()); 347 if (ConvDesc == (iconv_t)-1) 348 return std::make_error_code(std::errc::invalid_argument); 349 return TextEncodingConverter( 350 std::make_unique<TextEncodingConverterIconv>(ConvDesc)); 351 #else 352 return std::make_error_code(std::errc::invalid_argument); 353 #endif 354 } 355