xref: /freebsd/contrib/llvm-project/llvm/lib/Support/TextEncoding.cpp (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //===-- TextEncoding.cpp - Text encoding conversion class ---------*- C++ -*-=//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file provides utility classes to convert between different character
11 /// encodings.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/Support/TextEncoding.h"
16 #include "llvm/ADT/SmallString.h"
17 #include "llvm/ADT/SmallVector.h"
18 #include "llvm/ADT/StringExtras.h"
19 #include "llvm/Support/ConvertEBCDIC.h"
20 #include <system_error>
21 
22 #if HAVE_ICU
23 #include <unicode/ucnv.h>
24 #elif HAVE_ICONV
25 #include <iconv.h>
26 #endif
27 
28 using namespace llvm;
29 
30 // Normalize the charset name with the charset alias matching algorithm proposed
31 // in https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching.
normalizeCharSetName(StringRef CSName,SmallVectorImpl<char> & Normalized)32 static void normalizeCharSetName(StringRef CSName,
33                                  SmallVectorImpl<char> &Normalized) {
34   bool PrevDigit = false;
35   for (auto Ch : CSName) {
36     if (isAlnum(Ch)) {
37       Ch = toLower(Ch);
38       if (Ch != '0' || PrevDigit) {
39         PrevDigit = isDigit(Ch);
40         Normalized.push_back(Ch);
41       }
42     }
43   }
44 }
45 
46 // Maps the encoding name to enum constant if possible.
getKnownEncoding(StringRef Name)47 static std::optional<TextEncoding> getKnownEncoding(StringRef Name) {
48   SmallString<16> Normalized;
49   normalizeCharSetName(Name, Normalized);
50   if (Normalized.equals("utf8"))
51     return TextEncoding::UTF8;
52   if (Normalized.equals("ibm1047"))
53     return TextEncoding::IBM1047;
54   return std::nullopt;
55 }
56 
57 LLVM_ATTRIBUTE_UNUSED static void
HandleOverflow(size_t & Capacity,char * & Output,size_t & OutputLength,SmallVectorImpl<char> & Result)58 HandleOverflow(size_t &Capacity, char *&Output, size_t &OutputLength,
59                SmallVectorImpl<char> &Result) {
60   // No space left in output buffer. Double the size of the underlying
61   // memory in the SmallVectorImpl, adjust pointer and length and continue
62   // the conversion.
63   Capacity =
64       (Capacity < Result.max_size() / 2) ? 2 * Capacity : Result.max_size();
65   Result.resize(0);
66   Result.resize_for_overwrite(Capacity);
67   Output = static_cast<char *>(Result.data());
68   OutputLength = Capacity;
69 }
70 
71 namespace {
72 enum ConversionType {
73   UTF8ToIBM1047,
74   IBM1047ToUTF8,
75 };
76 
77 // Support conversion between EBCDIC 1047 and UTF-8. This class uses
78 // built-in translation tables that allow for translation between the
79 // aforementioned encodings. The use of tables for conversion is only
80 // possible because EBCDIC 1047 is a single-byte, stateless encoding; other
81 // encodings are not supported.
82 class TextEncodingConverterTable final
83     : public details::TextEncodingConverterImplBase {
84   const ConversionType ConvType;
85 
86 public:
TextEncodingConverterTable(ConversionType ConvType)87   TextEncodingConverterTable(ConversionType ConvType) : ConvType(ConvType) {}
88 
89   std::error_code convertString(StringRef Source,
90                                 SmallVectorImpl<char> &Result) override;
91 
reset()92   void reset() override {}
93 };
94 
95 std::error_code
convertString(StringRef Source,SmallVectorImpl<char> & Result)96 TextEncodingConverterTable::convertString(StringRef Source,
97                                           SmallVectorImpl<char> &Result) {
98   switch (ConvType) {
99   case IBM1047ToUTF8:
100     ConverterEBCDIC::convertToUTF8(Source, Result);
101     return std::error_code();
102   case UTF8ToIBM1047:
103     return ConverterEBCDIC::convertToEBCDIC(Source, Result);
104   }
105   llvm_unreachable("Invalid ConvType!");
106   return std::error_code();
107 }
108 
109 #if HAVE_ICU
110 struct UConverterDeleter {
operator ()__anonf264d0e80111::UConverterDeleter111   void operator()(UConverter *Converter) const {
112     if (Converter)
113       ucnv_close(Converter);
114   }
115 };
116 using UConverterUniquePtr = std::unique_ptr<UConverter, UConverterDeleter>;
117 
118 class TextEncodingConverterICU final
119     : public details::TextEncodingConverterImplBase {
120   UConverterUniquePtr FromConvDesc;
121   UConverterUniquePtr ToConvDesc;
122 
123 public:
TextEncodingConverterICU(UConverterUniquePtr FromConverter,UConverterUniquePtr ToConverter)124   TextEncodingConverterICU(UConverterUniquePtr FromConverter,
125                            UConverterUniquePtr ToConverter)
126       : FromConvDesc(std::move(FromConverter)),
127         ToConvDesc(std::move(ToConverter)) {}
128 
129   std::error_code convertString(StringRef Source,
130                                 SmallVectorImpl<char> &Result) override;
131 
132   void reset() override;
133 };
134 
135 // TODO: The current implementation discards the partial result and restarts the
136 // conversion from the beginning if there is a conversion error due to
137 // insufficient buffer size. In the future, it would better to save the partial
138 // result and resume the conversion for the remaining string.
139 // TODO: Improve translation of ICU errors to error_code
140 std::error_code
convertString(StringRef Source,SmallVectorImpl<char> & Result)141 TextEncodingConverterICU::convertString(StringRef Source,
142                                         SmallVectorImpl<char> &Result) {
143   // Setup the input in case it has no backing data.
144   size_t InputLength = Source.size();
145   const char *In = InputLength ? const_cast<char *>(Source.data()) : "";
146 
147   // Setup the output. We directly write into the SmallVector.
148   size_t Capacity = Result.capacity();
149   size_t OutputLength = Capacity;
150   Result.resize_for_overwrite(Capacity);
151   char *Output;
152   UErrorCode EC = U_ZERO_ERROR;
153 
154   ucnv_setToUCallBack(&*FromConvDesc, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL,
155                       &EC);
156   ucnv_setFromUCallBack(&*ToConvDesc, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL,
157                         NULL, &EC);
158   assert(U_SUCCESS(EC));
159 
160   do {
161     EC = U_ZERO_ERROR;
162     const char *Input = In;
163 
164     Output = InputLength ? static_cast<char *>(Result.data()) : nullptr;
165     ucnv_convertEx(&*ToConvDesc, &*FromConvDesc, &Output, Result.end(), &Input,
166                    In + InputLength, /*pivotStart=*/NULL,
167                    /*pivotSource=*/NULL, /*pivotTarget=*/NULL,
168                    /*pivotLimit=*/NULL, /*reset=*/true,
169                    /*flush=*/true, &EC);
170     if (U_FAILURE(EC)) {
171       if (EC == U_BUFFER_OVERFLOW_ERROR) {
172         if (Capacity < Result.max_size()) {
173           HandleOverflow(Capacity, Output, OutputLength, Result);
174           continue;
175         } else
176           return std::error_code(E2BIG, std::generic_category());
177       }
178       // Some other error occured.
179       Result.resize(Output - Result.data());
180       return std::error_code(EILSEQ, std::generic_category());
181     }
182     break;
183   } while (true);
184 
185   Result.resize(Output - Result.data());
186   return std::error_code();
187 }
188 
reset()189 void TextEncodingConverterICU::reset() {
190   ucnv_reset(&*FromConvDesc);
191   ucnv_reset(&*ToConvDesc);
192 }
193 
194 #elif HAVE_ICONV
195 class TextEncodingConverterIconv final
196     : public details::TextEncodingConverterImplBase {
197   class UniqueIconvT {
198     iconv_t ConvDesc;
199 
200   public:
operator iconv_t() const201     operator iconv_t() const { return ConvDesc; }
UniqueIconvT(iconv_t CD)202     UniqueIconvT(iconv_t CD) : ConvDesc(CD) {}
~UniqueIconvT()203     ~UniqueIconvT() {
204       if (ConvDesc != (iconv_t)-1) {
205         iconv_close(ConvDesc);
206         ConvDesc = (iconv_t)-1;
207       }
208     }
UniqueIconvT(UniqueIconvT && Other)209     UniqueIconvT(UniqueIconvT &&Other) : ConvDesc(Other.ConvDesc) {
210       Other.ConvDesc = (iconv_t)-1;
211     }
operator =(UniqueIconvT && Other)212     UniqueIconvT &operator=(UniqueIconvT &&Other) {
213       if (&Other != this) {
214         ConvDesc = Other.ConvDesc;
215         Other.ConvDesc = (iconv_t)-1;
216       }
217       return *this;
218     }
219   };
220   UniqueIconvT ConvDesc;
221 
222 public:
TextEncodingConverterIconv(UniqueIconvT ConvDesc)223   TextEncodingConverterIconv(UniqueIconvT ConvDesc)
224       : ConvDesc(std::move(ConvDesc)) {}
225 
226   std::error_code convertString(StringRef Source,
227                                 SmallVectorImpl<char> &Result) override;
228 
229   void reset() override;
230 };
231 
232 // TODO: The current implementation discards the partial result and restarts the
233 // conversion from the beginning if there is a conversion error due to
234 // insufficient buffer size. In the future, it would better to save the partial
235 // result and resume the conversion for the remaining string.
236 std::error_code
convertString(StringRef Source,SmallVectorImpl<char> & Result)237 TextEncodingConverterIconv::convertString(StringRef Source,
238                                           SmallVectorImpl<char> &Result) {
239   // Setup the output. We directly write into the SmallVector.
240   size_t Capacity = Result.capacity();
241   char *Output = static_cast<char *>(Result.data());
242   size_t OutputLength = Capacity;
243   Result.resize_for_overwrite(Capacity);
244 
245   size_t Ret;
246   // Handle errors returned from iconv().
247   auto HandleError = [&Capacity, &Output, &OutputLength, &Result,
248                       this](size_t Ret) {
249     if (Ret == static_cast<size_t>(-1)) {
250       // An error occured. Check if we can gracefully handle it.
251       if (errno == E2BIG && Capacity < Result.max_size()) {
252         HandleOverflow(Capacity, Output, OutputLength, Result);
253         // Reset converter
254         reset();
255         return std::error_code();
256       } else {
257         // Some other error occured.
258         Result.resize(Output - Result.data());
259         return std::error_code(errno, std::generic_category());
260       }
261     } else {
262       // A positive return value indicates that some characters were converted
263       // in a nonreversible way, that is, replaced with a SUB symbol. Returning
264       // an error in this case makes sure that both conversion routines behave
265       // in the same way.
266       return std::make_error_code(std::errc::illegal_byte_sequence);
267     }
268   };
269 
270   do {
271     // Setup the input. Use nullptr to reset iconv state if input length is
272     // zero.
273     size_t InputLength = Source.size();
274     char *Input = const_cast<char *>(InputLength ? Source.data() : "");
275     Ret = iconv(ConvDesc, &Input, &InputLength, &Output, &OutputLength);
276     if (Ret != 0) {
277       if (auto EC = HandleError(Ret))
278         return EC;
279       continue;
280     }
281     // Flush the converter
282     Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength);
283     if (Ret != 0) {
284       if (auto EC = HandleError(Ret))
285         return EC;
286       continue;
287     }
288     break;
289   } while (true);
290 
291   // Re-adjust size to actual size.
292   Result.resize(Output - Result.data());
293   return std::error_code();
294 }
295 
reset()296 inline void TextEncodingConverterIconv::reset() {
297   iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr);
298 }
299 
300 #endif // HAVE_ICONV
301 } // namespace
302 
303 ErrorOr<TextEncodingConverter>
create(TextEncoding CPFrom,TextEncoding CPTo)304 TextEncodingConverter::create(TextEncoding CPFrom, TextEncoding CPTo) {
305 
306   // Text encodings should be distinct.
307   if (CPFrom == CPTo)
308     return std::make_error_code(std::errc::invalid_argument);
309 
310   ConversionType Conversion;
311   if (CPFrom == TextEncoding::UTF8 && CPTo == TextEncoding::IBM1047)
312     Conversion = UTF8ToIBM1047;
313   else if (CPFrom == TextEncoding::IBM1047 && CPTo == TextEncoding::UTF8)
314     Conversion = IBM1047ToUTF8;
315   else
316     return std::make_error_code(std::errc::invalid_argument);
317 
318   return TextEncodingConverter(
319       std::make_unique<TextEncodingConverterTable>(Conversion));
320 }
321 
create(StringRef From,StringRef To)322 ErrorOr<TextEncodingConverter> TextEncodingConverter::create(StringRef From,
323                                                              StringRef To) {
324   std::optional<TextEncoding> FromEncoding = getKnownEncoding(From);
325   std::optional<TextEncoding> ToEncoding = getKnownEncoding(To);
326   if (FromEncoding && ToEncoding) {
327     ErrorOr<TextEncodingConverter> Converter =
328         create(*FromEncoding, *ToEncoding);
329     if (Converter)
330       return Converter;
331   }
332 #if HAVE_ICU
333   UErrorCode EC = U_ZERO_ERROR;
334   UConverterUniquePtr FromConvDesc(ucnv_open(From.str().c_str(), &EC));
335   if (U_FAILURE(EC))
336     return std::make_error_code(std::errc::invalid_argument);
337 
338   UConverterUniquePtr ToConvDesc(ucnv_open(To.str().c_str(), &EC));
339   if (U_FAILURE(EC))
340     return std::make_error_code(std::errc::invalid_argument);
341 
342   auto Converter = std::make_unique<TextEncodingConverterICU>(
343       std::move(FromConvDesc), std::move(ToConvDesc));
344   return TextEncodingConverter(std::move(Converter));
345 #elif HAVE_ICONV
346   iconv_t ConvDesc = iconv_open(To.str().c_str(), From.str().c_str());
347   if (ConvDesc == (iconv_t)-1)
348     return std::make_error_code(std::errc::invalid_argument);
349   return TextEncodingConverter(
350       std::make_unique<TextEncodingConverterIconv>(ConvDesc));
351 #else
352   return std::make_error_code(std::errc::invalid_argument);
353 #endif
354 }
355