1*0b57cec5SDimitry Andric //===--- ClangCommentHTMLNamedCharacterReferenceEmitter.cpp -----------------=//
2*0b57cec5SDimitry Andric //
3*0b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*0b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
5*0b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*0b57cec5SDimitry Andric //
7*0b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
8*0b57cec5SDimitry Andric //
9*0b57cec5SDimitry Andric // This tablegen backend emits an efficient function to translate HTML named
10*0b57cec5SDimitry Andric // character references to UTF-8 sequences.
11*0b57cec5SDimitry Andric //
12*0b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
13*0b57cec5SDimitry Andric 
14*0b57cec5SDimitry Andric #include "llvm/ADT/SmallString.h"
15*0b57cec5SDimitry Andric #include "llvm/Support/ConvertUTF.h"
16*0b57cec5SDimitry Andric #include "llvm/TableGen/Error.h"
17*0b57cec5SDimitry Andric #include "llvm/TableGen/Record.h"
18*0b57cec5SDimitry Andric #include "llvm/TableGen/StringMatcher.h"
19*0b57cec5SDimitry Andric #include "llvm/TableGen/TableGenBackend.h"
20*0b57cec5SDimitry Andric #include <vector>
21*0b57cec5SDimitry Andric 
22*0b57cec5SDimitry Andric using namespace llvm;
23*0b57cec5SDimitry Andric 
24*0b57cec5SDimitry Andric /// Convert a code point to the corresponding UTF-8 sequence represented
25*0b57cec5SDimitry Andric /// as a C string literal.
26*0b57cec5SDimitry Andric ///
27*0b57cec5SDimitry Andric /// \returns true on success.
28*0b57cec5SDimitry Andric static bool translateCodePointToUTF8(unsigned CodePoint,
29*0b57cec5SDimitry Andric                                      SmallVectorImpl<char> &CLiteral) {
30*0b57cec5SDimitry Andric   char Translated[UNI_MAX_UTF8_BYTES_PER_CODE_POINT];
31*0b57cec5SDimitry Andric   char *TranslatedPtr = Translated;
32*0b57cec5SDimitry Andric   if (!ConvertCodePointToUTF8(CodePoint, TranslatedPtr))
33*0b57cec5SDimitry Andric     return false;
34*0b57cec5SDimitry Andric 
35*0b57cec5SDimitry Andric   StringRef UTF8(Translated, TranslatedPtr - Translated);
36*0b57cec5SDimitry Andric 
37*0b57cec5SDimitry Andric   raw_svector_ostream OS(CLiteral);
38*0b57cec5SDimitry Andric   OS << "\"";
39*0b57cec5SDimitry Andric   for (size_t i = 0, e = UTF8.size(); i != e; ++i) {
40*0b57cec5SDimitry Andric     OS << "\\x";
41*0b57cec5SDimitry Andric     OS.write_hex(static_cast<unsigned char>(UTF8[i]));
42*0b57cec5SDimitry Andric   }
43*0b57cec5SDimitry Andric   OS << "\"";
44*0b57cec5SDimitry Andric 
45*0b57cec5SDimitry Andric   return true;
46*0b57cec5SDimitry Andric }
47*0b57cec5SDimitry Andric 
48*0b57cec5SDimitry Andric namespace clang {
49*0b57cec5SDimitry Andric void EmitClangCommentHTMLNamedCharacterReferences(RecordKeeper &Records,
50*0b57cec5SDimitry Andric                                                   raw_ostream &OS) {
51*0b57cec5SDimitry Andric   std::vector<Record *> Tags = Records.getAllDerivedDefinitions("NCR");
52*0b57cec5SDimitry Andric   std::vector<StringMatcher::StringPair> NameToUTF8;
53*0b57cec5SDimitry Andric   SmallString<32> CLiteral;
54*0b57cec5SDimitry Andric   for (std::vector<Record *>::iterator I = Tags.begin(), E = Tags.end();
55*0b57cec5SDimitry Andric        I != E; ++I) {
56*0b57cec5SDimitry Andric     Record &Tag = **I;
57*0b57cec5SDimitry Andric     std::string Spelling = Tag.getValueAsString("Spelling");
58*0b57cec5SDimitry Andric     uint64_t CodePoint = Tag.getValueAsInt("CodePoint");
59*0b57cec5SDimitry Andric     CLiteral.clear();
60*0b57cec5SDimitry Andric     CLiteral.append("return ");
61*0b57cec5SDimitry Andric     if (!translateCodePointToUTF8(CodePoint, CLiteral)) {
62*0b57cec5SDimitry Andric       SrcMgr.PrintMessage(Tag.getLoc().front(),
63*0b57cec5SDimitry Andric                           SourceMgr::DK_Error,
64*0b57cec5SDimitry Andric                           Twine("invalid code point"));
65*0b57cec5SDimitry Andric       continue;
66*0b57cec5SDimitry Andric     }
67*0b57cec5SDimitry Andric     CLiteral.append(";");
68*0b57cec5SDimitry Andric 
69*0b57cec5SDimitry Andric     StringMatcher::StringPair Match(Spelling, CLiteral.str());
70*0b57cec5SDimitry Andric     NameToUTF8.push_back(Match);
71*0b57cec5SDimitry Andric   }
72*0b57cec5SDimitry Andric 
73*0b57cec5SDimitry Andric   emitSourceFileHeader("HTML named character reference to UTF-8 "
74*0b57cec5SDimitry Andric                        "translation", OS);
75*0b57cec5SDimitry Andric 
76*0b57cec5SDimitry Andric   OS << "StringRef translateHTMLNamedCharacterReferenceToUTF8(\n"
77*0b57cec5SDimitry Andric         "                                             StringRef Name) {\n";
78*0b57cec5SDimitry Andric   StringMatcher("Name", NameToUTF8, OS).Emit();
79*0b57cec5SDimitry Andric   OS << "  return StringRef();\n"
80*0b57cec5SDimitry Andric      << "}\n\n";
81*0b57cec5SDimitry Andric }
82*0b57cec5SDimitry Andric 
83*0b57cec5SDimitry Andric } // end namespace clang
84*0b57cec5SDimitry Andric 
85