xref: /freebsd/contrib/llvm-project/llvm/lib/Support/JSON.cpp (revision a521f2116473fbd8c09db395518f060a27d02334)
1 //=== JSON.cpp - JSON value, parsing and serialization - C++ -----------*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===---------------------------------------------------------------------===//
8 
9 #include "llvm/Support/JSON.h"
10 #include "llvm/Support/ConvertUTF.h"
11 #include "llvm/Support/Format.h"
12 #include <cctype>
13 
14 namespace llvm {
15 namespace json {
16 
17 Value &Object::operator[](const ObjectKey &K) {
18   return try_emplace(K, nullptr).first->getSecond();
19 }
20 Value &Object::operator[](ObjectKey &&K) {
21   return try_emplace(std::move(K), nullptr).first->getSecond();
22 }
23 Value *Object::get(StringRef K) {
24   auto I = find(K);
25   if (I == end())
26     return nullptr;
27   return &I->second;
28 }
29 const Value *Object::get(StringRef K) const {
30   auto I = find(K);
31   if (I == end())
32     return nullptr;
33   return &I->second;
34 }
35 llvm::Optional<std::nullptr_t> Object::getNull(StringRef K) const {
36   if (auto *V = get(K))
37     return V->getAsNull();
38   return llvm::None;
39 }
40 llvm::Optional<bool> Object::getBoolean(StringRef K) const {
41   if (auto *V = get(K))
42     return V->getAsBoolean();
43   return llvm::None;
44 }
45 llvm::Optional<double> Object::getNumber(StringRef K) const {
46   if (auto *V = get(K))
47     return V->getAsNumber();
48   return llvm::None;
49 }
50 llvm::Optional<int64_t> Object::getInteger(StringRef K) const {
51   if (auto *V = get(K))
52     return V->getAsInteger();
53   return llvm::None;
54 }
55 llvm::Optional<llvm::StringRef> Object::getString(StringRef K) const {
56   if (auto *V = get(K))
57     return V->getAsString();
58   return llvm::None;
59 }
60 const json::Object *Object::getObject(StringRef K) const {
61   if (auto *V = get(K))
62     return V->getAsObject();
63   return nullptr;
64 }
65 json::Object *Object::getObject(StringRef K) {
66   if (auto *V = get(K))
67     return V->getAsObject();
68   return nullptr;
69 }
70 const json::Array *Object::getArray(StringRef K) const {
71   if (auto *V = get(K))
72     return V->getAsArray();
73   return nullptr;
74 }
75 json::Array *Object::getArray(StringRef K) {
76   if (auto *V = get(K))
77     return V->getAsArray();
78   return nullptr;
79 }
80 bool operator==(const Object &LHS, const Object &RHS) {
81   if (LHS.size() != RHS.size())
82     return false;
83   for (const auto &L : LHS) {
84     auto R = RHS.find(L.first);
85     if (R == RHS.end() || L.second != R->second)
86       return false;
87   }
88   return true;
89 }
90 
91 Array::Array(std::initializer_list<Value> Elements) {
92   V.reserve(Elements.size());
93   for (const Value &V : Elements) {
94     emplace_back(nullptr);
95     back().moveFrom(std::move(V));
96   }
97 }
98 
99 Value::Value(std::initializer_list<Value> Elements)
100     : Value(json::Array(Elements)) {}
101 
102 void Value::copyFrom(const Value &M) {
103   Type = M.Type;
104   switch (Type) {
105   case T_Null:
106   case T_Boolean:
107   case T_Double:
108   case T_Integer:
109     memcpy(Union.buffer, M.Union.buffer, sizeof(Union.buffer));
110     break;
111   case T_StringRef:
112     create<StringRef>(M.as<StringRef>());
113     break;
114   case T_String:
115     create<std::string>(M.as<std::string>());
116     break;
117   case T_Object:
118     create<json::Object>(M.as<json::Object>());
119     break;
120   case T_Array:
121     create<json::Array>(M.as<json::Array>());
122     break;
123   }
124 }
125 
126 void Value::moveFrom(const Value &&M) {
127   Type = M.Type;
128   switch (Type) {
129   case T_Null:
130   case T_Boolean:
131   case T_Double:
132   case T_Integer:
133     memcpy(Union.buffer, M.Union.buffer, sizeof(Union.buffer));
134     break;
135   case T_StringRef:
136     create<StringRef>(M.as<StringRef>());
137     break;
138   case T_String:
139     create<std::string>(std::move(M.as<std::string>()));
140     M.Type = T_Null;
141     break;
142   case T_Object:
143     create<json::Object>(std::move(M.as<json::Object>()));
144     M.Type = T_Null;
145     break;
146   case T_Array:
147     create<json::Array>(std::move(M.as<json::Array>()));
148     M.Type = T_Null;
149     break;
150   }
151 }
152 
153 void Value::destroy() {
154   switch (Type) {
155   case T_Null:
156   case T_Boolean:
157   case T_Double:
158   case T_Integer:
159     break;
160   case T_StringRef:
161     as<StringRef>().~StringRef();
162     break;
163   case T_String:
164     as<std::string>().~basic_string();
165     break;
166   case T_Object:
167     as<json::Object>().~Object();
168     break;
169   case T_Array:
170     as<json::Array>().~Array();
171     break;
172   }
173 }
174 
175 bool operator==(const Value &L, const Value &R) {
176   if (L.kind() != R.kind())
177     return false;
178   switch (L.kind()) {
179   case Value::Null:
180     return *L.getAsNull() == *R.getAsNull();
181   case Value::Boolean:
182     return *L.getAsBoolean() == *R.getAsBoolean();
183   case Value::Number:
184     // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=323
185     // The same integer must convert to the same double, per the standard.
186     // However we see 64-vs-80-bit precision comparisons with gcc-7 -O3 -m32.
187     // So we avoid floating point promotion for exact comparisons.
188     if (L.Type == Value::T_Integer || R.Type == Value::T_Integer)
189       return L.getAsInteger() == R.getAsInteger();
190     return *L.getAsNumber() == *R.getAsNumber();
191   case Value::String:
192     return *L.getAsString() == *R.getAsString();
193   case Value::Array:
194     return *L.getAsArray() == *R.getAsArray();
195   case Value::Object:
196     return *L.getAsObject() == *R.getAsObject();
197   }
198   llvm_unreachable("Unknown value kind");
199 }
200 
201 namespace {
202 // Simple recursive-descent JSON parser.
203 class Parser {
204 public:
205   Parser(StringRef JSON)
206       : Start(JSON.begin()), P(JSON.begin()), End(JSON.end()) {}
207 
208   bool checkUTF8() {
209     size_t ErrOffset;
210     if (isUTF8(StringRef(Start, End - Start), &ErrOffset))
211       return true;
212     P = Start + ErrOffset; // For line/column calculation.
213     return parseError("Invalid UTF-8 sequence");
214   }
215 
216   bool parseValue(Value &Out);
217 
218   bool assertEnd() {
219     eatWhitespace();
220     if (P == End)
221       return true;
222     return parseError("Text after end of document");
223   }
224 
225   Error takeError() {
226     assert(Err);
227     return std::move(*Err);
228   }
229 
230 private:
231   void eatWhitespace() {
232     while (P != End && (*P == ' ' || *P == '\r' || *P == '\n' || *P == '\t'))
233       ++P;
234   }
235 
236   // On invalid syntax, parseX() functions return false and set Err.
237   bool parseNumber(char First, Value &Out);
238   bool parseString(std::string &Out);
239   bool parseUnicode(std::string &Out);
240   bool parseError(const char *Msg); // always returns false
241 
242   char next() { return P == End ? 0 : *P++; }
243   char peek() { return P == End ? 0 : *P; }
244   static bool isNumber(char C) {
245     return C == '0' || C == '1' || C == '2' || C == '3' || C == '4' ||
246            C == '5' || C == '6' || C == '7' || C == '8' || C == '9' ||
247            C == 'e' || C == 'E' || C == '+' || C == '-' || C == '.';
248   }
249 
250   Optional<Error> Err;
251   const char *Start, *P, *End;
252 };
253 
254 bool Parser::parseValue(Value &Out) {
255   eatWhitespace();
256   if (P == End)
257     return parseError("Unexpected EOF");
258   switch (char C = next()) {
259   // Bare null/true/false are easy - first char identifies them.
260   case 'n':
261     Out = nullptr;
262     return (next() == 'u' && next() == 'l' && next() == 'l') ||
263            parseError("Invalid JSON value (null?)");
264   case 't':
265     Out = true;
266     return (next() == 'r' && next() == 'u' && next() == 'e') ||
267            parseError("Invalid JSON value (true?)");
268   case 'f':
269     Out = false;
270     return (next() == 'a' && next() == 'l' && next() == 's' && next() == 'e') ||
271            parseError("Invalid JSON value (false?)");
272   case '"': {
273     std::string S;
274     if (parseString(S)) {
275       Out = std::move(S);
276       return true;
277     }
278     return false;
279   }
280   case '[': {
281     Out = Array{};
282     Array &A = *Out.getAsArray();
283     eatWhitespace();
284     if (peek() == ']') {
285       ++P;
286       return true;
287     }
288     for (;;) {
289       A.emplace_back(nullptr);
290       if (!parseValue(A.back()))
291         return false;
292       eatWhitespace();
293       switch (next()) {
294       case ',':
295         eatWhitespace();
296         continue;
297       case ']':
298         return true;
299       default:
300         return parseError("Expected , or ] after array element");
301       }
302     }
303   }
304   case '{': {
305     Out = Object{};
306     Object &O = *Out.getAsObject();
307     eatWhitespace();
308     if (peek() == '}') {
309       ++P;
310       return true;
311     }
312     for (;;) {
313       if (next() != '"')
314         return parseError("Expected object key");
315       std::string K;
316       if (!parseString(K))
317         return false;
318       eatWhitespace();
319       if (next() != ':')
320         return parseError("Expected : after object key");
321       eatWhitespace();
322       if (!parseValue(O[std::move(K)]))
323         return false;
324       eatWhitespace();
325       switch (next()) {
326       case ',':
327         eatWhitespace();
328         continue;
329       case '}':
330         return true;
331       default:
332         return parseError("Expected , or } after object property");
333       }
334     }
335   }
336   default:
337     if (isNumber(C))
338       return parseNumber(C, Out);
339     return parseError("Invalid JSON value");
340   }
341 }
342 
343 bool Parser::parseNumber(char First, Value &Out) {
344   // Read the number into a string. (Must be null-terminated for strto*).
345   SmallString<24> S;
346   S.push_back(First);
347   while (isNumber(peek()))
348     S.push_back(next());
349   char *End;
350   // Try first to parse as integer, and if so preserve full 64 bits.
351   // strtoll returns long long >= 64 bits, so check it's in range too.
352   auto I = std::strtoll(S.c_str(), &End, 10);
353   if (End == S.end() && I >= std::numeric_limits<int64_t>::min() &&
354       I <= std::numeric_limits<int64_t>::max()) {
355     Out = int64_t(I);
356     return true;
357   }
358   // If it's not an integer
359   Out = std::strtod(S.c_str(), &End);
360   return End == S.end() || parseError("Invalid JSON value (number?)");
361 }
362 
363 bool Parser::parseString(std::string &Out) {
364   // leading quote was already consumed.
365   for (char C = next(); C != '"'; C = next()) {
366     if (LLVM_UNLIKELY(P == End))
367       return parseError("Unterminated string");
368     if (LLVM_UNLIKELY((C & 0x1f) == C))
369       return parseError("Control character in string");
370     if (LLVM_LIKELY(C != '\\')) {
371       Out.push_back(C);
372       continue;
373     }
374     // Handle escape sequence.
375     switch (C = next()) {
376     case '"':
377     case '\\':
378     case '/':
379       Out.push_back(C);
380       break;
381     case 'b':
382       Out.push_back('\b');
383       break;
384     case 'f':
385       Out.push_back('\f');
386       break;
387     case 'n':
388       Out.push_back('\n');
389       break;
390     case 'r':
391       Out.push_back('\r');
392       break;
393     case 't':
394       Out.push_back('\t');
395       break;
396     case 'u':
397       if (!parseUnicode(Out))
398         return false;
399       break;
400     default:
401       return parseError("Invalid escape sequence");
402     }
403   }
404   return true;
405 }
406 
407 static void encodeUtf8(uint32_t Rune, std::string &Out) {
408   if (Rune < 0x80) {
409     Out.push_back(Rune & 0x7F);
410   } else if (Rune < 0x800) {
411     uint8_t FirstByte = 0xC0 | ((Rune & 0x7C0) >> 6);
412     uint8_t SecondByte = 0x80 | (Rune & 0x3F);
413     Out.push_back(FirstByte);
414     Out.push_back(SecondByte);
415   } else if (Rune < 0x10000) {
416     uint8_t FirstByte = 0xE0 | ((Rune & 0xF000) >> 12);
417     uint8_t SecondByte = 0x80 | ((Rune & 0xFC0) >> 6);
418     uint8_t ThirdByte = 0x80 | (Rune & 0x3F);
419     Out.push_back(FirstByte);
420     Out.push_back(SecondByte);
421     Out.push_back(ThirdByte);
422   } else if (Rune < 0x110000) {
423     uint8_t FirstByte = 0xF0 | ((Rune & 0x1F0000) >> 18);
424     uint8_t SecondByte = 0x80 | ((Rune & 0x3F000) >> 12);
425     uint8_t ThirdByte = 0x80 | ((Rune & 0xFC0) >> 6);
426     uint8_t FourthByte = 0x80 | (Rune & 0x3F);
427     Out.push_back(FirstByte);
428     Out.push_back(SecondByte);
429     Out.push_back(ThirdByte);
430     Out.push_back(FourthByte);
431   } else {
432     llvm_unreachable("Invalid codepoint");
433   }
434 }
435 
436 // Parse a UTF-16 \uNNNN escape sequence. "\u" has already been consumed.
437 // May parse several sequential escapes to ensure proper surrogate handling.
438 // We do not use ConvertUTF.h, it can't accept and replace unpaired surrogates.
439 // These are invalid Unicode but valid JSON (RFC 8259, section 8.2).
440 bool Parser::parseUnicode(std::string &Out) {
441   // Invalid UTF is not a JSON error (RFC 8529§8.2). It gets replaced by U+FFFD.
442   auto Invalid = [&] { Out.append(/* UTF-8 */ {'\xef', '\xbf', '\xbd'}); };
443   // Decodes 4 hex digits from the stream into Out, returns false on error.
444   auto Parse4Hex = [this](uint16_t &Out) -> bool {
445     Out = 0;
446     char Bytes[] = {next(), next(), next(), next()};
447     for (unsigned char C : Bytes) {
448       if (!std::isxdigit(C))
449         return parseError("Invalid \\u escape sequence");
450       Out <<= 4;
451       Out |= (C > '9') ? (C & ~0x20) - 'A' + 10 : (C - '0');
452     }
453     return true;
454   };
455   uint16_t First; // UTF-16 code unit from the first \u escape.
456   if (!Parse4Hex(First))
457     return false;
458 
459   // We loop to allow proper surrogate-pair error handling.
460   while (true) {
461     // Case 1: the UTF-16 code unit is already a codepoint in the BMP.
462     if (LLVM_LIKELY(First < 0xD800 || First >= 0xE000)) {
463       encodeUtf8(First, Out);
464       return true;
465     }
466 
467     // Case 2: it's an (unpaired) trailing surrogate.
468     if (LLVM_UNLIKELY(First >= 0xDC00)) {
469       Invalid();
470       return true;
471     }
472 
473     // Case 3: it's a leading surrogate. We expect a trailing one next.
474     // Case 3a: there's no trailing \u escape. Don't advance in the stream.
475     if (LLVM_UNLIKELY(P + 2 > End || *P != '\\' || *(P + 1) != 'u')) {
476       Invalid(); // Leading surrogate was unpaired.
477       return true;
478     }
479     P += 2;
480     uint16_t Second;
481     if (!Parse4Hex(Second))
482       return false;
483     // Case 3b: there was another \u escape, but it wasn't a trailing surrogate.
484     if (LLVM_UNLIKELY(Second < 0xDC00 || Second >= 0xE000)) {
485       Invalid();      // Leading surrogate was unpaired.
486       First = Second; // Second escape still needs to be processed.
487       continue;
488     }
489     // Case 3c: a valid surrogate pair encoding an astral codepoint.
490     encodeUtf8(0x10000 | ((First - 0xD800) << 10) | (Second - 0xDC00), Out);
491     return true;
492   }
493 }
494 
495 bool Parser::parseError(const char *Msg) {
496   int Line = 1;
497   const char *StartOfLine = Start;
498   for (const char *X = Start; X < P; ++X) {
499     if (*X == 0x0A) {
500       ++Line;
501       StartOfLine = X + 1;
502     }
503   }
504   Err.emplace(
505       std::make_unique<ParseError>(Msg, Line, P - StartOfLine, P - Start));
506   return false;
507 }
508 } // namespace
509 
510 Expected<Value> parse(StringRef JSON) {
511   Parser P(JSON);
512   Value E = nullptr;
513   if (P.checkUTF8())
514     if (P.parseValue(E))
515       if (P.assertEnd())
516         return std::move(E);
517   return P.takeError();
518 }
519 char ParseError::ID = 0;
520 
521 static std::vector<const Object::value_type *> sortedElements(const Object &O) {
522   std::vector<const Object::value_type *> Elements;
523   for (const auto &E : O)
524     Elements.push_back(&E);
525   llvm::sort(Elements,
526              [](const Object::value_type *L, const Object::value_type *R) {
527                return L->first < R->first;
528              });
529   return Elements;
530 }
531 
532 bool isUTF8(llvm::StringRef S, size_t *ErrOffset) {
533   // Fast-path for ASCII, which is valid UTF-8.
534   if (LLVM_LIKELY(isASCII(S)))
535     return true;
536 
537   const UTF8 *Data = reinterpret_cast<const UTF8 *>(S.data()), *Rest = Data;
538   if (LLVM_LIKELY(isLegalUTF8String(&Rest, Data + S.size())))
539     return true;
540 
541   if (ErrOffset)
542     *ErrOffset = Rest - Data;
543   return false;
544 }
545 
546 std::string fixUTF8(llvm::StringRef S) {
547   // This isn't particularly efficient, but is only for error-recovery.
548   std::vector<UTF32> Codepoints(S.size()); // 1 codepoint per byte suffices.
549   const UTF8 *In8 = reinterpret_cast<const UTF8 *>(S.data());
550   UTF32 *Out32 = Codepoints.data();
551   ConvertUTF8toUTF32(&In8, In8 + S.size(), &Out32, Out32 + Codepoints.size(),
552                      lenientConversion);
553   Codepoints.resize(Out32 - Codepoints.data());
554   std::string Res(4 * Codepoints.size(), 0); // 4 bytes per codepoint suffice
555   const UTF32 *In32 = Codepoints.data();
556   UTF8 *Out8 = reinterpret_cast<UTF8 *>(&Res[0]);
557   ConvertUTF32toUTF8(&In32, In32 + Codepoints.size(), &Out8, Out8 + Res.size(),
558                      strictConversion);
559   Res.resize(reinterpret_cast<char *>(Out8) - Res.data());
560   return Res;
561 }
562 
563 static void quote(llvm::raw_ostream &OS, llvm::StringRef S) {
564   OS << '\"';
565   for (unsigned char C : S) {
566     if (C == 0x22 || C == 0x5C)
567       OS << '\\';
568     if (C >= 0x20) {
569       OS << C;
570       continue;
571     }
572     OS << '\\';
573     switch (C) {
574     // A few characters are common enough to make short escapes worthwhile.
575     case '\t':
576       OS << 't';
577       break;
578     case '\n':
579       OS << 'n';
580       break;
581     case '\r':
582       OS << 'r';
583       break;
584     default:
585       OS << 'u';
586       llvm::write_hex(OS, C, llvm::HexPrintStyle::Lower, 4);
587       break;
588     }
589   }
590   OS << '\"';
591 }
592 
593 void llvm::json::OStream::value(const Value &V) {
594   switch (V.kind()) {
595   case Value::Null:
596     valueBegin();
597     OS << "null";
598     return;
599   case Value::Boolean:
600     valueBegin();
601     OS << (*V.getAsBoolean() ? "true" : "false");
602     return;
603   case Value::Number:
604     valueBegin();
605     if (V.Type == Value::T_Integer)
606       OS << *V.getAsInteger();
607     else
608       OS << format("%.*g", std::numeric_limits<double>::max_digits10,
609                    *V.getAsNumber());
610     return;
611   case Value::String:
612     valueBegin();
613     quote(OS, *V.getAsString());
614     return;
615   case Value::Array:
616     return array([&] {
617       for (const Value &E : *V.getAsArray())
618         value(E);
619     });
620   case Value::Object:
621     return object([&] {
622       for (const Object::value_type *E : sortedElements(*V.getAsObject()))
623         attribute(E->first, E->second);
624     });
625   }
626 }
627 
628 void llvm::json::OStream::valueBegin() {
629   assert(Stack.back().Ctx != Object && "Only attributes allowed here");
630   if (Stack.back().HasValue) {
631     assert(Stack.back().Ctx != Singleton && "Only one value allowed here");
632     OS << ',';
633   }
634   if (Stack.back().Ctx == Array)
635     newline();
636   Stack.back().HasValue = true;
637 }
638 
639 void llvm::json::OStream::newline() {
640   if (IndentSize) {
641     OS.write('\n');
642     OS.indent(Indent);
643   }
644 }
645 
646 void llvm::json::OStream::arrayBegin() {
647   valueBegin();
648   Stack.emplace_back();
649   Stack.back().Ctx = Array;
650   Indent += IndentSize;
651   OS << '[';
652 }
653 
654 void llvm::json::OStream::arrayEnd() {
655   assert(Stack.back().Ctx == Array);
656   Indent -= IndentSize;
657   if (Stack.back().HasValue)
658     newline();
659   OS << ']';
660   Stack.pop_back();
661   assert(!Stack.empty());
662 }
663 
664 void llvm::json::OStream::objectBegin() {
665   valueBegin();
666   Stack.emplace_back();
667   Stack.back().Ctx = Object;
668   Indent += IndentSize;
669   OS << '{';
670 }
671 
672 void llvm::json::OStream::objectEnd() {
673   assert(Stack.back().Ctx == Object);
674   Indent -= IndentSize;
675   if (Stack.back().HasValue)
676     newline();
677   OS << '}';
678   Stack.pop_back();
679   assert(!Stack.empty());
680 }
681 
682 void llvm::json::OStream::attributeBegin(llvm::StringRef Key) {
683   assert(Stack.back().Ctx == Object);
684   if (Stack.back().HasValue)
685     OS << ',';
686   newline();
687   Stack.back().HasValue = true;
688   Stack.emplace_back();
689   Stack.back().Ctx = Singleton;
690   if (LLVM_LIKELY(isUTF8(Key))) {
691     quote(OS, Key);
692   } else {
693     assert(false && "Invalid UTF-8 in attribute key");
694     quote(OS, fixUTF8(Key));
695   }
696   OS.write(':');
697   if (IndentSize)
698     OS.write(' ');
699 }
700 
701 void llvm::json::OStream::attributeEnd() {
702   assert(Stack.back().Ctx == Singleton);
703   assert(Stack.back().HasValue && "Attribute must have a value");
704   Stack.pop_back();
705   assert(Stack.back().Ctx == Object);
706 }
707 
708 } // namespace json
709 } // namespace llvm
710 
711 void llvm::format_provider<llvm::json::Value>::format(
712     const llvm::json::Value &E, raw_ostream &OS, StringRef Options) {
713   unsigned IndentAmount = 0;
714   if (!Options.empty() && Options.getAsInteger(/*Radix=*/10, IndentAmount))
715     llvm_unreachable("json::Value format options should be an integer");
716   json::OStream(OS, IndentAmount).value(E);
717 }
718 
719