xref: /freebsd/contrib/llvm-project/llvm/lib/Support/JSON.cpp (revision 29f37e9bcc67d5d94c9d6bbbcf2717e16bf25c4e)
1  //=== JSON.cpp - JSON value, parsing and serialization - C++ -----------*-===//
2  //
3  // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  // See https://llvm.org/LICENSE.txt for license information.
5  // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  //
7  //===---------------------------------------------------------------------===//
8  
9  #include "llvm/Support/JSON.h"
10  #include "llvm/Support/ConvertUTF.h"
11  #include "llvm/Support/Format.h"
12  #include <cctype>
13  
14  namespace llvm {
15  namespace json {
16  
17  Value &Object::operator[](const ObjectKey &K) {
18    return try_emplace(K, nullptr).first->getSecond();
19  }
20  Value &Object::operator[](ObjectKey &&K) {
21    return try_emplace(std::move(K), nullptr).first->getSecond();
22  }
23  Value *Object::get(StringRef K) {
24    auto I = find(K);
25    if (I == end())
26      return nullptr;
27    return &I->second;
28  }
29  const Value *Object::get(StringRef K) const {
30    auto I = find(K);
31    if (I == end())
32      return nullptr;
33    return &I->second;
34  }
35  llvm::Optional<std::nullptr_t> Object::getNull(StringRef K) const {
36    if (auto *V = get(K))
37      return V->getAsNull();
38    return llvm::None;
39  }
40  llvm::Optional<bool> Object::getBoolean(StringRef K) const {
41    if (auto *V = get(K))
42      return V->getAsBoolean();
43    return llvm::None;
44  }
45  llvm::Optional<double> Object::getNumber(StringRef K) const {
46    if (auto *V = get(K))
47      return V->getAsNumber();
48    return llvm::None;
49  }
50  llvm::Optional<int64_t> Object::getInteger(StringRef K) const {
51    if (auto *V = get(K))
52      return V->getAsInteger();
53    return llvm::None;
54  }
55  llvm::Optional<llvm::StringRef> Object::getString(StringRef K) const {
56    if (auto *V = get(K))
57      return V->getAsString();
58    return llvm::None;
59  }
60  const json::Object *Object::getObject(StringRef K) const {
61    if (auto *V = get(K))
62      return V->getAsObject();
63    return nullptr;
64  }
65  json::Object *Object::getObject(StringRef K) {
66    if (auto *V = get(K))
67      return V->getAsObject();
68    return nullptr;
69  }
70  const json::Array *Object::getArray(StringRef K) const {
71    if (auto *V = get(K))
72      return V->getAsArray();
73    return nullptr;
74  }
75  json::Array *Object::getArray(StringRef K) {
76    if (auto *V = get(K))
77      return V->getAsArray();
78    return nullptr;
79  }
80  bool operator==(const Object &LHS, const Object &RHS) {
81    if (LHS.size() != RHS.size())
82      return false;
83    for (const auto &L : LHS) {
84      auto R = RHS.find(L.first);
85      if (R == RHS.end() || L.second != R->second)
86        return false;
87    }
88    return true;
89  }
90  
91  Array::Array(std::initializer_list<Value> Elements) {
92    V.reserve(Elements.size());
93    for (const Value &V : Elements) {
94      emplace_back(nullptr);
95      back().moveFrom(std::move(V));
96    }
97  }
98  
99  Value::Value(std::initializer_list<Value> Elements)
100      : Value(json::Array(Elements)) {}
101  
102  void Value::copyFrom(const Value &M) {
103    Type = M.Type;
104    switch (Type) {
105    case T_Null:
106    case T_Boolean:
107    case T_Double:
108    case T_Integer:
109      memcpy(Union.buffer, M.Union.buffer, sizeof(Union.buffer));
110      break;
111    case T_StringRef:
112      create<StringRef>(M.as<StringRef>());
113      break;
114    case T_String:
115      create<std::string>(M.as<std::string>());
116      break;
117    case T_Object:
118      create<json::Object>(M.as<json::Object>());
119      break;
120    case T_Array:
121      create<json::Array>(M.as<json::Array>());
122      break;
123    }
124  }
125  
126  void Value::moveFrom(const Value &&M) {
127    Type = M.Type;
128    switch (Type) {
129    case T_Null:
130    case T_Boolean:
131    case T_Double:
132    case T_Integer:
133      memcpy(Union.buffer, M.Union.buffer, sizeof(Union.buffer));
134      break;
135    case T_StringRef:
136      create<StringRef>(M.as<StringRef>());
137      break;
138    case T_String:
139      create<std::string>(std::move(M.as<std::string>()));
140      M.Type = T_Null;
141      break;
142    case T_Object:
143      create<json::Object>(std::move(M.as<json::Object>()));
144      M.Type = T_Null;
145      break;
146    case T_Array:
147      create<json::Array>(std::move(M.as<json::Array>()));
148      M.Type = T_Null;
149      break;
150    }
151  }
152  
153  void Value::destroy() {
154    switch (Type) {
155    case T_Null:
156    case T_Boolean:
157    case T_Double:
158    case T_Integer:
159      break;
160    case T_StringRef:
161      as<StringRef>().~StringRef();
162      break;
163    case T_String:
164      as<std::string>().~basic_string();
165      break;
166    case T_Object:
167      as<json::Object>().~Object();
168      break;
169    case T_Array:
170      as<json::Array>().~Array();
171      break;
172    }
173  }
174  
175  bool operator==(const Value &L, const Value &R) {
176    if (L.kind() != R.kind())
177      return false;
178    switch (L.kind()) {
179    case Value::Null:
180      return *L.getAsNull() == *R.getAsNull();
181    case Value::Boolean:
182      return *L.getAsBoolean() == *R.getAsBoolean();
183    case Value::Number:
184      // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=323
185      // The same integer must convert to the same double, per the standard.
186      // However we see 64-vs-80-bit precision comparisons with gcc-7 -O3 -m32.
187      // So we avoid floating point promotion for exact comparisons.
188      if (L.Type == Value::T_Integer || R.Type == Value::T_Integer)
189        return L.getAsInteger() == R.getAsInteger();
190      return *L.getAsNumber() == *R.getAsNumber();
191    case Value::String:
192      return *L.getAsString() == *R.getAsString();
193    case Value::Array:
194      return *L.getAsArray() == *R.getAsArray();
195    case Value::Object:
196      return *L.getAsObject() == *R.getAsObject();
197    }
198    llvm_unreachable("Unknown value kind");
199  }
200  
201  namespace {
202  // Simple recursive-descent JSON parser.
203  class Parser {
204  public:
205    Parser(StringRef JSON)
206        : Start(JSON.begin()), P(JSON.begin()), End(JSON.end()) {}
207  
208    bool checkUTF8() {
209      size_t ErrOffset;
210      if (isUTF8(StringRef(Start, End - Start), &ErrOffset))
211        return true;
212      P = Start + ErrOffset; // For line/column calculation.
213      return parseError("Invalid UTF-8 sequence");
214    }
215  
216    bool parseValue(Value &Out);
217  
218    bool assertEnd() {
219      eatWhitespace();
220      if (P == End)
221        return true;
222      return parseError("Text after end of document");
223    }
224  
225    Error takeError() {
226      assert(Err);
227      return std::move(*Err);
228    }
229  
230  private:
231    void eatWhitespace() {
232      while (P != End && (*P == ' ' || *P == '\r' || *P == '\n' || *P == '\t'))
233        ++P;
234    }
235  
236    // On invalid syntax, parseX() functions return false and set Err.
237    bool parseNumber(char First, Value &Out);
238    bool parseString(std::string &Out);
239    bool parseUnicode(std::string &Out);
240    bool parseError(const char *Msg); // always returns false
241  
242    char next() { return P == End ? 0 : *P++; }
243    char peek() { return P == End ? 0 : *P; }
244    static bool isNumber(char C) {
245      return C == '0' || C == '1' || C == '2' || C == '3' || C == '4' ||
246             C == '5' || C == '6' || C == '7' || C == '8' || C == '9' ||
247             C == 'e' || C == 'E' || C == '+' || C == '-' || C == '.';
248    }
249  
250    Optional<Error> Err;
251    const char *Start, *P, *End;
252  };
253  
254  bool Parser::parseValue(Value &Out) {
255    eatWhitespace();
256    if (P == End)
257      return parseError("Unexpected EOF");
258    switch (char C = next()) {
259    // Bare null/true/false are easy - first char identifies them.
260    case 'n':
261      Out = nullptr;
262      return (next() == 'u' && next() == 'l' && next() == 'l') ||
263             parseError("Invalid JSON value (null?)");
264    case 't':
265      Out = true;
266      return (next() == 'r' && next() == 'u' && next() == 'e') ||
267             parseError("Invalid JSON value (true?)");
268    case 'f':
269      Out = false;
270      return (next() == 'a' && next() == 'l' && next() == 's' && next() == 'e') ||
271             parseError("Invalid JSON value (false?)");
272    case '"': {
273      std::string S;
274      if (parseString(S)) {
275        Out = std::move(S);
276        return true;
277      }
278      return false;
279    }
280    case '[': {
281      Out = Array{};
282      Array &A = *Out.getAsArray();
283      eatWhitespace();
284      if (peek() == ']') {
285        ++P;
286        return true;
287      }
288      for (;;) {
289        A.emplace_back(nullptr);
290        if (!parseValue(A.back()))
291          return false;
292        eatWhitespace();
293        switch (next()) {
294        case ',':
295          eatWhitespace();
296          continue;
297        case ']':
298          return true;
299        default:
300          return parseError("Expected , or ] after array element");
301        }
302      }
303    }
304    case '{': {
305      Out = Object{};
306      Object &O = *Out.getAsObject();
307      eatWhitespace();
308      if (peek() == '}') {
309        ++P;
310        return true;
311      }
312      for (;;) {
313        if (next() != '"')
314          return parseError("Expected object key");
315        std::string K;
316        if (!parseString(K))
317          return false;
318        eatWhitespace();
319        if (next() != ':')
320          return parseError("Expected : after object key");
321        eatWhitespace();
322        if (!parseValue(O[std::move(K)]))
323          return false;
324        eatWhitespace();
325        switch (next()) {
326        case ',':
327          eatWhitespace();
328          continue;
329        case '}':
330          return true;
331        default:
332          return parseError("Expected , or } after object property");
333        }
334      }
335    }
336    default:
337      if (isNumber(C))
338        return parseNumber(C, Out);
339      return parseError("Invalid JSON value");
340    }
341  }
342  
343  bool Parser::parseNumber(char First, Value &Out) {
344    // Read the number into a string. (Must be null-terminated for strto*).
345    SmallString<24> S;
346    S.push_back(First);
347    while (isNumber(peek()))
348      S.push_back(next());
349    char *End;
350    // Try first to parse as integer, and if so preserve full 64 bits.
351    // strtoll returns long long >= 64 bits, so check it's in range too.
352    auto I = std::strtoll(S.c_str(), &End, 10);
353    if (End == S.end() && I >= std::numeric_limits<int64_t>::min() &&
354        I <= std::numeric_limits<int64_t>::max()) {
355      Out = int64_t(I);
356      return true;
357    }
358    // If it's not an integer
359    Out = std::strtod(S.c_str(), &End);
360    return End == S.end() || parseError("Invalid JSON value (number?)");
361  }
362  
363  bool Parser::parseString(std::string &Out) {
364    // leading quote was already consumed.
365    for (char C = next(); C != '"'; C = next()) {
366      if (LLVM_UNLIKELY(P == End))
367        return parseError("Unterminated string");
368      if (LLVM_UNLIKELY((C & 0x1f) == C))
369        return parseError("Control character in string");
370      if (LLVM_LIKELY(C != '\\')) {
371        Out.push_back(C);
372        continue;
373      }
374      // Handle escape sequence.
375      switch (C = next()) {
376      case '"':
377      case '\\':
378      case '/':
379        Out.push_back(C);
380        break;
381      case 'b':
382        Out.push_back('\b');
383        break;
384      case 'f':
385        Out.push_back('\f');
386        break;
387      case 'n':
388        Out.push_back('\n');
389        break;
390      case 'r':
391        Out.push_back('\r');
392        break;
393      case 't':
394        Out.push_back('\t');
395        break;
396      case 'u':
397        if (!parseUnicode(Out))
398          return false;
399        break;
400      default:
401        return parseError("Invalid escape sequence");
402      }
403    }
404    return true;
405  }
406  
407  static void encodeUtf8(uint32_t Rune, std::string &Out) {
408    if (Rune < 0x80) {
409      Out.push_back(Rune & 0x7F);
410    } else if (Rune < 0x800) {
411      uint8_t FirstByte = 0xC0 | ((Rune & 0x7C0) >> 6);
412      uint8_t SecondByte = 0x80 | (Rune & 0x3F);
413      Out.push_back(FirstByte);
414      Out.push_back(SecondByte);
415    } else if (Rune < 0x10000) {
416      uint8_t FirstByte = 0xE0 | ((Rune & 0xF000) >> 12);
417      uint8_t SecondByte = 0x80 | ((Rune & 0xFC0) >> 6);
418      uint8_t ThirdByte = 0x80 | (Rune & 0x3F);
419      Out.push_back(FirstByte);
420      Out.push_back(SecondByte);
421      Out.push_back(ThirdByte);
422    } else if (Rune < 0x110000) {
423      uint8_t FirstByte = 0xF0 | ((Rune & 0x1F0000) >> 18);
424      uint8_t SecondByte = 0x80 | ((Rune & 0x3F000) >> 12);
425      uint8_t ThirdByte = 0x80 | ((Rune & 0xFC0) >> 6);
426      uint8_t FourthByte = 0x80 | (Rune & 0x3F);
427      Out.push_back(FirstByte);
428      Out.push_back(SecondByte);
429      Out.push_back(ThirdByte);
430      Out.push_back(FourthByte);
431    } else {
432      llvm_unreachable("Invalid codepoint");
433    }
434  }
435  
436  // Parse a UTF-16 \uNNNN escape sequence. "\u" has already been consumed.
437  // May parse several sequential escapes to ensure proper surrogate handling.
438  // We do not use ConvertUTF.h, it can't accept and replace unpaired surrogates.
439  // These are invalid Unicode but valid JSON (RFC 8259, section 8.2).
440  bool Parser::parseUnicode(std::string &Out) {
441    // Invalid UTF is not a JSON error (RFC 8529§8.2). It gets replaced by U+FFFD.
442    auto Invalid = [&] { Out.append(/* UTF-8 */ {'\xef', '\xbf', '\xbd'}); };
443    // Decodes 4 hex digits from the stream into Out, returns false on error.
444    auto Parse4Hex = [this](uint16_t &Out) -> bool {
445      Out = 0;
446      char Bytes[] = {next(), next(), next(), next()};
447      for (unsigned char C : Bytes) {
448        if (!std::isxdigit(C))
449          return parseError("Invalid \\u escape sequence");
450        Out <<= 4;
451        Out |= (C > '9') ? (C & ~0x20) - 'A' + 10 : (C - '0');
452      }
453      return true;
454    };
455    uint16_t First; // UTF-16 code unit from the first \u escape.
456    if (!Parse4Hex(First))
457      return false;
458  
459    // We loop to allow proper surrogate-pair error handling.
460    while (true) {
461      // Case 1: the UTF-16 code unit is already a codepoint in the BMP.
462      if (LLVM_LIKELY(First < 0xD800 || First >= 0xE000)) {
463        encodeUtf8(First, Out);
464        return true;
465      }
466  
467      // Case 2: it's an (unpaired) trailing surrogate.
468      if (LLVM_UNLIKELY(First >= 0xDC00)) {
469        Invalid();
470        return true;
471      }
472  
473      // Case 3: it's a leading surrogate. We expect a trailing one next.
474      // Case 3a: there's no trailing \u escape. Don't advance in the stream.
475      if (LLVM_UNLIKELY(P + 2 > End || *P != '\\' || *(P + 1) != 'u')) {
476        Invalid(); // Leading surrogate was unpaired.
477        return true;
478      }
479      P += 2;
480      uint16_t Second;
481      if (!Parse4Hex(Second))
482        return false;
483      // Case 3b: there was another \u escape, but it wasn't a trailing surrogate.
484      if (LLVM_UNLIKELY(Second < 0xDC00 || Second >= 0xE000)) {
485        Invalid();      // Leading surrogate was unpaired.
486        First = Second; // Second escape still needs to be processed.
487        continue;
488      }
489      // Case 3c: a valid surrogate pair encoding an astral codepoint.
490      encodeUtf8(0x10000 | ((First - 0xD800) << 10) | (Second - 0xDC00), Out);
491      return true;
492    }
493  }
494  
495  bool Parser::parseError(const char *Msg) {
496    int Line = 1;
497    const char *StartOfLine = Start;
498    for (const char *X = Start; X < P; ++X) {
499      if (*X == 0x0A) {
500        ++Line;
501        StartOfLine = X + 1;
502      }
503    }
504    Err.emplace(
505        std::make_unique<ParseError>(Msg, Line, P - StartOfLine, P - Start));
506    return false;
507  }
508  } // namespace
509  
510  Expected<Value> parse(StringRef JSON) {
511    Parser P(JSON);
512    Value E = nullptr;
513    if (P.checkUTF8())
514      if (P.parseValue(E))
515        if (P.assertEnd())
516          return std::move(E);
517    return P.takeError();
518  }
519  char ParseError::ID = 0;
520  
521  static std::vector<const Object::value_type *> sortedElements(const Object &O) {
522    std::vector<const Object::value_type *> Elements;
523    for (const auto &E : O)
524      Elements.push_back(&E);
525    llvm::sort(Elements,
526               [](const Object::value_type *L, const Object::value_type *R) {
527                 return L->first < R->first;
528               });
529    return Elements;
530  }
531  
532  bool isUTF8(llvm::StringRef S, size_t *ErrOffset) {
533    // Fast-path for ASCII, which is valid UTF-8.
534    if (LLVM_LIKELY(isASCII(S)))
535      return true;
536  
537    const UTF8 *Data = reinterpret_cast<const UTF8 *>(S.data()), *Rest = Data;
538    if (LLVM_LIKELY(isLegalUTF8String(&Rest, Data + S.size())))
539      return true;
540  
541    if (ErrOffset)
542      *ErrOffset = Rest - Data;
543    return false;
544  }
545  
546  std::string fixUTF8(llvm::StringRef S) {
547    // This isn't particularly efficient, but is only for error-recovery.
548    std::vector<UTF32> Codepoints(S.size()); // 1 codepoint per byte suffices.
549    const UTF8 *In8 = reinterpret_cast<const UTF8 *>(S.data());
550    UTF32 *Out32 = Codepoints.data();
551    ConvertUTF8toUTF32(&In8, In8 + S.size(), &Out32, Out32 + Codepoints.size(),
552                       lenientConversion);
553    Codepoints.resize(Out32 - Codepoints.data());
554    std::string Res(4 * Codepoints.size(), 0); // 4 bytes per codepoint suffice
555    const UTF32 *In32 = Codepoints.data();
556    UTF8 *Out8 = reinterpret_cast<UTF8 *>(&Res[0]);
557    ConvertUTF32toUTF8(&In32, In32 + Codepoints.size(), &Out8, Out8 + Res.size(),
558                       strictConversion);
559    Res.resize(reinterpret_cast<char *>(Out8) - Res.data());
560    return Res;
561  }
562  
563  static void quote(llvm::raw_ostream &OS, llvm::StringRef S) {
564    OS << '\"';
565    for (unsigned char C : S) {
566      if (C == 0x22 || C == 0x5C)
567        OS << '\\';
568      if (C >= 0x20) {
569        OS << C;
570        continue;
571      }
572      OS << '\\';
573      switch (C) {
574      // A few characters are common enough to make short escapes worthwhile.
575      case '\t':
576        OS << 't';
577        break;
578      case '\n':
579        OS << 'n';
580        break;
581      case '\r':
582        OS << 'r';
583        break;
584      default:
585        OS << 'u';
586        llvm::write_hex(OS, C, llvm::HexPrintStyle::Lower, 4);
587        break;
588      }
589    }
590    OS << '\"';
591  }
592  
593  void llvm::json::OStream::value(const Value &V) {
594    switch (V.kind()) {
595    case Value::Null:
596      valueBegin();
597      OS << "null";
598      return;
599    case Value::Boolean:
600      valueBegin();
601      OS << (*V.getAsBoolean() ? "true" : "false");
602      return;
603    case Value::Number:
604      valueBegin();
605      if (V.Type == Value::T_Integer)
606        OS << *V.getAsInteger();
607      else
608        OS << format("%.*g", std::numeric_limits<double>::max_digits10,
609                     *V.getAsNumber());
610      return;
611    case Value::String:
612      valueBegin();
613      quote(OS, *V.getAsString());
614      return;
615    case Value::Array:
616      return array([&] {
617        for (const Value &E : *V.getAsArray())
618          value(E);
619      });
620    case Value::Object:
621      return object([&] {
622        for (const Object::value_type *E : sortedElements(*V.getAsObject()))
623          attribute(E->first, E->second);
624      });
625    }
626  }
627  
628  void llvm::json::OStream::valueBegin() {
629    assert(Stack.back().Ctx != Object && "Only attributes allowed here");
630    if (Stack.back().HasValue) {
631      assert(Stack.back().Ctx != Singleton && "Only one value allowed here");
632      OS << ',';
633    }
634    if (Stack.back().Ctx == Array)
635      newline();
636    Stack.back().HasValue = true;
637  }
638  
639  void llvm::json::OStream::newline() {
640    if (IndentSize) {
641      OS.write('\n');
642      OS.indent(Indent);
643    }
644  }
645  
646  void llvm::json::OStream::arrayBegin() {
647    valueBegin();
648    Stack.emplace_back();
649    Stack.back().Ctx = Array;
650    Indent += IndentSize;
651    OS << '[';
652  }
653  
654  void llvm::json::OStream::arrayEnd() {
655    assert(Stack.back().Ctx == Array);
656    Indent -= IndentSize;
657    if (Stack.back().HasValue)
658      newline();
659    OS << ']';
660    Stack.pop_back();
661    assert(!Stack.empty());
662  }
663  
664  void llvm::json::OStream::objectBegin() {
665    valueBegin();
666    Stack.emplace_back();
667    Stack.back().Ctx = Object;
668    Indent += IndentSize;
669    OS << '{';
670  }
671  
672  void llvm::json::OStream::objectEnd() {
673    assert(Stack.back().Ctx == Object);
674    Indent -= IndentSize;
675    if (Stack.back().HasValue)
676      newline();
677    OS << '}';
678    Stack.pop_back();
679    assert(!Stack.empty());
680  }
681  
682  void llvm::json::OStream::attributeBegin(llvm::StringRef Key) {
683    assert(Stack.back().Ctx == Object);
684    if (Stack.back().HasValue)
685      OS << ',';
686    newline();
687    Stack.back().HasValue = true;
688    Stack.emplace_back();
689    Stack.back().Ctx = Singleton;
690    if (LLVM_LIKELY(isUTF8(Key))) {
691      quote(OS, Key);
692    } else {
693      assert(false && "Invalid UTF-8 in attribute key");
694      quote(OS, fixUTF8(Key));
695    }
696    OS.write(':');
697    if (IndentSize)
698      OS.write(' ');
699  }
700  
701  void llvm::json::OStream::attributeEnd() {
702    assert(Stack.back().Ctx == Singleton);
703    assert(Stack.back().HasValue && "Attribute must have a value");
704    Stack.pop_back();
705    assert(Stack.back().Ctx == Object);
706  }
707  
708  } // namespace json
709  } // namespace llvm
710  
711  void llvm::format_provider<llvm::json::Value>::format(
712      const llvm::json::Value &E, raw_ostream &OS, StringRef Options) {
713    unsigned IndentAmount = 0;
714    if (!Options.empty() && Options.getAsInteger(/*Radix=*/10, IndentAmount))
715      llvm_unreachable("json::Value format options should be an integer");
716    json::OStream(OS, IndentAmount).value(E);
717  }
718  
719