1 //=== JSON.cpp - JSON value, parsing and serialization - C++ -----------*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===---------------------------------------------------------------------===// 8 9 #include "llvm/Support/JSON.h" 10 #include "llvm/Support/ConvertUTF.h" 11 #include "llvm/Support/Format.h" 12 #include <cctype> 13 14 namespace llvm { 15 namespace json { 16 17 Value &Object::operator[](const ObjectKey &K) { 18 return try_emplace(K, nullptr).first->getSecond(); 19 } 20 Value &Object::operator[](ObjectKey &&K) { 21 return try_emplace(std::move(K), nullptr).first->getSecond(); 22 } 23 Value *Object::get(StringRef K) { 24 auto I = find(K); 25 if (I == end()) 26 return nullptr; 27 return &I->second; 28 } 29 const Value *Object::get(StringRef K) const { 30 auto I = find(K); 31 if (I == end()) 32 return nullptr; 33 return &I->second; 34 } 35 llvm::Optional<std::nullptr_t> Object::getNull(StringRef K) const { 36 if (auto *V = get(K)) 37 return V->getAsNull(); 38 return llvm::None; 39 } 40 llvm::Optional<bool> Object::getBoolean(StringRef K) const { 41 if (auto *V = get(K)) 42 return V->getAsBoolean(); 43 return llvm::None; 44 } 45 llvm::Optional<double> Object::getNumber(StringRef K) const { 46 if (auto *V = get(K)) 47 return V->getAsNumber(); 48 return llvm::None; 49 } 50 llvm::Optional<int64_t> Object::getInteger(StringRef K) const { 51 if (auto *V = get(K)) 52 return V->getAsInteger(); 53 return llvm::None; 54 } 55 llvm::Optional<llvm::StringRef> Object::getString(StringRef K) const { 56 if (auto *V = get(K)) 57 return V->getAsString(); 58 return llvm::None; 59 } 60 const json::Object *Object::getObject(StringRef K) const { 61 if (auto *V = get(K)) 62 return V->getAsObject(); 63 return nullptr; 64 } 65 json::Object *Object::getObject(StringRef K) { 66 if (auto *V = get(K)) 67 return V->getAsObject(); 68 return nullptr; 69 } 70 const json::Array *Object::getArray(StringRef K) const { 71 if (auto *V = get(K)) 72 return V->getAsArray(); 73 return nullptr; 74 } 75 json::Array *Object::getArray(StringRef K) { 76 if (auto *V = get(K)) 77 return V->getAsArray(); 78 return nullptr; 79 } 80 bool operator==(const Object &LHS, const Object &RHS) { 81 if (LHS.size() != RHS.size()) 82 return false; 83 for (const auto &L : LHS) { 84 auto R = RHS.find(L.first); 85 if (R == RHS.end() || L.second != R->second) 86 return false; 87 } 88 return true; 89 } 90 91 Array::Array(std::initializer_list<Value> Elements) { 92 V.reserve(Elements.size()); 93 for (const Value &V : Elements) { 94 emplace_back(nullptr); 95 back().moveFrom(std::move(V)); 96 } 97 } 98 99 Value::Value(std::initializer_list<Value> Elements) 100 : Value(json::Array(Elements)) {} 101 102 void Value::copyFrom(const Value &M) { 103 Type = M.Type; 104 switch (Type) { 105 case T_Null: 106 case T_Boolean: 107 case T_Double: 108 case T_Integer: 109 memcpy(Union.buffer, M.Union.buffer, sizeof(Union.buffer)); 110 break; 111 case T_StringRef: 112 create<StringRef>(M.as<StringRef>()); 113 break; 114 case T_String: 115 create<std::string>(M.as<std::string>()); 116 break; 117 case T_Object: 118 create<json::Object>(M.as<json::Object>()); 119 break; 120 case T_Array: 121 create<json::Array>(M.as<json::Array>()); 122 break; 123 } 124 } 125 126 void Value::moveFrom(const Value &&M) { 127 Type = M.Type; 128 switch (Type) { 129 case T_Null: 130 case T_Boolean: 131 case T_Double: 132 case T_Integer: 133 memcpy(Union.buffer, M.Union.buffer, sizeof(Union.buffer)); 134 break; 135 case T_StringRef: 136 create<StringRef>(M.as<StringRef>()); 137 break; 138 case T_String: 139 create<std::string>(std::move(M.as<std::string>())); 140 M.Type = T_Null; 141 break; 142 case T_Object: 143 create<json::Object>(std::move(M.as<json::Object>())); 144 M.Type = T_Null; 145 break; 146 case T_Array: 147 create<json::Array>(std::move(M.as<json::Array>())); 148 M.Type = T_Null; 149 break; 150 } 151 } 152 153 void Value::destroy() { 154 switch (Type) { 155 case T_Null: 156 case T_Boolean: 157 case T_Double: 158 case T_Integer: 159 break; 160 case T_StringRef: 161 as<StringRef>().~StringRef(); 162 break; 163 case T_String: 164 as<std::string>().~basic_string(); 165 break; 166 case T_Object: 167 as<json::Object>().~Object(); 168 break; 169 case T_Array: 170 as<json::Array>().~Array(); 171 break; 172 } 173 } 174 175 bool operator==(const Value &L, const Value &R) { 176 if (L.kind() != R.kind()) 177 return false; 178 switch (L.kind()) { 179 case Value::Null: 180 return *L.getAsNull() == *R.getAsNull(); 181 case Value::Boolean: 182 return *L.getAsBoolean() == *R.getAsBoolean(); 183 case Value::Number: 184 // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=323 185 // The same integer must convert to the same double, per the standard. 186 // However we see 64-vs-80-bit precision comparisons with gcc-7 -O3 -m32. 187 // So we avoid floating point promotion for exact comparisons. 188 if (L.Type == Value::T_Integer || R.Type == Value::T_Integer) 189 return L.getAsInteger() == R.getAsInteger(); 190 return *L.getAsNumber() == *R.getAsNumber(); 191 case Value::String: 192 return *L.getAsString() == *R.getAsString(); 193 case Value::Array: 194 return *L.getAsArray() == *R.getAsArray(); 195 case Value::Object: 196 return *L.getAsObject() == *R.getAsObject(); 197 } 198 llvm_unreachable("Unknown value kind"); 199 } 200 201 namespace { 202 // Simple recursive-descent JSON parser. 203 class Parser { 204 public: 205 Parser(StringRef JSON) 206 : Start(JSON.begin()), P(JSON.begin()), End(JSON.end()) {} 207 208 bool checkUTF8() { 209 size_t ErrOffset; 210 if (isUTF8(StringRef(Start, End - Start), &ErrOffset)) 211 return true; 212 P = Start + ErrOffset; // For line/column calculation. 213 return parseError("Invalid UTF-8 sequence"); 214 } 215 216 bool parseValue(Value &Out); 217 218 bool assertEnd() { 219 eatWhitespace(); 220 if (P == End) 221 return true; 222 return parseError("Text after end of document"); 223 } 224 225 Error takeError() { 226 assert(Err); 227 return std::move(*Err); 228 } 229 230 private: 231 void eatWhitespace() { 232 while (P != End && (*P == ' ' || *P == '\r' || *P == '\n' || *P == '\t')) 233 ++P; 234 } 235 236 // On invalid syntax, parseX() functions return false and set Err. 237 bool parseNumber(char First, Value &Out); 238 bool parseString(std::string &Out); 239 bool parseUnicode(std::string &Out); 240 bool parseError(const char *Msg); // always returns false 241 242 char next() { return P == End ? 0 : *P++; } 243 char peek() { return P == End ? 0 : *P; } 244 static bool isNumber(char C) { 245 return C == '0' || C == '1' || C == '2' || C == '3' || C == '4' || 246 C == '5' || C == '6' || C == '7' || C == '8' || C == '9' || 247 C == 'e' || C == 'E' || C == '+' || C == '-' || C == '.'; 248 } 249 250 Optional<Error> Err; 251 const char *Start, *P, *End; 252 }; 253 254 bool Parser::parseValue(Value &Out) { 255 eatWhitespace(); 256 if (P == End) 257 return parseError("Unexpected EOF"); 258 switch (char C = next()) { 259 // Bare null/true/false are easy - first char identifies them. 260 case 'n': 261 Out = nullptr; 262 return (next() == 'u' && next() == 'l' && next() == 'l') || 263 parseError("Invalid JSON value (null?)"); 264 case 't': 265 Out = true; 266 return (next() == 'r' && next() == 'u' && next() == 'e') || 267 parseError("Invalid JSON value (true?)"); 268 case 'f': 269 Out = false; 270 return (next() == 'a' && next() == 'l' && next() == 's' && next() == 'e') || 271 parseError("Invalid JSON value (false?)"); 272 case '"': { 273 std::string S; 274 if (parseString(S)) { 275 Out = std::move(S); 276 return true; 277 } 278 return false; 279 } 280 case '[': { 281 Out = Array{}; 282 Array &A = *Out.getAsArray(); 283 eatWhitespace(); 284 if (peek() == ']') { 285 ++P; 286 return true; 287 } 288 for (;;) { 289 A.emplace_back(nullptr); 290 if (!parseValue(A.back())) 291 return false; 292 eatWhitespace(); 293 switch (next()) { 294 case ',': 295 eatWhitespace(); 296 continue; 297 case ']': 298 return true; 299 default: 300 return parseError("Expected , or ] after array element"); 301 } 302 } 303 } 304 case '{': { 305 Out = Object{}; 306 Object &O = *Out.getAsObject(); 307 eatWhitespace(); 308 if (peek() == '}') { 309 ++P; 310 return true; 311 } 312 for (;;) { 313 if (next() != '"') 314 return parseError("Expected object key"); 315 std::string K; 316 if (!parseString(K)) 317 return false; 318 eatWhitespace(); 319 if (next() != ':') 320 return parseError("Expected : after object key"); 321 eatWhitespace(); 322 if (!parseValue(O[std::move(K)])) 323 return false; 324 eatWhitespace(); 325 switch (next()) { 326 case ',': 327 eatWhitespace(); 328 continue; 329 case '}': 330 return true; 331 default: 332 return parseError("Expected , or } after object property"); 333 } 334 } 335 } 336 default: 337 if (isNumber(C)) 338 return parseNumber(C, Out); 339 return parseError("Invalid JSON value"); 340 } 341 } 342 343 bool Parser::parseNumber(char First, Value &Out) { 344 // Read the number into a string. (Must be null-terminated for strto*). 345 SmallString<24> S; 346 S.push_back(First); 347 while (isNumber(peek())) 348 S.push_back(next()); 349 char *End; 350 // Try first to parse as integer, and if so preserve full 64 bits. 351 // strtoll returns long long >= 64 bits, so check it's in range too. 352 auto I = std::strtoll(S.c_str(), &End, 10); 353 if (End == S.end() && I >= std::numeric_limits<int64_t>::min() && 354 I <= std::numeric_limits<int64_t>::max()) { 355 Out = int64_t(I); 356 return true; 357 } 358 // If it's not an integer 359 Out = std::strtod(S.c_str(), &End); 360 return End == S.end() || parseError("Invalid JSON value (number?)"); 361 } 362 363 bool Parser::parseString(std::string &Out) { 364 // leading quote was already consumed. 365 for (char C = next(); C != '"'; C = next()) { 366 if (LLVM_UNLIKELY(P == End)) 367 return parseError("Unterminated string"); 368 if (LLVM_UNLIKELY((C & 0x1f) == C)) 369 return parseError("Control character in string"); 370 if (LLVM_LIKELY(C != '\\')) { 371 Out.push_back(C); 372 continue; 373 } 374 // Handle escape sequence. 375 switch (C = next()) { 376 case '"': 377 case '\\': 378 case '/': 379 Out.push_back(C); 380 break; 381 case 'b': 382 Out.push_back('\b'); 383 break; 384 case 'f': 385 Out.push_back('\f'); 386 break; 387 case 'n': 388 Out.push_back('\n'); 389 break; 390 case 'r': 391 Out.push_back('\r'); 392 break; 393 case 't': 394 Out.push_back('\t'); 395 break; 396 case 'u': 397 if (!parseUnicode(Out)) 398 return false; 399 break; 400 default: 401 return parseError("Invalid escape sequence"); 402 } 403 } 404 return true; 405 } 406 407 static void encodeUtf8(uint32_t Rune, std::string &Out) { 408 if (Rune < 0x80) { 409 Out.push_back(Rune & 0x7F); 410 } else if (Rune < 0x800) { 411 uint8_t FirstByte = 0xC0 | ((Rune & 0x7C0) >> 6); 412 uint8_t SecondByte = 0x80 | (Rune & 0x3F); 413 Out.push_back(FirstByte); 414 Out.push_back(SecondByte); 415 } else if (Rune < 0x10000) { 416 uint8_t FirstByte = 0xE0 | ((Rune & 0xF000) >> 12); 417 uint8_t SecondByte = 0x80 | ((Rune & 0xFC0) >> 6); 418 uint8_t ThirdByte = 0x80 | (Rune & 0x3F); 419 Out.push_back(FirstByte); 420 Out.push_back(SecondByte); 421 Out.push_back(ThirdByte); 422 } else if (Rune < 0x110000) { 423 uint8_t FirstByte = 0xF0 | ((Rune & 0x1F0000) >> 18); 424 uint8_t SecondByte = 0x80 | ((Rune & 0x3F000) >> 12); 425 uint8_t ThirdByte = 0x80 | ((Rune & 0xFC0) >> 6); 426 uint8_t FourthByte = 0x80 | (Rune & 0x3F); 427 Out.push_back(FirstByte); 428 Out.push_back(SecondByte); 429 Out.push_back(ThirdByte); 430 Out.push_back(FourthByte); 431 } else { 432 llvm_unreachable("Invalid codepoint"); 433 } 434 } 435 436 // Parse a UTF-16 \uNNNN escape sequence. "\u" has already been consumed. 437 // May parse several sequential escapes to ensure proper surrogate handling. 438 // We do not use ConvertUTF.h, it can't accept and replace unpaired surrogates. 439 // These are invalid Unicode but valid JSON (RFC 8259, section 8.2). 440 bool Parser::parseUnicode(std::string &Out) { 441 // Invalid UTF is not a JSON error (RFC 8529§8.2). It gets replaced by U+FFFD. 442 auto Invalid = [&] { Out.append(/* UTF-8 */ {'\xef', '\xbf', '\xbd'}); }; 443 // Decodes 4 hex digits from the stream into Out, returns false on error. 444 auto Parse4Hex = [this](uint16_t &Out) -> bool { 445 Out = 0; 446 char Bytes[] = {next(), next(), next(), next()}; 447 for (unsigned char C : Bytes) { 448 if (!std::isxdigit(C)) 449 return parseError("Invalid \\u escape sequence"); 450 Out <<= 4; 451 Out |= (C > '9') ? (C & ~0x20) - 'A' + 10 : (C - '0'); 452 } 453 return true; 454 }; 455 uint16_t First; // UTF-16 code unit from the first \u escape. 456 if (!Parse4Hex(First)) 457 return false; 458 459 // We loop to allow proper surrogate-pair error handling. 460 while (true) { 461 // Case 1: the UTF-16 code unit is already a codepoint in the BMP. 462 if (LLVM_LIKELY(First < 0xD800 || First >= 0xE000)) { 463 encodeUtf8(First, Out); 464 return true; 465 } 466 467 // Case 2: it's an (unpaired) trailing surrogate. 468 if (LLVM_UNLIKELY(First >= 0xDC00)) { 469 Invalid(); 470 return true; 471 } 472 473 // Case 3: it's a leading surrogate. We expect a trailing one next. 474 // Case 3a: there's no trailing \u escape. Don't advance in the stream. 475 if (LLVM_UNLIKELY(P + 2 > End || *P != '\\' || *(P + 1) != 'u')) { 476 Invalid(); // Leading surrogate was unpaired. 477 return true; 478 } 479 P += 2; 480 uint16_t Second; 481 if (!Parse4Hex(Second)) 482 return false; 483 // Case 3b: there was another \u escape, but it wasn't a trailing surrogate. 484 if (LLVM_UNLIKELY(Second < 0xDC00 || Second >= 0xE000)) { 485 Invalid(); // Leading surrogate was unpaired. 486 First = Second; // Second escape still needs to be processed. 487 continue; 488 } 489 // Case 3c: a valid surrogate pair encoding an astral codepoint. 490 encodeUtf8(0x10000 | ((First - 0xD800) << 10) | (Second - 0xDC00), Out); 491 return true; 492 } 493 } 494 495 bool Parser::parseError(const char *Msg) { 496 int Line = 1; 497 const char *StartOfLine = Start; 498 for (const char *X = Start; X < P; ++X) { 499 if (*X == 0x0A) { 500 ++Line; 501 StartOfLine = X + 1; 502 } 503 } 504 Err.emplace( 505 std::make_unique<ParseError>(Msg, Line, P - StartOfLine, P - Start)); 506 return false; 507 } 508 } // namespace 509 510 Expected<Value> parse(StringRef JSON) { 511 Parser P(JSON); 512 Value E = nullptr; 513 if (P.checkUTF8()) 514 if (P.parseValue(E)) 515 if (P.assertEnd()) 516 return std::move(E); 517 return P.takeError(); 518 } 519 char ParseError::ID = 0; 520 521 static std::vector<const Object::value_type *> sortedElements(const Object &O) { 522 std::vector<const Object::value_type *> Elements; 523 for (const auto &E : O) 524 Elements.push_back(&E); 525 llvm::sort(Elements, 526 [](const Object::value_type *L, const Object::value_type *R) { 527 return L->first < R->first; 528 }); 529 return Elements; 530 } 531 532 bool isUTF8(llvm::StringRef S, size_t *ErrOffset) { 533 // Fast-path for ASCII, which is valid UTF-8. 534 if (LLVM_LIKELY(isASCII(S))) 535 return true; 536 537 const UTF8 *Data = reinterpret_cast<const UTF8 *>(S.data()), *Rest = Data; 538 if (LLVM_LIKELY(isLegalUTF8String(&Rest, Data + S.size()))) 539 return true; 540 541 if (ErrOffset) 542 *ErrOffset = Rest - Data; 543 return false; 544 } 545 546 std::string fixUTF8(llvm::StringRef S) { 547 // This isn't particularly efficient, but is only for error-recovery. 548 std::vector<UTF32> Codepoints(S.size()); // 1 codepoint per byte suffices. 549 const UTF8 *In8 = reinterpret_cast<const UTF8 *>(S.data()); 550 UTF32 *Out32 = Codepoints.data(); 551 ConvertUTF8toUTF32(&In8, In8 + S.size(), &Out32, Out32 + Codepoints.size(), 552 lenientConversion); 553 Codepoints.resize(Out32 - Codepoints.data()); 554 std::string Res(4 * Codepoints.size(), 0); // 4 bytes per codepoint suffice 555 const UTF32 *In32 = Codepoints.data(); 556 UTF8 *Out8 = reinterpret_cast<UTF8 *>(&Res[0]); 557 ConvertUTF32toUTF8(&In32, In32 + Codepoints.size(), &Out8, Out8 + Res.size(), 558 strictConversion); 559 Res.resize(reinterpret_cast<char *>(Out8) - Res.data()); 560 return Res; 561 } 562 563 static void quote(llvm::raw_ostream &OS, llvm::StringRef S) { 564 OS << '\"'; 565 for (unsigned char C : S) { 566 if (C == 0x22 || C == 0x5C) 567 OS << '\\'; 568 if (C >= 0x20) { 569 OS << C; 570 continue; 571 } 572 OS << '\\'; 573 switch (C) { 574 // A few characters are common enough to make short escapes worthwhile. 575 case '\t': 576 OS << 't'; 577 break; 578 case '\n': 579 OS << 'n'; 580 break; 581 case '\r': 582 OS << 'r'; 583 break; 584 default: 585 OS << 'u'; 586 llvm::write_hex(OS, C, llvm::HexPrintStyle::Lower, 4); 587 break; 588 } 589 } 590 OS << '\"'; 591 } 592 593 void llvm::json::OStream::value(const Value &V) { 594 switch (V.kind()) { 595 case Value::Null: 596 valueBegin(); 597 OS << "null"; 598 return; 599 case Value::Boolean: 600 valueBegin(); 601 OS << (*V.getAsBoolean() ? "true" : "false"); 602 return; 603 case Value::Number: 604 valueBegin(); 605 if (V.Type == Value::T_Integer) 606 OS << *V.getAsInteger(); 607 else 608 OS << format("%.*g", std::numeric_limits<double>::max_digits10, 609 *V.getAsNumber()); 610 return; 611 case Value::String: 612 valueBegin(); 613 quote(OS, *V.getAsString()); 614 return; 615 case Value::Array: 616 return array([&] { 617 for (const Value &E : *V.getAsArray()) 618 value(E); 619 }); 620 case Value::Object: 621 return object([&] { 622 for (const Object::value_type *E : sortedElements(*V.getAsObject())) 623 attribute(E->first, E->second); 624 }); 625 } 626 } 627 628 void llvm::json::OStream::valueBegin() { 629 assert(Stack.back().Ctx != Object && "Only attributes allowed here"); 630 if (Stack.back().HasValue) { 631 assert(Stack.back().Ctx != Singleton && "Only one value allowed here"); 632 OS << ','; 633 } 634 if (Stack.back().Ctx == Array) 635 newline(); 636 Stack.back().HasValue = true; 637 } 638 639 void llvm::json::OStream::newline() { 640 if (IndentSize) { 641 OS.write('\n'); 642 OS.indent(Indent); 643 } 644 } 645 646 void llvm::json::OStream::arrayBegin() { 647 valueBegin(); 648 Stack.emplace_back(); 649 Stack.back().Ctx = Array; 650 Indent += IndentSize; 651 OS << '['; 652 } 653 654 void llvm::json::OStream::arrayEnd() { 655 assert(Stack.back().Ctx == Array); 656 Indent -= IndentSize; 657 if (Stack.back().HasValue) 658 newline(); 659 OS << ']'; 660 Stack.pop_back(); 661 assert(!Stack.empty()); 662 } 663 664 void llvm::json::OStream::objectBegin() { 665 valueBegin(); 666 Stack.emplace_back(); 667 Stack.back().Ctx = Object; 668 Indent += IndentSize; 669 OS << '{'; 670 } 671 672 void llvm::json::OStream::objectEnd() { 673 assert(Stack.back().Ctx == Object); 674 Indent -= IndentSize; 675 if (Stack.back().HasValue) 676 newline(); 677 OS << '}'; 678 Stack.pop_back(); 679 assert(!Stack.empty()); 680 } 681 682 void llvm::json::OStream::attributeBegin(llvm::StringRef Key) { 683 assert(Stack.back().Ctx == Object); 684 if (Stack.back().HasValue) 685 OS << ','; 686 newline(); 687 Stack.back().HasValue = true; 688 Stack.emplace_back(); 689 Stack.back().Ctx = Singleton; 690 if (LLVM_LIKELY(isUTF8(Key))) { 691 quote(OS, Key); 692 } else { 693 assert(false && "Invalid UTF-8 in attribute key"); 694 quote(OS, fixUTF8(Key)); 695 } 696 OS.write(':'); 697 if (IndentSize) 698 OS.write(' '); 699 } 700 701 void llvm::json::OStream::attributeEnd() { 702 assert(Stack.back().Ctx == Singleton); 703 assert(Stack.back().HasValue && "Attribute must have a value"); 704 Stack.pop_back(); 705 assert(Stack.back().Ctx == Object); 706 } 707 708 } // namespace json 709 } // namespace llvm 710 711 void llvm::format_provider<llvm::json::Value>::format( 712 const llvm::json::Value &E, raw_ostream &OS, StringRef Options) { 713 unsigned IndentAmount = 0; 714 if (!Options.empty() && Options.getAsInteger(/*Radix=*/10, IndentAmount)) 715 llvm_unreachable("json::Value format options should be an integer"); 716 json::OStream(OS, IndentAmount).value(E); 717 } 718 719