1 //===-- StringPrinter.cpp -------------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "lldb/DataFormatters/StringPrinter.h" 10 11 #include "lldb/Core/Debugger.h" 12 #include "lldb/Core/ValueObject.h" 13 #include "lldb/Target/Language.h" 14 #include "lldb/Target/Process.h" 15 #include "lldb/Target/Target.h" 16 #include "lldb/Utility/Status.h" 17 18 #include "llvm/ADT/StringExtras.h" 19 #include "llvm/Support/ConvertUTF.h" 20 21 #include <cctype> 22 #include <locale> 23 #include <memory> 24 25 using namespace lldb; 26 using namespace lldb_private; 27 using namespace lldb_private::formatters; 28 using GetPrintableElementType = StringPrinter::GetPrintableElementType; 29 using StringElementType = StringPrinter::StringElementType; 30 31 /// DecodedCharBuffer stores the decoded contents of a single character. It 32 /// avoids managing memory on the heap by copying decoded bytes into an in-line 33 /// buffer. 34 class DecodedCharBuffer { 35 public: 36 DecodedCharBuffer(std::nullptr_t) {} 37 38 DecodedCharBuffer(const uint8_t *bytes, size_t size) : m_size(size) { 39 if (size > MaxLength) 40 llvm_unreachable("unsupported length"); 41 memcpy(m_data, bytes, size); 42 } 43 44 DecodedCharBuffer(const char *bytes, size_t size) 45 : DecodedCharBuffer(reinterpret_cast<const uint8_t *>(bytes), size) {} 46 47 const uint8_t *GetBytes() const { return m_data; } 48 49 size_t GetSize() const { return m_size; } 50 51 private: 52 static constexpr unsigned MaxLength = 16; 53 54 size_t m_size = 0; 55 uint8_t m_data[MaxLength] = {0}; 56 }; 57 58 using EscapingHelper = 59 std::function<DecodedCharBuffer(uint8_t *, uint8_t *, uint8_t *&)>; 60 61 // we define this for all values of type but only implement it for those we 62 // care about that's good because we get linker errors for any unsupported type 63 template <StringElementType type> 64 static DecodedCharBuffer 65 GetPrintableImpl(uint8_t *buffer, uint8_t *buffer_end, uint8_t *&next, 66 StringPrinter::EscapeStyle escape_style); 67 68 // Mimic isprint() for Unicode codepoints. 69 static bool isprint32(char32_t codepoint) { 70 if (codepoint <= 0x1F || codepoint == 0x7F) // C0 71 { 72 return false; 73 } 74 if (codepoint >= 0x80 && codepoint <= 0x9F) // C1 75 { 76 return false; 77 } 78 if (codepoint == 0x2028 || codepoint == 0x2029) // line/paragraph separators 79 { 80 return false; 81 } 82 if (codepoint == 0x200E || codepoint == 0x200F || 83 (codepoint >= 0x202A && 84 codepoint <= 0x202E)) // bidirectional text control 85 { 86 return false; 87 } 88 if (codepoint >= 0xFFF9 && 89 codepoint <= 0xFFFF) // interlinears and generally specials 90 { 91 return false; 92 } 93 return true; 94 } 95 96 DecodedCharBuffer attemptASCIIEscape(llvm::UTF32 c, 97 StringPrinter::EscapeStyle escape_style) { 98 const bool is_swift_escape_style = 99 escape_style == StringPrinter::EscapeStyle::Swift; 100 switch (c) { 101 case 0: 102 return {"\\0", 2}; 103 case '\a': 104 return {"\\a", 2}; 105 case '\b': 106 if (is_swift_escape_style) 107 return nullptr; 108 return {"\\b", 2}; 109 case '\f': 110 if (is_swift_escape_style) 111 return nullptr; 112 return {"\\f", 2}; 113 case '\n': 114 return {"\\n", 2}; 115 case '\r': 116 return {"\\r", 2}; 117 case '\t': 118 return {"\\t", 2}; 119 case '\v': 120 if (is_swift_escape_style) 121 return nullptr; 122 return {"\\v", 2}; 123 case '\"': 124 return {"\\\"", 2}; 125 case '\'': 126 if (is_swift_escape_style) 127 return {"\\'", 2}; 128 return nullptr; 129 case '\\': 130 return {"\\\\", 2}; 131 } 132 return nullptr; 133 } 134 135 template <> 136 DecodedCharBuffer GetPrintableImpl<StringElementType::ASCII>( 137 uint8_t *buffer, uint8_t *buffer_end, uint8_t *&next, 138 StringPrinter::EscapeStyle escape_style) { 139 // The ASCII helper always advances 1 byte at a time. 140 next = buffer + 1; 141 142 DecodedCharBuffer retval = attemptASCIIEscape(*buffer, escape_style); 143 if (retval.GetSize()) 144 return retval; 145 146 // Use llvm's locale-independent isPrint(char), instead of the libc 147 // implementation which may give different results on different platforms. 148 if (llvm::isPrint(*buffer)) 149 return {buffer, 1}; 150 151 unsigned escaped_len; 152 constexpr unsigned max_buffer_size = 7; 153 uint8_t data[max_buffer_size]; 154 switch (escape_style) { 155 case StringPrinter::EscapeStyle::CXX: 156 // Prints 4 characters, then a \0 terminator. 157 escaped_len = snprintf((char *)data, max_buffer_size, "\\x%02x", *buffer); 158 break; 159 case StringPrinter::EscapeStyle::Swift: 160 // Prints up to 6 characters, then a \0 terminator. 161 escaped_len = snprintf((char *)data, max_buffer_size, "\\u{%x}", *buffer); 162 break; 163 } 164 lldbassert(escaped_len > 0 && "unknown string escape style"); 165 return {data, escaped_len}; 166 } 167 168 template <> 169 DecodedCharBuffer GetPrintableImpl<StringElementType::UTF8>( 170 uint8_t *buffer, uint8_t *buffer_end, uint8_t *&next, 171 StringPrinter::EscapeStyle escape_style) { 172 // If the utf8 encoded length is invalid (i.e., not in the closed interval 173 // [1;4]), or if there aren't enough bytes to print, or if the subsequence 174 // isn't valid utf8, fall back to printing an ASCII-escaped subsequence. 175 if (!llvm::isLegalUTF8Sequence(buffer, buffer_end)) 176 return GetPrintableImpl<StringElementType::ASCII>(buffer, buffer_end, next, 177 escape_style); 178 179 // Convert the valid utf8 sequence to a utf32 codepoint. This cannot fail. 180 llvm::UTF32 codepoint = 0; 181 const llvm::UTF8 *buffer_for_conversion = buffer; 182 llvm::ConversionResult result = llvm::convertUTF8Sequence( 183 &buffer_for_conversion, buffer_end, &codepoint, llvm::strictConversion); 184 assert(result == llvm::conversionOK && 185 "Failed to convert legal utf8 sequence"); 186 UNUSED_IF_ASSERT_DISABLED(result); 187 188 // The UTF8 helper always advances by the utf8 encoded length. 189 const unsigned utf8_encoded_len = buffer_for_conversion - buffer; 190 next = buffer + utf8_encoded_len; 191 192 DecodedCharBuffer retval = attemptASCIIEscape(codepoint, escape_style); 193 if (retval.GetSize()) 194 return retval; 195 if (isprint32(codepoint)) 196 return {buffer, utf8_encoded_len}; 197 198 unsigned escaped_len; 199 constexpr unsigned max_buffer_size = 13; 200 uint8_t data[max_buffer_size]; 201 switch (escape_style) { 202 case StringPrinter::EscapeStyle::CXX: 203 // Prints 10 characters, then a \0 terminator. 204 escaped_len = snprintf((char *)data, max_buffer_size, "\\U%08x", codepoint); 205 break; 206 case StringPrinter::EscapeStyle::Swift: 207 // Prints up to 12 characters, then a \0 terminator. 208 escaped_len = snprintf((char *)data, max_buffer_size, "\\u{%x}", codepoint); 209 break; 210 } 211 lldbassert(escaped_len > 0 && "unknown string escape style"); 212 return {data, escaped_len}; 213 } 214 215 // Given a sequence of bytes, this function returns: a sequence of bytes to 216 // actually print out + a length the following unscanned position of the buffer 217 // is in next 218 static DecodedCharBuffer GetPrintable(StringElementType type, uint8_t *buffer, 219 uint8_t *buffer_end, uint8_t *&next, 220 StringPrinter::EscapeStyle escape_style) { 221 if (!buffer || buffer >= buffer_end) 222 return {nullptr}; 223 224 switch (type) { 225 case StringElementType::ASCII: 226 return GetPrintableImpl<StringElementType::ASCII>(buffer, buffer_end, next, 227 escape_style); 228 case StringElementType::UTF8: 229 return GetPrintableImpl<StringElementType::UTF8>(buffer, buffer_end, next, 230 escape_style); 231 default: 232 return {nullptr}; 233 } 234 } 235 236 static EscapingHelper 237 GetDefaultEscapingHelper(GetPrintableElementType elem_type, 238 StringPrinter::EscapeStyle escape_style) { 239 switch (elem_type) { 240 case GetPrintableElementType::UTF8: 241 case GetPrintableElementType::ASCII: 242 return [escape_style, elem_type](uint8_t *buffer, uint8_t *buffer_end, 243 uint8_t *&next) -> DecodedCharBuffer { 244 return GetPrintable(elem_type == GetPrintableElementType::UTF8 245 ? StringElementType::UTF8 246 : StringElementType::ASCII, 247 buffer, buffer_end, next, escape_style); 248 }; 249 } 250 llvm_unreachable("bad element type"); 251 } 252 253 /// Read a string encoded in accordance with \tparam SourceDataType from a 254 /// host-side LLDB buffer, then pretty-print it to a stream using \p style. 255 template <typename SourceDataType> 256 static bool DumpEncodedBufferToStream( 257 GetPrintableElementType style, 258 llvm::ConversionResult (*ConvertFunction)(const SourceDataType **, 259 const SourceDataType *, 260 llvm::UTF8 **, llvm::UTF8 *, 261 llvm::ConversionFlags), 262 const StringPrinter::ReadBufferAndDumpToStreamOptions &dump_options) { 263 assert(dump_options.GetStream() && "need a Stream to print the string to"); 264 Stream &stream(*dump_options.GetStream()); 265 if (dump_options.GetPrefixToken() != nullptr) 266 stream.Printf("%s", dump_options.GetPrefixToken()); 267 if (dump_options.GetQuote() != 0) 268 stream.Printf("%c", dump_options.GetQuote()); 269 auto data(dump_options.GetData()); 270 auto source_size(dump_options.GetSourceSize()); 271 if (data.GetByteSize() && data.GetDataStart() && data.GetDataEnd()) { 272 const int bufferSPSize = data.GetByteSize(); 273 if (dump_options.GetSourceSize() == 0) { 274 const int origin_encoding = 8 * sizeof(SourceDataType); 275 source_size = bufferSPSize / (origin_encoding / 4); 276 } 277 278 const SourceDataType *data_ptr = 279 (const SourceDataType *)data.GetDataStart(); 280 const SourceDataType *data_end_ptr = data_ptr + source_size; 281 282 const bool zero_is_terminator = dump_options.GetBinaryZeroIsTerminator(); 283 284 if (zero_is_terminator) { 285 while (data_ptr < data_end_ptr) { 286 if (!*data_ptr) { 287 data_end_ptr = data_ptr; 288 break; 289 } 290 data_ptr++; 291 } 292 293 data_ptr = (const SourceDataType *)data.GetDataStart(); 294 } 295 296 lldb::WritableDataBufferSP utf8_data_buffer_sp; 297 llvm::UTF8 *utf8_data_ptr = nullptr; 298 llvm::UTF8 *utf8_data_end_ptr = nullptr; 299 300 if (ConvertFunction) { 301 utf8_data_buffer_sp = 302 std::make_shared<DataBufferHeap>(4 * bufferSPSize, 0); 303 utf8_data_ptr = (llvm::UTF8 *)utf8_data_buffer_sp->GetBytes(); 304 utf8_data_end_ptr = utf8_data_ptr + utf8_data_buffer_sp->GetByteSize(); 305 ConvertFunction(&data_ptr, data_end_ptr, &utf8_data_ptr, 306 utf8_data_end_ptr, llvm::lenientConversion); 307 if (!zero_is_terminator) 308 utf8_data_end_ptr = utf8_data_ptr; 309 // needed because the ConvertFunction will change the value of the 310 // data_ptr. 311 utf8_data_ptr = 312 (llvm::UTF8 *)utf8_data_buffer_sp->GetBytes(); 313 } else { 314 // just copy the pointers - the cast is necessary to make the compiler 315 // happy but this should only happen if we are reading UTF8 data 316 utf8_data_ptr = const_cast<llvm::UTF8 *>( 317 reinterpret_cast<const llvm::UTF8 *>(data_ptr)); 318 utf8_data_end_ptr = const_cast<llvm::UTF8 *>( 319 reinterpret_cast<const llvm::UTF8 *>(data_end_ptr)); 320 } 321 322 const bool escape_non_printables = dump_options.GetEscapeNonPrintables(); 323 EscapingHelper escaping_callback; 324 if (escape_non_printables) 325 escaping_callback = 326 GetDefaultEscapingHelper(style, dump_options.GetEscapeStyle()); 327 328 // since we tend to accept partial data (and even partially malformed data) 329 // we might end up with no NULL terminator before the end_ptr hence we need 330 // to take a slower route and ensure we stay within boundaries 331 for (; utf8_data_ptr < utf8_data_end_ptr;) { 332 if (zero_is_terminator && !*utf8_data_ptr) 333 break; 334 335 if (escape_non_printables) { 336 uint8_t *next_data = nullptr; 337 auto printable = 338 escaping_callback(utf8_data_ptr, utf8_data_end_ptr, next_data); 339 auto printable_bytes = printable.GetBytes(); 340 auto printable_size = printable.GetSize(); 341 342 // We failed to figure out how to print this string. 343 if (!printable_bytes || !next_data) 344 return false; 345 346 for (unsigned c = 0; c < printable_size; c++) 347 stream.Printf("%c", *(printable_bytes + c)); 348 utf8_data_ptr = (uint8_t *)next_data; 349 } else { 350 stream.Printf("%c", *utf8_data_ptr); 351 utf8_data_ptr++; 352 } 353 } 354 } 355 if (dump_options.GetQuote() != 0) 356 stream.Printf("%c", dump_options.GetQuote()); 357 if (dump_options.GetSuffixToken() != nullptr) 358 stream.Printf("%s", dump_options.GetSuffixToken()); 359 if (dump_options.GetIsTruncated()) 360 stream.Printf("..."); 361 return true; 362 } 363 364 lldb_private::formatters::StringPrinter::ReadStringAndDumpToStreamOptions:: 365 ReadStringAndDumpToStreamOptions(ValueObject &valobj) 366 : ReadStringAndDumpToStreamOptions() { 367 SetEscapeNonPrintables( 368 valobj.GetTargetSP()->GetDebugger().GetEscapeNonPrintables()); 369 } 370 371 lldb_private::formatters::StringPrinter::ReadBufferAndDumpToStreamOptions:: 372 ReadBufferAndDumpToStreamOptions(ValueObject &valobj) 373 : ReadBufferAndDumpToStreamOptions() { 374 SetEscapeNonPrintables( 375 valobj.GetTargetSP()->GetDebugger().GetEscapeNonPrintables()); 376 } 377 378 lldb_private::formatters::StringPrinter::ReadBufferAndDumpToStreamOptions:: 379 ReadBufferAndDumpToStreamOptions( 380 const ReadStringAndDumpToStreamOptions &options) 381 : ReadBufferAndDumpToStreamOptions() { 382 SetStream(options.GetStream()); 383 SetPrefixToken(options.GetPrefixToken()); 384 SetSuffixToken(options.GetSuffixToken()); 385 SetQuote(options.GetQuote()); 386 SetEscapeNonPrintables(options.GetEscapeNonPrintables()); 387 SetBinaryZeroIsTerminator(options.GetBinaryZeroIsTerminator()); 388 SetEscapeStyle(options.GetEscapeStyle()); 389 } 390 391 namespace lldb_private { 392 393 namespace formatters { 394 395 template <typename SourceDataType> 396 static bool ReadEncodedBufferAndDumpToStream( 397 StringElementType elem_type, 398 const StringPrinter::ReadStringAndDumpToStreamOptions &options, 399 llvm::ConversionResult (*ConvertFunction)(const SourceDataType **, 400 const SourceDataType *, 401 llvm::UTF8 **, llvm::UTF8 *, 402 llvm::ConversionFlags)) { 403 assert(options.GetStream() && "need a Stream to print the string to"); 404 if (!options.GetStream()) 405 return false; 406 407 if (options.GetLocation() == 0 || 408 options.GetLocation() == LLDB_INVALID_ADDRESS) 409 return false; 410 411 lldb::TargetSP target_sp = options.GetTargetSP(); 412 if (!target_sp) 413 return false; 414 415 constexpr int type_width = sizeof(SourceDataType); 416 constexpr int origin_encoding = 8 * type_width; 417 if (origin_encoding != 8 && origin_encoding != 16 && origin_encoding != 32) 418 return false; 419 // If not UTF8 or ASCII, conversion to UTF8 is necessary. 420 if (origin_encoding != 8 && !ConvertFunction) 421 return false; 422 423 bool needs_zero_terminator = options.GetNeedsZeroTermination(); 424 425 bool is_truncated = false; 426 const auto max_size = target_sp->GetMaximumSizeOfStringSummary(); 427 428 uint32_t sourceSize; 429 if (elem_type == StringElementType::ASCII && !options.GetSourceSize()) { 430 // FIXME: The NSString formatter sets HasSourceSize(true) when the size is 431 // actually unknown, as well as SetBinaryZeroIsTerminator(false). IIUC the 432 // C++ formatter also sets SetBinaryZeroIsTerminator(false) when it doesn't 433 // mean to. I don't see how this makes sense: we should fix the formatters. 434 // 435 // Until then, the behavior that's expected for ASCII strings with unknown 436 // lengths is to read up to the max size and then null-terminate. Do that. 437 sourceSize = max_size; 438 needs_zero_terminator = true; 439 } else if (options.HasSourceSize()) { 440 sourceSize = options.GetSourceSize(); 441 if (!options.GetIgnoreMaxLength()) { 442 if (sourceSize > max_size) { 443 sourceSize = max_size; 444 is_truncated = true; 445 } 446 } 447 } else { 448 sourceSize = max_size; 449 needs_zero_terminator = true; 450 } 451 452 const int bufferSPSize = sourceSize * type_width; 453 lldb::WritableDataBufferSP buffer_sp(new DataBufferHeap(bufferSPSize, 0)); 454 455 // Check if we got bytes. We never get any bytes if we have an empty 456 // string, but we still continue so that we end up actually printing 457 // an empty string (""). 458 if (sourceSize != 0 && !buffer_sp->GetBytes()) 459 return false; 460 461 Status error; 462 char *buffer = reinterpret_cast<char *>(buffer_sp->GetBytes()); 463 464 if (elem_type == StringElementType::ASCII) 465 target_sp->ReadCStringFromMemory(options.GetLocation(), buffer, 466 bufferSPSize, error); 467 else if (needs_zero_terminator) 468 target_sp->ReadStringFromMemory(options.GetLocation(), buffer, 469 bufferSPSize, error, type_width); 470 else 471 target_sp->ReadMemory(options.GetLocation(), buffer, bufferSPSize, error); 472 if (error.Fail()) { 473 options.GetStream()->Printf("unable to read data"); 474 return true; 475 } 476 477 StringPrinter::ReadBufferAndDumpToStreamOptions dump_options(options); 478 dump_options.SetData( 479 DataExtractor(buffer_sp, target_sp->GetArchitecture().GetByteOrder(), 480 target_sp->GetArchitecture().GetAddressByteSize())); 481 dump_options.SetSourceSize(sourceSize); 482 dump_options.SetIsTruncated(is_truncated); 483 dump_options.SetNeedsZeroTermination(needs_zero_terminator); 484 if (needs_zero_terminator) 485 dump_options.SetBinaryZeroIsTerminator(true); 486 487 GetPrintableElementType print_style = (elem_type == StringElementType::ASCII) 488 ? GetPrintableElementType::ASCII 489 : GetPrintableElementType::UTF8; 490 return DumpEncodedBufferToStream(print_style, ConvertFunction, dump_options); 491 } 492 493 template <> 494 bool StringPrinter::ReadStringAndDumpToStream<StringElementType::UTF8>( 495 const ReadStringAndDumpToStreamOptions &options) { 496 return ReadEncodedBufferAndDumpToStream<llvm::UTF8>(StringElementType::UTF8, 497 options, nullptr); 498 } 499 500 template <> 501 bool StringPrinter::ReadStringAndDumpToStream<StringElementType::UTF16>( 502 const ReadStringAndDumpToStreamOptions &options) { 503 return ReadEncodedBufferAndDumpToStream<llvm::UTF16>( 504 StringElementType::UTF16, options, llvm::ConvertUTF16toUTF8); 505 } 506 507 template <> 508 bool StringPrinter::ReadStringAndDumpToStream<StringElementType::UTF32>( 509 const ReadStringAndDumpToStreamOptions &options) { 510 return ReadEncodedBufferAndDumpToStream<llvm::UTF32>( 511 StringElementType::UTF32, options, llvm::ConvertUTF32toUTF8); 512 } 513 514 template <> 515 bool StringPrinter::ReadStringAndDumpToStream<StringElementType::ASCII>( 516 const ReadStringAndDumpToStreamOptions &options) { 517 return ReadEncodedBufferAndDumpToStream<char>(StringElementType::ASCII, 518 options, nullptr); 519 } 520 521 template <> 522 bool StringPrinter::ReadBufferAndDumpToStream<StringElementType::UTF8>( 523 const ReadBufferAndDumpToStreamOptions &options) { 524 return DumpEncodedBufferToStream<llvm::UTF8>(GetPrintableElementType::UTF8, 525 nullptr, options); 526 } 527 528 template <> 529 bool StringPrinter::ReadBufferAndDumpToStream<StringElementType::UTF16>( 530 const ReadBufferAndDumpToStreamOptions &options) { 531 return DumpEncodedBufferToStream(GetPrintableElementType::UTF8, 532 llvm::ConvertUTF16toUTF8, options); 533 } 534 535 template <> 536 bool StringPrinter::ReadBufferAndDumpToStream<StringElementType::UTF32>( 537 const ReadBufferAndDumpToStreamOptions &options) { 538 return DumpEncodedBufferToStream(GetPrintableElementType::UTF8, 539 llvm::ConvertUTF32toUTF8, options); 540 } 541 542 template <> 543 bool StringPrinter::ReadBufferAndDumpToStream<StringElementType::ASCII>( 544 const ReadBufferAndDumpToStreamOptions &options) { 545 // Treat ASCII the same as UTF8. 546 // 547 // FIXME: This is probably not the right thing to do (well, it's debatable). 548 // If an ASCII-encoded string happens to contain a sequence of invalid bytes 549 // that forms a valid UTF8 character, we'll print out that character. This is 550 // good if you're playing fast and loose with encodings (probably good for 551 // std::string users), but maybe not so good if you care about your string 552 // formatter respecting the semantics of your selected string encoding. In 553 // the latter case you'd want to see the character byte sequence ('\x..'), not 554 // the UTF8 character itself. 555 return ReadBufferAndDumpToStream<StringElementType::UTF8>(options); 556 } 557 558 } // namespace formatters 559 560 } // namespace lldb_private 561