1 //===-- Utilities to convert integral values to string ----------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Converts an integer to a string.
10 //
11 // By default, the string is written as decimal to an internal buffer and
12 // accessed via the 'view' method.
13 //
14 // IntegerToString<int> buffer(42);
15 // cpp::string_view view = buffer.view();
16 //
17 // The buffer is allocated on the stack and its size is so that the conversion
18 // always succeeds.
19 //
20 // It is also possible to write the data to a preallocated buffer, but this may
21 // fail.
22 //
23 // char buffer[8];
24 // if (auto maybe_view = IntegerToString<int>::write_to_span(buffer, 42)) {
25 // cpp::string_view view = *maybe_view;
26 // }
27 //
28 // The first template parameter is the type of the integer.
29 // The second template parameter defines how the integer is formatted.
30 // Available default are 'radix::Bin', 'radix::Oct', 'radix::Dec' and
31 // 'radix::Hex'.
32 //
33 // For 'radix::Bin', 'radix::Oct' and 'radix::Hex' the value is always
34 // interpreted as a positive type but 'radix::Dec' will honor negative values.
35 // e.g.,
36 //
37 // IntegerToString<int8_t>(-1) // "-1"
38 // IntegerToString<int8_t, radix::Dec>(-1) // "-1"
39 // IntegerToString<int8_t, radix::Bin>(-1) // "11111111"
40 // IntegerToString<int8_t, radix::Oct>(-1) // "377"
41 // IntegerToString<int8_t, radix::Hex>(-1) // "ff"
42 //
43 // Additionnally, the format can be changed by navigating the subtypes:
44 // - WithPrefix : Adds "0b", "0", "0x" for binary, octal and hexadecimal
45 // - WithWidth<XX> : Pad string to XX characters filling leading digits with 0
46 // - Uppercase : Use uppercase letters (only for HexString)
47 // - WithSign : Prepend '+' for positive values (only for DecString)
48 //
49 // Examples
50 // --------
51 // IntegerToString<int8_t, radix::Dec::WithWidth<2>::WithSign>(0) : "+00"
52 // IntegerToString<int8_t, radix::Dec::WithWidth<2>::WithSign>(-1) : "-01"
53 // IntegerToString<uint8_t, radix::Hex::WithPrefix::Uppercase>(255) : "0xFF"
54 // IntegerToString<uint8_t, radix::Hex::WithWidth<4>::Uppercase>(255) : "00FF"
55 //===----------------------------------------------------------------------===//
56
57 #ifndef LLVM_LIBC_SRC___SUPPORT_INTEGER_TO_STRING_H
58 #define LLVM_LIBC_SRC___SUPPORT_INTEGER_TO_STRING_H
59
60 #include <stdint.h>
61
62 #include "src/__support/CPP/algorithm.h" // max
63 #include "src/__support/CPP/array.h"
64 #include "src/__support/CPP/bit.h"
65 #include "src/__support/CPP/limits.h"
66 #include "src/__support/CPP/optional.h"
67 #include "src/__support/CPP/span.h"
68 #include "src/__support/CPP/string_view.h"
69 #include "src/__support/CPP/type_traits.h"
70 #include "src/__support/big_int.h" // make_integral_or_big_int_unsigned_t
71 #include "src/__support/common.h"
72 #include "src/__support/ctype_utils.h"
73 #include "src/__support/macros/config.h"
74
75 namespace LIBC_NAMESPACE_DECL {
76
77 namespace details {
78
79 template <uint8_t base, bool prefix = false, bool force_sign = false,
80 bool is_uppercase = false, size_t min_digits = 1>
81 struct Fmt {
82 static constexpr uint8_t BASE = base;
83 static constexpr size_t MIN_DIGITS = min_digits;
84 static constexpr bool IS_UPPERCASE = is_uppercase;
85 static constexpr bool PREFIX = prefix;
86 static constexpr char FORCE_SIGN = force_sign;
87
88 using WithPrefix = Fmt<BASE, true, FORCE_SIGN, IS_UPPERCASE, MIN_DIGITS>;
89 using WithSign = Fmt<BASE, PREFIX, true, IS_UPPERCASE, MIN_DIGITS>;
90 using Uppercase = Fmt<BASE, PREFIX, FORCE_SIGN, true, MIN_DIGITS>;
91 template <size_t value>
92 using WithWidth = Fmt<BASE, PREFIX, FORCE_SIGN, IS_UPPERCASE, value>;
93
94 // Invariants
95 static constexpr uint8_t NUMERICAL_DIGITS = 10;
96 static constexpr uint8_t ALPHA_DIGITS = 26;
97 static constexpr uint8_t MAX_DIGIT = NUMERICAL_DIGITS + ALPHA_DIGITS;
98 static_assert(BASE > 1 && BASE <= MAX_DIGIT);
99 static_assert(!IS_UPPERCASE || BASE > 10, "Uppercase is only for radix > 10");
100 static_assert(!FORCE_SIGN || BASE == 10, "WithSign is only for radix == 10");
101 static_assert(!PREFIX || (BASE == 2 || BASE == 8 || BASE == 16),
102 "WithPrefix is only for radix == 2, 8 or 16");
103 };
104
105 // Move this to a separate header since it might be useful elsewhere.
106 template <bool forward> class StringBufferWriterImpl {
107 cpp::span<char> buffer;
108 size_t index = 0;
109 bool out_of_range = false;
110
location()111 LIBC_INLINE size_t location() const {
112 return forward ? index : buffer.size() - 1 - index;
113 }
114
115 public:
116 StringBufferWriterImpl(const StringBufferWriterImpl &) = delete;
StringBufferWriterImpl(cpp::span<char> buffer)117 StringBufferWriterImpl(cpp::span<char> buffer) : buffer(buffer) {}
118
size()119 LIBC_INLINE size_t size() const { return index; }
remainder_size()120 LIBC_INLINE size_t remainder_size() const { return buffer.size() - size(); }
empty()121 LIBC_INLINE bool empty() const { return size() == 0; }
full()122 LIBC_INLINE bool full() const { return size() == buffer.size(); }
ok()123 LIBC_INLINE bool ok() const { return !out_of_range; }
124
push(char c)125 LIBC_INLINE StringBufferWriterImpl &push(char c) {
126 if (ok()) {
127 if (!full()) {
128 buffer[location()] = c;
129 ++index;
130 } else {
131 out_of_range = true;
132 }
133 }
134 return *this;
135 }
136
remainder_span()137 LIBC_INLINE cpp::span<char> remainder_span() const {
138 return forward ? buffer.last(remainder_size())
139 : buffer.first(remainder_size());
140 }
141
buffer_span()142 LIBC_INLINE cpp::span<char> buffer_span() const {
143 return forward ? buffer.first(size()) : buffer.last(size());
144 }
145
buffer_view()146 LIBC_INLINE cpp::string_view buffer_view() const {
147 const auto s = buffer_span();
148 return {s.data(), s.size()};
149 }
150 };
151
152 using StringBufferWriter = StringBufferWriterImpl<true>;
153 using BackwardStringBufferWriter = StringBufferWriterImpl<false>;
154
155 } // namespace details
156
157 namespace radix {
158
159 using Bin = details::Fmt<2>;
160 using Oct = details::Fmt<8>;
161 using Dec = details::Fmt<10>;
162 using Hex = details::Fmt<16>;
163 template <size_t radix> using Custom = details::Fmt<radix>;
164
165 } // namespace radix
166
167 // Extract the low-order decimal digit from a value of integer type T. The
168 // returned value is the digit itself, from 0 to 9. The input value is passed
169 // by reference, and modified by dividing by 10, so that iterating this
170 // function extracts all the digits of the original number one at a time from
171 // low to high.
172 template <typename T>
173 LIBC_INLINE cpp::enable_if_t<cpp::is_integral_v<T>, uint8_t>
extract_decimal_digit(T & value)174 extract_decimal_digit(T &value) {
175 const uint8_t digit(static_cast<uint8_t>(value % 10));
176 // For built-in integer types, we assume that an adequately fast division is
177 // available. If hardware division isn't implemented, then with a divisor
178 // known at compile time the compiler might be able to generate an optimized
179 // sequence instead.
180 value /= 10;
181 return digit;
182 }
183
184 // A specialization of extract_decimal_digit for the BigInt type in big_int.h,
185 // avoiding the use of general-purpose BigInt division which is very slow.
186 template <typename T>
187 LIBC_INLINE cpp::enable_if_t<is_big_int_v<T>, uint8_t>
extract_decimal_digit(T & value)188 extract_decimal_digit(T &value) {
189 // There are two essential ways you can turn n into (n/10,n%10). One is
190 // ordinary integer division. The other is a modular-arithmetic approach in
191 // which you first compute n%10 by bit twiddling, then subtract it off to get
192 // a value that is definitely a multiple of 10. Then you divide that by 10 in
193 // two steps: shift right to divide off a factor of 2, and then divide off a
194 // factor of 5 by multiplying by the modular inverse of 5 mod 2^BITS. (That
195 // last step only works if you know there's no remainder, which is why you
196 // had to subtract off the output digit first.)
197 //
198 // Either approach can be made to work in linear time. This code uses the
199 // modular-arithmetic technique, because the other approach either does a lot
200 // of integer divisions (requiring a fast hardware divider), or else uses a
201 // "multiply by an approximation to the reciprocal" technique which depends
202 // on careful error analysis which might go wrong in an untested edge case.
203
204 using Word = typename T::word_type;
205
206 // Find the remainder (value % 10). We do this by breaking up the input
207 // integer into chunks of size WORD_SIZE/2, so that the sum of them doesn't
208 // overflow a Word. Then we sum all the half-words times 6, except the bottom
209 // one, which is added to that sum without scaling.
210 //
211 // Why 6? Because you can imagine that the original number had the form
212 //
213 // halfwords[0] + K*halfwords[1] + K^2*halfwords[2] + ...
214 //
215 // where K = 2^(WORD_SIZE/2). Since WORD_SIZE is expected to be a multiple of
216 // 8, that makes WORD_SIZE/2 a multiple of 4, so that K is a power of 16. And
217 // all powers of 16 (larger than 1) are congruent to 6 mod 10, by induction:
218 // 16 itself is, and 6^2=36 is also congruent to 6.
219 Word acc_remainder = 0;
220 constexpr Word HALFWORD_BITS = T::WORD_SIZE / 2;
221 constexpr Word HALFWORD_MASK = ((Word(1) << HALFWORD_BITS) - 1);
222 // Sum both halves of all words except the low one.
223 for (size_t i = 1; i < T::WORD_COUNT; i++) {
224 acc_remainder += value.val[i] >> HALFWORD_BITS;
225 acc_remainder += value.val[i] & HALFWORD_MASK;
226 }
227 // Add the high half of the low word. Then we have everything that needs to
228 // be multiplied by 6, so do that.
229 acc_remainder += value.val[0] >> HALFWORD_BITS;
230 acc_remainder *= 6;
231 // Having multiplied it by 6, add the lowest half-word, and then reduce mod
232 // 10 by normal integer division to finish.
233 acc_remainder += value.val[0] & HALFWORD_MASK;
234 uint8_t digit = static_cast<uint8_t>(acc_remainder % 10);
235
236 // Now we have the output digit. Subtract it from the input value, and shift
237 // right to divide by 2.
238 value -= digit;
239 value >>= 1;
240
241 // Now all that's left is to multiply by the inverse of 5 mod 2^BITS. No
242 // matter what the value of BITS, the inverse of 5 has the very convenient
243 // form 0xCCCC...CCCD, with as many C hex digits in the middle as necessary.
244 //
245 // We could construct a second BigInt with all words 0xCCCCCCCCCCCCCCCC,
246 // increment the bottom word, and call a general-purpose multiply function.
247 // But we can do better, by taking advantage of the regularity: we can do
248 // this particular operation in linear time, whereas a general multiplier
249 // would take superlinear time (quadratic in small cases).
250 //
251 // To begin with, instead of computing n*0xCCCC...CCCD, we'll compute
252 // n*0xCCCC...CCCC and then add it to the original n. Then all the words of
253 // the multiplier have the same value 0xCCCCCCCCCCCCCCCC, which I'll just
254 // denote as C. If we also write t = 2^WORD_SIZE, and imagine (as an example)
255 // that the input number has three words x,y,z with x being the low word,
256 // then we're computing
257 //
258 // (x + y t + z t^2) * (C + C t + C t^2)
259 //
260 // = x C + y C t + z C t^2
261 // + x C t + y C t^2 + z C t^3
262 // + x C t^2 + y C t^3 + z C t^4
263 //
264 // but we're working mod t^3, so the high-order terms vanish and this becomes
265 //
266 // x C + y C t + z C t^2
267 // + x C t + y C t^2
268 // + x C t^2
269 //
270 // = x C + (x+y) C t + (x+y+z) C t^2
271 //
272 // So all you have to do is to work from the low word of the integer upwards,
273 // accumulating C times the sum of all the words you've seen so far to get
274 // x*C, (x+y)*C, (x+y+z)*C and so on. In each step you add another product to
275 // the accumulator, and add the accumulator to the corresponding word of the
276 // original number (so that we end up with value*CCCD, not just value*CCCC).
277 //
278 // If you do that literally, then your accumulator has to be three words
279 // wide, because the sum of words can overflow into a second word, and
280 // multiplying by C adds another word. But we can do slightly better by
281 // breaking each product word*C up into a bottom half and a top half. If we
282 // write x*C = xl + xh*t, and similarly for y and z, then our sum becomes
283 //
284 // (xl + xh t) + (yl + yh t) t + (zl + zh t) t^2
285 // + (xl + xh t) t + (yl + yh t) t^2
286 // + (xl + xh t) t^2
287 //
288 // and if you expand out again, collect terms, and discard t^3 terms, you get
289 //
290 // (xl)
291 // + (xl + xh + yl) t
292 // + (xl + xh + yl + yh + zl) t^2
293 //
294 // in which each coefficient is the sum of all the low words of the products
295 // up to _and including_ the current word, plus all the high words up to but
296 // _not_ including the current word. So now you only have to retain two words
297 // of sum instead of three.
298 //
299 // We do this entire procedure in a single in-place pass over the input
300 // number, reading each word to make its product with C and then adding the
301 // low word of the accumulator to it.
302 constexpr Word C = Word(-1) / 5 * 4; // calculate 0xCCCC as 4/5 of 0xFFFF
303 Word acc_lo = 0, acc_hi = 0; // accumulator of all the half-products so far
304 Word carry_bit, carry_word = 0;
305
306 for (size_t i = 0; i < T::WORD_COUNT; i++) {
307 // Make the two-word product of C with the current input word.
308 multiword::DoubleWide<Word> product = multiword::mul2(C, value.val[i]);
309
310 // Add the low half of the product to our accumulator, but not yet the high
311 // half.
312 acc_lo = add_with_carry<Word>(acc_lo, product[0], 0, carry_bit);
313 acc_hi += carry_bit;
314
315 // Now the accumulator contains exactly the value we need to add to the
316 // current input word. Add it, plus any carries from lower words, and make
317 // a new word of carry data to propagate into the next iteration.
318 value.val[i] = add_with_carry<Word>(value.val[i], carry_word, 0, carry_bit);
319 carry_word = acc_hi + carry_bit;
320 value.val[i] = add_with_carry<Word>(value.val[i], acc_lo, 0, carry_bit);
321 carry_word += carry_bit;
322
323 // Now add the high half of the current product to our accumulator.
324 acc_lo = add_with_carry<Word>(acc_lo, product[1], 0, carry_bit);
325 acc_hi += carry_bit;
326 }
327
328 return digit;
329 }
330
331 // See file header for documentation.
332 template <typename T, typename Fmt = radix::Dec> class IntegerToString {
333 static_assert(cpp::is_integral_v<T> || is_big_int_v<T>);
334
compute_buffer_size()335 LIBC_INLINE static constexpr size_t compute_buffer_size() {
336 constexpr auto MAX_DIGITS = []() -> size_t {
337 // We size the string buffer for base 10 using an approximation algorithm:
338 //
339 // size = ceil(sizeof(T) * 5 / 2)
340 //
341 // If sizeof(T) is 1, then size is 3 (actually need 3)
342 // If sizeof(T) is 2, then size is 5 (actually need 5)
343 // If sizeof(T) is 4, then size is 10 (actually need 10)
344 // If sizeof(T) is 8, then size is 20 (actually need 20)
345 // If sizeof(T) is 16, then size is 40 (actually need 39)
346 //
347 // NOTE: The ceil operation is actually implemented as
348 // floor(((sizeof(T) * 5) + 1) / 2)
349 // where floor operation is just integer division.
350 //
351 // This estimation grows slightly faster than the actual value, but the
352 // overhead is small enough to tolerate.
353 if constexpr (Fmt::BASE == 10)
354 return ((sizeof(T) * 5) + 1) / 2;
355 // For other bases, we approximate by rounding down to the nearest power
356 // of two base, since the space needed is easy to calculate and it won't
357 // overestimate by too much.
358 constexpr auto FLOOR_LOG_2 = [](size_t num) -> size_t {
359 size_t i = 0;
360 for (; num > 1; num /= 2)
361 ++i;
362 return i;
363 };
364 constexpr size_t BITS_PER_DIGIT = FLOOR_LOG_2(Fmt::BASE);
365 return ((sizeof(T) * 8 + (BITS_PER_DIGIT - 1)) / BITS_PER_DIGIT);
366 };
367 constexpr size_t DIGIT_SIZE = cpp::max(MAX_DIGITS(), Fmt::MIN_DIGITS);
368 constexpr size_t SIGN_SIZE = Fmt::BASE == 10 ? 1 : 0;
369 constexpr size_t PREFIX_SIZE = Fmt::PREFIX ? 2 : 0;
370 return DIGIT_SIZE + SIGN_SIZE + PREFIX_SIZE;
371 }
372
373 static constexpr size_t BUFFER_SIZE = compute_buffer_size();
374 static_assert(BUFFER_SIZE > 0);
375
376 // An internal stateless structure that handles the number formatting logic.
377 struct IntegerWriter {
378 static_assert(cpp::is_integral_v<T> || is_big_int_v<T>);
379 using UNSIGNED_T = make_integral_or_big_int_unsigned_t<T>;
380
digit_charIntegerWriter381 LIBC_INLINE static char digit_char(uint8_t digit) {
382 const int result = internal::int_to_b36_char(digit);
383 return static_cast<char>(Fmt::IS_UPPERCASE ? internal::toupper(result)
384 : result);
385 }
386
387 LIBC_INLINE static void
write_unsigned_numberIntegerWriter388 write_unsigned_number(UNSIGNED_T value,
389 details::BackwardStringBufferWriter &sink) {
390 for (; sink.ok() && value != 0; value /= Fmt::BASE) {
391 const uint8_t digit(static_cast<uint8_t>(value % Fmt::BASE));
392 sink.push(digit_char(digit));
393 }
394 }
395
396 LIBC_INLINE static void
write_unsigned_number_decIntegerWriter397 write_unsigned_number_dec(UNSIGNED_T value,
398 details::BackwardStringBufferWriter &sink) {
399 while (sink.ok() && value != 0) {
400 const uint8_t digit = extract_decimal_digit(value);
401 sink.push(digit_char(digit));
402 }
403 }
404
405 // Returns the absolute value of 'value' as 'UNSIGNED_T'.
absIntegerWriter406 LIBC_INLINE static UNSIGNED_T abs(T value) {
407 if (cpp::is_unsigned_v<T> || value >= 0)
408 return static_cast<UNSIGNED_T>(value); // already of the right sign.
409
410 // Signed integers are asymmetric (e.g., int8_t ∈ [-128, 127]).
411 // Thus negating the type's minimum value would overflow.
412 // From C++20 on, signed types are guaranteed to be represented as 2's
413 // complement. We take advantage of this representation and negate the
414 // value by using the exact same bit representation, e.g.,
415 // binary : 0b1000'0000
416 // int8_t : -128
417 // uint8_t: 128
418
419 // Note: the compiler can completely optimize out the two branches and
420 // replace them by a simple negate instruction.
421 // https://godbolt.org/z/hE7zahT9W
422 if (value == cpp::numeric_limits<T>::min()) {
423 return cpp::bit_cast<UNSIGNED_T>(value);
424 } else {
425 return static_cast<UNSIGNED_T>(
426 -value); // legal and representable both as T and UNSIGNED_T.`
427 }
428 }
429
writeIntegerWriter430 LIBC_INLINE static void write(T value,
431 details::BackwardStringBufferWriter &sink) {
432 if constexpr (Fmt::BASE == 10) {
433 write_unsigned_number_dec(abs(value), sink);
434 } else {
435 write_unsigned_number(static_cast<UNSIGNED_T>(value), sink);
436 }
437 // width
438 while (sink.ok() && sink.size() < Fmt::MIN_DIGITS)
439 sink.push('0');
440 // sign
441 if constexpr (Fmt::BASE == 10) {
442 if (value < 0)
443 sink.push('-');
444 else if (Fmt::FORCE_SIGN)
445 sink.push('+');
446 }
447 // prefix
448 if constexpr (Fmt::PREFIX) {
449 if constexpr (Fmt::BASE == 2) {
450 sink.push('b');
451 sink.push('0');
452 }
453 if constexpr (Fmt::BASE == 16) {
454 sink.push('x');
455 sink.push('0');
456 }
457 if constexpr (Fmt::BASE == 8) {
458 const cpp::string_view written = sink.buffer_view();
459 if (written.empty() || written.front() != '0')
460 sink.push('0');
461 }
462 }
463 }
464 };
465
466 cpp::array<char, BUFFER_SIZE> array;
467 size_t written = 0;
468
469 public:
470 IntegerToString(const IntegerToString &) = delete;
IntegerToString(T value)471 IntegerToString(T value) {
472 details::BackwardStringBufferWriter writer(array);
473 IntegerWriter::write(value, writer);
474 written = writer.size();
475 }
476
477 [[nodiscard]] LIBC_INLINE static cpp::optional<cpp::string_view>
format_to(cpp::span<char> buffer,T value)478 format_to(cpp::span<char> buffer, T value) {
479 details::BackwardStringBufferWriter writer(buffer);
480 IntegerWriter::write(value, writer);
481 if (writer.ok())
482 return cpp::string_view(buffer.data() + buffer.size() - writer.size(),
483 writer.size());
484 return cpp::nullopt;
485 }
486
buffer_size()487 LIBC_INLINE static constexpr size_t buffer_size() { return BUFFER_SIZE; }
488
size()489 LIBC_INLINE size_t size() const { return written; }
490 LIBC_INLINE cpp::string_view view() && = delete;
view()491 LIBC_INLINE cpp::string_view view() const & {
492 return cpp::string_view(array.data() + array.size() - size(), size());
493 }
494 };
495
496 } // namespace LIBC_NAMESPACE_DECL
497
498 #endif // LLVM_LIBC_SRC___SUPPORT_INTEGER_TO_STRING_H
499