1 //===--- LiteralSupport.h ---------------------------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines the NumericLiteralParser, CharLiteralParser, and 10 // StringLiteralParser interfaces. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef LLVM_CLANG_LEX_LITERALSUPPORT_H 15 #define LLVM_CLANG_LEX_LITERALSUPPORT_H 16 17 #include "clang/Basic/CharInfo.h" 18 #include "clang/Basic/LLVM.h" 19 #include "clang/Basic/TokenKinds.h" 20 #include "llvm/ADT/APFloat.h" 21 #include "llvm/ADT/ArrayRef.h" 22 #include "llvm/ADT/SmallString.h" 23 #include "llvm/ADT/StringRef.h" 24 #include "llvm/Support/DataTypes.h" 25 26 namespace clang { 27 28 class DiagnosticsEngine; 29 class Preprocessor; 30 class Token; 31 class SourceLocation; 32 class TargetInfo; 33 class SourceManager; 34 class LangOptions; 35 36 /// Copy characters from Input to Buf, expanding any UCNs. 37 void expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input); 38 39 /// Return true if the token corresponds to a function local predefined macro, 40 /// which expands to a string literal, that can be concatenated with other 41 /// string literals (only in Microsoft mode). 42 bool isFunctionLocalStringLiteralMacro(tok::TokenKind K, const LangOptions &LO); 43 44 /// Return true if the token is a string literal, or a function local 45 /// predefined macro, which expands to a string literal. 46 bool tokenIsLikeStringLiteral(const Token &Tok, const LangOptions &LO); 47 48 /// NumericLiteralParser - This performs strict semantic analysis of the content 49 /// of a ppnumber, classifying it as either integer, floating, or erroneous, 50 /// determines the radix of the value and can convert it to a useful value. 51 class NumericLiteralParser { 52 const SourceManager &SM; 53 const LangOptions &LangOpts; 54 DiagnosticsEngine &Diags; 55 56 const char *const ThisTokBegin; 57 const char *const ThisTokEnd; 58 const char *DigitsBegin, *SuffixBegin; // markers 59 const char *s; // cursor 60 61 unsigned radix; 62 63 bool saw_exponent, saw_period, saw_ud_suffix, saw_fixed_point_suffix; 64 65 SmallString<32> UDSuffixBuf; 66 67 public: 68 NumericLiteralParser(StringRef TokSpelling, SourceLocation TokLoc, 69 const SourceManager &SM, const LangOptions &LangOpts, 70 const TargetInfo &Target, DiagnosticsEngine &Diags); 71 bool hadError : 1; 72 bool isUnsigned : 1; 73 bool isLong : 1; // This is *not* set for long long. 74 bool isLongLong : 1; 75 bool isSizeT : 1; // 1z, 1uz (C++23) 76 bool isHalf : 1; // 1.0h 77 bool isFloat : 1; // 1.0f 78 bool isImaginary : 1; // 1.0i 79 bool isFloat16 : 1; // 1.0f16 80 bool isFloat128 : 1; // 1.0q 81 bool isFract : 1; // 1.0hr/r/lr/uhr/ur/ulr 82 bool isAccum : 1; // 1.0hk/k/lk/uhk/uk/ulk 83 bool isBitInt : 1; // 1wb, 1uwb (C23) or 1__wb, 1__uwb (Clang extension in C++ 84 // mode) 85 uint8_t MicrosoftInteger; // Microsoft suffix extension i8, i16, i32, or i64. 86 87 isFixedPointLiteral()88 bool isFixedPointLiteral() const { 89 return (saw_period || saw_exponent) && saw_fixed_point_suffix; 90 } 91 isIntegerLiteral()92 bool isIntegerLiteral() const { 93 return !saw_period && !saw_exponent && !isFixedPointLiteral(); 94 } isFloatingLiteral()95 bool isFloatingLiteral() const { 96 return (saw_period || saw_exponent) && !isFixedPointLiteral(); 97 } 98 hasUDSuffix()99 bool hasUDSuffix() const { 100 return saw_ud_suffix; 101 } getUDSuffix()102 StringRef getUDSuffix() const { 103 assert(saw_ud_suffix); 104 return UDSuffixBuf; 105 } getUDSuffixOffset()106 unsigned getUDSuffixOffset() const { 107 assert(saw_ud_suffix); 108 return SuffixBegin - ThisTokBegin; 109 } 110 111 static bool isValidUDSuffix(const LangOptions &LangOpts, StringRef Suffix); 112 getRadix()113 unsigned getRadix() const { return radix; } 114 115 /// GetIntegerValue - Convert this numeric literal value to an APInt that 116 /// matches Val's input width. If there is an overflow (i.e., if the unsigned 117 /// value read is larger than the APInt's bits will hold), set Val to the low 118 /// bits of the result and return true. Otherwise, return false. 119 bool GetIntegerValue(llvm::APInt &Val); 120 121 /// Convert this numeric literal to a floating value, using the specified 122 /// APFloat fltSemantics (specifying float, double, etc) and rounding mode. 123 llvm::APFloat::opStatus GetFloatValue(llvm::APFloat &Result, 124 llvm::RoundingMode RM); 125 126 /// GetFixedPointValue - Convert this numeric literal value into a 127 /// scaled integer that represents this value. Returns true if an overflow 128 /// occurred when calculating the integral part of the scaled integer or 129 /// calculating the digit sequence of the exponent. 130 bool GetFixedPointValue(llvm::APInt &StoreVal, unsigned Scale); 131 132 /// Get the digits that comprise the literal. This excludes any prefix or 133 /// suffix associated with the literal. getLiteralDigits()134 StringRef getLiteralDigits() const { 135 assert(!hadError && "cannot reliably get the literal digits with an error"); 136 return StringRef(DigitsBegin, SuffixBegin - DigitsBegin); 137 } 138 139 private: 140 141 void ParseNumberStartingWithZero(SourceLocation TokLoc); 142 void ParseDecimalOrOctalCommon(SourceLocation TokLoc); 143 isDigitSeparator(char C)144 static bool isDigitSeparator(char C) { return C == '\''; } 145 146 /// Determine whether the sequence of characters [Start, End) contains 147 /// any real digits (not digit separators). containsDigits(const char * Start,const char * End)148 bool containsDigits(const char *Start, const char *End) { 149 return Start != End && (Start + 1 != End || !isDigitSeparator(Start[0])); 150 } 151 152 enum CheckSeparatorKind { CSK_BeforeDigits, CSK_AfterDigits }; 153 154 /// Ensure that we don't have a digit separator here. 155 void checkSeparator(SourceLocation TokLoc, const char *Pos, 156 CheckSeparatorKind IsAfterDigits); 157 158 /// SkipHexDigits - Read and skip over any hex digits, up to End. 159 /// Return a pointer to the first non-hex digit or End. SkipHexDigits(const char * ptr)160 const char *SkipHexDigits(const char *ptr) { 161 while (ptr != ThisTokEnd && (isHexDigit(*ptr) || isDigitSeparator(*ptr))) 162 ptr++; 163 return ptr; 164 } 165 166 /// SkipOctalDigits - Read and skip over any octal digits, up to End. 167 /// Return a pointer to the first non-hex digit or End. SkipOctalDigits(const char * ptr)168 const char *SkipOctalDigits(const char *ptr) { 169 while (ptr != ThisTokEnd && 170 ((*ptr >= '0' && *ptr <= '7') || isDigitSeparator(*ptr))) 171 ptr++; 172 return ptr; 173 } 174 175 /// SkipDigits - Read and skip over any digits, up to End. 176 /// Return a pointer to the first non-hex digit or End. SkipDigits(const char * ptr)177 const char *SkipDigits(const char *ptr) { 178 while (ptr != ThisTokEnd && (isDigit(*ptr) || isDigitSeparator(*ptr))) 179 ptr++; 180 return ptr; 181 } 182 183 /// SkipBinaryDigits - Read and skip over any binary digits, up to End. 184 /// Return a pointer to the first non-binary digit or End. SkipBinaryDigits(const char * ptr)185 const char *SkipBinaryDigits(const char *ptr) { 186 while (ptr != ThisTokEnd && 187 (*ptr == '0' || *ptr == '1' || isDigitSeparator(*ptr))) 188 ptr++; 189 return ptr; 190 } 191 192 }; 193 194 /// CharLiteralParser - Perform interpretation and semantic analysis of a 195 /// character literal. 196 class CharLiteralParser { 197 uint64_t Value; 198 tok::TokenKind Kind; 199 bool IsMultiChar; 200 bool HadError; 201 SmallString<32> UDSuffixBuf; 202 unsigned UDSuffixOffset; 203 public: 204 CharLiteralParser(const char *begin, const char *end, 205 SourceLocation Loc, Preprocessor &PP, 206 tok::TokenKind kind); 207 hadError()208 bool hadError() const { return HadError; } isOrdinary()209 bool isOrdinary() const { return Kind == tok::char_constant; } isWide()210 bool isWide() const { return Kind == tok::wide_char_constant; } isUTF8()211 bool isUTF8() const { return Kind == tok::utf8_char_constant; } isUTF16()212 bool isUTF16() const { return Kind == tok::utf16_char_constant; } isUTF32()213 bool isUTF32() const { return Kind == tok::utf32_char_constant; } isMultiChar()214 bool isMultiChar() const { return IsMultiChar; } getValue()215 uint64_t getValue() const { return Value; } getUDSuffix()216 StringRef getUDSuffix() const { return UDSuffixBuf; } getUDSuffixOffset()217 unsigned getUDSuffixOffset() const { 218 assert(!UDSuffixBuf.empty() && "no ud-suffix"); 219 return UDSuffixOffset; 220 } 221 }; 222 223 enum class StringLiteralEvalMethod { 224 Evaluated, 225 Unevaluated, 226 }; 227 228 /// StringLiteralParser - This decodes string escape characters and performs 229 /// wide string analysis and Translation Phase #6 (concatenation of string 230 /// literals) (C99 5.1.1.2p1). 231 class StringLiteralParser { 232 const SourceManager &SM; 233 const LangOptions &Features; 234 const TargetInfo &Target; 235 DiagnosticsEngine *Diags; 236 237 unsigned MaxTokenLength; 238 unsigned SizeBound; 239 unsigned CharByteWidth; 240 tok::TokenKind Kind; 241 SmallString<512> ResultBuf; 242 char *ResultPtr; // cursor 243 SmallString<32> UDSuffixBuf; 244 unsigned UDSuffixToken; 245 unsigned UDSuffixOffset; 246 StringLiteralEvalMethod EvalMethod; 247 248 public: 249 StringLiteralParser(ArrayRef<Token> StringToks, Preprocessor &PP, 250 StringLiteralEvalMethod StringMethod = 251 StringLiteralEvalMethod::Evaluated); 252 StringLiteralParser(ArrayRef<Token> StringToks, const SourceManager &sm, 253 const LangOptions &features, const TargetInfo &target, 254 DiagnosticsEngine *diags = nullptr) SM(sm)255 : SM(sm), Features(features), Target(target), Diags(diags), 256 MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown), 257 ResultPtr(ResultBuf.data()), 258 EvalMethod(StringLiteralEvalMethod::Evaluated), hadError(false), 259 Pascal(false) { 260 init(StringToks); 261 } 262 263 bool hadError; 264 bool Pascal; 265 GetString()266 StringRef GetString() const { 267 return StringRef(ResultBuf.data(), GetStringLength()); 268 } GetStringLength()269 unsigned GetStringLength() const { return ResultPtr-ResultBuf.data(); } 270 GetNumStringChars()271 unsigned GetNumStringChars() const { 272 return GetStringLength() / CharByteWidth; 273 } 274 /// getOffsetOfStringByte - This function returns the offset of the 275 /// specified byte of the string data represented by Token. This handles 276 /// advancing over escape sequences in the string. 277 /// 278 /// If the Diagnostics pointer is non-null, then this will do semantic 279 /// checking of the string literal and emit errors and warnings. 280 unsigned getOffsetOfStringByte(const Token &TheTok, unsigned ByteNo) const; 281 isOrdinary()282 bool isOrdinary() const { return Kind == tok::string_literal; } isWide()283 bool isWide() const { return Kind == tok::wide_string_literal; } isUTF8()284 bool isUTF8() const { return Kind == tok::utf8_string_literal; } isUTF16()285 bool isUTF16() const { return Kind == tok::utf16_string_literal; } isUTF32()286 bool isUTF32() const { return Kind == tok::utf32_string_literal; } isPascal()287 bool isPascal() const { return Pascal; } isUnevaluated()288 bool isUnevaluated() const { 289 return EvalMethod == StringLiteralEvalMethod::Unevaluated; 290 } 291 getUDSuffix()292 StringRef getUDSuffix() const { return UDSuffixBuf; } 293 294 /// Get the index of a token containing a ud-suffix. getUDSuffixToken()295 unsigned getUDSuffixToken() const { 296 assert(!UDSuffixBuf.empty() && "no ud-suffix"); 297 return UDSuffixToken; 298 } 299 /// Get the spelling offset of the first byte of the ud-suffix. getUDSuffixOffset()300 unsigned getUDSuffixOffset() const { 301 assert(!UDSuffixBuf.empty() && "no ud-suffix"); 302 return UDSuffixOffset; 303 } 304 305 static bool isValidUDSuffix(const LangOptions &LangOpts, StringRef Suffix); 306 307 private: 308 void init(ArrayRef<Token> StringToks); 309 bool CopyStringFragment(const Token &Tok, const char *TokBegin, 310 StringRef Fragment); 311 void DiagnoseLexingError(SourceLocation Loc); 312 }; 313 314 } // end namespace clang 315 316 #endif 317