xref: /freebsd/contrib/llvm-project/clang/lib/AST/CommentLexer.cpp (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //===--- CommentLexer.cpp -------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "clang/AST/CommentLexer.h"
10 #include "clang/AST/CommentCommandTraits.h"
11 #include "clang/Basic/CharInfo.h"
12 #include "clang/Basic/DiagnosticComment.h"
13 #include "llvm/ADT/StringExtras.h"
14 #include "llvm/ADT/StringSwitch.h"
15 #include "llvm/Support/ConvertUTF.h"
16 #include "llvm/Support/ErrorHandling.h"
17 
18 namespace clang {
19 namespace comments {
20 
dump(const Lexer & L,const SourceManager & SM) const21 void Token::dump(const Lexer &L, const SourceManager &SM) const {
22   llvm::errs() << "comments::Token Kind=" << Kind << " ";
23   Loc.print(llvm::errs(), SM);
24   llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
25 }
26 
isHTMLNamedCharacterReferenceCharacter(char C)27 static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
28   return isLetter(C);
29 }
30 
isHTMLDecimalCharacterReferenceCharacter(char C)31 static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
32   return isDigit(C);
33 }
34 
isHTMLHexCharacterReferenceCharacter(char C)35 static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
36   return isHexDigit(C);
37 }
38 
convertCodePointToUTF8(llvm::BumpPtrAllocator & Allocator,unsigned CodePoint)39 static inline StringRef convertCodePointToUTF8(
40                                       llvm::BumpPtrAllocator &Allocator,
41                                       unsigned CodePoint) {
42   char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
43   char *ResolvedPtr = Resolved;
44   if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
45     return StringRef(Resolved, ResolvedPtr - Resolved);
46   else
47     return StringRef();
48 }
49 
50 namespace {
51 
52 #include "clang/AST/CommentHTMLTags.inc"
53 #include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
54 
55 } // end anonymous namespace
56 
resolveHTMLNamedCharacterReference(StringRef Name) const57 StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
58   // Fast path, first check a few most widely used named character references.
59   return llvm::StringSwitch<StringRef>(Name)
60       .Case("amp", "&")
61       .Case("lt", "<")
62       .Case("gt", ">")
63       .Case("quot", "\"")
64       .Case("apos", "\'")
65       // Slow path.
66       .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
67 }
68 
resolveHTMLDecimalCharacterReference(StringRef Name) const69 StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
70   unsigned CodePoint = 0;
71   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
72     assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
73     CodePoint *= 10;
74     CodePoint += Name[i] - '0';
75   }
76   return convertCodePointToUTF8(Allocator, CodePoint);
77 }
78 
resolveHTMLHexCharacterReference(StringRef Name) const79 StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
80   unsigned CodePoint = 0;
81   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
82     CodePoint *= 16;
83     const char C = Name[i];
84     assert(isHTMLHexCharacterReferenceCharacter(C));
85     CodePoint += llvm::hexDigitValue(C);
86   }
87   return convertCodePointToUTF8(Allocator, CodePoint);
88 }
89 
skipLineStartingDecorations()90 void Lexer::skipLineStartingDecorations() {
91   // This function should be called only for C comments
92   assert(CommentState == LCS_InsideCComment);
93 
94   if (BufferPtr == CommentEnd)
95     return;
96 
97   const char *NewBufferPtr = BufferPtr;
98   while (isHorizontalWhitespace(*NewBufferPtr))
99     if (++NewBufferPtr == CommentEnd)
100       return;
101   if (*NewBufferPtr == '*')
102     BufferPtr = NewBufferPtr + 1;
103 }
104 
105 namespace {
106 /// Returns pointer to the first newline character in the string.
findNewline(const char * BufferPtr,const char * BufferEnd)107 const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
108   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
109     if (isVerticalWhitespace(*BufferPtr))
110       return BufferPtr;
111   }
112   return BufferEnd;
113 }
114 
skipNewline(const char * BufferPtr,const char * BufferEnd)115 const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
116   if (BufferPtr == BufferEnd)
117     return BufferPtr;
118 
119   if (*BufferPtr == '\n')
120     BufferPtr++;
121   else {
122     assert(*BufferPtr == '\r');
123     BufferPtr++;
124     if (BufferPtr != BufferEnd && *BufferPtr == '\n')
125       BufferPtr++;
126   }
127   return BufferPtr;
128 }
129 
skipNamedCharacterReference(const char * BufferPtr,const char * BufferEnd)130 const char *skipNamedCharacterReference(const char *BufferPtr,
131                                         const char *BufferEnd) {
132   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
133     if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
134       return BufferPtr;
135   }
136   return BufferEnd;
137 }
138 
skipDecimalCharacterReference(const char * BufferPtr,const char * BufferEnd)139 const char *skipDecimalCharacterReference(const char *BufferPtr,
140                                           const char *BufferEnd) {
141   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
142     if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
143       return BufferPtr;
144   }
145   return BufferEnd;
146 }
147 
skipHexCharacterReference(const char * BufferPtr,const char * BufferEnd)148 const char *skipHexCharacterReference(const char *BufferPtr,
149                                       const char *BufferEnd) {
150   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
151     if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
152       return BufferPtr;
153   }
154   return BufferEnd;
155 }
156 
isHTMLIdentifierStartingCharacter(char C)157 bool isHTMLIdentifierStartingCharacter(char C) {
158   return isLetter(C);
159 }
160 
isHTMLIdentifierCharacter(char C)161 bool isHTMLIdentifierCharacter(char C) {
162   return isAlphanumeric(C);
163 }
164 
skipHTMLIdentifier(const char * BufferPtr,const char * BufferEnd)165 const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
166   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
167     if (!isHTMLIdentifierCharacter(*BufferPtr))
168       return BufferPtr;
169   }
170   return BufferEnd;
171 }
172 
173 /// Skip HTML string quoted in single or double quotes.  Escaping quotes inside
174 /// string allowed.
175 ///
176 /// Returns pointer to closing quote.
skipHTMLQuotedString(const char * BufferPtr,const char * BufferEnd)177 const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
178 {
179   const char Quote = *BufferPtr;
180   assert(Quote == '\"' || Quote == '\'');
181 
182   BufferPtr++;
183   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
184     const char C = *BufferPtr;
185     if (C == Quote && BufferPtr[-1] != '\\')
186       return BufferPtr;
187   }
188   return BufferEnd;
189 }
190 
skipWhitespace(const char * BufferPtr,const char * BufferEnd)191 const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
192   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
193     if (!isWhitespace(*BufferPtr))
194       return BufferPtr;
195   }
196   return BufferEnd;
197 }
198 
skipHorizontalWhitespace(const char * BufferPtr,const char * BufferEnd)199 const char *skipHorizontalWhitespace(const char *BufferPtr,
200                                      const char *BufferEnd) {
201   for (; BufferPtr != BufferEnd; ++BufferPtr) {
202     if (!isHorizontalWhitespace(*BufferPtr))
203       return BufferPtr;
204   }
205   return BufferEnd;
206 }
207 
isWhitespace(const char * BufferPtr,const char * BufferEnd)208 bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
209   return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
210 }
211 
isCommandNameStartCharacter(char C)212 bool isCommandNameStartCharacter(char C) {
213   return isLetter(C);
214 }
215 
isCommandNameCharacter(char C)216 bool isCommandNameCharacter(char C) {
217   return isAlphanumeric(C);
218 }
219 
skipCommandName(const char * BufferPtr,const char * BufferEnd)220 const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
221   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
222     if (!isCommandNameCharacter(*BufferPtr))
223       return BufferPtr;
224   }
225   return BufferEnd;
226 }
227 
228 /// Return the one past end pointer for BCPL comments.
229 /// Handles newlines escaped with backslash or trigraph for backslahs.
findBCPLCommentEnd(const char * BufferPtr,const char * BufferEnd)230 const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
231   const char *CurPtr = BufferPtr;
232   while (CurPtr != BufferEnd) {
233     while (!isVerticalWhitespace(*CurPtr)) {
234       CurPtr++;
235       if (CurPtr == BufferEnd)
236         return BufferEnd;
237     }
238     // We found a newline, check if it is escaped.
239     const char *EscapePtr = CurPtr - 1;
240     while(isHorizontalWhitespace(*EscapePtr))
241       EscapePtr--;
242 
243     if (*EscapePtr == '\\' ||
244         (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
245          EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
246       // We found an escaped newline.
247       CurPtr = skipNewline(CurPtr, BufferEnd);
248     } else
249       return CurPtr; // Not an escaped newline.
250   }
251   return BufferEnd;
252 }
253 
254 /// Return the one past end pointer for C comments.
255 /// Very dumb, does not handle escaped newlines or trigraphs.
findCCommentEnd(const char * BufferPtr,const char * BufferEnd)256 const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
257   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
258     if (*BufferPtr == '*') {
259       assert(BufferPtr + 1 != BufferEnd);
260       if (*(BufferPtr + 1) == '/')
261         return BufferPtr;
262     }
263   }
264   llvm_unreachable("buffer end hit before '*/' was seen");
265 }
266 
267 } // end anonymous namespace
268 
formTokenWithChars(Token & Result,const char * TokEnd,tok::TokenKind Kind)269 void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
270                                tok::TokenKind Kind) {
271   const unsigned TokLen = TokEnd - BufferPtr;
272   Result.setLocation(getSourceLocation(BufferPtr));
273   Result.setKind(Kind);
274   Result.setLength(TokLen);
275 #ifndef NDEBUG
276   Result.TextPtr = "<UNSET>";
277   Result.IntVal = 7;
278 #endif
279   BufferPtr = TokEnd;
280 }
281 
skipTextToken()282 const char *Lexer::skipTextToken() {
283   const char *TokenPtr = BufferPtr;
284   assert(TokenPtr < CommentEnd);
285   StringRef TokStartSymbols = ParseCommands ? "\n\r\\@\"&<" : "\n\r";
286 
287 again:
288   size_t End =
289       StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of(TokStartSymbols);
290   if (End == StringRef::npos)
291     return CommentEnd;
292 
293   // Doxygen doesn't recognize any commands in a one-line double quotation.
294   // If we don't find an ending quotation mark, we pretend it never began.
295   if (*(TokenPtr + End) == '\"') {
296     TokenPtr += End + 1;
297     End = StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of("\n\r\"");
298     if (End != StringRef::npos && *(TokenPtr + End) == '\"')
299       TokenPtr += End + 1;
300     goto again;
301   }
302   return TokenPtr + End;
303 }
304 
lexCommentText(Token & T)305 void Lexer::lexCommentText(Token &T) {
306   assert(CommentState == LCS_InsideBCPLComment ||
307          CommentState == LCS_InsideCComment);
308 
309   // Handles lexing non-command text, i.e. text and newline.
310   auto HandleNonCommandToken = [&]() -> void {
311     assert(State == LS_Normal);
312 
313     const char *TokenPtr = BufferPtr;
314     assert(TokenPtr < CommentEnd);
315     switch (*TokenPtr) {
316       case '\n':
317       case '\r':
318           TokenPtr = skipNewline(TokenPtr, CommentEnd);
319           formTokenWithChars(T, TokenPtr, tok::newline);
320 
321           if (CommentState == LCS_InsideCComment)
322             skipLineStartingDecorations();
323           return;
324 
325       default:
326         return formTextToken(T, skipTextToken());
327     }
328   };
329 
330   if (!ParseCommands)
331     return HandleNonCommandToken();
332 
333   switch (State) {
334   case LS_Normal:
335     break;
336   case LS_VerbatimBlockFirstLine:
337     lexVerbatimBlockFirstLine(T);
338     return;
339   case LS_VerbatimBlockBody:
340     lexVerbatimBlockBody(T);
341     return;
342   case LS_VerbatimLineText:
343     lexVerbatimLineText(T);
344     return;
345   case LS_HTMLStartTag:
346     lexHTMLStartTag(T);
347     return;
348   case LS_HTMLEndTag:
349     lexHTMLEndTag(T);
350     return;
351   }
352 
353   assert(State == LS_Normal);
354   const char *TokenPtr = BufferPtr;
355   assert(TokenPtr < CommentEnd);
356   switch(*TokenPtr) {
357     case '\\':
358     case '@': {
359       // Commands that start with a backslash and commands that start with
360       // 'at' have equivalent semantics.  But we keep information about the
361       // exact syntax in AST for comments.
362       tok::TokenKind CommandKind =
363           (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
364       TokenPtr++;
365       if (TokenPtr == CommentEnd) {
366         formTextToken(T, TokenPtr);
367         return;
368       }
369       char C = *TokenPtr;
370       switch (C) {
371       default:
372         break;
373 
374       case '\\': case '@': case '&': case '$':
375       case '#':  case '<': case '>': case '%':
376       case '\"': case '.': case ':':
377         // This is one of \\ \@ \& \$ etc escape sequences.
378         TokenPtr++;
379         if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
380           // This is the \:: escape sequence.
381           TokenPtr++;
382         }
383         StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
384         formTokenWithChars(T, TokenPtr, tok::text);
385         T.setText(UnescapedText);
386         return;
387       }
388 
389       // Don't make zero-length commands.
390       if (!isCommandNameStartCharacter(*TokenPtr)) {
391         formTextToken(T, TokenPtr);
392         return;
393       }
394 
395       TokenPtr = skipCommandName(TokenPtr, CommentEnd);
396       unsigned Length = TokenPtr - (BufferPtr + 1);
397 
398       // Hardcoded support for lexing LaTeX formula commands
399       // \f$ \f( \f) \f[ \f] \f{ \f} as a single command.
400       if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
401         C = *TokenPtr;
402         if (C == '$' || C == '(' || C == ')' || C == '[' || C == ']' ||
403             C == '{' || C == '}') {
404           TokenPtr++;
405           Length++;
406         }
407       }
408 
409       StringRef CommandName(BufferPtr + 1, Length);
410 
411       const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
412       if (!Info) {
413         if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
414           StringRef CorrectedName = Info->Name;
415           SourceLocation Loc = getSourceLocation(BufferPtr);
416           SourceLocation EndLoc = getSourceLocation(TokenPtr);
417           SourceRange FullRange = SourceRange(Loc, EndLoc);
418           SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);
419           Diag(Loc, diag::warn_correct_comment_command_name)
420             << FullRange << CommandName << CorrectedName
421             << FixItHint::CreateReplacement(CommandRange, CorrectedName);
422         } else {
423           formTokenWithChars(T, TokenPtr, tok::unknown_command);
424           T.setUnknownCommandName(CommandName);
425           Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
426               << SourceRange(T.getLocation(), T.getEndLocation());
427           return;
428         }
429       }
430       if (Info->IsVerbatimBlockCommand) {
431         setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
432         return;
433       }
434       if (Info->IsVerbatimLineCommand) {
435         setupAndLexVerbatimLine(T, TokenPtr, Info);
436         return;
437       }
438       formTokenWithChars(T, TokenPtr, CommandKind);
439       T.setCommandID(Info->getID());
440       return;
441     }
442 
443     case '&':
444       lexHTMLCharacterReference(T);
445       return;
446 
447     case '<': {
448       TokenPtr++;
449       if (TokenPtr == CommentEnd) {
450         formTextToken(T, TokenPtr);
451         return;
452       }
453       const char C = *TokenPtr;
454       if (isHTMLIdentifierStartingCharacter(C))
455         setupAndLexHTMLStartTag(T);
456       else if (C == '/')
457         setupAndLexHTMLEndTag(T);
458       else
459         formTextToken(T, TokenPtr);
460       return;
461     }
462 
463     default:
464       return HandleNonCommandToken();
465   }
466 }
467 
setupAndLexVerbatimBlock(Token & T,const char * TextBegin,char Marker,const CommandInfo * Info)468 void Lexer::setupAndLexVerbatimBlock(Token &T,
469                                      const char *TextBegin,
470                                      char Marker, const CommandInfo *Info) {
471   assert(Info->IsVerbatimBlockCommand);
472 
473   VerbatimBlockEndCommandName.clear();
474   VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
475   VerbatimBlockEndCommandName.append(Info->EndCommandName);
476 
477   formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
478   T.setVerbatimBlockID(Info->getID());
479 
480   // If there is a newline following the verbatim opening command, skip the
481   // newline so that we don't create an tok::verbatim_block_line with empty
482   // text content.
483   if (BufferPtr != CommentEnd &&
484       isVerticalWhitespace(*BufferPtr)) {
485     BufferPtr = skipNewline(BufferPtr, CommentEnd);
486     State = LS_VerbatimBlockBody;
487     return;
488   }
489 
490   State = LS_VerbatimBlockFirstLine;
491 }
492 
lexVerbatimBlockFirstLine(Token & T)493 void Lexer::lexVerbatimBlockFirstLine(Token &T) {
494 again:
495   assert(BufferPtr < CommentEnd);
496 
497   // FIXME: It would be better to scan the text once, finding either the block
498   // end command or newline.
499   //
500   // Extract current line.
501   const char *Newline = findNewline(BufferPtr, CommentEnd);
502   StringRef Line(BufferPtr, Newline - BufferPtr);
503 
504   // Look for end command in current line.
505   size_t Pos = Line.find(VerbatimBlockEndCommandName);
506   const char *TextEnd;
507   const char *NextLine;
508   if (Pos == StringRef::npos) {
509     // Current line is completely verbatim.
510     TextEnd = Newline;
511     NextLine = skipNewline(Newline, CommentEnd);
512   } else if (Pos == 0) {
513     // Current line contains just an end command.
514     const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
515     StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
516     formTokenWithChars(T, End, tok::verbatim_block_end);
517     T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
518     State = LS_Normal;
519     return;
520   } else {
521     // There is some text, followed by end command.  Extract text first.
522     TextEnd = BufferPtr + Pos;
523     NextLine = TextEnd;
524     // If there is only whitespace before end command, skip whitespace.
525     if (isWhitespace(BufferPtr, TextEnd)) {
526       BufferPtr = TextEnd;
527       goto again;
528     }
529   }
530 
531   StringRef Text(BufferPtr, TextEnd - BufferPtr);
532   formTokenWithChars(T, NextLine, tok::verbatim_block_line);
533   T.setVerbatimBlockText(Text);
534 
535   State = LS_VerbatimBlockBody;
536 }
537 
lexVerbatimBlockBody(Token & T)538 void Lexer::lexVerbatimBlockBody(Token &T) {
539   assert(State == LS_VerbatimBlockBody);
540 
541   if (CommentState == LCS_InsideCComment)
542     skipLineStartingDecorations();
543 
544   if (BufferPtr == CommentEnd) {
545     formTokenWithChars(T, BufferPtr, tok::verbatim_block_line);
546     T.setVerbatimBlockText("");
547     return;
548   }
549 
550   lexVerbatimBlockFirstLine(T);
551 }
552 
setupAndLexVerbatimLine(Token & T,const char * TextBegin,const CommandInfo * Info)553 void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
554                                     const CommandInfo *Info) {
555   assert(Info->IsVerbatimLineCommand);
556   formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
557   T.setVerbatimLineID(Info->getID());
558 
559   State = LS_VerbatimLineText;
560 }
561 
lexVerbatimLineText(Token & T)562 void Lexer::lexVerbatimLineText(Token &T) {
563   assert(State == LS_VerbatimLineText);
564 
565   // Extract current line.
566   const char *Newline = findNewline(BufferPtr, CommentEnd);
567   StringRef Text(BufferPtr, Newline - BufferPtr);
568   formTokenWithChars(T, Newline, tok::verbatim_line_text);
569   T.setVerbatimLineText(Text);
570 
571   State = LS_Normal;
572 }
573 
lexHTMLCharacterReference(Token & T)574 void Lexer::lexHTMLCharacterReference(Token &T) {
575   const char *TokenPtr = BufferPtr;
576   assert(*TokenPtr == '&');
577   TokenPtr++;
578   if (TokenPtr == CommentEnd) {
579     formTextToken(T, TokenPtr);
580     return;
581   }
582   const char *NamePtr;
583   bool isNamed = false;
584   bool isDecimal = false;
585   char C = *TokenPtr;
586   if (isHTMLNamedCharacterReferenceCharacter(C)) {
587     NamePtr = TokenPtr;
588     TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
589     isNamed = true;
590   } else if (C == '#') {
591     TokenPtr++;
592     if (TokenPtr == CommentEnd) {
593       formTextToken(T, TokenPtr);
594       return;
595     }
596     C = *TokenPtr;
597     if (isHTMLDecimalCharacterReferenceCharacter(C)) {
598       NamePtr = TokenPtr;
599       TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
600       isDecimal = true;
601     } else if (C == 'x' || C == 'X') {
602       TokenPtr++;
603       NamePtr = TokenPtr;
604       TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
605     } else {
606       formTextToken(T, TokenPtr);
607       return;
608     }
609   } else {
610     formTextToken(T, TokenPtr);
611     return;
612   }
613   if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
614       *TokenPtr != ';') {
615     formTextToken(T, TokenPtr);
616     return;
617   }
618   StringRef Name(NamePtr, TokenPtr - NamePtr);
619   TokenPtr++; // Skip semicolon.
620   StringRef Resolved;
621   if (isNamed)
622     Resolved = resolveHTMLNamedCharacterReference(Name);
623   else if (isDecimal)
624     Resolved = resolveHTMLDecimalCharacterReference(Name);
625   else
626     Resolved = resolveHTMLHexCharacterReference(Name);
627 
628   if (Resolved.empty()) {
629     formTextToken(T, TokenPtr);
630     return;
631   }
632   formTokenWithChars(T, TokenPtr, tok::text);
633   T.setText(Resolved);
634 }
635 
setupAndLexHTMLStartTag(Token & T)636 void Lexer::setupAndLexHTMLStartTag(Token &T) {
637   assert(BufferPtr[0] == '<' &&
638          isHTMLIdentifierStartingCharacter(BufferPtr[1]));
639   const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
640   StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
641   if (!isHTMLTagName(Name)) {
642     formTextToken(T, TagNameEnd);
643     return;
644   }
645 
646   formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
647   T.setHTMLTagStartName(Name);
648 
649   BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd);
650   if (BufferPtr == CommentEnd) { // in BCPL comments
651     State = LS_HTMLStartTag;
652     return;
653   }
654 
655   const char C = *BufferPtr;
656   if (BufferPtr != CommentEnd &&
657       (C == '>' || C == '/' || isVerticalWhitespace(C) ||
658        isHTMLIdentifierStartingCharacter(C)))
659     State = LS_HTMLStartTag;
660 }
661 
lexHTMLStartTag(Token & T)662 void Lexer::lexHTMLStartTag(Token &T) {
663   assert(State == LS_HTMLStartTag);
664 
665   // Skip leading whitespace and comment decorations
666   while (isVerticalWhitespace(*BufferPtr)) {
667     BufferPtr = skipNewline(BufferPtr, CommentEnd);
668 
669     if (CommentState == LCS_InsideCComment)
670       skipLineStartingDecorations();
671 
672     BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd);
673     if (BufferPtr == CommentEnd) {
674       // HTML starting tags must be defined in a single comment block.
675       // It's likely a user-error where they forgot to terminate the comment.
676       State = LS_Normal;
677       // Since at least one newline was skipped and one token needs to be lexed,
678       // return a newline.
679       formTokenWithChars(T, BufferPtr, tok::newline);
680       return;
681     }
682   }
683 
684   const char *TokenPtr = BufferPtr;
685   char C = *TokenPtr;
686   if (isHTMLIdentifierCharacter(C)) {
687     TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
688     StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
689     formTokenWithChars(T, TokenPtr, tok::html_ident);
690     T.setHTMLIdent(Ident);
691   } else {
692     switch (C) {
693     case '=':
694       TokenPtr++;
695       formTokenWithChars(T, TokenPtr, tok::html_equals);
696       break;
697     case '\"':
698     case '\'': {
699       const char *OpenQuote = TokenPtr;
700       TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
701       const char *ClosingQuote = TokenPtr;
702       if (TokenPtr != CommentEnd) // Skip closing quote.
703         TokenPtr++;
704       formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
705       T.setHTMLQuotedString(StringRef(OpenQuote + 1,
706                                       ClosingQuote - (OpenQuote + 1)));
707       break;
708     }
709     case '>':
710       TokenPtr++;
711       formTokenWithChars(T, TokenPtr, tok::html_greater);
712       State = LS_Normal;
713       return;
714     case '/':
715       TokenPtr++;
716       if (TokenPtr != CommentEnd && *TokenPtr == '>') {
717         TokenPtr++;
718         formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
719       } else
720         formTextToken(T, TokenPtr);
721 
722       State = LS_Normal;
723       return;
724     }
725   }
726 
727   // Now look ahead and return to normal state if we don't see any HTML tokens
728   // ahead.
729   BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd);
730   if (BufferPtr == CommentEnd) {
731     return;
732   }
733 
734   C = *BufferPtr;
735   if (!isHTMLIdentifierStartingCharacter(C) && !isVerticalWhitespace(C) &&
736       C != '=' && C != '\"' && C != '\'' && C != '>' && C != '/') {
737     State = LS_Normal;
738     return;
739   }
740 }
741 
setupAndLexHTMLEndTag(Token & T)742 void Lexer::setupAndLexHTMLEndTag(Token &T) {
743   assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
744 
745   const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
746   const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
747   StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
748   if (!isHTMLTagName(Name)) {
749     formTextToken(T, TagNameEnd);
750     return;
751   }
752 
753   const char *End = skipWhitespace(TagNameEnd, CommentEnd);
754 
755   formTokenWithChars(T, End, tok::html_end_tag);
756   T.setHTMLTagEndName(Name);
757 
758   if (BufferPtr != CommentEnd && *BufferPtr == '>')
759     State = LS_HTMLEndTag;
760 }
761 
lexHTMLEndTag(Token & T)762 void Lexer::lexHTMLEndTag(Token &T) {
763   assert(BufferPtr != CommentEnd && *BufferPtr == '>');
764 
765   formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
766   State = LS_Normal;
767 }
768 
Lexer(llvm::BumpPtrAllocator & Allocator,DiagnosticsEngine & Diags,const CommandTraits & Traits,SourceLocation FileLoc,const char * BufferStart,const char * BufferEnd,bool ParseCommands)769 Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
770              const CommandTraits &Traits, SourceLocation FileLoc,
771              const char *BufferStart, const char *BufferEnd, bool ParseCommands)
772     : Allocator(Allocator), Diags(Diags), Traits(Traits),
773       BufferStart(BufferStart), BufferEnd(BufferEnd), BufferPtr(BufferStart),
774       FileLoc(FileLoc), ParseCommands(ParseCommands),
775       CommentState(LCS_BeforeComment), State(LS_Normal) {}
776 
lex(Token & T)777 void Lexer::lex(Token &T) {
778 again:
779   switch (CommentState) {
780   case LCS_BeforeComment:
781     if (BufferPtr == BufferEnd) {
782       formTokenWithChars(T, BufferPtr, tok::eof);
783       return;
784     }
785 
786     assert(*BufferPtr == '/');
787     BufferPtr++; // Skip first slash.
788     switch(*BufferPtr) {
789     case '/': { // BCPL comment.
790       BufferPtr++; // Skip second slash.
791 
792       if (BufferPtr != BufferEnd) {
793         // Skip Doxygen magic marker, if it is present.
794         // It might be missing because of a typo //< or /*<, or because we
795         // merged this non-Doxygen comment into a bunch of Doxygen comments
796         // around it: /** ... */ /* ... */ /** ... */
797         const char C = *BufferPtr;
798         if (C == '/' || C == '!')
799           BufferPtr++;
800       }
801 
802       // Skip less-than symbol that marks trailing comments.
803       // Skip it even if the comment is not a Doxygen one, because //< and /*<
804       // are frequent typos.
805       if (BufferPtr != BufferEnd && *BufferPtr == '<')
806         BufferPtr++;
807 
808       CommentState = LCS_InsideBCPLComment;
809       switch (State) {
810       case LS_VerbatimBlockFirstLine:
811       case LS_VerbatimBlockBody:
812         break;
813       case LS_HTMLStartTag:
814         BufferPtr = skipHorizontalWhitespace(BufferPtr, BufferEnd);
815         break;
816       default:
817         State = LS_Normal;
818         break;
819       }
820       CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
821       goto again;
822     }
823     case '*': { // C comment.
824       BufferPtr++; // Skip star.
825 
826       // Skip Doxygen magic marker.
827       const char C = *BufferPtr;
828       if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
829         BufferPtr++;
830 
831       // Skip less-than symbol that marks trailing comments.
832       if (BufferPtr != BufferEnd && *BufferPtr == '<')
833         BufferPtr++;
834 
835       CommentState = LCS_InsideCComment;
836       State = LS_Normal;
837       CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
838       goto again;
839     }
840     default:
841       llvm_unreachable("second character of comment should be '/' or '*'");
842     }
843 
844   case LCS_BetweenComments: {
845     // Consecutive comments are extracted only if there is only whitespace
846     // between them.  So we can search for the start of the next comment.
847     const char *EndWhitespace = BufferPtr;
848     while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
849       EndWhitespace++;
850 
851     // When lexing the start of an HTML tag (i.e. going through the attributes)
852     // there won't be any newlines generated.
853     if (State == LS_HTMLStartTag && EndWhitespace != BufferEnd) {
854       CommentState = LCS_BeforeComment;
855       BufferPtr = EndWhitespace;
856       goto again;
857     }
858 
859     // Turn any whitespace between comments (and there is only whitespace
860     // between them -- guaranteed by comment extraction) into a newline.  We
861     // have two newlines between C comments in total (first one was synthesized
862     // after a comment).
863     formTokenWithChars(T, EndWhitespace, tok::newline);
864 
865     CommentState = LCS_BeforeComment;
866     break;
867   }
868 
869   case LCS_InsideBCPLComment:
870   case LCS_InsideCComment:
871     if (BufferPtr != CommentEnd) {
872       lexCommentText(T);
873       break;
874     } else {
875       // Skip C comment closing sequence.
876       if (CommentState == LCS_InsideCComment) {
877         assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
878         BufferPtr += 2;
879         assert(BufferPtr <= BufferEnd);
880 
881         // When lexing the start of an HTML tag (i.e. going through the
882         // attributes) there won't be any newlines generated - whitespace still
883         // needs to be skipped.
884         if (State == LS_HTMLStartTag && BufferPtr != BufferEnd) {
885           CommentState = LCS_BetweenComments;
886           goto again;
887         }
888 
889         // Synthenize newline just after the C comment, regardless if there is
890         // actually a newline.
891         formTokenWithChars(T, BufferPtr, tok::newline);
892 
893         CommentState = LCS_BetweenComments;
894         break;
895       } else {
896         // Don't synthesized a newline after BCPL comment.
897         CommentState = LCS_BetweenComments;
898         goto again;
899       }
900     }
901   }
902 }
903 
getSpelling(const Token & Tok,const SourceManager & SourceMgr) const904 StringRef Lexer::getSpelling(const Token &Tok,
905                              const SourceManager &SourceMgr) const {
906   SourceLocation Loc = Tok.getLocation();
907   FileIDAndOffset LocInfo = SourceMgr.getDecomposedLoc(Loc);
908 
909   bool InvalidTemp = false;
910   StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
911   if (InvalidTemp)
912     return StringRef();
913 
914   const char *Begin = File.data() + LocInfo.second;
915   return StringRef(Begin, Tok.getLength());
916 }
917 
918 } // end namespace comments
919 } // end namespace clang
920