xref: /freebsd/contrib/llvm-project/clang/lib/AST/CommentLexer.cpp (revision e64fe029e9d3ce476e77a478318e0c3cd201ff08)
1 //===--- CommentLexer.cpp -------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "clang/AST/CommentLexer.h"
10 #include "clang/AST/CommentCommandTraits.h"
11 #include "clang/AST/CommentDiagnostic.h"
12 #include "clang/Basic/CharInfo.h"
13 #include "llvm/ADT/StringExtras.h"
14 #include "llvm/ADT/StringSwitch.h"
15 #include "llvm/Support/ConvertUTF.h"
16 #include "llvm/Support/ErrorHandling.h"
17 
18 namespace clang {
19 namespace comments {
20 
21 void Token::dump(const Lexer &L, const SourceManager &SM) const {
22   llvm::errs() << "comments::Token Kind=" << Kind << " ";
23   Loc.print(llvm::errs(), SM);
24   llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
25 }
26 
27 static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
28   return isLetter(C);
29 }
30 
31 static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
32   return isDigit(C);
33 }
34 
35 static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
36   return isHexDigit(C);
37 }
38 
39 static inline StringRef convertCodePointToUTF8(
40                                       llvm::BumpPtrAllocator &Allocator,
41                                       unsigned CodePoint) {
42   char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
43   char *ResolvedPtr = Resolved;
44   if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
45     return StringRef(Resolved, ResolvedPtr - Resolved);
46   else
47     return StringRef();
48 }
49 
50 namespace {
51 
52 #include "clang/AST/CommentHTMLTags.inc"
53 #include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
54 
55 } // end anonymous namespace
56 
57 StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
58   // Fast path, first check a few most widely used named character references.
59   return llvm::StringSwitch<StringRef>(Name)
60       .Case("amp", "&")
61       .Case("lt", "<")
62       .Case("gt", ">")
63       .Case("quot", "\"")
64       .Case("apos", "\'")
65       // Slow path.
66       .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
67 }
68 
69 StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
70   unsigned CodePoint = 0;
71   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
72     assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
73     CodePoint *= 10;
74     CodePoint += Name[i] - '0';
75   }
76   return convertCodePointToUTF8(Allocator, CodePoint);
77 }
78 
79 StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
80   unsigned CodePoint = 0;
81   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
82     CodePoint *= 16;
83     const char C = Name[i];
84     assert(isHTMLHexCharacterReferenceCharacter(C));
85     CodePoint += llvm::hexDigitValue(C);
86   }
87   return convertCodePointToUTF8(Allocator, CodePoint);
88 }
89 
90 void Lexer::skipLineStartingDecorations() {
91   // This function should be called only for C comments
92   assert(CommentState == LCS_InsideCComment);
93 
94   if (BufferPtr == CommentEnd)
95     return;
96 
97   const char *NewBufferPtr = BufferPtr;
98   while (isHorizontalWhitespace(*NewBufferPtr))
99     if (++NewBufferPtr == CommentEnd)
100       return;
101   if (*NewBufferPtr == '*')
102     BufferPtr = NewBufferPtr + 1;
103 }
104 
105 namespace {
106 /// Returns pointer to the first newline character in the string.
107 const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
108   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
109     if (isVerticalWhitespace(*BufferPtr))
110       return BufferPtr;
111   }
112   return BufferEnd;
113 }
114 
115 const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
116   if (BufferPtr == BufferEnd)
117     return BufferPtr;
118 
119   if (*BufferPtr == '\n')
120     BufferPtr++;
121   else {
122     assert(*BufferPtr == '\r');
123     BufferPtr++;
124     if (BufferPtr != BufferEnd && *BufferPtr == '\n')
125       BufferPtr++;
126   }
127   return BufferPtr;
128 }
129 
130 const char *skipNamedCharacterReference(const char *BufferPtr,
131                                         const char *BufferEnd) {
132   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
133     if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
134       return BufferPtr;
135   }
136   return BufferEnd;
137 }
138 
139 const char *skipDecimalCharacterReference(const char *BufferPtr,
140                                           const char *BufferEnd) {
141   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
142     if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
143       return BufferPtr;
144   }
145   return BufferEnd;
146 }
147 
148 const char *skipHexCharacterReference(const char *BufferPtr,
149                                       const char *BufferEnd) {
150   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
151     if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
152       return BufferPtr;
153   }
154   return BufferEnd;
155 }
156 
157 bool isHTMLIdentifierStartingCharacter(char C) {
158   return isLetter(C);
159 }
160 
161 bool isHTMLIdentifierCharacter(char C) {
162   return isAlphanumeric(C);
163 }
164 
165 const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
166   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
167     if (!isHTMLIdentifierCharacter(*BufferPtr))
168       return BufferPtr;
169   }
170   return BufferEnd;
171 }
172 
173 /// Skip HTML string quoted in single or double quotes.  Escaping quotes inside
174 /// string allowed.
175 ///
176 /// Returns pointer to closing quote.
177 const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
178 {
179   const char Quote = *BufferPtr;
180   assert(Quote == '\"' || Quote == '\'');
181 
182   BufferPtr++;
183   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
184     const char C = *BufferPtr;
185     if (C == Quote && BufferPtr[-1] != '\\')
186       return BufferPtr;
187   }
188   return BufferEnd;
189 }
190 
191 const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
192   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
193     if (!isWhitespace(*BufferPtr))
194       return BufferPtr;
195   }
196   return BufferEnd;
197 }
198 
199 bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
200   return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
201 }
202 
203 bool isCommandNameStartCharacter(char C) {
204   return isLetter(C);
205 }
206 
207 bool isCommandNameCharacter(char C) {
208   return isAlphanumeric(C);
209 }
210 
211 const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
212   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
213     if (!isCommandNameCharacter(*BufferPtr))
214       return BufferPtr;
215   }
216   return BufferEnd;
217 }
218 
219 /// Return the one past end pointer for BCPL comments.
220 /// Handles newlines escaped with backslash or trigraph for backslahs.
221 const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
222   const char *CurPtr = BufferPtr;
223   while (CurPtr != BufferEnd) {
224     while (!isVerticalWhitespace(*CurPtr)) {
225       CurPtr++;
226       if (CurPtr == BufferEnd)
227         return BufferEnd;
228     }
229     // We found a newline, check if it is escaped.
230     const char *EscapePtr = CurPtr - 1;
231     while(isHorizontalWhitespace(*EscapePtr))
232       EscapePtr--;
233 
234     if (*EscapePtr == '\\' ||
235         (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
236          EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
237       // We found an escaped newline.
238       CurPtr = skipNewline(CurPtr, BufferEnd);
239     } else
240       return CurPtr; // Not an escaped newline.
241   }
242   return BufferEnd;
243 }
244 
245 /// Return the one past end pointer for C comments.
246 /// Very dumb, does not handle escaped newlines or trigraphs.
247 const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
248   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
249     if (*BufferPtr == '*') {
250       assert(BufferPtr + 1 != BufferEnd);
251       if (*(BufferPtr + 1) == '/')
252         return BufferPtr;
253     }
254   }
255   llvm_unreachable("buffer end hit before '*/' was seen");
256 }
257 
258 } // end anonymous namespace
259 
260 void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
261                                tok::TokenKind Kind) {
262   const unsigned TokLen = TokEnd - BufferPtr;
263   Result.setLocation(getSourceLocation(BufferPtr));
264   Result.setKind(Kind);
265   Result.setLength(TokLen);
266 #ifndef NDEBUG
267   Result.TextPtr = "<UNSET>";
268   Result.IntVal = 7;
269 #endif
270   BufferPtr = TokEnd;
271 }
272 
273 const char *Lexer::skipTextToken() {
274   const char *TokenPtr = BufferPtr;
275   assert(TokenPtr < CommentEnd);
276   StringRef TokStartSymbols = ParseCommands ? "\n\r\\@\"&<" : "\n\r";
277 
278 again:
279   size_t End =
280       StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of(TokStartSymbols);
281   if (End == StringRef::npos)
282     return CommentEnd;
283 
284   // Doxygen doesn't recognize any commands in a one-line double quotation.
285   // If we don't find an ending quotation mark, we pretend it never began.
286   if (*(TokenPtr + End) == '\"') {
287     TokenPtr += End + 1;
288     End = StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of("\n\r\"");
289     if (End != StringRef::npos && *(TokenPtr + End) == '\"')
290       TokenPtr += End + 1;
291     goto again;
292   }
293   return TokenPtr + End;
294 }
295 
296 void Lexer::lexCommentText(Token &T) {
297   assert(CommentState == LCS_InsideBCPLComment ||
298          CommentState == LCS_InsideCComment);
299 
300   // Handles lexing non-command text, i.e. text and newline.
301   auto HandleNonCommandToken = [&]() -> void {
302     assert(State == LS_Normal);
303 
304     const char *TokenPtr = BufferPtr;
305     assert(TokenPtr < CommentEnd);
306     switch (*TokenPtr) {
307       case '\n':
308       case '\r':
309           TokenPtr = skipNewline(TokenPtr, CommentEnd);
310           formTokenWithChars(T, TokenPtr, tok::newline);
311 
312           if (CommentState == LCS_InsideCComment)
313             skipLineStartingDecorations();
314           return;
315 
316       default:
317         return formTextToken(T, skipTextToken());
318     }
319   };
320 
321   if (!ParseCommands)
322     return HandleNonCommandToken();
323 
324   switch (State) {
325   case LS_Normal:
326     break;
327   case LS_VerbatimBlockFirstLine:
328     lexVerbatimBlockFirstLine(T);
329     return;
330   case LS_VerbatimBlockBody:
331     lexVerbatimBlockBody(T);
332     return;
333   case LS_VerbatimLineText:
334     lexVerbatimLineText(T);
335     return;
336   case LS_HTMLStartTag:
337     lexHTMLStartTag(T);
338     return;
339   case LS_HTMLEndTag:
340     lexHTMLEndTag(T);
341     return;
342   }
343 
344   assert(State == LS_Normal);
345   const char *TokenPtr = BufferPtr;
346   assert(TokenPtr < CommentEnd);
347   switch(*TokenPtr) {
348     case '\\':
349     case '@': {
350       // Commands that start with a backslash and commands that start with
351       // 'at' have equivalent semantics.  But we keep information about the
352       // exact syntax in AST for comments.
353       tok::TokenKind CommandKind =
354           (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
355       TokenPtr++;
356       if (TokenPtr == CommentEnd) {
357         formTextToken(T, TokenPtr);
358         return;
359       }
360       char C = *TokenPtr;
361       switch (C) {
362       default:
363         break;
364 
365       case '\\': case '@': case '&': case '$':
366       case '#':  case '<': case '>': case '%':
367       case '\"': case '.': case ':':
368         // This is one of \\ \@ \& \$ etc escape sequences.
369         TokenPtr++;
370         if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
371           // This is the \:: escape sequence.
372           TokenPtr++;
373         }
374         StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
375         formTokenWithChars(T, TokenPtr, tok::text);
376         T.setText(UnescapedText);
377         return;
378       }
379 
380       // Don't make zero-length commands.
381       if (!isCommandNameStartCharacter(*TokenPtr)) {
382         formTextToken(T, TokenPtr);
383         return;
384       }
385 
386       TokenPtr = skipCommandName(TokenPtr, CommentEnd);
387       unsigned Length = TokenPtr - (BufferPtr + 1);
388 
389       // Hardcoded support for lexing LaTeX formula commands
390       // \f$ \f( \f) \f[ \f] \f{ \f} as a single command.
391       if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
392         C = *TokenPtr;
393         if (C == '$' || C == '(' || C == ')' || C == '[' || C == ']' ||
394             C == '{' || C == '}') {
395           TokenPtr++;
396           Length++;
397         }
398       }
399 
400       StringRef CommandName(BufferPtr + 1, Length);
401 
402       const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
403       if (!Info) {
404         if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
405           StringRef CorrectedName = Info->Name;
406           SourceLocation Loc = getSourceLocation(BufferPtr);
407           SourceLocation EndLoc = getSourceLocation(TokenPtr);
408           SourceRange FullRange = SourceRange(Loc, EndLoc);
409           SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);
410           Diag(Loc, diag::warn_correct_comment_command_name)
411             << FullRange << CommandName << CorrectedName
412             << FixItHint::CreateReplacement(CommandRange, CorrectedName);
413         } else {
414           formTokenWithChars(T, TokenPtr, tok::unknown_command);
415           T.setUnknownCommandName(CommandName);
416           Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
417               << SourceRange(T.getLocation(), T.getEndLocation());
418           return;
419         }
420       }
421       if (Info->IsVerbatimBlockCommand) {
422         setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
423         return;
424       }
425       if (Info->IsVerbatimLineCommand) {
426         setupAndLexVerbatimLine(T, TokenPtr, Info);
427         return;
428       }
429       formTokenWithChars(T, TokenPtr, CommandKind);
430       T.setCommandID(Info->getID());
431       return;
432     }
433 
434     case '&':
435       lexHTMLCharacterReference(T);
436       return;
437 
438     case '<': {
439       TokenPtr++;
440       if (TokenPtr == CommentEnd) {
441         formTextToken(T, TokenPtr);
442         return;
443       }
444       const char C = *TokenPtr;
445       if (isHTMLIdentifierStartingCharacter(C))
446         setupAndLexHTMLStartTag(T);
447       else if (C == '/')
448         setupAndLexHTMLEndTag(T);
449       else
450         formTextToken(T, TokenPtr);
451       return;
452     }
453 
454     default:
455       return HandleNonCommandToken();
456   }
457 }
458 
459 void Lexer::setupAndLexVerbatimBlock(Token &T,
460                                      const char *TextBegin,
461                                      char Marker, const CommandInfo *Info) {
462   assert(Info->IsVerbatimBlockCommand);
463 
464   VerbatimBlockEndCommandName.clear();
465   VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
466   VerbatimBlockEndCommandName.append(Info->EndCommandName);
467 
468   formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
469   T.setVerbatimBlockID(Info->getID());
470 
471   // If there is a newline following the verbatim opening command, skip the
472   // newline so that we don't create an tok::verbatim_block_line with empty
473   // text content.
474   if (BufferPtr != CommentEnd &&
475       isVerticalWhitespace(*BufferPtr)) {
476     BufferPtr = skipNewline(BufferPtr, CommentEnd);
477     State = LS_VerbatimBlockBody;
478     return;
479   }
480 
481   State = LS_VerbatimBlockFirstLine;
482 }
483 
484 void Lexer::lexVerbatimBlockFirstLine(Token &T) {
485 again:
486   assert(BufferPtr < CommentEnd);
487 
488   // FIXME: It would be better to scan the text once, finding either the block
489   // end command or newline.
490   //
491   // Extract current line.
492   const char *Newline = findNewline(BufferPtr, CommentEnd);
493   StringRef Line(BufferPtr, Newline - BufferPtr);
494 
495   // Look for end command in current line.
496   size_t Pos = Line.find(VerbatimBlockEndCommandName);
497   const char *TextEnd;
498   const char *NextLine;
499   if (Pos == StringRef::npos) {
500     // Current line is completely verbatim.
501     TextEnd = Newline;
502     NextLine = skipNewline(Newline, CommentEnd);
503   } else if (Pos == 0) {
504     // Current line contains just an end command.
505     const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
506     StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
507     formTokenWithChars(T, End, tok::verbatim_block_end);
508     T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
509     State = LS_Normal;
510     return;
511   } else {
512     // There is some text, followed by end command.  Extract text first.
513     TextEnd = BufferPtr + Pos;
514     NextLine = TextEnd;
515     // If there is only whitespace before end command, skip whitespace.
516     if (isWhitespace(BufferPtr, TextEnd)) {
517       BufferPtr = TextEnd;
518       goto again;
519     }
520   }
521 
522   StringRef Text(BufferPtr, TextEnd - BufferPtr);
523   formTokenWithChars(T, NextLine, tok::verbatim_block_line);
524   T.setVerbatimBlockText(Text);
525 
526   State = LS_VerbatimBlockBody;
527 }
528 
529 void Lexer::lexVerbatimBlockBody(Token &T) {
530   assert(State == LS_VerbatimBlockBody);
531 
532   if (CommentState == LCS_InsideCComment)
533     skipLineStartingDecorations();
534 
535   if (BufferPtr == CommentEnd) {
536     formTokenWithChars(T, BufferPtr, tok::verbatim_block_line);
537     T.setVerbatimBlockText("");
538     return;
539   }
540 
541   lexVerbatimBlockFirstLine(T);
542 }
543 
544 void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
545                                     const CommandInfo *Info) {
546   assert(Info->IsVerbatimLineCommand);
547   formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
548   T.setVerbatimLineID(Info->getID());
549 
550   State = LS_VerbatimLineText;
551 }
552 
553 void Lexer::lexVerbatimLineText(Token &T) {
554   assert(State == LS_VerbatimLineText);
555 
556   // Extract current line.
557   const char *Newline = findNewline(BufferPtr, CommentEnd);
558   StringRef Text(BufferPtr, Newline - BufferPtr);
559   formTokenWithChars(T, Newline, tok::verbatim_line_text);
560   T.setVerbatimLineText(Text);
561 
562   State = LS_Normal;
563 }
564 
565 void Lexer::lexHTMLCharacterReference(Token &T) {
566   const char *TokenPtr = BufferPtr;
567   assert(*TokenPtr == '&');
568   TokenPtr++;
569   if (TokenPtr == CommentEnd) {
570     formTextToken(T, TokenPtr);
571     return;
572   }
573   const char *NamePtr;
574   bool isNamed = false;
575   bool isDecimal = false;
576   char C = *TokenPtr;
577   if (isHTMLNamedCharacterReferenceCharacter(C)) {
578     NamePtr = TokenPtr;
579     TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
580     isNamed = true;
581   } else if (C == '#') {
582     TokenPtr++;
583     if (TokenPtr == CommentEnd) {
584       formTextToken(T, TokenPtr);
585       return;
586     }
587     C = *TokenPtr;
588     if (isHTMLDecimalCharacterReferenceCharacter(C)) {
589       NamePtr = TokenPtr;
590       TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
591       isDecimal = true;
592     } else if (C == 'x' || C == 'X') {
593       TokenPtr++;
594       NamePtr = TokenPtr;
595       TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
596     } else {
597       formTextToken(T, TokenPtr);
598       return;
599     }
600   } else {
601     formTextToken(T, TokenPtr);
602     return;
603   }
604   if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
605       *TokenPtr != ';') {
606     formTextToken(T, TokenPtr);
607     return;
608   }
609   StringRef Name(NamePtr, TokenPtr - NamePtr);
610   TokenPtr++; // Skip semicolon.
611   StringRef Resolved;
612   if (isNamed)
613     Resolved = resolveHTMLNamedCharacterReference(Name);
614   else if (isDecimal)
615     Resolved = resolveHTMLDecimalCharacterReference(Name);
616   else
617     Resolved = resolveHTMLHexCharacterReference(Name);
618 
619   if (Resolved.empty()) {
620     formTextToken(T, TokenPtr);
621     return;
622   }
623   formTokenWithChars(T, TokenPtr, tok::text);
624   T.setText(Resolved);
625 }
626 
627 void Lexer::setupAndLexHTMLStartTag(Token &T) {
628   assert(BufferPtr[0] == '<' &&
629          isHTMLIdentifierStartingCharacter(BufferPtr[1]));
630   const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
631   StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
632   if (!isHTMLTagName(Name)) {
633     formTextToken(T, TagNameEnd);
634     return;
635   }
636 
637   formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
638   T.setHTMLTagStartName(Name);
639 
640   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
641 
642   const char C = *BufferPtr;
643   if (BufferPtr != CommentEnd &&
644       (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
645     State = LS_HTMLStartTag;
646 }
647 
648 void Lexer::lexHTMLStartTag(Token &T) {
649   assert(State == LS_HTMLStartTag);
650 
651   const char *TokenPtr = BufferPtr;
652   char C = *TokenPtr;
653   if (isHTMLIdentifierCharacter(C)) {
654     TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
655     StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
656     formTokenWithChars(T, TokenPtr, tok::html_ident);
657     T.setHTMLIdent(Ident);
658   } else {
659     switch (C) {
660     case '=':
661       TokenPtr++;
662       formTokenWithChars(T, TokenPtr, tok::html_equals);
663       break;
664     case '\"':
665     case '\'': {
666       const char *OpenQuote = TokenPtr;
667       TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
668       const char *ClosingQuote = TokenPtr;
669       if (TokenPtr != CommentEnd) // Skip closing quote.
670         TokenPtr++;
671       formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
672       T.setHTMLQuotedString(StringRef(OpenQuote + 1,
673                                       ClosingQuote - (OpenQuote + 1)));
674       break;
675     }
676     case '>':
677       TokenPtr++;
678       formTokenWithChars(T, TokenPtr, tok::html_greater);
679       State = LS_Normal;
680       return;
681     case '/':
682       TokenPtr++;
683       if (TokenPtr != CommentEnd && *TokenPtr == '>') {
684         TokenPtr++;
685         formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
686       } else
687         formTextToken(T, TokenPtr);
688 
689       State = LS_Normal;
690       return;
691     }
692   }
693 
694   // Now look ahead and return to normal state if we don't see any HTML tokens
695   // ahead.
696   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
697   if (BufferPtr == CommentEnd) {
698     State = LS_Normal;
699     return;
700   }
701 
702   C = *BufferPtr;
703   if (!isHTMLIdentifierStartingCharacter(C) &&
704       C != '=' && C != '\"' && C != '\'' && C != '>' && C != '/') {
705     State = LS_Normal;
706     return;
707   }
708 }
709 
710 void Lexer::setupAndLexHTMLEndTag(Token &T) {
711   assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
712 
713   const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
714   const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
715   StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
716   if (!isHTMLTagName(Name)) {
717     formTextToken(T, TagNameEnd);
718     return;
719   }
720 
721   const char *End = skipWhitespace(TagNameEnd, CommentEnd);
722 
723   formTokenWithChars(T, End, tok::html_end_tag);
724   T.setHTMLTagEndName(Name);
725 
726   if (BufferPtr != CommentEnd && *BufferPtr == '>')
727     State = LS_HTMLEndTag;
728 }
729 
730 void Lexer::lexHTMLEndTag(Token &T) {
731   assert(BufferPtr != CommentEnd && *BufferPtr == '>');
732 
733   formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
734   State = LS_Normal;
735 }
736 
737 Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
738              const CommandTraits &Traits, SourceLocation FileLoc,
739              const char *BufferStart, const char *BufferEnd, bool ParseCommands)
740     : Allocator(Allocator), Diags(Diags), Traits(Traits),
741       BufferStart(BufferStart), BufferEnd(BufferEnd), BufferPtr(BufferStart),
742       FileLoc(FileLoc), ParseCommands(ParseCommands),
743       CommentState(LCS_BeforeComment), State(LS_Normal) {}
744 
745 void Lexer::lex(Token &T) {
746 again:
747   switch (CommentState) {
748   case LCS_BeforeComment:
749     if (BufferPtr == BufferEnd) {
750       formTokenWithChars(T, BufferPtr, tok::eof);
751       return;
752     }
753 
754     assert(*BufferPtr == '/');
755     BufferPtr++; // Skip first slash.
756     switch(*BufferPtr) {
757     case '/': { // BCPL comment.
758       BufferPtr++; // Skip second slash.
759 
760       if (BufferPtr != BufferEnd) {
761         // Skip Doxygen magic marker, if it is present.
762         // It might be missing because of a typo //< or /*<, or because we
763         // merged this non-Doxygen comment into a bunch of Doxygen comments
764         // around it: /** ... */ /* ... */ /** ... */
765         const char C = *BufferPtr;
766         if (C == '/' || C == '!')
767           BufferPtr++;
768       }
769 
770       // Skip less-than symbol that marks trailing comments.
771       // Skip it even if the comment is not a Doxygen one, because //< and /*<
772       // are frequent typos.
773       if (BufferPtr != BufferEnd && *BufferPtr == '<')
774         BufferPtr++;
775 
776       CommentState = LCS_InsideBCPLComment;
777       if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
778         State = LS_Normal;
779       CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
780       goto again;
781     }
782     case '*': { // C comment.
783       BufferPtr++; // Skip star.
784 
785       // Skip Doxygen magic marker.
786       const char C = *BufferPtr;
787       if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
788         BufferPtr++;
789 
790       // Skip less-than symbol that marks trailing comments.
791       if (BufferPtr != BufferEnd && *BufferPtr == '<')
792         BufferPtr++;
793 
794       CommentState = LCS_InsideCComment;
795       State = LS_Normal;
796       CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
797       goto again;
798     }
799     default:
800       llvm_unreachable("second character of comment should be '/' or '*'");
801     }
802 
803   case LCS_BetweenComments: {
804     // Consecutive comments are extracted only if there is only whitespace
805     // between them.  So we can search for the start of the next comment.
806     const char *EndWhitespace = BufferPtr;
807     while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
808       EndWhitespace++;
809 
810     // Turn any whitespace between comments (and there is only whitespace
811     // between them -- guaranteed by comment extraction) into a newline.  We
812     // have two newlines between C comments in total (first one was synthesized
813     // after a comment).
814     formTokenWithChars(T, EndWhitespace, tok::newline);
815 
816     CommentState = LCS_BeforeComment;
817     break;
818   }
819 
820   case LCS_InsideBCPLComment:
821   case LCS_InsideCComment:
822     if (BufferPtr != CommentEnd) {
823       lexCommentText(T);
824       break;
825     } else {
826       // Skip C comment closing sequence.
827       if (CommentState == LCS_InsideCComment) {
828         assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
829         BufferPtr += 2;
830         assert(BufferPtr <= BufferEnd);
831 
832         // Synthenize newline just after the C comment, regardless if there is
833         // actually a newline.
834         formTokenWithChars(T, BufferPtr, tok::newline);
835 
836         CommentState = LCS_BetweenComments;
837         break;
838       } else {
839         // Don't synthesized a newline after BCPL comment.
840         CommentState = LCS_BetweenComments;
841         goto again;
842       }
843     }
844   }
845 }
846 
847 StringRef Lexer::getSpelling(const Token &Tok,
848                              const SourceManager &SourceMgr) const {
849   SourceLocation Loc = Tok.getLocation();
850   std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
851 
852   bool InvalidTemp = false;
853   StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
854   if (InvalidTemp)
855     return StringRef();
856 
857   const char *Begin = File.data() + LocInfo.second;
858   return StringRef(Begin, Tok.getLength());
859 }
860 
861 } // end namespace comments
862 } // end namespace clang
863