xref: /aosp_15_r20/external/clang/include/clang/AST/CommentLexer.h (revision 67e74705e28f6214e480b399dd47ea732279e315)
1*67e74705SXin Li //===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===//
2*67e74705SXin Li //
3*67e74705SXin Li //                     The LLVM Compiler Infrastructure
4*67e74705SXin Li //
5*67e74705SXin Li // This file is distributed under the University of Illinois Open Source
6*67e74705SXin Li // License. See LICENSE.TXT for details.
7*67e74705SXin Li //
8*67e74705SXin Li //===----------------------------------------------------------------------===//
9*67e74705SXin Li //
10*67e74705SXin Li //  This file defines lexer for structured comments and supporting token class.
11*67e74705SXin Li //
12*67e74705SXin Li //===----------------------------------------------------------------------===//
13*67e74705SXin Li 
14*67e74705SXin Li #ifndef LLVM_CLANG_AST_COMMENTLEXER_H
15*67e74705SXin Li #define LLVM_CLANG_AST_COMMENTLEXER_H
16*67e74705SXin Li 
17*67e74705SXin Li #include "clang/Basic/Diagnostic.h"
18*67e74705SXin Li #include "clang/Basic/SourceManager.h"
19*67e74705SXin Li #include "llvm/ADT/SmallString.h"
20*67e74705SXin Li #include "llvm/ADT/SmallVector.h"
21*67e74705SXin Li #include "llvm/ADT/StringRef.h"
22*67e74705SXin Li #include "llvm/Support/Allocator.h"
23*67e74705SXin Li #include "llvm/Support/raw_ostream.h"
24*67e74705SXin Li 
25*67e74705SXin Li namespace clang {
26*67e74705SXin Li namespace comments {
27*67e74705SXin Li 
28*67e74705SXin Li class Lexer;
29*67e74705SXin Li class TextTokenRetokenizer;
30*67e74705SXin Li struct CommandInfo;
31*67e74705SXin Li class CommandTraits;
32*67e74705SXin Li 
33*67e74705SXin Li namespace tok {
34*67e74705SXin Li enum TokenKind {
35*67e74705SXin Li   eof,
36*67e74705SXin Li   newline,
37*67e74705SXin Li   text,
38*67e74705SXin Li   unknown_command,   // Command that does not have an ID.
39*67e74705SXin Li   backslash_command, // Command with an ID, that used backslash marker.
40*67e74705SXin Li   at_command,        // Command with an ID, that used 'at' marker.
41*67e74705SXin Li   verbatim_block_begin,
42*67e74705SXin Li   verbatim_block_line,
43*67e74705SXin Li   verbatim_block_end,
44*67e74705SXin Li   verbatim_line_name,
45*67e74705SXin Li   verbatim_line_text,
46*67e74705SXin Li   html_start_tag,     // <tag
47*67e74705SXin Li   html_ident,         // attr
48*67e74705SXin Li   html_equals,        // =
49*67e74705SXin Li   html_quoted_string, // "blah\"blah" or 'blah\'blah'
50*67e74705SXin Li   html_greater,       // >
51*67e74705SXin Li   html_slash_greater, // />
52*67e74705SXin Li   html_end_tag        // </tag
53*67e74705SXin Li };
54*67e74705SXin Li } // end namespace tok
55*67e74705SXin Li 
56*67e74705SXin Li /// \brief Comment token.
57*67e74705SXin Li class Token {
58*67e74705SXin Li   friend class Lexer;
59*67e74705SXin Li   friend class TextTokenRetokenizer;
60*67e74705SXin Li 
61*67e74705SXin Li   /// The location of the token.
62*67e74705SXin Li   SourceLocation Loc;
63*67e74705SXin Li 
64*67e74705SXin Li   /// The actual kind of the token.
65*67e74705SXin Li   tok::TokenKind Kind;
66*67e74705SXin Li 
67*67e74705SXin Li   /// Length of the token spelling in comment.  Can be 0 for synthenized
68*67e74705SXin Li   /// tokens.
69*67e74705SXin Li   unsigned Length;
70*67e74705SXin Li 
71*67e74705SXin Li   /// Contains text value associated with a token.
72*67e74705SXin Li   const char *TextPtr;
73*67e74705SXin Li 
74*67e74705SXin Li   /// Integer value associated with a token.
75*67e74705SXin Li   ///
76*67e74705SXin Li   /// If the token is a konwn command, contains command ID and TextPtr is
77*67e74705SXin Li   /// unused (command spelling can be found with CommandTraits).  Otherwise,
78*67e74705SXin Li   /// contains the length of the string that starts at TextPtr.
79*67e74705SXin Li   unsigned IntVal;
80*67e74705SXin Li 
81*67e74705SXin Li public:
getLocation()82*67e74705SXin Li   SourceLocation getLocation() const LLVM_READONLY { return Loc; }
setLocation(SourceLocation SL)83*67e74705SXin Li   void setLocation(SourceLocation SL) { Loc = SL; }
84*67e74705SXin Li 
getEndLocation()85*67e74705SXin Li   SourceLocation getEndLocation() const LLVM_READONLY {
86*67e74705SXin Li     if (Length == 0 || Length == 1)
87*67e74705SXin Li       return Loc;
88*67e74705SXin Li     return Loc.getLocWithOffset(Length - 1);
89*67e74705SXin Li   }
90*67e74705SXin Li 
getKind()91*67e74705SXin Li   tok::TokenKind getKind() const LLVM_READONLY { return Kind; }
setKind(tok::TokenKind K)92*67e74705SXin Li   void setKind(tok::TokenKind K) { Kind = K; }
93*67e74705SXin Li 
is(tok::TokenKind K)94*67e74705SXin Li   bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; }
isNot(tok::TokenKind K)95*67e74705SXin Li   bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; }
96*67e74705SXin Li 
getLength()97*67e74705SXin Li   unsigned getLength() const LLVM_READONLY { return Length; }
setLength(unsigned L)98*67e74705SXin Li   void setLength(unsigned L) { Length = L; }
99*67e74705SXin Li 
getText()100*67e74705SXin Li   StringRef getText() const LLVM_READONLY {
101*67e74705SXin Li     assert(is(tok::text));
102*67e74705SXin Li     return StringRef(TextPtr, IntVal);
103*67e74705SXin Li   }
104*67e74705SXin Li 
setText(StringRef Text)105*67e74705SXin Li   void setText(StringRef Text) {
106*67e74705SXin Li     assert(is(tok::text));
107*67e74705SXin Li     TextPtr = Text.data();
108*67e74705SXin Li     IntVal = Text.size();
109*67e74705SXin Li   }
110*67e74705SXin Li 
getUnknownCommandName()111*67e74705SXin Li   StringRef getUnknownCommandName() const LLVM_READONLY {
112*67e74705SXin Li     assert(is(tok::unknown_command));
113*67e74705SXin Li     return StringRef(TextPtr, IntVal);
114*67e74705SXin Li   }
115*67e74705SXin Li 
setUnknownCommandName(StringRef Name)116*67e74705SXin Li   void setUnknownCommandName(StringRef Name) {
117*67e74705SXin Li     assert(is(tok::unknown_command));
118*67e74705SXin Li     TextPtr = Name.data();
119*67e74705SXin Li     IntVal = Name.size();
120*67e74705SXin Li   }
121*67e74705SXin Li 
getCommandID()122*67e74705SXin Li   unsigned getCommandID() const LLVM_READONLY {
123*67e74705SXin Li     assert(is(tok::backslash_command) || is(tok::at_command));
124*67e74705SXin Li     return IntVal;
125*67e74705SXin Li   }
126*67e74705SXin Li 
setCommandID(unsigned ID)127*67e74705SXin Li   void setCommandID(unsigned ID) {
128*67e74705SXin Li     assert(is(tok::backslash_command) || is(tok::at_command));
129*67e74705SXin Li     IntVal = ID;
130*67e74705SXin Li   }
131*67e74705SXin Li 
getVerbatimBlockID()132*67e74705SXin Li   unsigned getVerbatimBlockID() const LLVM_READONLY {
133*67e74705SXin Li     assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
134*67e74705SXin Li     return IntVal;
135*67e74705SXin Li   }
136*67e74705SXin Li 
setVerbatimBlockID(unsigned ID)137*67e74705SXin Li   void setVerbatimBlockID(unsigned ID) {
138*67e74705SXin Li     assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
139*67e74705SXin Li     IntVal = ID;
140*67e74705SXin Li   }
141*67e74705SXin Li 
getVerbatimBlockText()142*67e74705SXin Li   StringRef getVerbatimBlockText() const LLVM_READONLY {
143*67e74705SXin Li     assert(is(tok::verbatim_block_line));
144*67e74705SXin Li     return StringRef(TextPtr, IntVal);
145*67e74705SXin Li   }
146*67e74705SXin Li 
setVerbatimBlockText(StringRef Text)147*67e74705SXin Li   void setVerbatimBlockText(StringRef Text) {
148*67e74705SXin Li     assert(is(tok::verbatim_block_line));
149*67e74705SXin Li     TextPtr = Text.data();
150*67e74705SXin Li     IntVal = Text.size();
151*67e74705SXin Li   }
152*67e74705SXin Li 
getVerbatimLineID()153*67e74705SXin Li   unsigned getVerbatimLineID() const LLVM_READONLY {
154*67e74705SXin Li     assert(is(tok::verbatim_line_name));
155*67e74705SXin Li     return IntVal;
156*67e74705SXin Li   }
157*67e74705SXin Li 
setVerbatimLineID(unsigned ID)158*67e74705SXin Li   void setVerbatimLineID(unsigned ID) {
159*67e74705SXin Li     assert(is(tok::verbatim_line_name));
160*67e74705SXin Li     IntVal = ID;
161*67e74705SXin Li   }
162*67e74705SXin Li 
getVerbatimLineText()163*67e74705SXin Li   StringRef getVerbatimLineText() const LLVM_READONLY {
164*67e74705SXin Li     assert(is(tok::verbatim_line_text));
165*67e74705SXin Li     return StringRef(TextPtr, IntVal);
166*67e74705SXin Li   }
167*67e74705SXin Li 
setVerbatimLineText(StringRef Text)168*67e74705SXin Li   void setVerbatimLineText(StringRef Text) {
169*67e74705SXin Li     assert(is(tok::verbatim_line_text));
170*67e74705SXin Li     TextPtr = Text.data();
171*67e74705SXin Li     IntVal = Text.size();
172*67e74705SXin Li   }
173*67e74705SXin Li 
getHTMLTagStartName()174*67e74705SXin Li   StringRef getHTMLTagStartName() const LLVM_READONLY {
175*67e74705SXin Li     assert(is(tok::html_start_tag));
176*67e74705SXin Li     return StringRef(TextPtr, IntVal);
177*67e74705SXin Li   }
178*67e74705SXin Li 
setHTMLTagStartName(StringRef Name)179*67e74705SXin Li   void setHTMLTagStartName(StringRef Name) {
180*67e74705SXin Li     assert(is(tok::html_start_tag));
181*67e74705SXin Li     TextPtr = Name.data();
182*67e74705SXin Li     IntVal = Name.size();
183*67e74705SXin Li   }
184*67e74705SXin Li 
getHTMLIdent()185*67e74705SXin Li   StringRef getHTMLIdent() const LLVM_READONLY {
186*67e74705SXin Li     assert(is(tok::html_ident));
187*67e74705SXin Li     return StringRef(TextPtr, IntVal);
188*67e74705SXin Li   }
189*67e74705SXin Li 
setHTMLIdent(StringRef Name)190*67e74705SXin Li   void setHTMLIdent(StringRef Name) {
191*67e74705SXin Li     assert(is(tok::html_ident));
192*67e74705SXin Li     TextPtr = Name.data();
193*67e74705SXin Li     IntVal = Name.size();
194*67e74705SXin Li   }
195*67e74705SXin Li 
getHTMLQuotedString()196*67e74705SXin Li   StringRef getHTMLQuotedString() const LLVM_READONLY {
197*67e74705SXin Li     assert(is(tok::html_quoted_string));
198*67e74705SXin Li     return StringRef(TextPtr, IntVal);
199*67e74705SXin Li   }
200*67e74705SXin Li 
setHTMLQuotedString(StringRef Str)201*67e74705SXin Li   void setHTMLQuotedString(StringRef Str) {
202*67e74705SXin Li     assert(is(tok::html_quoted_string));
203*67e74705SXin Li     TextPtr = Str.data();
204*67e74705SXin Li     IntVal = Str.size();
205*67e74705SXin Li   }
206*67e74705SXin Li 
getHTMLTagEndName()207*67e74705SXin Li   StringRef getHTMLTagEndName() const LLVM_READONLY {
208*67e74705SXin Li     assert(is(tok::html_end_tag));
209*67e74705SXin Li     return StringRef(TextPtr, IntVal);
210*67e74705SXin Li   }
211*67e74705SXin Li 
setHTMLTagEndName(StringRef Name)212*67e74705SXin Li   void setHTMLTagEndName(StringRef Name) {
213*67e74705SXin Li     assert(is(tok::html_end_tag));
214*67e74705SXin Li     TextPtr = Name.data();
215*67e74705SXin Li     IntVal = Name.size();
216*67e74705SXin Li   }
217*67e74705SXin Li 
218*67e74705SXin Li   void dump(const Lexer &L, const SourceManager &SM) const;
219*67e74705SXin Li };
220*67e74705SXin Li 
221*67e74705SXin Li /// \brief Comment lexer.
222*67e74705SXin Li class Lexer {
223*67e74705SXin Li private:
224*67e74705SXin Li   Lexer(const Lexer &) = delete;
225*67e74705SXin Li   void operator=(const Lexer &) = delete;
226*67e74705SXin Li 
227*67e74705SXin Li   /// Allocator for strings that are semantic values of tokens and have to be
228*67e74705SXin Li   /// computed (for example, resolved decimal character references).
229*67e74705SXin Li   llvm::BumpPtrAllocator &Allocator;
230*67e74705SXin Li 
231*67e74705SXin Li   DiagnosticsEngine &Diags;
232*67e74705SXin Li 
233*67e74705SXin Li   const CommandTraits &Traits;
234*67e74705SXin Li 
235*67e74705SXin Li   const char *const BufferStart;
236*67e74705SXin Li   const char *const BufferEnd;
237*67e74705SXin Li   SourceLocation FileLoc;
238*67e74705SXin Li 
239*67e74705SXin Li   const char *BufferPtr;
240*67e74705SXin Li 
241*67e74705SXin Li   /// One past end pointer for the current comment.  For BCPL comments points
242*67e74705SXin Li   /// to newline or BufferEnd, for C comments points to star in '*/'.
243*67e74705SXin Li   const char *CommentEnd;
244*67e74705SXin Li 
245*67e74705SXin Li   enum LexerCommentState {
246*67e74705SXin Li     LCS_BeforeComment,
247*67e74705SXin Li     LCS_InsideBCPLComment,
248*67e74705SXin Li     LCS_InsideCComment,
249*67e74705SXin Li     LCS_BetweenComments
250*67e74705SXin Li   };
251*67e74705SXin Li 
252*67e74705SXin Li   /// Low-level lexer state, track if we are inside or outside of comment.
253*67e74705SXin Li   LexerCommentState CommentState;
254*67e74705SXin Li 
255*67e74705SXin Li   enum LexerState {
256*67e74705SXin Li     /// Lexing normal comment text
257*67e74705SXin Li     LS_Normal,
258*67e74705SXin Li 
259*67e74705SXin Li     /// Finished lexing verbatim block beginning command, will lex first body
260*67e74705SXin Li     /// line.
261*67e74705SXin Li     LS_VerbatimBlockFirstLine,
262*67e74705SXin Li 
263*67e74705SXin Li     /// Lexing verbatim block body line-by-line, skipping line-starting
264*67e74705SXin Li     /// decorations.
265*67e74705SXin Li     LS_VerbatimBlockBody,
266*67e74705SXin Li 
267*67e74705SXin Li     /// Finished lexing verbatim line beginning command, will lex text (one
268*67e74705SXin Li     /// line).
269*67e74705SXin Li     LS_VerbatimLineText,
270*67e74705SXin Li 
271*67e74705SXin Li     /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes.
272*67e74705SXin Li     LS_HTMLStartTag,
273*67e74705SXin Li 
274*67e74705SXin Li     /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'.
275*67e74705SXin Li     LS_HTMLEndTag
276*67e74705SXin Li   };
277*67e74705SXin Li 
278*67e74705SXin Li   /// Current lexing mode.
279*67e74705SXin Li   LexerState State;
280*67e74705SXin Li 
281*67e74705SXin Li   /// If State is LS_VerbatimBlock, contains the name of verbatim end
282*67e74705SXin Li   /// command, including command marker.
283*67e74705SXin Li   SmallString<16> VerbatimBlockEndCommandName;
284*67e74705SXin Li 
285*67e74705SXin Li   /// Given a character reference name (e.g., "lt"), return the character that
286*67e74705SXin Li   /// it stands for (e.g., "<").
287*67e74705SXin Li   StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
288*67e74705SXin Li 
289*67e74705SXin Li   /// Given a Unicode codepoint as base-10 integer, return the character.
290*67e74705SXin Li   StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const;
291*67e74705SXin Li 
292*67e74705SXin Li   /// Given a Unicode codepoint as base-16 integer, return the character.
293*67e74705SXin Li   StringRef resolveHTMLHexCharacterReference(StringRef Name) const;
294*67e74705SXin Li 
295*67e74705SXin Li   void formTokenWithChars(Token &Result, const char *TokEnd,
296*67e74705SXin Li                           tok::TokenKind Kind);
297*67e74705SXin Li 
formTextToken(Token & Result,const char * TokEnd)298*67e74705SXin Li   void formTextToken(Token &Result, const char *TokEnd) {
299*67e74705SXin Li     StringRef Text(BufferPtr, TokEnd - BufferPtr);
300*67e74705SXin Li     formTokenWithChars(Result, TokEnd, tok::text);
301*67e74705SXin Li     Result.setText(Text);
302*67e74705SXin Li   }
303*67e74705SXin Li 
getSourceLocation(const char * Loc)304*67e74705SXin Li   SourceLocation getSourceLocation(const char *Loc) const {
305*67e74705SXin Li     assert(Loc >= BufferStart && Loc <= BufferEnd &&
306*67e74705SXin Li            "Location out of range for this buffer!");
307*67e74705SXin Li 
308*67e74705SXin Li     const unsigned CharNo = Loc - BufferStart;
309*67e74705SXin Li     return FileLoc.getLocWithOffset(CharNo);
310*67e74705SXin Li   }
311*67e74705SXin Li 
Diag(SourceLocation Loc,unsigned DiagID)312*67e74705SXin Li   DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) {
313*67e74705SXin Li     return Diags.Report(Loc, DiagID);
314*67e74705SXin Li   }
315*67e74705SXin Li 
316*67e74705SXin Li   /// Eat string matching regexp \code \s*\* \endcode.
317*67e74705SXin Li   void skipLineStartingDecorations();
318*67e74705SXin Li 
319*67e74705SXin Li   /// Lex stuff inside comments.  CommentEnd should be set correctly.
320*67e74705SXin Li   void lexCommentText(Token &T);
321*67e74705SXin Li 
322*67e74705SXin Li   void setupAndLexVerbatimBlock(Token &T,
323*67e74705SXin Li                                 const char *TextBegin,
324*67e74705SXin Li                                 char Marker, const CommandInfo *Info);
325*67e74705SXin Li 
326*67e74705SXin Li   void lexVerbatimBlockFirstLine(Token &T);
327*67e74705SXin Li 
328*67e74705SXin Li   void lexVerbatimBlockBody(Token &T);
329*67e74705SXin Li 
330*67e74705SXin Li   void setupAndLexVerbatimLine(Token &T, const char *TextBegin,
331*67e74705SXin Li                                const CommandInfo *Info);
332*67e74705SXin Li 
333*67e74705SXin Li   void lexVerbatimLineText(Token &T);
334*67e74705SXin Li 
335*67e74705SXin Li   void lexHTMLCharacterReference(Token &T);
336*67e74705SXin Li 
337*67e74705SXin Li   void setupAndLexHTMLStartTag(Token &T);
338*67e74705SXin Li 
339*67e74705SXin Li   void lexHTMLStartTag(Token &T);
340*67e74705SXin Li 
341*67e74705SXin Li   void setupAndLexHTMLEndTag(Token &T);
342*67e74705SXin Li 
343*67e74705SXin Li   void lexHTMLEndTag(Token &T);
344*67e74705SXin Li 
345*67e74705SXin Li public:
346*67e74705SXin Li   Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
347*67e74705SXin Li         const CommandTraits &Traits,
348*67e74705SXin Li         SourceLocation FileLoc,
349*67e74705SXin Li         const char *BufferStart, const char *BufferEnd);
350*67e74705SXin Li 
351*67e74705SXin Li   void lex(Token &T);
352*67e74705SXin Li 
353*67e74705SXin Li   StringRef getSpelling(const Token &Tok,
354*67e74705SXin Li                         const SourceManager &SourceMgr,
355*67e74705SXin Li                         bool *Invalid = nullptr) const;
356*67e74705SXin Li };
357*67e74705SXin Li 
358*67e74705SXin Li } // end namespace comments
359*67e74705SXin Li } // end namespace clang
360*67e74705SXin Li 
361*67e74705SXin Li #endif
362*67e74705SXin Li 
363