1*67e74705SXin Li //===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===// 2*67e74705SXin Li // 3*67e74705SXin Li // The LLVM Compiler Infrastructure 4*67e74705SXin Li // 5*67e74705SXin Li // This file is distributed under the University of Illinois Open Source 6*67e74705SXin Li // License. See LICENSE.TXT for details. 7*67e74705SXin Li // 8*67e74705SXin Li //===----------------------------------------------------------------------===// 9*67e74705SXin Li // 10*67e74705SXin Li // This file defines lexer for structured comments and supporting token class. 11*67e74705SXin Li // 12*67e74705SXin Li //===----------------------------------------------------------------------===// 13*67e74705SXin Li 14*67e74705SXin Li #ifndef LLVM_CLANG_AST_COMMENTLEXER_H 15*67e74705SXin Li #define LLVM_CLANG_AST_COMMENTLEXER_H 16*67e74705SXin Li 17*67e74705SXin Li #include "clang/Basic/Diagnostic.h" 18*67e74705SXin Li #include "clang/Basic/SourceManager.h" 19*67e74705SXin Li #include "llvm/ADT/SmallString.h" 20*67e74705SXin Li #include "llvm/ADT/SmallVector.h" 21*67e74705SXin Li #include "llvm/ADT/StringRef.h" 22*67e74705SXin Li #include "llvm/Support/Allocator.h" 23*67e74705SXin Li #include "llvm/Support/raw_ostream.h" 24*67e74705SXin Li 25*67e74705SXin Li namespace clang { 26*67e74705SXin Li namespace comments { 27*67e74705SXin Li 28*67e74705SXin Li class Lexer; 29*67e74705SXin Li class TextTokenRetokenizer; 30*67e74705SXin Li struct CommandInfo; 31*67e74705SXin Li class CommandTraits; 32*67e74705SXin Li 33*67e74705SXin Li namespace tok { 34*67e74705SXin Li enum TokenKind { 35*67e74705SXin Li eof, 36*67e74705SXin Li newline, 37*67e74705SXin Li text, 38*67e74705SXin Li unknown_command, // Command that does not have an ID. 39*67e74705SXin Li backslash_command, // Command with an ID, that used backslash marker. 40*67e74705SXin Li at_command, // Command with an ID, that used 'at' marker. 41*67e74705SXin Li verbatim_block_begin, 42*67e74705SXin Li verbatim_block_line, 43*67e74705SXin Li verbatim_block_end, 44*67e74705SXin Li verbatim_line_name, 45*67e74705SXin Li verbatim_line_text, 46*67e74705SXin Li html_start_tag, // <tag 47*67e74705SXin Li html_ident, // attr 48*67e74705SXin Li html_equals, // = 49*67e74705SXin Li html_quoted_string, // "blah\"blah" or 'blah\'blah' 50*67e74705SXin Li html_greater, // > 51*67e74705SXin Li html_slash_greater, // /> 52*67e74705SXin Li html_end_tag // </tag 53*67e74705SXin Li }; 54*67e74705SXin Li } // end namespace tok 55*67e74705SXin Li 56*67e74705SXin Li /// \brief Comment token. 57*67e74705SXin Li class Token { 58*67e74705SXin Li friend class Lexer; 59*67e74705SXin Li friend class TextTokenRetokenizer; 60*67e74705SXin Li 61*67e74705SXin Li /// The location of the token. 62*67e74705SXin Li SourceLocation Loc; 63*67e74705SXin Li 64*67e74705SXin Li /// The actual kind of the token. 65*67e74705SXin Li tok::TokenKind Kind; 66*67e74705SXin Li 67*67e74705SXin Li /// Length of the token spelling in comment. Can be 0 for synthenized 68*67e74705SXin Li /// tokens. 69*67e74705SXin Li unsigned Length; 70*67e74705SXin Li 71*67e74705SXin Li /// Contains text value associated with a token. 72*67e74705SXin Li const char *TextPtr; 73*67e74705SXin Li 74*67e74705SXin Li /// Integer value associated with a token. 75*67e74705SXin Li /// 76*67e74705SXin Li /// If the token is a konwn command, contains command ID and TextPtr is 77*67e74705SXin Li /// unused (command spelling can be found with CommandTraits). Otherwise, 78*67e74705SXin Li /// contains the length of the string that starts at TextPtr. 79*67e74705SXin Li unsigned IntVal; 80*67e74705SXin Li 81*67e74705SXin Li public: getLocation()82*67e74705SXin Li SourceLocation getLocation() const LLVM_READONLY { return Loc; } setLocation(SourceLocation SL)83*67e74705SXin Li void setLocation(SourceLocation SL) { Loc = SL; } 84*67e74705SXin Li getEndLocation()85*67e74705SXin Li SourceLocation getEndLocation() const LLVM_READONLY { 86*67e74705SXin Li if (Length == 0 || Length == 1) 87*67e74705SXin Li return Loc; 88*67e74705SXin Li return Loc.getLocWithOffset(Length - 1); 89*67e74705SXin Li } 90*67e74705SXin Li getKind()91*67e74705SXin Li tok::TokenKind getKind() const LLVM_READONLY { return Kind; } setKind(tok::TokenKind K)92*67e74705SXin Li void setKind(tok::TokenKind K) { Kind = K; } 93*67e74705SXin Li is(tok::TokenKind K)94*67e74705SXin Li bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; } isNot(tok::TokenKind K)95*67e74705SXin Li bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; } 96*67e74705SXin Li getLength()97*67e74705SXin Li unsigned getLength() const LLVM_READONLY { return Length; } setLength(unsigned L)98*67e74705SXin Li void setLength(unsigned L) { Length = L; } 99*67e74705SXin Li getText()100*67e74705SXin Li StringRef getText() const LLVM_READONLY { 101*67e74705SXin Li assert(is(tok::text)); 102*67e74705SXin Li return StringRef(TextPtr, IntVal); 103*67e74705SXin Li } 104*67e74705SXin Li setText(StringRef Text)105*67e74705SXin Li void setText(StringRef Text) { 106*67e74705SXin Li assert(is(tok::text)); 107*67e74705SXin Li TextPtr = Text.data(); 108*67e74705SXin Li IntVal = Text.size(); 109*67e74705SXin Li } 110*67e74705SXin Li getUnknownCommandName()111*67e74705SXin Li StringRef getUnknownCommandName() const LLVM_READONLY { 112*67e74705SXin Li assert(is(tok::unknown_command)); 113*67e74705SXin Li return StringRef(TextPtr, IntVal); 114*67e74705SXin Li } 115*67e74705SXin Li setUnknownCommandName(StringRef Name)116*67e74705SXin Li void setUnknownCommandName(StringRef Name) { 117*67e74705SXin Li assert(is(tok::unknown_command)); 118*67e74705SXin Li TextPtr = Name.data(); 119*67e74705SXin Li IntVal = Name.size(); 120*67e74705SXin Li } 121*67e74705SXin Li getCommandID()122*67e74705SXin Li unsigned getCommandID() const LLVM_READONLY { 123*67e74705SXin Li assert(is(tok::backslash_command) || is(tok::at_command)); 124*67e74705SXin Li return IntVal; 125*67e74705SXin Li } 126*67e74705SXin Li setCommandID(unsigned ID)127*67e74705SXin Li void setCommandID(unsigned ID) { 128*67e74705SXin Li assert(is(tok::backslash_command) || is(tok::at_command)); 129*67e74705SXin Li IntVal = ID; 130*67e74705SXin Li } 131*67e74705SXin Li getVerbatimBlockID()132*67e74705SXin Li unsigned getVerbatimBlockID() const LLVM_READONLY { 133*67e74705SXin Li assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); 134*67e74705SXin Li return IntVal; 135*67e74705SXin Li } 136*67e74705SXin Li setVerbatimBlockID(unsigned ID)137*67e74705SXin Li void setVerbatimBlockID(unsigned ID) { 138*67e74705SXin Li assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); 139*67e74705SXin Li IntVal = ID; 140*67e74705SXin Li } 141*67e74705SXin Li getVerbatimBlockText()142*67e74705SXin Li StringRef getVerbatimBlockText() const LLVM_READONLY { 143*67e74705SXin Li assert(is(tok::verbatim_block_line)); 144*67e74705SXin Li return StringRef(TextPtr, IntVal); 145*67e74705SXin Li } 146*67e74705SXin Li setVerbatimBlockText(StringRef Text)147*67e74705SXin Li void setVerbatimBlockText(StringRef Text) { 148*67e74705SXin Li assert(is(tok::verbatim_block_line)); 149*67e74705SXin Li TextPtr = Text.data(); 150*67e74705SXin Li IntVal = Text.size(); 151*67e74705SXin Li } 152*67e74705SXin Li getVerbatimLineID()153*67e74705SXin Li unsigned getVerbatimLineID() const LLVM_READONLY { 154*67e74705SXin Li assert(is(tok::verbatim_line_name)); 155*67e74705SXin Li return IntVal; 156*67e74705SXin Li } 157*67e74705SXin Li setVerbatimLineID(unsigned ID)158*67e74705SXin Li void setVerbatimLineID(unsigned ID) { 159*67e74705SXin Li assert(is(tok::verbatim_line_name)); 160*67e74705SXin Li IntVal = ID; 161*67e74705SXin Li } 162*67e74705SXin Li getVerbatimLineText()163*67e74705SXin Li StringRef getVerbatimLineText() const LLVM_READONLY { 164*67e74705SXin Li assert(is(tok::verbatim_line_text)); 165*67e74705SXin Li return StringRef(TextPtr, IntVal); 166*67e74705SXin Li } 167*67e74705SXin Li setVerbatimLineText(StringRef Text)168*67e74705SXin Li void setVerbatimLineText(StringRef Text) { 169*67e74705SXin Li assert(is(tok::verbatim_line_text)); 170*67e74705SXin Li TextPtr = Text.data(); 171*67e74705SXin Li IntVal = Text.size(); 172*67e74705SXin Li } 173*67e74705SXin Li getHTMLTagStartName()174*67e74705SXin Li StringRef getHTMLTagStartName() const LLVM_READONLY { 175*67e74705SXin Li assert(is(tok::html_start_tag)); 176*67e74705SXin Li return StringRef(TextPtr, IntVal); 177*67e74705SXin Li } 178*67e74705SXin Li setHTMLTagStartName(StringRef Name)179*67e74705SXin Li void setHTMLTagStartName(StringRef Name) { 180*67e74705SXin Li assert(is(tok::html_start_tag)); 181*67e74705SXin Li TextPtr = Name.data(); 182*67e74705SXin Li IntVal = Name.size(); 183*67e74705SXin Li } 184*67e74705SXin Li getHTMLIdent()185*67e74705SXin Li StringRef getHTMLIdent() const LLVM_READONLY { 186*67e74705SXin Li assert(is(tok::html_ident)); 187*67e74705SXin Li return StringRef(TextPtr, IntVal); 188*67e74705SXin Li } 189*67e74705SXin Li setHTMLIdent(StringRef Name)190*67e74705SXin Li void setHTMLIdent(StringRef Name) { 191*67e74705SXin Li assert(is(tok::html_ident)); 192*67e74705SXin Li TextPtr = Name.data(); 193*67e74705SXin Li IntVal = Name.size(); 194*67e74705SXin Li } 195*67e74705SXin Li getHTMLQuotedString()196*67e74705SXin Li StringRef getHTMLQuotedString() const LLVM_READONLY { 197*67e74705SXin Li assert(is(tok::html_quoted_string)); 198*67e74705SXin Li return StringRef(TextPtr, IntVal); 199*67e74705SXin Li } 200*67e74705SXin Li setHTMLQuotedString(StringRef Str)201*67e74705SXin Li void setHTMLQuotedString(StringRef Str) { 202*67e74705SXin Li assert(is(tok::html_quoted_string)); 203*67e74705SXin Li TextPtr = Str.data(); 204*67e74705SXin Li IntVal = Str.size(); 205*67e74705SXin Li } 206*67e74705SXin Li getHTMLTagEndName()207*67e74705SXin Li StringRef getHTMLTagEndName() const LLVM_READONLY { 208*67e74705SXin Li assert(is(tok::html_end_tag)); 209*67e74705SXin Li return StringRef(TextPtr, IntVal); 210*67e74705SXin Li } 211*67e74705SXin Li setHTMLTagEndName(StringRef Name)212*67e74705SXin Li void setHTMLTagEndName(StringRef Name) { 213*67e74705SXin Li assert(is(tok::html_end_tag)); 214*67e74705SXin Li TextPtr = Name.data(); 215*67e74705SXin Li IntVal = Name.size(); 216*67e74705SXin Li } 217*67e74705SXin Li 218*67e74705SXin Li void dump(const Lexer &L, const SourceManager &SM) const; 219*67e74705SXin Li }; 220*67e74705SXin Li 221*67e74705SXin Li /// \brief Comment lexer. 222*67e74705SXin Li class Lexer { 223*67e74705SXin Li private: 224*67e74705SXin Li Lexer(const Lexer &) = delete; 225*67e74705SXin Li void operator=(const Lexer &) = delete; 226*67e74705SXin Li 227*67e74705SXin Li /// Allocator for strings that are semantic values of tokens and have to be 228*67e74705SXin Li /// computed (for example, resolved decimal character references). 229*67e74705SXin Li llvm::BumpPtrAllocator &Allocator; 230*67e74705SXin Li 231*67e74705SXin Li DiagnosticsEngine &Diags; 232*67e74705SXin Li 233*67e74705SXin Li const CommandTraits &Traits; 234*67e74705SXin Li 235*67e74705SXin Li const char *const BufferStart; 236*67e74705SXin Li const char *const BufferEnd; 237*67e74705SXin Li SourceLocation FileLoc; 238*67e74705SXin Li 239*67e74705SXin Li const char *BufferPtr; 240*67e74705SXin Li 241*67e74705SXin Li /// One past end pointer for the current comment. For BCPL comments points 242*67e74705SXin Li /// to newline or BufferEnd, for C comments points to star in '*/'. 243*67e74705SXin Li const char *CommentEnd; 244*67e74705SXin Li 245*67e74705SXin Li enum LexerCommentState { 246*67e74705SXin Li LCS_BeforeComment, 247*67e74705SXin Li LCS_InsideBCPLComment, 248*67e74705SXin Li LCS_InsideCComment, 249*67e74705SXin Li LCS_BetweenComments 250*67e74705SXin Li }; 251*67e74705SXin Li 252*67e74705SXin Li /// Low-level lexer state, track if we are inside or outside of comment. 253*67e74705SXin Li LexerCommentState CommentState; 254*67e74705SXin Li 255*67e74705SXin Li enum LexerState { 256*67e74705SXin Li /// Lexing normal comment text 257*67e74705SXin Li LS_Normal, 258*67e74705SXin Li 259*67e74705SXin Li /// Finished lexing verbatim block beginning command, will lex first body 260*67e74705SXin Li /// line. 261*67e74705SXin Li LS_VerbatimBlockFirstLine, 262*67e74705SXin Li 263*67e74705SXin Li /// Lexing verbatim block body line-by-line, skipping line-starting 264*67e74705SXin Li /// decorations. 265*67e74705SXin Li LS_VerbatimBlockBody, 266*67e74705SXin Li 267*67e74705SXin Li /// Finished lexing verbatim line beginning command, will lex text (one 268*67e74705SXin Li /// line). 269*67e74705SXin Li LS_VerbatimLineText, 270*67e74705SXin Li 271*67e74705SXin Li /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes. 272*67e74705SXin Li LS_HTMLStartTag, 273*67e74705SXin Li 274*67e74705SXin Li /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'. 275*67e74705SXin Li LS_HTMLEndTag 276*67e74705SXin Li }; 277*67e74705SXin Li 278*67e74705SXin Li /// Current lexing mode. 279*67e74705SXin Li LexerState State; 280*67e74705SXin Li 281*67e74705SXin Li /// If State is LS_VerbatimBlock, contains the name of verbatim end 282*67e74705SXin Li /// command, including command marker. 283*67e74705SXin Li SmallString<16> VerbatimBlockEndCommandName; 284*67e74705SXin Li 285*67e74705SXin Li /// Given a character reference name (e.g., "lt"), return the character that 286*67e74705SXin Li /// it stands for (e.g., "<"). 287*67e74705SXin Li StringRef resolveHTMLNamedCharacterReference(StringRef Name) const; 288*67e74705SXin Li 289*67e74705SXin Li /// Given a Unicode codepoint as base-10 integer, return the character. 290*67e74705SXin Li StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const; 291*67e74705SXin Li 292*67e74705SXin Li /// Given a Unicode codepoint as base-16 integer, return the character. 293*67e74705SXin Li StringRef resolveHTMLHexCharacterReference(StringRef Name) const; 294*67e74705SXin Li 295*67e74705SXin Li void formTokenWithChars(Token &Result, const char *TokEnd, 296*67e74705SXin Li tok::TokenKind Kind); 297*67e74705SXin Li formTextToken(Token & Result,const char * TokEnd)298*67e74705SXin Li void formTextToken(Token &Result, const char *TokEnd) { 299*67e74705SXin Li StringRef Text(BufferPtr, TokEnd - BufferPtr); 300*67e74705SXin Li formTokenWithChars(Result, TokEnd, tok::text); 301*67e74705SXin Li Result.setText(Text); 302*67e74705SXin Li } 303*67e74705SXin Li getSourceLocation(const char * Loc)304*67e74705SXin Li SourceLocation getSourceLocation(const char *Loc) const { 305*67e74705SXin Li assert(Loc >= BufferStart && Loc <= BufferEnd && 306*67e74705SXin Li "Location out of range for this buffer!"); 307*67e74705SXin Li 308*67e74705SXin Li const unsigned CharNo = Loc - BufferStart; 309*67e74705SXin Li return FileLoc.getLocWithOffset(CharNo); 310*67e74705SXin Li } 311*67e74705SXin Li Diag(SourceLocation Loc,unsigned DiagID)312*67e74705SXin Li DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) { 313*67e74705SXin Li return Diags.Report(Loc, DiagID); 314*67e74705SXin Li } 315*67e74705SXin Li 316*67e74705SXin Li /// Eat string matching regexp \code \s*\* \endcode. 317*67e74705SXin Li void skipLineStartingDecorations(); 318*67e74705SXin Li 319*67e74705SXin Li /// Lex stuff inside comments. CommentEnd should be set correctly. 320*67e74705SXin Li void lexCommentText(Token &T); 321*67e74705SXin Li 322*67e74705SXin Li void setupAndLexVerbatimBlock(Token &T, 323*67e74705SXin Li const char *TextBegin, 324*67e74705SXin Li char Marker, const CommandInfo *Info); 325*67e74705SXin Li 326*67e74705SXin Li void lexVerbatimBlockFirstLine(Token &T); 327*67e74705SXin Li 328*67e74705SXin Li void lexVerbatimBlockBody(Token &T); 329*67e74705SXin Li 330*67e74705SXin Li void setupAndLexVerbatimLine(Token &T, const char *TextBegin, 331*67e74705SXin Li const CommandInfo *Info); 332*67e74705SXin Li 333*67e74705SXin Li void lexVerbatimLineText(Token &T); 334*67e74705SXin Li 335*67e74705SXin Li void lexHTMLCharacterReference(Token &T); 336*67e74705SXin Li 337*67e74705SXin Li void setupAndLexHTMLStartTag(Token &T); 338*67e74705SXin Li 339*67e74705SXin Li void lexHTMLStartTag(Token &T); 340*67e74705SXin Li 341*67e74705SXin Li void setupAndLexHTMLEndTag(Token &T); 342*67e74705SXin Li 343*67e74705SXin Li void lexHTMLEndTag(Token &T); 344*67e74705SXin Li 345*67e74705SXin Li public: 346*67e74705SXin Li Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags, 347*67e74705SXin Li const CommandTraits &Traits, 348*67e74705SXin Li SourceLocation FileLoc, 349*67e74705SXin Li const char *BufferStart, const char *BufferEnd); 350*67e74705SXin Li 351*67e74705SXin Li void lex(Token &T); 352*67e74705SXin Li 353*67e74705SXin Li StringRef getSpelling(const Token &Tok, 354*67e74705SXin Li const SourceManager &SourceMgr, 355*67e74705SXin Li bool *Invalid = nullptr) const; 356*67e74705SXin Li }; 357*67e74705SXin Li 358*67e74705SXin Li } // end namespace comments 359*67e74705SXin Li } // end namespace clang 360*67e74705SXin Li 361*67e74705SXin Li #endif 362*67e74705SXin Li 363