xref: /aosp_15_r20/external/pdfium/xfa/fxfa/formcalc/cxfa_fmlexer.cpp (revision 3ac0a46f773bac49fa9476ec2b1cf3f8da5ec3a4)
1 // Copyright 2014 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #include "xfa/fxfa/formcalc/cxfa_fmlexer.h"
8 
9 #include <algorithm>
10 
11 #include "core/fxcrt/fx_extension.h"
12 
13 namespace {
14 
IsFormCalcCharacter(wchar_t c)15 bool IsFormCalcCharacter(wchar_t c) {
16   return (c >= 0x09 && c <= 0x0D) || (c >= 0x20 && c <= 0xd7FF) ||
17          (c >= 0xE000 && c <= 0xFFFD);
18 }
19 
IsIdentifierCharacter(wchar_t c)20 bool IsIdentifierCharacter(wchar_t c) {
21   return FXSYS_iswalnum(c) || c == 0x005F ||  // '_'
22          c == 0x0024;                         // '$'
23 }
24 
IsInitialIdentifierCharacter(wchar_t c)25 bool IsInitialIdentifierCharacter(wchar_t c) {
26   return FXSYS_iswalpha(c) || c == 0x005F ||  // '_'
27          c == 0x0024 ||                       // '$'
28          c == 0x0021;                         // '!'
29 }
30 
IsWhitespaceCharacter(wchar_t c)31 bool IsWhitespaceCharacter(wchar_t c) {
32   return c == 0x0009 ||  // Horizontal tab
33          c == 0x000B ||  // Vertical tab
34          c == 0x000C ||  // Form feed
35          c == 0x0020;    // Space
36 }
37 
38 struct XFA_FMKeyword {
39   XFA_FM_TOKEN m_type;
40   const char* m_keyword;  // Raw, POD struct.
41 };
42 
43 const XFA_FMKeyword keyWords[] = {
44     {TOKdo, "do"},
45     {TOKkseq, "eq"},
46     {TOKksge, "ge"},
47     {TOKksgt, "gt"},
48     {TOKif, "if"},
49     {TOKin, "in"},
50     {TOKksle, "le"},
51     {TOKkslt, "lt"},
52     {TOKksne, "ne"},
53     {TOKksor, "or"},
54     {TOKnull, "null"},
55     {TOKbreak, "break"},
56     {TOKksand, "and"},
57     {TOKend, "end"},
58     {TOKeof, "eof"},
59     {TOKfor, "for"},
60     {TOKnan, "nan"},
61     {TOKksnot, "not"},
62     {TOKvar, "var"},
63     {TOKthen, "then"},
64     {TOKelse, "else"},
65     {TOKexit, "exit"},
66     {TOKdownto, "downto"},
67     {TOKreturn, "return"},
68     {TOKinfinity, "infinity"},
69     {TOKendwhile, "endwhile"},
70     {TOKforeach, "foreach"},
71     {TOKendfunc, "endfunc"},
72     {TOKelseif, "elseif"},
73     {TOKwhile, "while"},
74     {TOKendfor, "endfor"},
75     {TOKthrow, "throw"},
76     {TOKstep, "step"},
77     {TOKupto, "upto"},
78     {TOKcontinue, "continue"},
79     {TOKfunc, "func"},
80     {TOKendif, "endif"},
81 };
82 
83 #ifndef NDEBUG
84 const char* const tokenStrings[] = {
85     "TOKand",        "TOKlparen",     "TOKrparen",   "TOKmul",
86     "TOKplus",       "TOKcomma",      "TOKminus",    "TOKdot",
87     "TOKdiv",        "TOKlt",         "TOKassign",   "TOKgt",
88     "TOKlbracket",   "TOKrbracket",   "TOKor",       "TOKdotscream",
89     "TOKdotstar",    "TOKdotdot",     "TOKle",       "TOKne",
90     "TOKeq",         "TOKge",         "TOKdo",       "TOKkseq",
91     "TOKksge",       "TOKksgt",       "TOKif",       "TOKin",
92     "TOKksle",       "TOKkslt",       "TOKksne",     "TOKksor",
93     "TOKnull",       "TOKbreak",      "TOKksand",    "TOKend",
94     "TOKeof",        "TOKfor",        "TOKnan",      "TOKksnot",
95     "TOKvar",        "TOKthen",       "TOKelse",     "TOKexit",
96     "TOKdownto",     "TOKreturn",     "TOKinfinity", "TOKendwhile",
97     "TOKforeach",    "TOKendfunc",    "TOKelseif",   "TOKwhile",
98     "TOKendfor",     "TOKthrow",      "TOKstep",     "TOKupto",
99     "TOKcontinue",   "TOKfunc",       "TOKendif",    "TOKstar",
100     "TOKidentifier", "TOKunderscore", "TOKdollar",   "TOKexclamation",
101     "TOKcall",       "TOKstring",     "TOKnumber",   "TOKreserver",
102 };
103 #endif  // NDEBUG
104 
TokenizeIdentifier(WideStringView str)105 XFA_FM_TOKEN TokenizeIdentifier(WideStringView str) {
106   const XFA_FMKeyword* result =
107       std::find_if(std::begin(keyWords), std::end(keyWords),
108                    [str](const XFA_FMKeyword& iter) {
109                      return str.EqualsASCII(iter.m_keyword);
110                    });
111   if (result != std::end(keyWords) && str.EqualsASCII(result->m_keyword))
112     return result->m_type;
113   return TOKidentifier;
114 }
115 
116 }  // namespace
117 
118 CXFA_FMLexer::Token::Token() = default;
119 
Token(XFA_FM_TOKEN token)120 CXFA_FMLexer::Token::Token(XFA_FM_TOKEN token) : m_type(token) {}
121 
Token(XFA_FM_TOKEN token,WideStringView str)122 CXFA_FMLexer::Token::Token(XFA_FM_TOKEN token, WideStringView str)
123     : m_type(token), m_string(str) {}
124 
125 CXFA_FMLexer::Token::Token(const Token& that) = default;
126 
127 CXFA_FMLexer::Token::~Token() = default;
128 
129 #ifndef NDEBUG
ToDebugString() const130 WideString CXFA_FMLexer::Token::ToDebugString() const {
131   WideString str = WideString::FromASCII("type = ");
132   str += WideString::FromASCII(tokenStrings[m_type]);
133   str += WideString::FromASCII(", string = ");
134   str += m_string;
135   return str;
136 }
137 #endif  // NDEBUG
138 
CXFA_FMLexer(WideStringView wsFormCalc)139 CXFA_FMLexer::CXFA_FMLexer(WideStringView wsFormCalc)
140     : m_spInput(wsFormCalc.span()) {}
141 
142 CXFA_FMLexer::~CXFA_FMLexer() = default;
143 
NextToken()144 CXFA_FMLexer::Token CXFA_FMLexer::NextToken() {
145   if (m_bLexerError)
146     return Token();
147 
148   while (!IsComplete() && m_spInput[m_nCursor]) {
149     if (!IsFormCalcCharacter(m_spInput[m_nCursor])) {
150       RaiseError();
151       return Token();
152     }
153 
154     switch (m_spInput[m_nCursor]) {
155       case '\n':
156         ++m_nCursor;
157         break;
158       case '\r':
159         ++m_nCursor;
160         break;
161       case ';':
162         AdvanceForComment();
163         break;
164       case '"':
165         return AdvanceForString();
166       case '0':
167       case '1':
168       case '2':
169       case '3':
170       case '4':
171       case '5':
172       case '6':
173       case '7':
174       case '8':
175       case '9':
176         return AdvanceForNumber();
177       case '=':
178         ++m_nCursor;
179         if (m_nCursor >= m_spInput.size())
180           return Token(TOKassign);
181 
182         if (!IsFormCalcCharacter(m_spInput[m_nCursor])) {
183           RaiseError();
184           return Token();
185         }
186         if (m_spInput[m_nCursor] == '=') {
187           ++m_nCursor;
188           return Token(TOKeq);
189         }
190         return Token(TOKassign);
191       case '<':
192         ++m_nCursor;
193         if (m_nCursor >= m_spInput.size())
194           return Token(TOKlt);
195 
196         if (!IsFormCalcCharacter(m_spInput[m_nCursor])) {
197           RaiseError();
198           return Token();
199         }
200         if (m_spInput[m_nCursor] == '=') {
201           ++m_nCursor;
202           return Token(TOKle);
203         }
204         if (m_spInput[m_nCursor] == '>') {
205           ++m_nCursor;
206           return Token(TOKne);
207         }
208         return Token(TOKlt);
209       case '>':
210         ++m_nCursor;
211         if (m_nCursor >= m_spInput.size())
212           return Token(TOKgt);
213 
214         if (!IsFormCalcCharacter(m_spInput[m_nCursor])) {
215           RaiseError();
216           return Token();
217         }
218         if (m_spInput[m_nCursor] == '=') {
219           ++m_nCursor;
220           return Token(TOKge);
221         }
222         return Token(TOKgt);
223       case ',':
224         ++m_nCursor;
225         return Token(TOKcomma);
226       case '(':
227         ++m_nCursor;
228         return Token(TOKlparen);
229       case ')':
230         ++m_nCursor;
231         return Token(TOKrparen);
232       case '[':
233         ++m_nCursor;
234         return Token(TOKlbracket);
235       case ']':
236         ++m_nCursor;
237         return Token(TOKrbracket);
238       case '&':
239         ++m_nCursor;
240         return Token(TOKand);
241       case '|':
242         ++m_nCursor;
243         return Token(TOKor);
244       case '+':
245         ++m_nCursor;
246         return Token(TOKplus);
247       case '-':
248         ++m_nCursor;
249         return Token(TOKminus);
250       case '*':
251         ++m_nCursor;
252         return Token(TOKmul);
253       case '/': {
254         ++m_nCursor;
255         if (m_nCursor >= m_spInput.size())
256           return Token(TOKdiv);
257 
258         if (!IsFormCalcCharacter(m_spInput[m_nCursor])) {
259           RaiseError();
260           return Token();
261         }
262         if (m_spInput[m_nCursor] != '/')
263           return Token(TOKdiv);
264 
265         AdvanceForComment();
266         break;
267       }
268       case '.':
269         ++m_nCursor;
270         if (m_nCursor >= m_spInput.size())
271           return Token(TOKdot);
272 
273         if (!IsFormCalcCharacter(m_spInput[m_nCursor])) {
274           RaiseError();
275           return Token();
276         }
277 
278         if (m_spInput[m_nCursor] == '.') {
279           ++m_nCursor;
280           return Token(TOKdotdot);
281         }
282         if (m_spInput[m_nCursor] == '*') {
283           ++m_nCursor;
284           return Token(TOKdotstar);
285         }
286         if (m_spInput[m_nCursor] == '#') {
287           ++m_nCursor;
288           return Token(TOKdotscream);
289         }
290         if (FXSYS_IsDecimalDigit(m_spInput[m_nCursor])) {
291           --m_nCursor;
292           return AdvanceForNumber();
293         }
294         return Token(TOKdot);
295       default:
296         if (IsWhitespaceCharacter(m_spInput[m_nCursor])) {
297           ++m_nCursor;
298           break;
299         }
300         if (!IsInitialIdentifierCharacter(m_spInput[m_nCursor])) {
301           RaiseError();
302           return Token();
303         }
304         return AdvanceForIdentifier();
305     }
306   }
307   return Token(TOKeof);
308 }
309 
AdvanceForNumber()310 CXFA_FMLexer::Token CXFA_FMLexer::AdvanceForNumber() {
311   // This will set end to the character after the end of the number.
312   size_t used_length = 0;
313   if (m_nCursor < m_spInput.size()) {
314     FXSYS_wcstof(&m_spInput[m_nCursor], m_spInput.size() - m_nCursor,
315                  &used_length);
316   }
317   size_t end = m_nCursor + used_length;
318   if (used_length == 0 ||
319       (end < m_spInput.size() && FXSYS_iswalpha(m_spInput[end]))) {
320     RaiseError();
321     return Token();
322   }
323   WideStringView str(m_spInput.subspan(m_nCursor, end - m_nCursor));
324   m_nCursor = end;
325   return Token(TOKnumber, str);
326 }
327 
AdvanceForString()328 CXFA_FMLexer::Token CXFA_FMLexer::AdvanceForString() {
329   size_t start = m_nCursor;
330   ++m_nCursor;
331   while (!IsComplete() && m_spInput[m_nCursor]) {
332     if (!IsFormCalcCharacter(m_spInput[m_nCursor]))
333       break;
334 
335     if (m_spInput[m_nCursor] == '"') {
336       // Check for escaped "s, i.e. "".
337       ++m_nCursor;
338       // If the end of the input has been reached it was not escaped.
339       if (m_nCursor >= m_spInput.size()) {
340         return Token(TOKstring, WideStringView(m_spInput.subspan(
341                                     start, m_nCursor - start)));
342       }
343       // If the next character is not a " then the end of the string has been
344       // found.
345       if (m_spInput[m_nCursor] != '"') {
346         if (!IsFormCalcCharacter(m_spInput[m_nCursor]))
347           break;
348 
349         return Token(TOKstring, WideStringView(m_spInput.subspan(
350                                     start, m_nCursor - start)));
351       }
352     }
353     ++m_nCursor;
354   }
355 
356   // Didn't find the end of the string.
357   RaiseError();
358   return Token();
359 }
360 
AdvanceForIdentifier()361 CXFA_FMLexer::Token CXFA_FMLexer::AdvanceForIdentifier() {
362   size_t start = m_nCursor;
363   ++m_nCursor;
364   while (!IsComplete() && m_spInput[m_nCursor]) {
365     if (!IsFormCalcCharacter(m_spInput[m_nCursor])) {
366       RaiseError();
367       return Token();
368     }
369     if (!IsIdentifierCharacter(m_spInput[m_nCursor]))
370       break;
371 
372     ++m_nCursor;
373   }
374 
375   WideStringView str(m_spInput.subspan(start, m_nCursor - start));
376   return Token(TokenizeIdentifier(str), str);
377 }
378 
AdvanceForComment()379 void CXFA_FMLexer::AdvanceForComment() {
380   ++m_nCursor;
381   while (!IsComplete() && m_spInput[m_nCursor]) {
382     if (!IsFormCalcCharacter(m_spInput[m_nCursor])) {
383       RaiseError();
384       return;
385     }
386     if (m_spInput[m_nCursor] == L'\r') {
387       ++m_nCursor;
388       return;
389     }
390     if (m_spInput[m_nCursor] == L'\n') {
391       ++m_nCursor;
392       return;
393     }
394     ++m_nCursor;
395   }
396 }
397