xref: /aosp_15_r20/external/skia/src/sksl/lex/NFAtoDFA.h (revision c8dee2aa9b3f27cf6c858bd81872bdeb2c07ed17)
1*c8dee2aaSAndroid Build Coastguard Worker /*
2*c8dee2aaSAndroid Build Coastguard Worker  * Copyright 2017 Google Inc.
3*c8dee2aaSAndroid Build Coastguard Worker  *
4*c8dee2aaSAndroid Build Coastguard Worker  * Use of this source code is governed by a BSD-style license that can be
5*c8dee2aaSAndroid Build Coastguard Worker  * found in the LICENSE file.
6*c8dee2aaSAndroid Build Coastguard Worker  */
7*c8dee2aaSAndroid Build Coastguard Worker #ifndef NFAtoDFA_DEFINED
8*c8dee2aaSAndroid Build Coastguard Worker #define NFAtoDFA_DEFINED
9*c8dee2aaSAndroid Build Coastguard Worker 
10*c8dee2aaSAndroid Build Coastguard Worker #include "src/sksl/lex/DFA.h"
11*c8dee2aaSAndroid Build Coastguard Worker #include "src/sksl/lex/DFAState.h"
12*c8dee2aaSAndroid Build Coastguard Worker #include "src/sksl/lex/NFA.h"
13*c8dee2aaSAndroid Build Coastguard Worker #include "src/sksl/lex/NFAState.h"
14*c8dee2aaSAndroid Build Coastguard Worker 
15*c8dee2aaSAndroid Build Coastguard Worker #include <algorithm>
16*c8dee2aaSAndroid Build Coastguard Worker #include <climits>
17*c8dee2aaSAndroid Build Coastguard Worker #include <memory>
18*c8dee2aaSAndroid Build Coastguard Worker #include <unordered_map>
19*c8dee2aaSAndroid Build Coastguard Worker #include <set>
20*c8dee2aaSAndroid Build Coastguard Worker #include <vector>
21*c8dee2aaSAndroid Build Coastguard Worker 
22*c8dee2aaSAndroid Build Coastguard Worker /**
23*c8dee2aaSAndroid Build Coastguard Worker  * Converts a nondeterministic finite automaton to a deterministic finite automaton. Since NFAs and
24*c8dee2aaSAndroid Build Coastguard Worker  * DFAs differ only in that an NFA allows multiple states at the same time, we can find each
25*c8dee2aaSAndroid Build Coastguard Worker  * possible combination of simultaneous NFA states and give this combination a label. These labelled
26*c8dee2aaSAndroid Build Coastguard Worker  * nodes are our DFA nodes, since we can only be in one such unique set of NFA states at a time.
27*c8dee2aaSAndroid Build Coastguard Worker  *
28*c8dee2aaSAndroid Build Coastguard Worker  * As an NFA can end up in multiple accept states at the same time (for instance, the token "while"
29*c8dee2aaSAndroid Build Coastguard Worker  * is valid for both WHILE and IDENTIFIER), we disambiguate by preferring the first matching regex
30*c8dee2aaSAndroid Build Coastguard Worker  * (in terms of the order in which they were added to the NFA).
31*c8dee2aaSAndroid Build Coastguard Worker  */
32*c8dee2aaSAndroid Build Coastguard Worker class NFAtoDFA {
33*c8dee2aaSAndroid Build Coastguard Worker public:
34*c8dee2aaSAndroid Build Coastguard Worker     inline static constexpr char START_CHAR = 9;
35*c8dee2aaSAndroid Build Coastguard Worker     inline static constexpr char END_CHAR = 126;
36*c8dee2aaSAndroid Build Coastguard Worker 
NFAtoDFA(NFA * nfa)37*c8dee2aaSAndroid Build Coastguard Worker     NFAtoDFA(NFA* nfa)
38*c8dee2aaSAndroid Build Coastguard Worker     : fNFA(*nfa) {}
39*c8dee2aaSAndroid Build Coastguard Worker 
40*c8dee2aaSAndroid Build Coastguard Worker     /**
41*c8dee2aaSAndroid Build Coastguard Worker      * Returns a DFA created from the NFA.
42*c8dee2aaSAndroid Build Coastguard Worker      */
convert()43*c8dee2aaSAndroid Build Coastguard Worker     DFA convert() {
44*c8dee2aaSAndroid Build Coastguard Worker         // create state 0, the "reject" state
45*c8dee2aaSAndroid Build Coastguard Worker         getState(DFAState::Label({}));
46*c8dee2aaSAndroid Build Coastguard Worker         // create a state representing being in all of the NFA's start states at once
47*c8dee2aaSAndroid Build Coastguard Worker         std::vector<int> startStates = fNFA.fStartStates;
48*c8dee2aaSAndroid Build Coastguard Worker         std::sort(startStates.begin(), startStates.end());
49*c8dee2aaSAndroid Build Coastguard Worker         // this becomes state 1, our start state
50*c8dee2aaSAndroid Build Coastguard Worker         DFAState* start = getState(DFAState::Label(startStates));
51*c8dee2aaSAndroid Build Coastguard Worker         this->scanState(start);
52*c8dee2aaSAndroid Build Coastguard Worker 
53*c8dee2aaSAndroid Build Coastguard Worker         this->computeMappings();
54*c8dee2aaSAndroid Build Coastguard Worker 
55*c8dee2aaSAndroid Build Coastguard Worker         int stateCount = 0;
56*c8dee2aaSAndroid Build Coastguard Worker         for (const auto& row : fTransitions) {
57*c8dee2aaSAndroid Build Coastguard Worker             stateCount = std::max(stateCount, (int) row.size());
58*c8dee2aaSAndroid Build Coastguard Worker         }
59*c8dee2aaSAndroid Build Coastguard Worker         return DFA(fCharMappings, fTransitions, fAccepts);
60*c8dee2aaSAndroid Build Coastguard Worker     }
61*c8dee2aaSAndroid Build Coastguard Worker 
62*c8dee2aaSAndroid Build Coastguard Worker private:
63*c8dee2aaSAndroid Build Coastguard Worker     /**
64*c8dee2aaSAndroid Build Coastguard Worker      * Returns an existing state with the given label, or creates a new one and returns it.
65*c8dee2aaSAndroid Build Coastguard Worker      */
getState(DFAState::Label label)66*c8dee2aaSAndroid Build Coastguard Worker     DFAState* getState(DFAState::Label label) {
67*c8dee2aaSAndroid Build Coastguard Worker         auto found = fStates.find(label);
68*c8dee2aaSAndroid Build Coastguard Worker         if (found == fStates.end()) {
69*c8dee2aaSAndroid Build Coastguard Worker             int id = fStates.size();
70*c8dee2aaSAndroid Build Coastguard Worker             fStates[label] = std::unique_ptr<DFAState>(new DFAState(id, label));
71*c8dee2aaSAndroid Build Coastguard Worker             return fStates[label].get();
72*c8dee2aaSAndroid Build Coastguard Worker         }
73*c8dee2aaSAndroid Build Coastguard Worker         return found->second.get();
74*c8dee2aaSAndroid Build Coastguard Worker     }
75*c8dee2aaSAndroid Build Coastguard Worker 
add(int nfaState,std::vector<int> * states)76*c8dee2aaSAndroid Build Coastguard Worker     void add(int nfaState, std::vector<int>* states) {
77*c8dee2aaSAndroid Build Coastguard Worker         NFAState state = fNFA.fStates[nfaState];
78*c8dee2aaSAndroid Build Coastguard Worker         if (state.fKind == NFAState::kRemapped_Kind) {
79*c8dee2aaSAndroid Build Coastguard Worker             for (int next : state.fData) {
80*c8dee2aaSAndroid Build Coastguard Worker                 this->add(next, states);
81*c8dee2aaSAndroid Build Coastguard Worker             }
82*c8dee2aaSAndroid Build Coastguard Worker         } else {
83*c8dee2aaSAndroid Build Coastguard Worker             for (int entry : *states) {
84*c8dee2aaSAndroid Build Coastguard Worker                 if (nfaState == entry) {
85*c8dee2aaSAndroid Build Coastguard Worker                     return;
86*c8dee2aaSAndroid Build Coastguard Worker                 }
87*c8dee2aaSAndroid Build Coastguard Worker             }
88*c8dee2aaSAndroid Build Coastguard Worker             states->push_back(nfaState);
89*c8dee2aaSAndroid Build Coastguard Worker         }
90*c8dee2aaSAndroid Build Coastguard Worker     }
91*c8dee2aaSAndroid Build Coastguard Worker 
addTransition(char c,int start,int next)92*c8dee2aaSAndroid Build Coastguard Worker     void addTransition(char c, int start, int next) {
93*c8dee2aaSAndroid Build Coastguard Worker         while (fTransitions.size() <= (size_t) c) {
94*c8dee2aaSAndroid Build Coastguard Worker             fTransitions.push_back(std::vector<int>());
95*c8dee2aaSAndroid Build Coastguard Worker         }
96*c8dee2aaSAndroid Build Coastguard Worker         std::vector<int>& row = fTransitions[c];
97*c8dee2aaSAndroid Build Coastguard Worker         while (row.size() <= (size_t) start) {
98*c8dee2aaSAndroid Build Coastguard Worker             row.push_back(INVALID);
99*c8dee2aaSAndroid Build Coastguard Worker         }
100*c8dee2aaSAndroid Build Coastguard Worker         row[start] = next;
101*c8dee2aaSAndroid Build Coastguard Worker     }
102*c8dee2aaSAndroid Build Coastguard Worker 
scanState(DFAState * state)103*c8dee2aaSAndroid Build Coastguard Worker     void scanState(DFAState* state) {
104*c8dee2aaSAndroid Build Coastguard Worker         state->fIsScanned = true;
105*c8dee2aaSAndroid Build Coastguard Worker         for (char c = START_CHAR; c <= END_CHAR; ++c) {
106*c8dee2aaSAndroid Build Coastguard Worker             std::vector<int> next;
107*c8dee2aaSAndroid Build Coastguard Worker             int bestAccept = INT_MAX;
108*c8dee2aaSAndroid Build Coastguard Worker             for (int idx : state->fLabel.fStates) {
109*c8dee2aaSAndroid Build Coastguard Worker                 const NFAState& nfaState = fNFA.fStates[idx];
110*c8dee2aaSAndroid Build Coastguard Worker                 if (nfaState.accept(c)) {
111*c8dee2aaSAndroid Build Coastguard Worker                     for (int nextState : nfaState.fNext) {
112*c8dee2aaSAndroid Build Coastguard Worker                         if (fNFA.fStates[nextState].fKind == NFAState::kAccept_Kind) {
113*c8dee2aaSAndroid Build Coastguard Worker                             bestAccept = std::min(bestAccept, fNFA.fStates[nextState].fData[0]);
114*c8dee2aaSAndroid Build Coastguard Worker                         }
115*c8dee2aaSAndroid Build Coastguard Worker                         this->add(nextState, &next);
116*c8dee2aaSAndroid Build Coastguard Worker                     }
117*c8dee2aaSAndroid Build Coastguard Worker                 }
118*c8dee2aaSAndroid Build Coastguard Worker             }
119*c8dee2aaSAndroid Build Coastguard Worker             std::sort(next.begin(), next.end());
120*c8dee2aaSAndroid Build Coastguard Worker             DFAState* nextState = this->getState(DFAState::Label(next));
121*c8dee2aaSAndroid Build Coastguard Worker             this->addTransition(c, state->fId, nextState->fId);
122*c8dee2aaSAndroid Build Coastguard Worker             if (bestAccept != INT_MAX) {
123*c8dee2aaSAndroid Build Coastguard Worker                 while (fAccepts.size() <= (size_t) nextState->fId) {
124*c8dee2aaSAndroid Build Coastguard Worker                     fAccepts.push_back(INVALID);
125*c8dee2aaSAndroid Build Coastguard Worker                 }
126*c8dee2aaSAndroid Build Coastguard Worker                 fAccepts[nextState->fId] = bestAccept;
127*c8dee2aaSAndroid Build Coastguard Worker             }
128*c8dee2aaSAndroid Build Coastguard Worker             if (!nextState->fIsScanned) {
129*c8dee2aaSAndroid Build Coastguard Worker                 this->scanState(nextState);
130*c8dee2aaSAndroid Build Coastguard Worker             }
131*c8dee2aaSAndroid Build Coastguard Worker         }
132*c8dee2aaSAndroid Build Coastguard Worker     }
133*c8dee2aaSAndroid Build Coastguard Worker 
134*c8dee2aaSAndroid Build Coastguard Worker     // collapse rows with the same transitions to a single row. This is common, as each row
135*c8dee2aaSAndroid Build Coastguard Worker     // represents a character and often there are many characters for which all transitions are
136*c8dee2aaSAndroid Build Coastguard Worker     // identical (e.g. [0-9] are treated the same way by all lexer rules)
computeMappings()137*c8dee2aaSAndroid Build Coastguard Worker     void computeMappings() {
138*c8dee2aaSAndroid Build Coastguard Worker         // mappings[<input row>] = <output row>
139*c8dee2aaSAndroid Build Coastguard Worker         std::vector<std::vector<int>*> uniques;
140*c8dee2aaSAndroid Build Coastguard Worker         // this could be done more efficiently, but O(n^2) is plenty fast for our purposes
141*c8dee2aaSAndroid Build Coastguard Worker         for (size_t i = 0; i < fTransitions.size(); ++i) {
142*c8dee2aaSAndroid Build Coastguard Worker             int found = -1;
143*c8dee2aaSAndroid Build Coastguard Worker             for (size_t j = 0; j < uniques.size(); ++j) {
144*c8dee2aaSAndroid Build Coastguard Worker                 if (*uniques[j] == fTransitions[i]) {
145*c8dee2aaSAndroid Build Coastguard Worker                     found = j;
146*c8dee2aaSAndroid Build Coastguard Worker                     break;
147*c8dee2aaSAndroid Build Coastguard Worker                 }
148*c8dee2aaSAndroid Build Coastguard Worker             }
149*c8dee2aaSAndroid Build Coastguard Worker             if (found == -1) {
150*c8dee2aaSAndroid Build Coastguard Worker                 found = (int) uniques.size();
151*c8dee2aaSAndroid Build Coastguard Worker                 uniques.push_back(&fTransitions[i]);
152*c8dee2aaSAndroid Build Coastguard Worker             }
153*c8dee2aaSAndroid Build Coastguard Worker             fCharMappings.push_back(found);
154*c8dee2aaSAndroid Build Coastguard Worker         }
155*c8dee2aaSAndroid Build Coastguard Worker         std::vector<std::vector<int>> newTransitions;
156*c8dee2aaSAndroid Build Coastguard Worker         for (std::vector<int>* row : uniques) {
157*c8dee2aaSAndroid Build Coastguard Worker             newTransitions.push_back(*row);
158*c8dee2aaSAndroid Build Coastguard Worker         }
159*c8dee2aaSAndroid Build Coastguard Worker         fTransitions = newTransitions;
160*c8dee2aaSAndroid Build Coastguard Worker     }
161*c8dee2aaSAndroid Build Coastguard Worker 
162*c8dee2aaSAndroid Build Coastguard Worker     const NFA& fNFA;
163*c8dee2aaSAndroid Build Coastguard Worker     std::unordered_map<DFAState::Label, std::unique_ptr<DFAState>> fStates;
164*c8dee2aaSAndroid Build Coastguard Worker     std::vector<std::vector<int>> fTransitions;
165*c8dee2aaSAndroid Build Coastguard Worker     std::vector<int> fCharMappings;
166*c8dee2aaSAndroid Build Coastguard Worker     std::vector<int> fAccepts;
167*c8dee2aaSAndroid Build Coastguard Worker };
168*c8dee2aaSAndroid Build Coastguard Worker #endif  // NFAtoDFA_DEFINED
169