1 /*
2  * Copyright (C) 2013, The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef LATINIME_VER4_PATRICIA_TRIE_POLICY_H
18 #define LATINIME_VER4_PATRICIA_TRIE_POLICY_H
19 
20 #include <vector>
21 
22 #include "defines.h"
23 #include "dictionary/header/header_policy.h"
24 #include "dictionary/interface/dictionary_structure_with_buffer_policy.h"
25 #include "dictionary/structure/pt_common/dynamic_pt_updating_helper.h"
26 #include "dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h"
27 #include "dictionary/structure/v4/ver4_dict_buffers.h"
28 #include "dictionary/structure/v4/ver4_patricia_trie_node_reader.h"
29 #include "dictionary/structure/v4/ver4_patricia_trie_node_writer.h"
30 #include "dictionary/structure/v4/ver4_patricia_trie_writing_helper.h"
31 #include "dictionary/structure/v4/ver4_pt_node_array_reader.h"
32 #include "dictionary/utils/buffer_with_extendable_buffer.h"
33 #include "dictionary/utils/entry_counters.h"
34 #include "utils/int_array_view.h"
35 
36 namespace latinime {
37 
38 class DicNode;
39 class DicNodeVector;
40 
41 // Word id = Artificial id that is stored in the PtNode looked up by the word.
42 class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
43  public:
Ver4PatriciaTriePolicy(Ver4DictBuffers::Ver4DictBuffersPtr buffers)44     Ver4PatriciaTriePolicy(Ver4DictBuffers::Ver4DictBuffersPtr buffers)
45             : mBuffers(std::move(buffers)), mHeaderPolicy(mBuffers->getHeaderPolicy()),
46               mDictBuffer(mBuffers->getWritableTrieBuffer()),
47               mShortcutPolicy(mBuffers->getMutableShortcutDictContent(),
48                       mBuffers->getTerminalPositionLookupTable()),
49               mNodeReader(mDictBuffer), mPtNodeArrayReader(mDictBuffer),
50               mNodeWriter(mDictBuffer, mBuffers.get(), &mNodeReader, &mPtNodeArrayReader,
51                       &mShortcutPolicy),
52               mUpdatingHelper(mDictBuffer, &mNodeReader, &mNodeWriter),
53               mWritingHelper(mBuffers.get()),
54               mEntryCounters(mHeaderPolicy->getNgramCounts().getCountArray()),
55               mTerminalPtNodePositionsForIteratingWords(), mIsCorrupted(false) {};
56 
getRootPosition()57     AK_FORCE_INLINE int getRootPosition() const {
58         return 0;
59     }
60 
61     void createAndGetAllChildDicNodes(const DicNode *const dicNode,
62             DicNodeVector *const childDicNodes) const;
63 
64     int getCodePointsAndReturnCodePointCount(const int wordId, const int maxCodePointCount,
65             int *const outCodePoints) const;
66 
67     int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const;
68 
69     const WordAttributes getWordAttributesInContext(const WordIdArrayView prevWordIds,
70             const int wordId, MultiBigramMap *const multiBigramMap) const;
71 
72     // TODO: Remove
getProbability(const int unigramProbability,const int bigramProbability)73     int getProbability(const int unigramProbability, const int bigramProbability) const {
74         // Not used.
75         return NOT_A_PROBABILITY;
76     }
77 
78     int getProbabilityOfWord(const WordIdArrayView prevWordIds, const int wordId) const;
79 
80     void iterateNgramEntries(const WordIdArrayView prevWordIds,
81             NgramListener *const listener) const;
82 
83     BinaryDictionaryShortcutIterator getShortcutIterator(const int wordId) const;
84 
getHeaderStructurePolicy()85     const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const {
86         return mHeaderPolicy;
87     }
88 
89     bool addUnigramEntry(const CodePointArrayView wordCodePoints,
90             const UnigramProperty *const unigramProperty);
91 
92     bool removeUnigramEntry(const CodePointArrayView wordCodePoints);
93 
94     bool addNgramEntry(const NgramProperty *const ngramProperty);
95 
96     bool removeNgramEntry(const NgramContext *const ngramContext,
97             const CodePointArrayView wordCodePoints);
98 
99     bool updateEntriesForWordWithNgramContext(const NgramContext *const ngramContext,
100             const CodePointArrayView wordCodePoints, const bool isValidWord,
101             const HistoricalInfo historicalInfo);
102 
103     bool flush(const char *const filePath);
104 
105     bool flushWithGC(const char *const filePath);
106 
107     bool needsToRunGC(const bool mindsBlockByGC) const;
108 
109     void getProperty(const char *const query, const int queryLength, char *const outResult,
110             const int maxResultLength);
111 
112     const WordProperty getWordProperty(const CodePointArrayView wordCodePoints) const;
113 
114     int getNextWordAndNextToken(const int token, int *const outCodePoints,
115             int *const outCodePointCount);
116 
isCorrupted()117     bool isCorrupted() const {
118         return mIsCorrupted;
119     }
120 
121  private:
122     DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTriePolicy);
123 
124     static const char *const UNIGRAM_COUNT_QUERY;
125     static const char *const BIGRAM_COUNT_QUERY;
126     static const char *const MAX_UNIGRAM_COUNT_QUERY;
127     static const char *const MAX_BIGRAM_COUNT_QUERY;
128     // When the dictionary size is near the maximum size, we have to refuse dynamic operations to
129     // prevent the dictionary from overflowing.
130     static const int MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS;
131     static const int MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS;
132 
133     const Ver4DictBuffers::Ver4DictBuffersPtr mBuffers;
134     const HeaderPolicy *const mHeaderPolicy;
135     BufferWithExtendableBuffer *const mDictBuffer;
136     Ver4ShortcutListPolicy mShortcutPolicy;
137     Ver4PatriciaTrieNodeReader mNodeReader;
138     Ver4PtNodeArrayReader mPtNodeArrayReader;
139     Ver4PatriciaTrieNodeWriter mNodeWriter;
140     DynamicPtUpdatingHelper mUpdatingHelper;
141     Ver4PatriciaTrieWritingHelper mWritingHelper;
142     MutableEntryCounters mEntryCounters;
143     std::vector<int> mTerminalPtNodePositionsForIteratingWords;
144     mutable bool mIsCorrupted;
145 
146     int getShortcutPositionOfWord(const int wordId) const;
147 };
148 } // namespace latinime
149 #endif // LATINIME_VER4_PATRICIA_TRIE_POLICY_H
150