xref: /aosp_15_r20/external/icing/icing/tokenization/plain-tokenizer.h (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_TOKENIZATION_PLAIN_TOKENIZER_H_
16 #define ICING_TOKENIZATION_PLAIN_TOKENIZER_H_
17 
18 #include <memory>
19 #include <string_view>
20 #include <vector>
21 
22 #include "icing/text_classifier/lib3/utils/base/statusor.h"
23 #include "icing/tokenization/language-segmenter.h"
24 #include "icing/tokenization/tokenizer.h"
25 
26 namespace icing {
27 namespace lib {
28 
29 // Provides basic tokenization on input text
30 class PlainTokenizer : public Tokenizer {
31  public:
PlainTokenizer(const LanguageSegmenter * language_segmenter)32   explicit PlainTokenizer(const LanguageSegmenter* language_segmenter)
33       : language_segmenter_(*language_segmenter) {}
34 
35   libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>> Tokenize(
36       std::string_view text) const override;
37 
38   libtextclassifier3::StatusOr<std::vector<Token>> TokenizeAll(
39       std::string_view text) const override;
40 
41  private:
42   // Used to segment input texts based on language understanding
43   const LanguageSegmenter& language_segmenter_;
44 };
45 
46 }  // namespace lib
47 }  // namespace icing
48 
49 #endif  // ICING_TOKENIZATION_PLAIN_TOKENIZER_H_
50