1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_TOKENIZATION_PLAIN_TOKENIZER_H_ 16 #define ICING_TOKENIZATION_PLAIN_TOKENIZER_H_ 17 18 #include <memory> 19 #include <string_view> 20 #include <vector> 21 22 #include "icing/text_classifier/lib3/utils/base/statusor.h" 23 #include "icing/tokenization/language-segmenter.h" 24 #include "icing/tokenization/tokenizer.h" 25 26 namespace icing { 27 namespace lib { 28 29 // Provides basic tokenization on input text 30 class PlainTokenizer : public Tokenizer { 31 public: PlainTokenizer(const LanguageSegmenter * language_segmenter)32 explicit PlainTokenizer(const LanguageSegmenter* language_segmenter) 33 : language_segmenter_(*language_segmenter) {} 34 35 libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>> Tokenize( 36 std::string_view text) const override; 37 38 libtextclassifier3::StatusOr<std::vector<Token>> TokenizeAll( 39 std::string_view text) const override; 40 41 private: 42 // Used to segment input texts based on language understanding 43 const LanguageSegmenter& language_segmenter_; 44 }; 45 46 } // namespace lib 47 } // namespace icing 48 49 #endif // ICING_TOKENIZATION_PLAIN_TOKENIZER_H_ 50