1*993b0882SAndroid Build Coastguard Worker /* 2*993b0882SAndroid Build Coastguard Worker * Copyright (C) 2018 The Android Open Source Project 3*993b0882SAndroid Build Coastguard Worker * 4*993b0882SAndroid Build Coastguard Worker * Licensed under the Apache License, Version 2.0 (the "License"); 5*993b0882SAndroid Build Coastguard Worker * you may not use this file except in compliance with the License. 6*993b0882SAndroid Build Coastguard Worker * You may obtain a copy of the License at 7*993b0882SAndroid Build Coastguard Worker * 8*993b0882SAndroid Build Coastguard Worker * http://www.apache.org/licenses/LICENSE-2.0 9*993b0882SAndroid Build Coastguard Worker * 10*993b0882SAndroid Build Coastguard Worker * Unless required by applicable law or agreed to in writing, software 11*993b0882SAndroid Build Coastguard Worker * distributed under the License is distributed on an "AS IS" BASIS, 12*993b0882SAndroid Build Coastguard Worker * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13*993b0882SAndroid Build Coastguard Worker * See the License for the specific language governing permissions and 14*993b0882SAndroid Build Coastguard Worker * limitations under the License. 15*993b0882SAndroid Build Coastguard Worker */ 16*993b0882SAndroid Build Coastguard Worker 17*993b0882SAndroid Build Coastguard Worker #ifndef LIBTEXTCLASSIFIER_UTILS_WORDPIECE_TOKENIZER_H_ 18*993b0882SAndroid Build Coastguard Worker #define LIBTEXTCLASSIFIER_UTILS_WORDPIECE_TOKENIZER_H_ 19*993b0882SAndroid Build Coastguard Worker 20*993b0882SAndroid Build Coastguard Worker #include <string> 21*993b0882SAndroid Build Coastguard Worker #include <vector> 22*993b0882SAndroid Build Coastguard Worker 23*993b0882SAndroid Build Coastguard Worker #include "absl/strings/string_view.h" 24*993b0882SAndroid Build Coastguard Worker 25*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3 { 26*993b0882SAndroid Build Coastguard Worker 27*993b0882SAndroid Build Coastguard Worker struct LookupStatus { LookupStatusLookupStatus28*993b0882SAndroid Build Coastguard Worker LookupStatus() : error_msg(""), success(true) {} LookupStatusLookupStatus29*993b0882SAndroid Build Coastguard Worker explicit LookupStatus(const std::string& msg) 30*993b0882SAndroid Build Coastguard Worker : error_msg(msg), success(false) {} 31*993b0882SAndroid Build Coastguard Worker std::string error_msg; 32*993b0882SAndroid Build Coastguard Worker bool success; 33*993b0882SAndroid Build Coastguard Worker OKLookupStatus34*993b0882SAndroid Build Coastguard Worker static LookupStatus OK() { return LookupStatus(); } 35*993b0882SAndroid Build Coastguard Worker }; 36*993b0882SAndroid Build Coastguard Worker 37*993b0882SAndroid Build Coastguard Worker class WordpieceVocab { 38*993b0882SAndroid Build Coastguard Worker public: ~WordpieceVocab()39*993b0882SAndroid Build Coastguard Worker virtual ~WordpieceVocab() {} 40*993b0882SAndroid Build Coastguard Worker virtual LookupStatus Contains(const absl::string_view key, 41*993b0882SAndroid Build Coastguard Worker bool* value) const = 0; 42*993b0882SAndroid Build Coastguard Worker }; 43*993b0882SAndroid Build Coastguard Worker 44*993b0882SAndroid Build Coastguard Worker LookupStatus WordpieceTokenize( 45*993b0882SAndroid Build Coastguard Worker const absl::string_view token, const int max_bytes_per_token, 46*993b0882SAndroid Build Coastguard Worker const int max_chars_per_subtoken, const std::string& suffix_indicator, 47*993b0882SAndroid Build Coastguard Worker bool use_unknown_token, const std::string& unknown_token, 48*993b0882SAndroid Build Coastguard Worker bool split_unknown_characters, const WordpieceVocab* vocab_map, 49*993b0882SAndroid Build Coastguard Worker std::vector<std::string>* subwords, std::vector<int>* begin_offset, 50*993b0882SAndroid Build Coastguard Worker std::vector<int>* end_offset, int* num_word_pieces); 51*993b0882SAndroid Build Coastguard Worker 52*993b0882SAndroid Build Coastguard Worker // As above but with `max_bytes_per_subtoken` unknown, 53*993b0882SAndroid Build Coastguard Worker // and split_unknown_characters=false. (For backwards compatibility.) 54*993b0882SAndroid Build Coastguard Worker LookupStatus WordpieceTokenize( 55*993b0882SAndroid Build Coastguard Worker const absl::string_view token, const int max_bytes_per_token, 56*993b0882SAndroid Build Coastguard Worker const std::string& suffix_indicator, bool use_unknown_token, 57*993b0882SAndroid Build Coastguard Worker const std::string& unknown_token, const WordpieceVocab* vocab_map, 58*993b0882SAndroid Build Coastguard Worker std::vector<std::string>* subwords, std::vector<int>* begin_offset, 59*993b0882SAndroid Build Coastguard Worker std::vector<int>* end_offset, int* num_word_pieces); 60*993b0882SAndroid Build Coastguard Worker 61*993b0882SAndroid Build Coastguard Worker } // namespace libtextclassifier3 62*993b0882SAndroid Build Coastguard Worker 63*993b0882SAndroid Build Coastguard Worker #endif // LIBTEXTCLASSIFIER_UTILS_WORDPIECE_TOKENIZER_H_ 64