1*8b6cd535SAndroid Build Coastguard Worker // Copyright (C) 2019 Google LLC 2*8b6cd535SAndroid Build Coastguard Worker // 3*8b6cd535SAndroid Build Coastguard Worker // Licensed under the Apache License, Version 2.0 (the "License"); 4*8b6cd535SAndroid Build Coastguard Worker // you may not use this file except in compliance with the License. 5*8b6cd535SAndroid Build Coastguard Worker // You may obtain a copy of the License at 6*8b6cd535SAndroid Build Coastguard Worker // 7*8b6cd535SAndroid Build Coastguard Worker // http://www.apache.org/licenses/LICENSE-2.0 8*8b6cd535SAndroid Build Coastguard Worker // 9*8b6cd535SAndroid Build Coastguard Worker // Unless required by applicable law or agreed to in writing, software 10*8b6cd535SAndroid Build Coastguard Worker // distributed under the License is distributed on an "AS IS" BASIS, 11*8b6cd535SAndroid Build Coastguard Worker // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12*8b6cd535SAndroid Build Coastguard Worker // See the License for the specific language governing permissions and 13*8b6cd535SAndroid Build Coastguard Worker // limitations under the License. 14*8b6cd535SAndroid Build Coastguard Worker 15*8b6cd535SAndroid Build Coastguard Worker #ifndef ICING_TRANSFORM_NORMALIZER_H_ 16*8b6cd535SAndroid Build Coastguard Worker #define ICING_TRANSFORM_NORMALIZER_H_ 17*8b6cd535SAndroid Build Coastguard Worker 18*8b6cd535SAndroid Build Coastguard Worker #include <memory> 19*8b6cd535SAndroid Build Coastguard Worker #include <string> 20*8b6cd535SAndroid Build Coastguard Worker #include <string_view> 21*8b6cd535SAndroid Build Coastguard Worker 22*8b6cd535SAndroid Build Coastguard Worker #include "icing/text_classifier/lib3/utils/base/statusor.h" 23*8b6cd535SAndroid Build Coastguard Worker #include "icing/util/character-iterator.h" 24*8b6cd535SAndroid Build Coastguard Worker 25*8b6cd535SAndroid Build Coastguard Worker namespace icing { 26*8b6cd535SAndroid Build Coastguard Worker namespace lib { 27*8b6cd535SAndroid Build Coastguard Worker 28*8b6cd535SAndroid Build Coastguard Worker // Normalizes strings for text matching. 29*8b6cd535SAndroid Build Coastguard Worker // 30*8b6cd535SAndroid Build Coastguard Worker // Example use: 31*8b6cd535SAndroid Build Coastguard Worker // ICING_ASSIGN_OR_RETURN(auto normalizer, 32*8b6cd535SAndroid Build Coastguard Worker // normalizer_factory::Create(/*max_term_byte_size=*/5); 33*8b6cd535SAndroid Build Coastguard Worker // 34*8b6cd535SAndroid Build Coastguard Worker // std::string normalized_text = normalizer->NormalizeText("HELLO!"); 35*8b6cd535SAndroid Build Coastguard Worker // ICING_LOG(INFO) << normalized_text; // prints "hello" 36*8b6cd535SAndroid Build Coastguard Worker class Normalizer { 37*8b6cd535SAndroid Build Coastguard Worker public: 38*8b6cd535SAndroid Build Coastguard Worker virtual ~Normalizer() = default; 39*8b6cd535SAndroid Build Coastguard Worker 40*8b6cd535SAndroid Build Coastguard Worker // Normalizes the input term based on rules. See implementation classes for 41*8b6cd535SAndroid Build Coastguard Worker // specific transformation rules. 42*8b6cd535SAndroid Build Coastguard Worker struct NormalizedTerm { 43*8b6cd535SAndroid Build Coastguard Worker std::string text; 44*8b6cd535SAndroid Build Coastguard Worker }; 45*8b6cd535SAndroid Build Coastguard Worker virtual NormalizedTerm NormalizeTerm(std::string_view term) const = 0; 46*8b6cd535SAndroid Build Coastguard Worker 47*8b6cd535SAndroid Build Coastguard Worker // Returns a CharacterIterator pointing to one past the end of the segment of 48*8b6cd535SAndroid Build Coastguard Worker // term that (once normalized) matches with normalized_term. 49*8b6cd535SAndroid Build Coastguard Worker // 50*8b6cd535SAndroid Build Coastguard Worker // Ex. FindNormalizedMatchEndPosition("YELLOW", "yell") will return 51*8b6cd535SAndroid Build Coastguard Worker // CharacterIterator(u8:4, u16:4, u32:4). 52*8b6cd535SAndroid Build Coastguard Worker // 53*8b6cd535SAndroid Build Coastguard Worker // Ex. FindNormalizedMatchEndPosition("YELLOW", "red") will return 54*8b6cd535SAndroid Build Coastguard Worker // CharacterIterator(u8:0, u16:0, u32:0). 55*8b6cd535SAndroid Build Coastguard Worker virtual CharacterIterator FindNormalizedMatchEndPosition( 56*8b6cd535SAndroid Build Coastguard Worker std::string_view term, std::string_view normalized_term) const = 0; 57*8b6cd535SAndroid Build Coastguard Worker }; 58*8b6cd535SAndroid Build Coastguard Worker 59*8b6cd535SAndroid Build Coastguard Worker } // namespace lib 60*8b6cd535SAndroid Build Coastguard Worker } // namespace icing 61*8b6cd535SAndroid Build Coastguard Worker 62*8b6cd535SAndroid Build Coastguard Worker #endif // ICING_TRANSFORM_NORMALIZER_H_ 63