xref: /aosp_15_r20/external/libtextclassifier/native/utils/sentencepiece/encoder.cc (revision 993b0882672172b81d12fad7a7ac0c3e5c824a12)
1*993b0882SAndroid Build Coastguard Worker /*
2*993b0882SAndroid Build Coastguard Worker  * Copyright (C) 2018 The Android Open Source Project
3*993b0882SAndroid Build Coastguard Worker  *
4*993b0882SAndroid Build Coastguard Worker  * Licensed under the Apache License, Version 2.0 (the "License");
5*993b0882SAndroid Build Coastguard Worker  * you may not use this file except in compliance with the License.
6*993b0882SAndroid Build Coastguard Worker  * You may obtain a copy of the License at
7*993b0882SAndroid Build Coastguard Worker  *
8*993b0882SAndroid Build Coastguard Worker  *      http://www.apache.org/licenses/LICENSE-2.0
9*993b0882SAndroid Build Coastguard Worker  *
10*993b0882SAndroid Build Coastguard Worker  * Unless required by applicable law or agreed to in writing, software
11*993b0882SAndroid Build Coastguard Worker  * distributed under the License is distributed on an "AS IS" BASIS,
12*993b0882SAndroid Build Coastguard Worker  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*993b0882SAndroid Build Coastguard Worker  * See the License for the specific language governing permissions and
14*993b0882SAndroid Build Coastguard Worker  * limitations under the License.
15*993b0882SAndroid Build Coastguard Worker  */
16*993b0882SAndroid Build Coastguard Worker 
17*993b0882SAndroid Build Coastguard Worker #include "utils/sentencepiece/encoder.h"
18*993b0882SAndroid Build Coastguard Worker 
19*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3 {
20*993b0882SAndroid Build Coastguard Worker 
Encode(StringPiece normalized_text,std::vector<int> * encoded_text) const21*993b0882SAndroid Build Coastguard Worker bool Encoder::Encode(StringPiece normalized_text,
22*993b0882SAndroid Build Coastguard Worker                      std::vector<int>* encoded_text) const {
23*993b0882SAndroid Build Coastguard Worker   const int len = normalized_text.size();
24*993b0882SAndroid Build Coastguard Worker   if (len <= 0) {
25*993b0882SAndroid Build Coastguard Worker     *encoded_text = {start_code_, end_code_};
26*993b0882SAndroid Build Coastguard Worker     return true;
27*993b0882SAndroid Build Coastguard Worker   }
28*993b0882SAndroid Build Coastguard Worker   // We use `previous_pos` to indicate whether a dynamic programming state was
29*993b0882SAndroid Build Coastguard Worker   // reachable.
30*993b0882SAndroid Build Coastguard Worker   std::vector<SegmentationEntry> segmentation(
31*993b0882SAndroid Build Coastguard Worker       len + 1, {/*score=*/0, /*previous_pos=*/-1, /*piece_id=*/-1,
32*993b0882SAndroid Build Coastguard Worker                 /*num_pieces=*/0});
33*993b0882SAndroid Build Coastguard Worker   for (int i = 0; i < len; i++) {
34*993b0882SAndroid Build Coastguard Worker     // State couldn't be reached.
35*993b0882SAndroid Build Coastguard Worker     if (i > 0 && segmentation[i].previous_pos < 0) {
36*993b0882SAndroid Build Coastguard Worker       // Advance position.
37*993b0882SAndroid Build Coastguard Worker       normalized_text.RemovePrefix(1);
38*993b0882SAndroid Build Coastguard Worker       continue;
39*993b0882SAndroid Build Coastguard Worker     }
40*993b0882SAndroid Build Coastguard Worker     // Check whether we can use the unknown token.
41*993b0882SAndroid Build Coastguard Worker     if (unknown_code_ >= 0) {
42*993b0882SAndroid Build Coastguard Worker       const int pos = i + 1;
43*993b0882SAndroid Build Coastguard Worker       const float unknown_penalty = segmentation[i].score + unknown_score_;
44*993b0882SAndroid Build Coastguard Worker       if (segmentation[pos].previous_pos < 0 ||
45*993b0882SAndroid Build Coastguard Worker           segmentation[pos].score < unknown_penalty) {
46*993b0882SAndroid Build Coastguard Worker         // Merge multiple unknown tokens into one.
47*993b0882SAndroid Build Coastguard Worker         if (segmentation[i].piece_id == unknown_code_) {
48*993b0882SAndroid Build Coastguard Worker           segmentation[pos] = {/*score=*/unknown_penalty,
49*993b0882SAndroid Build Coastguard Worker                                /*previous_pos=*/segmentation[i].previous_pos,
50*993b0882SAndroid Build Coastguard Worker                                /*piece_id=*/unknown_code_,
51*993b0882SAndroid Build Coastguard Worker                                /*num_pieces=*/segmentation[i].num_pieces};
52*993b0882SAndroid Build Coastguard Worker         } else {
53*993b0882SAndroid Build Coastguard Worker           segmentation[pos] = {/*score=*/unknown_penalty,
54*993b0882SAndroid Build Coastguard Worker                                /*previous_pos=*/i,
55*993b0882SAndroid Build Coastguard Worker                                /*piece_id=*/unknown_code_,
56*993b0882SAndroid Build Coastguard Worker                                /*num_pieces=*/segmentation[i].num_pieces + 1};
57*993b0882SAndroid Build Coastguard Worker         }
58*993b0882SAndroid Build Coastguard Worker       }
59*993b0882SAndroid Build Coastguard Worker     }
60*993b0882SAndroid Build Coastguard Worker     std::vector<StringSet::Match> matches;
61*993b0882SAndroid Build Coastguard Worker     if (!pieces_->FindAllPrefixMatches(normalized_text, &matches)) {
62*993b0882SAndroid Build Coastguard Worker       TC3_LOG(ERROR)
63*993b0882SAndroid Build Coastguard Worker           << "Couldn't successfully gather prefix sentence piece matches.";
64*993b0882SAndroid Build Coastguard Worker       return false;
65*993b0882SAndroid Build Coastguard Worker     }
66*993b0882SAndroid Build Coastguard Worker     for (const auto& match : matches) {
67*993b0882SAndroid Build Coastguard Worker       TC3_CHECK(match.id >= 0 && match.id < num_pieces_);
68*993b0882SAndroid Build Coastguard Worker       const int pos = i + match.match_length;
69*993b0882SAndroid Build Coastguard Worker       const float candidate_score = segmentation[i].score + scores_[match.id];
70*993b0882SAndroid Build Coastguard Worker       if (segmentation[pos].previous_pos < 0 ||
71*993b0882SAndroid Build Coastguard Worker           segmentation[pos].score < candidate_score) {
72*993b0882SAndroid Build Coastguard Worker         segmentation[pos] = {/*score=*/candidate_score, /*previous_pos=*/i,
73*993b0882SAndroid Build Coastguard Worker                              /*piece_id=*/match.id + encoding_offset_,
74*993b0882SAndroid Build Coastguard Worker                              /*num_pieces=*/segmentation[i].num_pieces + 1};
75*993b0882SAndroid Build Coastguard Worker       }
76*993b0882SAndroid Build Coastguard Worker     }
77*993b0882SAndroid Build Coastguard Worker     // Advance position.
78*993b0882SAndroid Build Coastguard Worker     normalized_text.RemovePrefix(1);
79*993b0882SAndroid Build Coastguard Worker   }
80*993b0882SAndroid Build Coastguard Worker   if (segmentation[len].num_pieces <= 0) {
81*993b0882SAndroid Build Coastguard Worker     *encoded_text = {start_code_, end_code_};
82*993b0882SAndroid Build Coastguard Worker     return true;
83*993b0882SAndroid Build Coastguard Worker   }
84*993b0882SAndroid Build Coastguard Worker   const int num_pieces = segmentation[len].num_pieces;
85*993b0882SAndroid Build Coastguard Worker   encoded_text->resize(num_pieces + 2);
86*993b0882SAndroid Build Coastguard Worker   (*encoded_text)[num_pieces + 1] = end_code_;
87*993b0882SAndroid Build Coastguard Worker   int pos = len;
88*993b0882SAndroid Build Coastguard Worker   for (int i = num_pieces; i > 0; i--) {
89*993b0882SAndroid Build Coastguard Worker     (*encoded_text)[i] = segmentation[pos].piece_id;
90*993b0882SAndroid Build Coastguard Worker     pos = segmentation[pos].previous_pos;
91*993b0882SAndroid Build Coastguard Worker   }
92*993b0882SAndroid Build Coastguard Worker   (*encoded_text)[0] = start_code_;
93*993b0882SAndroid Build Coastguard Worker   return true;
94*993b0882SAndroid Build Coastguard Worker }
95*993b0882SAndroid Build Coastguard Worker 
96*993b0882SAndroid Build Coastguard Worker }  // namespace libtextclassifier3
97