1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 // LINT.IfChange 16 17 #ifndef TENSORFLOW_CORE_UTIL_CTC_CTC_DECODER_H_ 18 #define TENSORFLOW_CORE_UTIL_CTC_CTC_DECODER_H_ 19 20 #include <memory> 21 #include <vector> 22 23 #include "third_party/eigen3/Eigen/Core" 24 #include "tensorflow/core/lib/core/errors.h" 25 #include "tensorflow/core/lib/core/status.h" 26 27 namespace tensorflow { 28 namespace ctc { 29 30 // The CTCDecoder is an abstract interface to be implemented when providing a 31 // decoding method on the timestep output of a RNN trained with CTC loss. 32 // 33 // The two types of decoding available are: 34 // - greedy path, through the CTCGreedyDecoder 35 // - beam search, through the CTCBeamSearchDecoder 36 template <class T> 37 class CTCDecoder { 38 public: 39 typedef Eigen::Map<const Eigen::ArrayXi> SequenceLength; 40 typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> 41 Input; 42 typedef std::vector<std::vector<int>> Output; 43 typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> 44 ScoreOutput; 45 CTCDecoder(int num_classes,int batch_size,bool merge_repeated)46 CTCDecoder(int num_classes, int batch_size, bool merge_repeated) 47 : num_classes_(num_classes), 48 blank_index_(num_classes - 1), 49 batch_size_(batch_size), 50 merge_repeated_(merge_repeated) {} 51 ~CTCDecoder()52 virtual ~CTCDecoder() {} 53 54 // Dimensionality of the input/output is expected to be: 55 // - seq_len[b] - b = 0 to batch_size_ 56 // - input[t].rows(b) - t = 0 to timesteps; b = 0 t batch_size_ 57 // - output.size() specifies the number of beams to be returned. 58 // - scores(b, i) - b = 0 to batch_size; i = 0 to output.size() 59 virtual Status Decode(const SequenceLength& seq_len, 60 const std::vector<Input>& input, 61 std::vector<Output>* output, ScoreOutput* scores) = 0; 62 batch_size()63 int batch_size() { return batch_size_; } num_classes()64 int num_classes() { return num_classes_; } 65 66 protected: 67 int num_classes_; 68 int blank_index_; 69 int batch_size_; 70 bool merge_repeated_; 71 }; 72 73 // CTCGreedyDecoder is an implementation of the simple best path decoding 74 // algorithm, selecting at each timestep the most likely class at each timestep. 75 template <class T> 76 class CTCGreedyDecoder : public CTCDecoder<T> { 77 public: 78 typedef CTCDecoder<T> Decoder; CTCGreedyDecoder(int num_classes,int batch_size,bool merge_repeated)79 CTCGreedyDecoder(int num_classes, int batch_size, bool merge_repeated) 80 : CTCDecoder<T>(num_classes, batch_size, merge_repeated) {} 81 Decode(const typename CTCDecoder<T>::SequenceLength & seq_len,const std::vector<typename CTCDecoder<T>::Input> & input,std::vector<typename CTCDecoder<T>::Output> * output,typename CTCDecoder<T>::ScoreOutput * scores)82 Status Decode(const typename CTCDecoder<T>::SequenceLength& seq_len, 83 const std::vector<typename CTCDecoder<T>::Input>& input, 84 std::vector<typename CTCDecoder<T>::Output>* output, 85 typename CTCDecoder<T>::ScoreOutput* scores) override { 86 if (output->empty() || (*output)[0].size() < Decoder::batch_size_) { 87 return errors::InvalidArgument( 88 "output needs to be of size at least (1, batch_size)."); 89 } 90 if (scores->rows() < Decoder::batch_size_ || scores->cols() == 0) { 91 return errors::InvalidArgument( 92 "scores needs to be of size at least (batch_size, 1)."); 93 } 94 // For each batch entry, identify the transitions 95 for (int b = 0; b < Decoder::batch_size_; ++b) { 96 int seq_len_b = seq_len[b]; 97 // Only writing to beam 0 98 std::vector<int>& output_b = (*output)[0][b]; 99 100 int prev_class_ix = -1; 101 (*scores)(b, 0) = 0; 102 for (int t = 0; t < seq_len_b; ++t) { 103 auto row = input[t].row(b); 104 int max_class_ix; 105 (*scores)(b, 0) += -row.maxCoeff(&max_class_ix); 106 if (max_class_ix != Decoder::blank_index_ && 107 !(Decoder::merge_repeated_ && max_class_ix == prev_class_ix)) { 108 output_b.push_back(max_class_ix); 109 } 110 prev_class_ix = max_class_ix; 111 } 112 } 113 return OkStatus(); 114 } 115 }; 116 117 } // namespace ctc 118 } // namespace tensorflow 119 120 #endif // TENSORFLOW_CORE_UTIL_CTC_CTC_DECODER_H_ 121 // LINT.ThenChange(//tensorflow/lite/kernels/ctc/ctc_decoder.h) 122