1 /* 2 * Copyright (c) Meta Platforms, Inc. and affiliates. 3 * All rights reserved. 4 * 5 * This source code is licensed under the BSD-style license found in the 6 * LICENSE file in the root directory of this source tree. 7 */ 8 9 #pragma once 10 11 #include <executorch/extension/llm/tokenizer/tokenizer.h> 12 #include <memory> 13 14 namespace executorch { 15 namespace extension { 16 namespace llm { 17 18 struct TokenIndex { 19 const char* str; 20 int32_t id; 21 }; 22 23 // A simple Byte Pair Encoding (BPE) Tokenizer. Note that the current C++ code 24 // won't work with this class, it needs to go through tokenizer.py first. 25 class ET_EXPERIMENTAL BPETokenizer : public Tokenizer { 26 public: 27 explicit BPETokenizer(); 28 ~BPETokenizer() override; 29 30 ::executorch::runtime::Error load(const std::string& tokenizer_path) override; 31 32 ::executorch::runtime::Result<std::vector<uint64_t>> 33 encode(const std::string& input, int8_t bos, int8_t eos) const override; 34 35 ::executorch::runtime::Result<std::string> decode( 36 uint64_t prev_token, 37 uint64_t token) const override; 38 39 private: 40 std::unique_ptr<char*[]> vocab_ = nullptr; 41 std::unique_ptr<float[]> vocab_scores_ = nullptr; 42 std::unique_ptr<TokenIndex[]> sorted_vocab_ = nullptr; 43 unsigned int max_token_length_ = 0; 44 unsigned char byte_pieces_[512]; // stores all single-byte strings 45 }; 46 47 } // namespace llm 48 } // namespace extension 49 } // namespace executorch 50 51 namespace torch { 52 namespace executor { 53 // TODO(T197294990): Remove these deprecated aliases once all users have moved 54 // to the new `::executorch` namespaces. 55 using ::executorch::extension::llm::BPETokenizer; 56 using ::executorch::extension::llm::TokenIndex; 57 } // namespace executor 58 } // namespace torch 59