xref: /aosp_15_r20/external/executorch/extension/llm/tokenizer/bpe_tokenizer.h (revision 523fa7a60841cd1ecfb9cc4201f1ca8b03ed023a)
1 /*
2  * Copyright (c) Meta Platforms, Inc. and affiliates.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD-style license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 #pragma once
10 
11 #include <executorch/extension/llm/tokenizer/tokenizer.h>
12 #include <memory>
13 
14 namespace executorch {
15 namespace extension {
16 namespace llm {
17 
18 struct TokenIndex {
19   const char* str;
20   int32_t id;
21 };
22 
23 // A simple Byte Pair Encoding (BPE) Tokenizer. Note that the current C++ code
24 // won't work with this class, it needs to go through tokenizer.py first.
25 class ET_EXPERIMENTAL BPETokenizer : public Tokenizer {
26  public:
27   explicit BPETokenizer();
28   ~BPETokenizer() override;
29 
30   ::executorch::runtime::Error load(const std::string& tokenizer_path) override;
31 
32   ::executorch::runtime::Result<std::vector<uint64_t>>
33   encode(const std::string& input, int8_t bos, int8_t eos) const override;
34 
35   ::executorch::runtime::Result<std::string> decode(
36       uint64_t prev_token,
37       uint64_t token) const override;
38 
39  private:
40   std::unique_ptr<char*[]> vocab_ = nullptr;
41   std::unique_ptr<float[]> vocab_scores_ = nullptr;
42   std::unique_ptr<TokenIndex[]> sorted_vocab_ = nullptr;
43   unsigned int max_token_length_ = 0;
44   unsigned char byte_pieces_[512]; // stores all single-byte strings
45 };
46 
47 } // namespace llm
48 } // namespace extension
49 } // namespace executorch
50 
51 namespace torch {
52 namespace executor {
53 // TODO(T197294990): Remove these deprecated aliases once all users have moved
54 // to the new `::executorch` namespaces.
55 using ::executorch::extension::llm::BPETokenizer;
56 using ::executorch::extension::llm::TokenIndex;
57 } // namespace executor
58 } // namespace torch
59