1 /*
2  * Copyright (c) Qualcomm Innovation Center, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD-style license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 /**
10  * @file
11  *
12  * This tool can run Llama3 8b with Qualcomm AI Engine Direct.
13  *
14  * User could specify arguments like desired prompt, eval_mode, etc.
15  */
16 
17 #include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
18 #include <executorch/examples/qualcomm/qaihub_scripts/llama/runner/runner.h>
19 #include <executorch/runtime/platform/log.h>
20 
21 #include <gflags/gflags.h>
22 
23 #include <fstream>
24 
25 DEFINE_string(sharded_1_path, "", "Path to 1st sharded pte file");
26 DEFINE_string(sharded_2_path, "", "Path to 2nd sharded pte file");
27 DEFINE_string(sharded_3_path, "", "Path to 3rd sharded pte file");
28 DEFINE_string(sharded_4_path, "", "Path to 4th sharded pte file");
29 DEFINE_string(sharded_5_path, "", "Path to 5th sharded pte file");
30 
31 DEFINE_string(freq_cos_path, "", "Path to precomputed position embeddings");
32 DEFINE_string(freq_sin_path, "", "Path to precomputed position embeddings");
33 
34 DEFINE_string(output_path, "outputs", "Executorch inference data output path.");
35 DEFINE_string(tokenizer_path, "tokenizer.bin", "Tokenizer stuff.");
36 DEFINE_string(prompt, "The answer to the ultimate question is", "Prompt.");
37 DEFINE_string(
38     system_prompt,
39     "",
40     "Tells the model what kind of assistant it should be. For example, You are a helpful AI assistant for travel tips and recommendations. Default is None");
41 DEFINE_double(
42     temperature,
43     0.0f,
44     "Temperature; Default is 0.0f. 0 = greedy argmax sampling (deterministic). Lower temperature = more deterministic");
45 DEFINE_int32(
46     eval_mode,
47     0,
48     "0: PromptProcessor / 1: TokenGenerator / 2: MixedMode (TBD)");
49 DEFINE_int32(
50     seq_len,
51     128,
52     "Total number of tokens to generate (prompt + output). Defaults to max_seq_len. If the number of input tokens + seq_len > max_seq_len, the output will be truncated to max_seq_len tokens.");
53 DEFINE_double(logits_scale, 0.0, "Path to logits scale file");
54 DEFINE_int32(logits_offset, 0, "Path to logits offset file");
55 
main(int argc,char ** argv)56 int main(int argc, char** argv) {
57   gflags::ParseCommandLineFlags(&argc, &argv, true);
58 
59   std::vector<std::string> models_path = {
60       FLAGS_sharded_1_path,
61       FLAGS_sharded_2_path,
62       FLAGS_sharded_3_path,
63       FLAGS_sharded_4_path,
64       FLAGS_sharded_5_path};
65   std::vector<std::string> pos_embs_path = {
66       FLAGS_freq_cos_path, FLAGS_freq_sin_path};
67 
68   // create llama runner
69   example::Runner runner(
70       models_path,
71       pos_embs_path,
72       {4, 8, 8, 8, 4},
73       FLAGS_tokenizer_path.c_str(),
74       FLAGS_eval_mode,
75       FLAGS_temperature,
76       FLAGS_logits_scale,
77       FLAGS_logits_offset);
78 
79   // generate tokens & store inference output
80   std::ofstream fout(FLAGS_output_path.c_str());
81   runner.generate(
82       FLAGS_prompt,
83       FLAGS_system_prompt,
84       FLAGS_seq_len,
85       [&](const std::string& piece) { fout << piece; });
86   fout.close();
87   return 0;
88 }
89