1 /*
2 * Copyright (c) Qualcomm Innovation Center, Inc.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9 /**
10 * @file
11 *
12 * This tool can run Llama3 8b with Qualcomm AI Engine Direct.
13 *
14 * User could specify arguments like desired prompt, eval_mode, etc.
15 */
16
17 #include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
18 #include <executorch/examples/qualcomm/qaihub_scripts/llama/runner/runner.h>
19 #include <executorch/runtime/platform/log.h>
20
21 #include <gflags/gflags.h>
22
23 #include <fstream>
24
25 DEFINE_string(sharded_1_path, "", "Path to 1st sharded pte file");
26 DEFINE_string(sharded_2_path, "", "Path to 2nd sharded pte file");
27 DEFINE_string(sharded_3_path, "", "Path to 3rd sharded pte file");
28 DEFINE_string(sharded_4_path, "", "Path to 4th sharded pte file");
29 DEFINE_string(sharded_5_path, "", "Path to 5th sharded pte file");
30
31 DEFINE_string(freq_cos_path, "", "Path to precomputed position embeddings");
32 DEFINE_string(freq_sin_path, "", "Path to precomputed position embeddings");
33
34 DEFINE_string(output_path, "outputs", "Executorch inference data output path.");
35 DEFINE_string(tokenizer_path, "tokenizer.bin", "Tokenizer stuff.");
36 DEFINE_string(prompt, "The answer to the ultimate question is", "Prompt.");
37 DEFINE_string(
38 system_prompt,
39 "",
40 "Tells the model what kind of assistant it should be. For example, You are a helpful AI assistant for travel tips and recommendations. Default is None");
41 DEFINE_double(
42 temperature,
43 0.0f,
44 "Temperature; Default is 0.0f. 0 = greedy argmax sampling (deterministic). Lower temperature = more deterministic");
45 DEFINE_int32(
46 eval_mode,
47 0,
48 "0: PromptProcessor / 1: TokenGenerator / 2: MixedMode (TBD)");
49 DEFINE_int32(
50 seq_len,
51 128,
52 "Total number of tokens to generate (prompt + output). Defaults to max_seq_len. If the number of input tokens + seq_len > max_seq_len, the output will be truncated to max_seq_len tokens.");
53 DEFINE_double(logits_scale, 0.0, "Path to logits scale file");
54 DEFINE_int32(logits_offset, 0, "Path to logits offset file");
55
main(int argc,char ** argv)56 int main(int argc, char** argv) {
57 gflags::ParseCommandLineFlags(&argc, &argv, true);
58
59 std::vector<std::string> models_path = {
60 FLAGS_sharded_1_path,
61 FLAGS_sharded_2_path,
62 FLAGS_sharded_3_path,
63 FLAGS_sharded_4_path,
64 FLAGS_sharded_5_path};
65 std::vector<std::string> pos_embs_path = {
66 FLAGS_freq_cos_path, FLAGS_freq_sin_path};
67
68 // create llama runner
69 example::Runner runner(
70 models_path,
71 pos_embs_path,
72 {4, 8, 8, 8, 4},
73 FLAGS_tokenizer_path.c_str(),
74 FLAGS_eval_mode,
75 FLAGS_temperature,
76 FLAGS_logits_scale,
77 FLAGS_logits_offset);
78
79 // generate tokens & store inference output
80 std::ofstream fout(FLAGS_output_path.c_str());
81 runner.generate(
82 FLAGS_prompt,
83 FLAGS_system_prompt,
84 FLAGS_seq_len,
85 [&](const std::string& piece) { fout << piece; });
86 fout.close();
87 return 0;
88 }
89