# Any targets that should be shared between fbcode and xplat must be defined in # targets.bzl. This file can contain fbcode-only targets. load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") load(":targets.bzl", "define_common_targets") load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_verision") oncall("executorch") define_common_targets() runtime.python_library( name = "llama_transformer", srcs = [ "llama_transformer.py", "rope.py", ], _is_external_target = True, base_module = "executorch.examples.models.llama", visibility = [ "//executorch/...", "@EXECUTORCH_CLIENTS", ], deps = [ "//caffe2:torch", ], ) runtime.python_library( name = "llama2_model", srcs = [ "__init__.py", "fairseq2.py", "model.py", ], _is_external_target = True, base_module = "executorch.examples.models.llama", resources = { "//executorch/examples/models/llama/params:params": "params", }, visibility = [ "//bento/...", "//bento_kernels/...", "//executorch/...", ], deps = [ "//caffe2:torch", "//executorch/examples/models:model_base", "//executorch/examples/models/llama:llama_transformer", "//executorch/examples/models:checkpoint", ], ) runtime.python_binary( name = "export_llama", main_function = "executorch.examples.models.llama.export_llama.main", # visibility = ["//executorch/examples/..."], preload_deps = [ "//executorch/extension/llm/custom_ops:model_sharding_py", "//executorch/extension/llm/custom_ops:custom_ops_aot_lib", "//executorch/kernels/quantized:aot_lib", ], deps = [ ":export_library", "//caffe2:torch", "//executorch/extension/pybindings:aten_lib", ], ) runtime.command_alias( name = "export_llama_qnn", env = { "LD_LIBRARY_PATH": "$(location fbsource//third-party/qualcomm/qnn/qnn-{0}:qnn_offline_compile_libs)".format(get_qnn_library_verision()), }, exe = ":export_llama", ) runtime.python_library( name = "export_library", srcs = [ "export_llama.py", "export_llama_lib.py", "model.py", "source_transformation/apply_spin_quant_r1_r2.py", "source_transformation/attention.py", "source_transformation/lora.py", "source_transformation/pre_quantization.py", "source_transformation/prune_vocab.py", "source_transformation/quantize.py", "source_transformation/quantized_kv_cache.py", "source_transformation/rms_norm.py", "source_transformation/rope.py", "source_transformation/sdpa.py", "source_transformation/spin_quant.py", "source_transformation/vulkan_rope.py", ], _is_external_target = True, base_module = "executorch.examples.models.llama", visibility = [ "//bento/...", "//bento_kernels/...", "//executorch/examples/...", "@EXECUTORCH_CLIENTS", ], deps = [ "//ai_codesign/gen_ai/fast_hadamard_transform:fast_hadamard_transform", "//caffe2:torch", "//executorch/backends/vulkan/_passes:vulkan_passes", "//executorch/examples/models:model_base", "//executorch/examples/models:models", "//executorch/extension/llm/custom_ops:custom_ops_aot_py", "//executorch/extension/llm/export:export_lib", # one definition has to be included in the user of the libarary # depending on what library the client wants to use # "//executorch/extension/pybindings:aten_lib", # "//executorch/extension/pybindings:portable_lib", # "//executorch/extension/pybindings:portable_lib_plus_custom", "//executorch/devtools/etrecord:etrecord", "//executorch/util:memory_profiler", "//executorch/util:python_profiler", "fbsource//third-party/pypi/coremltools:coremltools", "fbsource//third-party/pypi/sentencepiece:sentencepiece", "//pytorch/ao:torchao", ], ) runtime.python_binary( name = "eval_llama", main_function = "executorch.examples.models.llama.eval_llama.main", deps = [ ":eval_library", "//caffe2:torch", ], ) runtime.python_library( name = "eval_library", srcs = [ "eval_llama.py", "eval_llama_lib.py", "evaluate/eager_eval.py", ], _is_external_target = True, base_module = "executorch.examples.models.llama", visibility = [ "//bento/...", "//bento_kernels/...", "//executorch/examples/...", "@EXECUTORCH_CLIENTS", ], deps = [ "fbsource//third-party/pypi/lm-eval:lm-eval", "fbsource//third-party/pypi/tiktoken:tiktoken", ":export_library", "//executorch/examples/models/llama/tokenizer:tiktoken_py", "//executorch/extension/llm/export:export_lib", "//executorch/extension/llm/tokenizer:tokenizer_py_lib", "//executorch/extension/pybindings:portable_lib", ], ) runtime.python_library( name = "quantized_kv_cache", srcs = [ "source_transformation/quantized_kv_cache.py", ], _is_external_target = True, visibility = ["//executorch/..."], deps = [ "//caffe2:torch", ], ) runtime.python_library( name = "sdpa", srcs = [ "source_transformation/sdpa.py", ], _is_external_target = True, visibility = ["//executorch/..."], deps = [ "//caffe2:torch", ], ) runtime.python_test( name = "quantized_kv_cache_test", srcs = [ "source_transformation/test_quantized_kv_cache.py", ], preload_deps = [ "//executorch/extension/llm/custom_ops:custom_ops_aot_lib", ], deps = [ ":quantized_kv_cache", "//caffe2:torch", "//executorch/examples/models/llama:llama_transformer", ], ) runtime.python_test( name = "quantized_sdpa_with_kv_cache_test", srcs = [ "source_transformation/test_sdpa_with_quantized_kv_cache.py", ], preload_deps = [ "//executorch/extension/llm/custom_ops:custom_ops_aot_lib", ], deps = [ ":quantized_kv_cache", ":sdpa", "//caffe2:torch", "//executorch/examples/models/llama:llama_transformer", ], )