gguf_util/converters/llama_converter.py

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

import copy
from typing import Any, Mapping

import torch
import torch.nn as nn
from executorch.examples.models.llama.llama_transformer import (
    ModelArgs as LlamaModelArgs,
    Transformer as LlamaTransformer,
)
from executorch.extension.gguf_util.load_gguf import GGUFModelArgs, GGUFWeights


def _create_pt_model(
    gguf_model_args: GGUFModelArgs,
) -> nn.Module:
    llama_model_args = LlamaModelArgs(
        dim=gguf_model_args.embedding_length,
        n_layers=gguf_model_args.block_count,
        n_heads=gguf_model_args.attention.head_count,
        n_kv_heads=gguf_model_args.attention.head_count_kv,
        vocab_size=gguf_model_args.vocab_size,
        norm_eps=gguf_model_args.attention.layer_norm_rms_epsilon,
        hidden_dim=gguf_model_args.feed_forward_length,
        rope_freq_base=gguf_model_args.rope.freq_base,
    )
    pt_model = LlamaTransformer(llama_model_args)
    pt_model.eval()
    return pt_model


_name_replacements = [
    ("blk", "layers"),
    ("token_embd", "tok_embeddings"),
    ("attn_q", "attention.wq"),
    ("attn_k", "attention.wk"),
    ("attn_v", "attention.wv"),
    ("attn_output", "attention.wo"),
    ("attn_norm", "attention_norm"),
    ("output_norm.weight", "norm.weight"),
    ("ffn_down", "feed_forward.w2"),
    ("ffn_gate", "feed_forward.w1"),
    ("ffn_up", "feed_forward.w3"),
]


def _convert_gguf_tensor_name_to_llama_nn(gguf_name: str) -> str:
    result = copy.deepcopy(gguf_name)
    for gguf_string, replacement in _name_replacements:
        result = result.replace(gguf_string, replacement)
    return result


def _convert_to_state_dict(gguf_weights: GGUFWeights) -> Mapping[str, Any]:

    state_dict = {}
    for tensor in gguf_weights.tensors:
        gguf_tensor_name = tensor.name
        nn_tensor_name = _convert_gguf_tensor_name_to_llama_nn(gguf_tensor_name)
        # gguf is reversed
        reversed_shape = tensor.shape[::-1]
        new_tensor = tensor.data.reshape(reversed_shape)
        state_dict[nn_tensor_name] = torch.from_numpy(new_tensor)

    return state_dict


def _load_weights_into_nn(
    pt_model: nn.Module, gguf_model_args: GGUFModelArgs, gguf_weights: GGUFWeights
):

    state_dict: Mapping[str, Any] = _convert_to_state_dict(gguf_weights)

    # We need to fake initialize the mask, to match with the llama_transformer.py
    for id in range(gguf_model_args.block_count):
        mask_name = f"layers.{id}.attention.mask"
        mask = torch.full(
            (1, 1, pt_model.params.max_seq_len, pt_model.params.max_seq_len),
            float("-inf"),
        )
        mask = torch.triu(mask, diagonal=1)
        state_dict[mask_name] = mask

    pt_model.load_state_dict(state_dict)
    return


def _create_pte_program(pt_model: nn.Module) -> bytes:
    # TODO (mnachin): Export
    return


def convert_to_pte(gguf_model_args: GGUFModelArgs, gguf_weights: GGUFWeights) -> bytes:
    """Convert a GGUF model into an ExecuTorch program.

    Args:
        gguf_model_args: The arguments for the GGUF model.
        gguf_weights: The weights of the GGUF model.
    """

    assert (
        gguf_model_args.arch == "llama"
    ), "Only LLaMa models are supported by this converter."

    # Step 1: Create the PyTorch model
    print("Create the PyTorch model")
    pt_model = _create_pt_model(
        gguf_model_args,
    )

    # Step 2: Load the weights into the PyTorch model
    print("Load the weights into the PyTorch model")
    _load_weights_into_nn(pt_model, gguf_model_args, gguf_weights)

    # Step 3: Export to ExecuTorch
    print("Exporting to ExecuTorch.")
    pte_program = _create_pte_program(pt_model)
    return pte_program