TensorRT-LLM Different output with transformers lib and tensorrt llm when using lora

System Info

A100

Who can help?

@juney-nvidia @ncomly-nvidia @kaiyux @byshiue

Information

[ ] The official example scripts
[X] My own modified scripts

Tasks

[ ] An officially supported task in the examples folder (such as GLUE/SQuAD, ...)
[X] My own task or dataset (give details below)

Reproduction

I want to set lora weights in run time for a llama3 8b based model. But output of transformers library and tensorrt_llm are different. When i use transformers library, my code is like this:

import json
import numpy as np
import os
from peft import PeftModel
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import pandas as pd

class RunWithTransformer:
    def __init__(self):
        self.file_path = os.path.dirname(__file__)
        # self.model_config = json.loads(args["model_config"])
        self.model = self.get_model()
        self.tokenizer = self.get_tokenizer()

    def get_model(self):
        base_model_path = self.file_path + "/../../data/base_model/"
        base_model = AutoModelForCausalLM.from_pretrained(base_model_path, device_map="auto", trust_remote_code=True, torch_dtype=torch.float16)
    
        lora_model_path = self.file_path + "/../../data/lora/torch/"
        ft_model = PeftModel.from_pretrained(base_model, lora_model_path)
        
        ft_model.eval()
        return ft_model
    
    def get_tokenizer(self):
        tokenizer_path = self.file_path + "/../../data/base_model/"
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
        return tokenizer
        
    def execute(self, prompts):
        model_input = self.tokenizer(prompts, return_tensors="pt").input_ids.to("cuda")
        config = dict()
        config["do_sample"] = True
        config["max_new_tokens"] = 512
        config["temperature"] = 0.1
        config["top_p"] = 0.96
        config["top_k"] = 1
        config["repetition_penalty"] = 1.2
        config["pad_token_id"] = self.tokenizer.eos_token_id

        model_output = self.model.generate(input_ids=model_input, **config)
        predicted_texts = self.tokenizer.batch_decode(model_output)
        return predicted_texts

if __name__ == "__main__":
    run_with_transformer = RunWithTransformer()
    
    prompts = ["### Human: \nپیشنهاد غذا برای دورهمی های ایرانی ### Assistant:"]
    run_with_transformer.execute(prompts)

For running with tensorrt_llm: I am using docker tritonserver 24.06-trtllm-python-py3. My tensorrt_llm version is 0.10. and i run this python file: https://github.com/NVIDIA/TensorRT-LLM/blob/v0.10.0/examples/llama/convert_checkpoint.py

python3 src/convert/convert_checkpoint.py --model_dir ./data/base_model \
                              --output_dir ./data/tllm_checkpoint \
                              --dtype float16
trtllm-build --checkpoint_dir ./data/tllm_checkpoint \
            --output_dir ./data/trt_engines \
            --gpt_attention_plugin float16 \
            --gemm_plugin float16 \
            --remove_input_padding enable \
            --context_fmha enable \
            --lora_plugin float16 \
            --lora_dir ./data/lora/torch \
            --max_lora_rank 256 \
            --lora_target_modules "attn_q" "attn_k" "attn_v" "attn_dense" "mlp_h_to_4h" "mlp_4h_to_h" "mlp_gate"

Convert lora weights:

python3 src/convert/hf_lora_convert.py -i ./data/lora/torch -o ./data/lora/tensorrt --storage-type float16

In inference time, i run this code:

import tensorrt_llm.bindings.executor as trtllm
import torch
from torch import from_numpy
import numpy as np
import os
import importlib


def import_lib(path, file_name, package_name):
    file_path = path + "/" + file_name + ".py"
    spec = importlib.util.spec_from_file_location(file_name, file_path)
    imported_file = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(imported_file)
    return getattr(imported_file, package_name)

load_tokenizer = import_lib(os.path.dirname(__file__) , "/../run/utils", "load_tokenizer")


file_path = os.path.dirname(__file__)


def load_base_model():
    engine_dir = file_path + "/../../data/trt_engines"
    kv_cache_config = trtllm.KvCacheConfig(
                free_gpu_memory_fraction=0.1,
                max_attention_window=None,
                sink_token_length=None)

    executor = trtllm.Executor(
                engine_dir, trtllm.ModelType.DECODER_ONLY,
                trtllm.ExecutorConfig(max_beam_width=1,
                                      kv_cache_config=kv_cache_config,
                                      medusa_choices=None))
    return executor


def get_sampling_config(**kwargs):
    accepted_parameters = [
        "num_beams", "top_k", "top_p", "top_p_min", "top_p_reset_ids",
        "top_p_decay", "random_seed", "temperature", "min_length",
        "beam_search_diversity_rate", "repetition_penalty",
        "presence_penalty", "frequency_penalty", "length_penalty",
        "early_stopping"
    ]
    rename_params = {"num_beams": "beam_width"}
    sampling_params = {
        k: v
        for k, v in kwargs.items() if k in accepted_parameters
    }
    for k, v in rename_params.items():
        if k in sampling_params:
            sampling_params[v] = sampling_params.pop(k)
    if "top_p" in sampling_params and sampling_params["top_p"] == 0.0:
        sampling_params["top_p"] = None

    if "temperature" in sampling_params and sampling_params[
            "temperature"] == 0.0:
        print("Convert `temperature=0.0` to `temperature=None` and `top_k=1` to prevent overflow.")
        sampling_params['temperature'] = None
        sampling_params['top_k'] = 1


    sampling_params['top_k'] = 1
    sampling_params['top_p'] = 0.96
    sampling_params['temperature'] = 0.1
    sampling_params['repetition_penalty'] = 1.2
    sampling_config = trtllm.SamplingConfig(**sampling_params)
    return sampling_config

def get_output_config():
    output_config = trtllm.OutputConfig(
            return_context_logits=False,
            return_generation_logits=False,
            return_log_probs=False,
        )
    return output_config


def get_lora_config_from_request():
    task_id = int(1)
    weights = torch.tensor(np.load(file_path + "/../../data/lora/tensorrt/model.lora_weights.npy")[0])
    config = torch.tensor(np.load(file_path + "/../../data/lora/tensorrt/model.lora_config.npy")[0])
    lora_config = trtllm.LoraConfig(task_id=task_id, weights=weights, config=config)
    return lora_config


def get_tokenizer():
    # tokenizer_dir = file_path + "/../../data/merged_model"
    tokenizer_dir = file_path + "/../../data/base_model"
    tokenizer, pad_id, end_id = load_tokenizer(
        tokenizer_dir=tokenizer_dir,
        vocab_file=None,
        model_name=None,
        model_version=None,
        tokenizer_type=None,
    )
    return tokenizer, pad_id, end_id

def prepare_inputs(batch_input_ids, pad_id, remove_input_padding):
    # Cast to int32
    batch_input_ids = [x.type(torch.int32) for x in batch_input_ids]
    input_lengths = [x.size(0) for x in batch_input_ids]
    max_length = max(input_lengths)

    if remove_input_padding:
        batch_input_ids = torch.concat(batch_input_ids)
    else:
        # Right padding for trt-llm
        paddings = [
            torch.ones(max_length - l, dtype=torch.int32) * pad_id
            for l in input_lengths
        ]
        batch_input_ids = [
            torch.cat([x, pad]) for x, pad in zip(batch_input_ids, paddings)
        ]
        batch_input_ids = torch.stack(batch_input_ids)
    input_lengths = torch.tensor(input_lengths, dtype=torch.int32)
    return batch_input_ids, input_lengths

def make_a_request(executor, preper_input_ids, loar_enable):
    input_ids = preper_input_ids
    max_new_tokens=512
    pad_id=2
    end_id=2
    stop_words_list=None
    bad_words_list=None
    sampling_config=get_sampling_config()
    streaming=False
    output_config=get_output_config()
    prompt_tuning_config=None
    lora_config = None
    if loar_enable == True:
        lora_config = get_lora_config_from_request()

    requests = [trtllm.Request(input_token_ids=input_ids,
                               max_new_tokens=max_new_tokens,
                               pad_id=pad_id,
                               end_id=end_id,
                               stop_words=stop_words_list,
                               bad_words=bad_words_list,
                               sampling_config=sampling_config,
                               streaming=streaming,
                               output_config=output_config,
                               prompt_tuning_config=prompt_tuning_config,
                               lora_config=lora_config)]

    request_ids = executor.enqueue_requests(requests)
    multi_responses = executor.await_responses(request_ids)
    response = multi_responses[0][0]

    output_ids = [[[]] for _ in range(len(multi_responses))]
    reqid_pos = request_ids.index(response.request_id)
    for beam, output_tokens in enumerate(
            response.result.output_token_ids):
        output_ids[reqid_pos][beam] += output_tokens

    print(output_ids[0][0][-10:-1])

    with torch.no_grad():
        output_ids = torch.tensor(output_ids,
                                          dtype=torch.int32,
                                          device="cuda:0")
        torch.cuda.synchronize()
    
    return output_ids



prompts = "### Human: \nپیشنهاد غذا برای دورهمی های ایرانی ### Assistant:"
executor = load_base_model()
tokenizer, pad_id, end_id = get_tokenizer()
preper_input_ids = tokenizer.encode(prompts)
output_ids = make_a_request(executor, preper_input_ids, loar_enable=True)

tokenizer.decode(output_ids.tolist()[0][0][:])

If i manually set lora weights to zero, both responses are the same. But if i use real lora weights, outputs are defferent.

Expected behavior

Get the same answer from both transformers and tensorrt

actual behavior

Get a different answer

additional notes

my adapter_config.json:

{
  "alpha_pattern": {},
  "auto_mapping": null,
  "base_model_name_or_path": "/path/to/my/model",
  "bias": "none",
  "fan_in_fan_out": false,
  "inference_mode": true,
  "init_lora_weights": true,
  "layers_pattern": null,
  "layers_to_transform": null,
  "loftq_config": {},
  "lora_alpha": 128,
  "lora_dropout": 0.1,
  "megatron_config": null,
  "megatron_core": "megatron.core",
  "modules_to_save": null,
  "peft_type": "LORA",
  "r": 256,
  "rank_pattern": {},
  "revision": null,
  "target_modules": [
    "down_proj",
    "gate_proj",
    "up_proj",
    "k_proj",
    "v_proj",
    "q_proj",
    "o_proj"
  ],
  "task_type": "CAUSAL_LM",
  "use_rslora": false
}

Jul 25 '24 08:07 Alireza3242

Hi @Alireza3242 , tritonserver 24.07-trtllm-python-py3 is released, which contains tensorrt_llm 0.11. Could you please try with it?

Jul 25 '24 09:07 QiJune

Hi @QiJune This problem solved with tritonserver 24.07-trtllm-python-py3. I have also another question. tensorrt_llm supports ['attn_q', 'attn_v', 'attn_k', 'attn_qkv', ...] layers in lora. But not support "lm_head". Why?

Jul 26 '24 10:07 Alireza3242

This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 15 days."

Sep 21 '24 01:09 github-actions[bot]