Different output with transformers lib and tensorrt llm when using lora
System Info
A100
Who can help?
@juney-nvidia @ncomly-nvidia @kaiyux @byshiue
Information
- [ ] The official example scripts
- [X] My own modified scripts
Tasks
- [ ] An officially supported task in the
examplesfolder (such as GLUE/SQuAD, ...) - [X] My own task or dataset (give details below)
Reproduction
I want to set lora weights in run time for a llama3 8b based model. But output of transformers library and tensorrt_llm are different. When i use transformers library, my code is like this:
import json
import numpy as np
import os
from peft import PeftModel
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import pandas as pd
class RunWithTransformer:
def __init__(self):
self.file_path = os.path.dirname(__file__)
# self.model_config = json.loads(args["model_config"])
self.model = self.get_model()
self.tokenizer = self.get_tokenizer()
def get_model(self):
base_model_path = self.file_path + "/../../data/base_model/"
base_model = AutoModelForCausalLM.from_pretrained(base_model_path, device_map="auto", trust_remote_code=True, torch_dtype=torch.float16)
lora_model_path = self.file_path + "/../../data/lora/torch/"
ft_model = PeftModel.from_pretrained(base_model, lora_model_path)
ft_model.eval()
return ft_model
def get_tokenizer(self):
tokenizer_path = self.file_path + "/../../data/base_model/"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
return tokenizer
def execute(self, prompts):
model_input = self.tokenizer(prompts, return_tensors="pt").input_ids.to("cuda")
config = dict()
config["do_sample"] = True
config["max_new_tokens"] = 512
config["temperature"] = 0.1
config["top_p"] = 0.96
config["top_k"] = 1
config["repetition_penalty"] = 1.2
config["pad_token_id"] = self.tokenizer.eos_token_id
model_output = self.model.generate(input_ids=model_input, **config)
predicted_texts = self.tokenizer.batch_decode(model_output)
return predicted_texts
if __name__ == "__main__":
run_with_transformer = RunWithTransformer()
prompts = ["### Human: \nپیشنهاد غذا برای دورهمی های ایرانی ### Assistant:"]
run_with_transformer.execute(prompts)
For running with tensorrt_llm: I am using docker tritonserver 24.06-trtllm-python-py3. My tensorrt_llm version is 0.10. and i run this python file: https://github.com/NVIDIA/TensorRT-LLM/blob/v0.10.0/examples/llama/convert_checkpoint.py
python3 src/convert/convert_checkpoint.py --model_dir ./data/base_model \
--output_dir ./data/tllm_checkpoint \
--dtype float16
trtllm-build --checkpoint_dir ./data/tllm_checkpoint \
--output_dir ./data/trt_engines \
--gpt_attention_plugin float16 \
--gemm_plugin float16 \
--remove_input_padding enable \
--context_fmha enable \
--lora_plugin float16 \
--lora_dir ./data/lora/torch \
--max_lora_rank 256 \
--lora_target_modules "attn_q" "attn_k" "attn_v" "attn_dense" "mlp_h_to_4h" "mlp_4h_to_h" "mlp_gate"
Convert lora weights:
python3 src/convert/hf_lora_convert.py -i ./data/lora/torch -o ./data/lora/tensorrt --storage-type float16
In inference time, i run this code:
import tensorrt_llm.bindings.executor as trtllm
import torch
from torch import from_numpy
import numpy as np
import os
import importlib
def import_lib(path, file_name, package_name):
file_path = path + "/" + file_name + ".py"
spec = importlib.util.spec_from_file_location(file_name, file_path)
imported_file = importlib.util.module_from_spec(spec)
spec.loader.exec_module(imported_file)
return getattr(imported_file, package_name)
load_tokenizer = import_lib(os.path.dirname(__file__) , "/../run/utils", "load_tokenizer")
file_path = os.path.dirname(__file__)
def load_base_model():
engine_dir = file_path + "/../../data/trt_engines"
kv_cache_config = trtllm.KvCacheConfig(
free_gpu_memory_fraction=0.1,
max_attention_window=None,
sink_token_length=None)
executor = trtllm.Executor(
engine_dir, trtllm.ModelType.DECODER_ONLY,
trtllm.ExecutorConfig(max_beam_width=1,
kv_cache_config=kv_cache_config,
medusa_choices=None))
return executor
def get_sampling_config(**kwargs):
accepted_parameters = [
"num_beams", "top_k", "top_p", "top_p_min", "top_p_reset_ids",
"top_p_decay", "random_seed", "temperature", "min_length",
"beam_search_diversity_rate", "repetition_penalty",
"presence_penalty", "frequency_penalty", "length_penalty",
"early_stopping"
]
rename_params = {"num_beams": "beam_width"}
sampling_params = {
k: v
for k, v in kwargs.items() if k in accepted_parameters
}
for k, v in rename_params.items():
if k in sampling_params:
sampling_params[v] = sampling_params.pop(k)
if "top_p" in sampling_params and sampling_params["top_p"] == 0.0:
sampling_params["top_p"] = None
if "temperature" in sampling_params and sampling_params[
"temperature"] == 0.0:
print("Convert `temperature=0.0` to `temperature=None` and `top_k=1` to prevent overflow.")
sampling_params['temperature'] = None
sampling_params['top_k'] = 1
sampling_params['top_k'] = 1
sampling_params['top_p'] = 0.96
sampling_params['temperature'] = 0.1
sampling_params['repetition_penalty'] = 1.2
sampling_config = trtllm.SamplingConfig(**sampling_params)
return sampling_config
def get_output_config():
output_config = trtllm.OutputConfig(
return_context_logits=False,
return_generation_logits=False,
return_log_probs=False,
)
return output_config
def get_lora_config_from_request():
task_id = int(1)
weights = torch.tensor(np.load(file_path + "/../../data/lora/tensorrt/model.lora_weights.npy")[0])
config = torch.tensor(np.load(file_path + "/../../data/lora/tensorrt/model.lora_config.npy")[0])
lora_config = trtllm.LoraConfig(task_id=task_id, weights=weights, config=config)
return lora_config
def get_tokenizer():
# tokenizer_dir = file_path + "/../../data/merged_model"
tokenizer_dir = file_path + "/../../data/base_model"
tokenizer, pad_id, end_id = load_tokenizer(
tokenizer_dir=tokenizer_dir,
vocab_file=None,
model_name=None,
model_version=None,
tokenizer_type=None,
)
return tokenizer, pad_id, end_id
def prepare_inputs(batch_input_ids, pad_id, remove_input_padding):
# Cast to int32
batch_input_ids = [x.type(torch.int32) for x in batch_input_ids]
input_lengths = [x.size(0) for x in batch_input_ids]
max_length = max(input_lengths)
if remove_input_padding:
batch_input_ids = torch.concat(batch_input_ids)
else:
# Right padding for trt-llm
paddings = [
torch.ones(max_length - l, dtype=torch.int32) * pad_id
for l in input_lengths
]
batch_input_ids = [
torch.cat([x, pad]) for x, pad in zip(batch_input_ids, paddings)
]
batch_input_ids = torch.stack(batch_input_ids)
input_lengths = torch.tensor(input_lengths, dtype=torch.int32)
return batch_input_ids, input_lengths
def make_a_request(executor, preper_input_ids, loar_enable):
input_ids = preper_input_ids
max_new_tokens=512
pad_id=2
end_id=2
stop_words_list=None
bad_words_list=None
sampling_config=get_sampling_config()
streaming=False
output_config=get_output_config()
prompt_tuning_config=None
lora_config = None
if loar_enable == True:
lora_config = get_lora_config_from_request()
requests = [trtllm.Request(input_token_ids=input_ids,
max_new_tokens=max_new_tokens,
pad_id=pad_id,
end_id=end_id,
stop_words=stop_words_list,
bad_words=bad_words_list,
sampling_config=sampling_config,
streaming=streaming,
output_config=output_config,
prompt_tuning_config=prompt_tuning_config,
lora_config=lora_config)]
request_ids = executor.enqueue_requests(requests)
multi_responses = executor.await_responses(request_ids)
response = multi_responses[0][0]
output_ids = [[[]] for _ in range(len(multi_responses))]
reqid_pos = request_ids.index(response.request_id)
for beam, output_tokens in enumerate(
response.result.output_token_ids):
output_ids[reqid_pos][beam] += output_tokens
print(output_ids[0][0][-10:-1])
with torch.no_grad():
output_ids = torch.tensor(output_ids,
dtype=torch.int32,
device="cuda:0")
torch.cuda.synchronize()
return output_ids
prompts = "### Human: \nپیشنهاد غذا برای دورهمی های ایرانی ### Assistant:"
executor = load_base_model()
tokenizer, pad_id, end_id = get_tokenizer()
preper_input_ids = tokenizer.encode(prompts)
output_ids = make_a_request(executor, preper_input_ids, loar_enable=True)
tokenizer.decode(output_ids.tolist()[0][0][:])
If i manually set lora weights to zero, both responses are the same. But if i use real lora weights, outputs are defferent.
Expected behavior
Get the same answer from both transformers and tensorrt
actual behavior
Get a different answer
additional notes
my adapter_config.json:
{
"alpha_pattern": {},
"auto_mapping": null,
"base_model_name_or_path": "/path/to/my/model",
"bias": "none",
"fan_in_fan_out": false,
"inference_mode": true,
"init_lora_weights": true,
"layers_pattern": null,
"layers_to_transform": null,
"loftq_config": {},
"lora_alpha": 128,
"lora_dropout": 0.1,
"megatron_config": null,
"megatron_core": "megatron.core",
"modules_to_save": null,
"peft_type": "LORA",
"r": 256,
"rank_pattern": {},
"revision": null,
"target_modules": [
"down_proj",
"gate_proj",
"up_proj",
"k_proj",
"v_proj",
"q_proj",
"o_proj"
],
"task_type": "CAUSAL_LM",
"use_rslora": false
}
Hi @Alireza3242 , tritonserver 24.07-trtllm-python-py3 is released, which contains tensorrt_llm 0.11. Could you please try with it?
Hi @QiJune This problem solved with tritonserver 24.07-trtllm-python-py3. I have also another question. tensorrt_llm supports ['attn_q', 'attn_v', 'attn_k', 'attn_qkv', ...] layers in lora. But not support "lm_head". Why?
This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 15 days."