Quantized model behave differently on multi GPU with device_map="auto" compared to single GPU
I quantized llma2-70b into Int8 format using vllm expample.
But I found that if I load the model with device_map="auto" on 2 GPUs, the output attention hidden states of second half layers (on cuda:1) is different compared to 1 GPU case. here is my script, modified from spec-bench
from typing import Optional, Callable
import torch
import argparse
from evaluation.eval import run_eval, reorg_answer_file
import pdb
from fastchat.utils import str_to_torch_dtype
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationMixin
from evaluation.decoding_tp import _assisted_decoding
from gptqmodel import GPTQModel, QuantizeConfig
def sps_forward(inputs, model, tokenizer, max_new_tokens, do_sample=False, temperature=0.0, drafter=None):
input_ids = inputs.input_ids
model.generation_config.max_new_tokens = max_new_tokens
model.generation_config.output_hidden_states = True
output_ids, idx, accept_length_list = model.generate(
**inputs,
generation_config=model.generation_config,
assistant_model=drafter,
do_sample=do_sample,
temperature=temperature,
return_dict_in_generate=True,
output_hidden_states=True,
)
new_token = len(output_ids[0][len(input_ids[0]):])
return output_ids, new_token, idx + 1, accept_length_list
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model-path",
type=str,
required=True,
)
parser.add_argument(
"--drafter-path",
type=str,
required=True,
)
parser.add_argument("--model-id", type=str, required=True)
parser.add_argument(
"--bench-name",
type=str,
default="mt_bench",
help="The name of the benchmark question set.",
)
parser.add_argument(
"--question-begin",
type=int,
help="A debug option. The begin index of questions.",
)
parser.add_argument(
"--question-end",
type=int,
help="A debug option. The end index of questions."
)
parser.add_argument("--answer-file", type=str, help="The output answer file.")
parser.add_argument(
"--max-new-tokens",
type=int,
default=1024,
help="The maximum number of new generated tokens.",
)
parser.add_argument(
"--num-choices",
type=int,
default=1,
help="How many completion choices to generate.",
)
parser.add_argument(
"--num-gpus-per-model",
type=int,
default=1,
help="The number of GPUs per model.",
)
parser.add_argument(
"--num-gpus-total", type=int, default=1, help="The total number of GPUs."
)
parser.add_argument(
"--temperature",
type=float,
default=0.0,
help="The temperature for medusa sampling.",
)
parser.add_argument(
"--dtype",
type=str,
default="float16",
choices=["float32", "float64", "float16", "bfloat16"],
help="Override the default dtype. If not set, it will use float16 on GPU.",
)
parser.add_argument(
"--drafter-dtype",
type=str,
default="float16",
choices=["float32", "float64", "float16", "bfloat16"],
help="Override the default dtype. If not set, it will use float16 on GPU.",
)
args = parser.parse_args()
#_set_backend_determinism()
GenerationMixin._assisted_decoding = _assisted_decoding
print("[INFO] Patched GenerationMixin._assisted_decoding -> evaluation.decoding_tp._assisted_decoding")
question_file = f"data/{args.bench_name}/question.jsonl"
if args.answer_file:
answer_file = args.answer_file
else:
answer_file = f"data/{args.bench_name}/model_answer/{args.model_id}.jsonl"
print(f"Output to {answer_file}")
model = GPTQModel.load(
model_id_or_path=args.model_path,
quantize_config=QuantizeConfig(bits=8, ),
device_map="auto",
low_cpu_mem_usage=True,
trust_remote_code=True,
)
drafter = AutoModelForCausalLM.from_pretrained(
args.drafter_path,
torch_dtype=str_to_torch_dtype(args.drafter_dtype),
low_cpu_mem_usage=True,
device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(args.model_path)
model.eval()
drafter.eval()
do_sample = args.temperature > 0.0
run_eval(
model=model,
tokenizer=tokenizer,
forward_func=sps_forward,
model_id=args.model_id,
question_file=question_file,
question_begin=args.question_begin,
question_end=args.question_end,
answer_file=answer_file,
max_new_tokens=args.max_new_tokens,
num_choices=args.num_choices,
num_gpus_per_model=args.num_gpus_per_model,
num_gpus_total=args.num_gpus_total,
drafter=drafter,
temperature=args.temperature,
do_sample=do_sample,
)
reorg_answer_file(answer_file)
I modified _assistant_decoding function to save one step infomation
SAVE_DEBUG_FILE = os.environ.get("SAVE_SPECULATIVE_DEBUG", None)
# after 2.3.
if SAVE_DEBUG_FILE :
saved_hidden_states = [
hs[:, -candidate_length - 1 :].cpu().clone() for hs in outputs.hidden_states
]
step_data = {
"step": step,
"cur_len": cur_len,
"candidate_length": candidate_length,
"input_ids": input_ids.cpu().clone(),
"candidate_input_ids": candidate_input_ids.cpu().clone(),
"candidate_logits": candidate_logits.cpu().clone() if candidate_logits is not None else None,
"target_logits": new_logits.cpu().clone(),
"attention_mask": model_inputs.get("attention_mask").cpu().clone() if "attention_mask" in model_inputs else None,
"position_ids": model_inputs.get("position_ids").cpu().clone() if "position_ids" in model_inputs else None,
"drafter_device": str(drafter_device),
"target_device": str(self.device),
"hidden_states": saved_hidden_states,
}
debug_data["steps"].append(step_data)
debug_data["final_output_ids"] = None
debug_data["accept_length_list"] = accept_length_list
Path(SAVE_DEBUG_FILE).parent.mkdir(parents=True, exist_ok=True)
with open(SAVE_DEBUG_FILE, "wb") as f:
pickle.dump(debug_data, f)
print(f" drafter token: {candidate_input_ids[0, cur_len:].tolist()[:candidate_length]}")
print(f" target argmax : {new_logits[0, :-1].argmax(-1).tolist()}")
import sys; sys.exit(0)
and compared results between single GPU and multi GPUs using following code
# compare_spec_debug.py
# compare_final_all.py
import pickle
import torch
import torch.nn.functional as F
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--single", type=str, required=True)
parser.add_argument("--multi", type=str, required=True)
args = parser.parse_args()
def load_first_step(p):
with open(p, "rb") as f:
data = pickle.load(f)
return [s for s in data["steps"] if s["step"] == 1][0]
single = load_first_step(args.single)
multi = load_first_step(args.multi)
# 提取验证部分(去掉最后一个用于采样的位置)
s_tgt = single["target_logits"][:, :-1].squeeze(0) # [L, V]
m_tgt = multi["target_logits"][:, :-1].squeeze(0)
s_cand = single["candidate_logits"].squeeze(0) if single["candidate_logits"] is not None else None
m_cand = multi["candidate_logits"].squeeze(0) if multi["candidate_logits"] is not None else None
drafter_tokens = single["candidate_input_ids"][0, single["cur_len"]: single["cur_len"]+single["candidate_length"]]
print("=" * 80)
print("完整第一步对比报告".center(80))
print("=" * 80)
print(f"candidate_length : {single['candidate_length']}")
print(f"drafter 猜测 token : {drafter_tokens.tolist()}")
print(f"input_ids 一致 : {torch.equal(single['input_ids'], multi['input_ids'])}")
print(f"candidate_input_ids 一致 : {torch.equal(single['candidate_input_ids'], multi['candidate_input_ids'])}")
print()
# 1. target_logits 单卡 vs 多卡
print("1. target_logits 单卡 ↔ 多卡 差异".center(70))
diff_tgt = torch.abs(s_tgt - m_tgt)
print(f" 最大绝对误差 : {diff_tgt.max().item():12.6f}")
print(f" 平均绝对误差 : {diff_tgt.mean().item():12.6f}")
print()
# 2. candidate_logits 单卡 vs 多卡(你特别要的)
print("2. candidate_logits 单卡 ↔ 多卡 差异".center(70))
if s_cand is not None and m_cand is not None:
diff_cand = torch.abs(s_cand - m_cand)
print(f" 最大绝对误差 : {diff_cand.max().item():12.6f}")
print(f" 平均绝对误差 : {diff_cand.mean().item():12.6f}")
print(f" 标准差 : {diff_cand.std().item():12.6f}")
print()
# 3. (candidate_logits → target_logits)
print("3. drafter 质量(candidate_logits → target_logits)".center(70))
print(f"{'':20} {'最大误差':>12} {'平均误差':>12} {'接受数':>8}")
if s_cand is not None:
err_s = torch.abs(s_cand - s_tgt)
accept_s = (s_tgt.argmax(-1) == drafter_tokens).sum().item()
print(f"{'单卡':20} {err_s.max().item():12.6f} {err_s.mean().item():12.6f} {accept_s:5}/{len(drafter_tokens)}")
else:
accept_s = (s_tgt.argmax(-1) == drafter_tokens).sum().item()
print(f"{'单卡':20} {'(无cand)':>12} {'(无cand)':>12} {accept_s:5}/{len(drafter_tokens)}")
if m_cand is not None:
err_m = torch.abs(m_cand - m_tgt)
accept_m = (m_tgt.argmax(-1) == drafter_tokens).sum().item()
print(f"{'多卡':20} {err_m.max().item():12.6f} {err_m.mean().item():12.6f} {accept_m:5}/{len(drafter_tokens)}")
else:
accept_m = (m_tgt.argmax(-1) == drafter_tokens).sum().item()
print(f"{'多卡':20} {'(无cand)':>12} {'(无cand)':>12} {accept_m:5}/{len(drafter_tokens)}")
print()
# 4. argmax
print("位置级 argmax 对比".center(70))
s_top1 = s_tgt.argmax(-1)
m_top1 = m_tgt.argmax(-1)
for i, token in enumerate(drafter_tokens.tolist()):
print(f"Pos {i}: drafter {token:5d} | single {s_top1[i].item():5d} {'✓' if s_top1[i].item() == token else '✗'} | "
f"multi {m_top1[i].item():5d} {'✓' if m_top1[i].item() == token else '✗'}")
print("5. hidden_states 分层差异".center(70))
s_hs = single.get("hidden_states", None)
m_hs = multi.get("hidden_states", None)
if s_hs is None or m_hs is None:
print(" 单卡或多卡缺少 hidden_states 字段,跳过。")
else:
if len(s_hs) != len(m_hs):
print(f" 层数不一致: single={len(s_hs)}, multi={len(m_hs)}")
else:
print(f"{'层':>4} {'shape一致':>10} {'max_abs':>12} {'mean_abs':>12}")
first_diff_layer = None
for li, (sh, mh) in enumerate(zip(s_hs, m_hs)):
same_shape = tuple(sh.shape) == tuple(mh.shape)
if not same_shape:
print(f"{li:4d} {'False':>10} {'-':>12} {'-':>12}")
continue
diff = (sh - mh).abs()
max_abs = diff.max().item()
mean_abs = diff.mean().item()
# first different layer
if first_diff_layer is None and (max_abs > 0.0 or mean_abs > 0.0):
first_diff_layer = li
print(f"{li:4d} {str(same_shape):>10} {max_abs:12.6f} {mean_abs:12.6f}")
and the result shows that the llam2-7b drafter logits is same in 2 cases but quantized llama2-70b target's hidden states is dramatically different on the second GPU
candidate_length : 1
drafter 猜测 token : [26901]
input_ids 一致 : True
candidate_input_ids 一致 : True
1. target_logits 单卡 ↔ 多卡 差异
最大绝对误差 : 18.126953
平均绝对误差 : 2.402046
2. candidate_logits 单卡 ↔ 多卡 差异
最大绝对误差 : 0.000000
平均绝对误差 : 0.000000
标准差 : 0.000000
3. drafter 质量(candidate_logits → target_logits)
最大误差 平均误差 接受数
单卡 9.171875 1.171219 0/1
多卡 17.580078 2.810862 0/1
位置级 argmax 对比
Pos 0: drafter 26901 | single 18585 ✗ | multi 19259 ✗
5. hidden_states 分层差异
层 shape一致 max_abs mean_abs
0 True 0.000000 0.000000
1 True 0.000000 0.000000
2 True 0.000000 0.000000
3 True 0.000000 0.000000
4 True 0.000000 0.000000
5 True 0.000000 0.000000
6 True 0.000000 0.000000
7 True 0.000000 0.000000
8 True 0.000000 0.000000
9 True 0.000000 0.000000
10 True 0.000000 0.000000
11 True 0.000000 0.000000
12 True 0.000000 0.000000
13 True 0.000000 0.000000
14 True 0.000000 0.000000
15 True 0.000000 0.000000
16 True 0.000000 0.000000
17 True 0.000000 0.000000
18 True 0.000000 0.000000
19 True 0.000000 0.000000
20 True 0.000000 0.000000
21 True 0.000000 0.000000
22 True 0.000000 0.000000
23 True 0.000000 0.000000
24 True 0.000000 0.000000
25 True 0.000000 0.000000
26 True 0.000000 0.000000
27 True 0.000000 0.000000
28 True 0.000000 0.000000
29 True 0.000000 0.000000
30 True 0.000000 0.000000
31 True 0.000000 0.000000
32 True 0.000000 0.000000
33 True 0.000000 0.000000
34 True 0.000000 0.000000
35 True 0.000000 0.000000
36 True 0.000000 0.000000
37 True 0.000000 0.000000
38 True 0.000000 0.000000
39 True 0.000000 0.000000
40 True 0.000000 0.000000
41 True 1.765625 0.326904
42 True 2.902344 0.520020
43 True 5.023438 0.672363
44 True 7.066406 0.794434
45 True 5.703125 0.947754
46 True 8.804688 1.071289
47 True 11.265625 1.173828
48 True 15.304688 1.296875
49 True 16.218750 1.413086
50 True 20.765625 1.499023
51 True 25.500000 1.581055
52 True 28.000000 1.673828
53 True 33.781250 1.760742
54 True 38.500000 1.835938
55 True 40.375000 1.911133
56 True 43.531250 1.989258
57 True 54.968750 2.070312
58 True 55.031250 2.150391
59 True 60.500000 2.222656
60 True 65.062500 2.294922
61 True 67.750000 2.388672
62 True 69.562500 2.455078
63 True 75.125000 2.533203
64 True 78.750000 2.607422
65 True 84.062500 2.699219
66 True 85.375000 2.775391
67 True 90.625000 2.857422
68 True 96.937500 2.931641
69 True 102.562500 3.019531
70 True 101.187500 3.117188
71 True 107.500000 3.218750
72 True 105.187500 3.328125
73 True 109.562500 3.462891
74 True 112.937500 3.609375
75 True 118.437500 3.736328
76 True 109.250000 3.857422
77 True 123.187500 4.031250
78 True 141.875000 4.218750
79 True 133.000000 4.457031
80 True 39.062500 1.160156
What cause the situation and how can I solve it, please!!!
@DarkenStar
- Disable draft model
- Remove low_cpu_memoory option
model = GPTQModel.load(
model_id_or_path=args.model_path,
quantize_config=QuantizeConfig(bits=8, ),
device_map="auto",
low_cpu_mem_usage=True, <-- remove this
trust_remote_code=True,
)
drafter = AutoModelForCausalLM.from_pretrained( <---- remove this....to isolate your problem
args.drafter_path,
torch_dtype=str_to_torch_dtype(args.drafter_dtype),
low_cpu_mem_usage=True,
device_map="auto",
)
OK,thank you very much!!! Let me write a test case
@Qubitium Hello, I write a new script with only using quantized model forward to get hidden states and logits
import argparse
import pickle
from pathlib import Path
import torch
from transformers import AutoTokenizer
from gptqmodel import GPTQModel, QuantizeConfig
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--model-path",
type=str,
default="/home/aocheng/QuantSpec/Llama-2-70b-hf-gptqmodel-8bit",
help="Path to the GPTQ model (including tokenizer).",
)
parser.add_argument(
"--save-path",
type=str,
required=True,
help="Path to save the debug pickle file, e.g. /tmp/gptq_hidden_debug.pkl",
)
parser.add_argument(
"--prompt",
type=str,
default="Hello, this is a test input for multi-GPU GPTQ inference.",
help="Prompt text for the test run.",
)
parser.add_argument(
"--dtype",
type=str,
default="float16",
choices=["float16", "bfloat16", "float32", "bfloat16"],
help="Torch dtype for inference.",
)
args = parser.parse_args()
torch_dtype = {
"float16": torch.float16,
"bfloat16": torch.bfloat16,
"float32": torch.float32,
}[args.dtype]
print(f"[INFO] Loading GPTQModel from {args.model_path}")
model = GPTQModel.load(
model_id_or_path=args.model_path,
quantize_config=QuantizeConfig(bits=8),
device_map="auto",
trust_remote_code=True,
torch_dtype=torch_dtype,
)
print("[INFO] Model loaded")
tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True)
print("[INFO] Tokenizer loaded")
model.eval()
print(f"[INFO] Prompt: {args.prompt!r}")
inputs = tokenizer(
args.prompt,
return_tensors="pt",
add_special_tokens=True,
)
print("[INFO] Tokenized prompt")
inputs = {k: v.to(model.device) for k, v in inputs.items()}
print("[INFO] Inputs moved to device(s)")
with torch.no_grad():
print("[INFO] Starting forward pass...")
outputs = model(
**inputs,
output_hidden_states=True,
return_dict=True,
)
print("[INFO] Forward pass finished")
# outputs.logits: [bs, seq_len, vocab]
# outputs.hidden_states: tuple(num_layers+1) of [bs, seq_len, hidden]
if outputs.hidden_states is None:
raise RuntimeError(
"Model did not return hidden_states. "
"Please make sure the model supports output_hidden_states=True."
)
hidden_states = [h.cpu().clone() for h in outputs.hidden_states]
logits = outputs.logits.cpu().clone()
debug_obj = {
"prompt": args.prompt,
"input_ids": inputs["input_ids"].cpu().clone(),
"logits": logits,
"hidden_states": hidden_states, # list[num_layers+1], each [1, seq_len, hidden]
"model_dtype": str(torch_dtype),
"model_device": str(model.device),
}
save_path = Path(args.save_path)
save_path.parent.mkdir(parents=True, exist_ok=True)
with open(save_path, "wb") as f:
pickle.dump(debug_obj, f)
print(f"[INFO] Forward pass finished. Debug data saved to: {save_path}")
if __name__ == "__main__":
main()
and use following script to compare the result. Unfortunately, the result also shows different hidden states value of the second device, and the console output is below.
import argparse
import pickle
import torch
parser = argparse.ArgumentParser(description="Compare single-GPU and multi-GPU hidden/debug dumps.")
parser.add_argument("--single", type=str, required=True, help="Path to single-GPU debug pickle.")
parser.add_argument("--multi", type=str, required=True, help="Path to multi-GPU debug pickle.")
args = parser.parse_args()
def load_debug(p):
with open(p, "rb") as f:
data = pickle.load(f)
return data
single = load_debug(args.single)
multi = load_debug(args.multi)
print("=" * 80)
print("First-pass debug comparison (single-GPU vs multi-GPU)".center(80))
print("=" * 80)
print()
# 0. Basic metadata comparison
print("0. Basic metadata".center(70))
prompt_equal = (single.get("prompt", None) == multi.get("prompt", None))
print(f"prompt equal : {prompt_equal}")
print(f"single prompt : {repr(single.get('prompt', None))}")
print(f"multi prompt : {repr(multi.get('prompt', None))}")
single_dtype = single.get("model_dtype", "N/A")
multi_dtype = multi.get("model_dtype", "N/A")
print(f"model_dtype single : {single_dtype}")
print(f"model_dtype multi : {multi_dtype}")
single_device = single.get("model_device", "N/A")
multi_device = multi.get("model_device", "N/A")
print(f"model_device single : {single_device}")
print(f"model_device multi : {multi_device}")
print()
# 1. input_ids comparison
print("1. input_ids comparison".center(70))
s_ids = single["input_ids"]
m_ids = multi["input_ids"]
print(f"shape single : {tuple(s_ids.shape)}")
print(f"shape multi : {tuple(m_ids.shape)}")
same_shape_ids = tuple(s_ids.shape) == tuple(m_ids.shape)
print(f"same shape : {same_shape_ids}")
same_ids = torch.equal(s_ids, m_ids) if same_shape_ids else False
print(f"exactly equal : {same_ids}")
if same_shape_ids and not same_ids:
diff_positions = (s_ids != m_ids).nonzero(as_tuple=False)
print(f"num differing tokens: {diff_positions.shape[0]}")
print()
# 2. logits comparison
print("2. logits comparison".center(70))
s_logits = single["logits"] # [1, L, V]
m_logits = multi["logits"]
print(f"shape single : {tuple(s_logits.shape)}")
print(f"shape multi : {tuple(m_logits.shape)}")
same_shape_logits = tuple(s_logits.shape) == tuple(m_logits.shape)
print(f"same shape : {same_shape_logits}")
if same_shape_logits:
diff_logits = (s_logits - m_logits).abs()
max_abs = diff_logits.max().item()
mean_abs = diff_logits.mean().item()
print(f"max absolute error : {max_abs:12.6f}")
print(f"mean absolute error : {mean_abs:12.6f}")
# argmax token comparison along vocab dimension
s_top1 = s_logits.argmax(-1) # [1, L]
m_top1 = m_logits.argmax(-1)
same_top1 = torch.equal(s_top1, m_top1)
print(f"top-1 token equal : {same_top1}")
if not same_top1:
diff_pos = (s_top1 != m_top1).nonzero(as_tuple=False)
print(f"num differing positions in top-1: {diff_pos.shape[0]}")
else:
print("logits shapes mismatch, skip numeric comparison.")
print()
# 3. hidden_states comparison (per-layer)
print("3. hidden_states (per-layer) comparison".center(70))
s_hs = single.get("hidden_states", None)
m_hs = multi.get("hidden_states", None)
if s_hs is None or m_hs is None:
print("Either single or multi does not contain 'hidden_states'; skipping.")
else:
# ensure list/tuple of tensors
if not isinstance(s_hs, (list, tuple)) or not isinstance(m_hs, (list, tuple)):
print("hidden_states is not a list/tuple of tensors; skipping detailed comparison.")
else:
print(f"num layers (including embedding output): single={len(s_hs)}, multi={len(m_hs)}")
if len(s_hs) != len(m_hs):
print("layer counts differ, cannot do full one-to-one comparison.")
print(f"{'layer':>6} {'same_shape':>12} {'max_abs':>12} {'mean_abs':>12}")
first_diff_layer = None
for li, (sh, mh) in enumerate(zip(s_hs, m_hs)):
if not (isinstance(sh, torch.Tensor) and isinstance(mh, torch.Tensor)):
print(f"{li:6d} {'False':>12} {'-':>12} {'-':>12}")
continue
same_shape = tuple(sh.shape) == tuple(mh.shape)
if not same_shape:
print(f"{li:6d} {'False':>12} {'-':>12} {'-':>12}")
continue
diff = (sh - mh).abs()
max_abs = diff.max().item()
mean_abs = diff.mean().item()
if first_diff_layer is None and (max_abs > 0.0 or mean_abs > 0.0):
first_diff_layer = li
print(f"{li:6d} {str(same_shape):>12} {max_abs:12.6f} {mean_abs:12.6f}")
print()
if first_diff_layer is None:
print("All compared layers are numerically identical (zero difference).")
else:
print(f"First layer with non-zero difference: {first_diff_layer}")
print()
print("=" * 80)
print("End of comparison report".center(80))
print("=" * 80)
output
================================================================================
First-pass debug comparison (single-GPU vs multi-GPU)
================================================================================
0. Basic metadata
prompt equal : True
single prompt : 'Hello, this is a test input for multi-GPU GPTQ inference.'
multi prompt : 'Hello, this is a test input for multi-GPU GPTQ inference.'
model_dtype single : torch.float16
model_dtype multi : torch.float16
model_device single : cuda:0
model_device multi : cuda:0
1. input_ids comparison
shape single : (1, 18)
shape multi : (1, 18)
same shape : True
exactly equal : True
2. logits comparison
shape single : (1, 18, 32000)
shape multi : (1, 18, 32000)
same shape : True
max absolute error : 23.187500
mean absolute error : 4.042969
top-1 token equal : False
num differing positions in top-1: 18
3. hidden_states (per-layer) comparison
num layers (including embedding output): single=81, multi=81
layer same_shape max_abs mean_abs
0 True 0.000000 0.000000
1 True 0.000000 0.000000
2 True 0.000000 0.000000
3 True 0.000000 0.000000
4 True 0.000000 0.000000
5 True 0.000000 0.000000
6 True 0.000000 0.000000
7 True 0.000000 0.000000
8 True 0.000000 0.000000
9 True 0.000000 0.000000
10 True 0.000000 0.000000
11 True 0.000000 0.000000
12 True 0.000000 0.000000
13 True 0.000000 0.000000
14 True 0.000000 0.000000
15 True 0.000000 0.000000
16 True 0.000000 0.000000
17 True 0.000000 0.000000
18 True 0.000000 0.000000
19 True 0.000000 0.000000
20 True 0.000000 0.000000
21 True 0.000000 0.000000
22 True 0.000000 0.000000
23 True 0.000000 0.000000
24 True 0.000000 0.000000
25 True 0.000000 0.000000
26 True 0.000000 0.000000
27 True 0.000000 0.000000
28 True 0.000000 0.000000
29 True 0.000000 0.000000
30 True 0.000000 0.000000
31 True 0.000000 0.000000
32 True 0.000000 0.000000
33 True 0.000000 0.000000
34 True 0.000000 0.000000
35 True 0.000000 0.000000
36 True 0.000000 0.000000
37 True 0.000000 0.000000
38 True 0.000000 0.000000
39 True 0.000000 0.000000
40 True 0.000000 0.000000
41 True 9.984375 1.247070
42 True 10.906250 1.321289
43 True 15.312500 1.399414
44 True 22.937500 1.476562
45 True 30.062500 1.576172
46 True 38.406250 1.667969
47 True 44.531250 1.754883
48 True 70.437500 1.845703
49 True 84.000000 1.910156
50 True 97.000000 1.981445
51 True 100.937500 2.048828
52 True 109.000000 2.128906
53 True 127.750000 2.205078
54 True 137.625000 2.275391
55 True 143.750000 2.345703
56 True 154.500000 2.414062
57 True 169.750000 2.468750
58 True 178.375000 2.525391
59 True 183.125000 2.589844
60 True 191.625000 2.648438
61 True 198.625000 2.712891
62 True 201.750000 2.761719
63 True 205.750000 2.816406
64 True 206.250000 2.873047
65 True 209.625000 2.929688
66 True 210.750000 2.984375
67 True 212.500000 3.058594
68 True 215.500000 3.123047
69 True 221.750000 3.189453
70 True 226.625000 3.267578
71 True 230.625000 3.363281
72 True 233.250000 3.457031
73 True 243.875000 3.554688
74 True 237.250000 3.658203
75 True 240.375000 3.769531
76 True 236.000000 3.908203
77 True 934.000000 4.074219
78 True 1385.000000 4.312500
79 True 1688.000000 4.644531
80 True 95.312500 1.112305
First layer with non-zero difference: 41
================================================================================
End of comparison report
================================================================================
Could you please tell me how to fix the problem ? Please !!!
Did you set do_sampling to false and temperature to 1.0?
Makes sure sampling is disabled so we get deterministic output to compare between the two.
Do you have two of the same model GPU?
@Qubitium I collected the hidden states and logits using a small debug script:
CUDA_VISIBLE_DEVICES=6 python -m evaluation.inference_gptq_hidden_debug \
--save-path ./gptq_single_hidden_debug.pkl \
--dtype float16
CUDA_VISIBLE_DEVICES=6,7 python -m evaluation.inference_gptq_hidden_debug \
--save-path ./gptq_multi_hidden_debug.pkl \
--dtype float16
The script only runs a single forward pass with torch.no_grad() and does not use sampling at all (above script):
with torch.no_grad():
print("[INFO] Starting forward pass...")
outputs = model(
**inputs,
output_hidden_states=True,
return_dict=True,
)
So there is no do_sampling / temperature involved here – it's a plain deterministic forward pass on the same input ids. And I run the scripy on same model GPU
Fri Nov 28 01:29:31 2025
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.144.03 Driver Version: 550.144.03 CUDA Version: 12.4 |
|-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA H20-3e On | 00000000:65:02.0 Off | 0 |
| N/A 36C P0 119W / 500W | 14575MiB / 143771MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
| 1 NVIDIA H20-3e On | 00000000:65:03.0 Off | 0 |
| N/A 41C P0 124W / 500W | 2270MiB / 143771MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
| 2 NVIDIA H20-3e On | 00000000:67:02.0 Off | 0 |
| N/A 53C P0 269W / 500W | 38355MiB / 143771MiB | 99% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
| 3 NVIDIA H20-3e On | 00000000:67:03.0 Off | 0 |
| N/A 44C P0 200W / 500W | 38099MiB / 143771MiB | 2% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
| 4 NVIDIA H20-3e On | 00000000:69:02.0 Off | 0 |
| N/A 37C P0 121W / 500W | 2649MiB / 143771MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
| 5 NVIDIA H20-3e On | 00000000:69:03.0 Off | 0 |
| N/A 41C P0 127W / 500W | 2272MiB / 143771MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
| 6 NVIDIA H20-3e On | 00000000:6B:02.0 Off | 0 |
| N/A 40C P0 123W / 500W | 2134MiB / 143771MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
| 7 NVIDIA H20-3e On | 00000000:6B:03.0 Off | 0 |
| N/A 35C P0 124W / 500W | 330MiB / 143771MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
@Qubitium Hello, could you please help me to fix or explan this error ? I think I have minimized the problem.