vidore-benchmark Unexpected Test Results Variance with Different Batch Sizes

In theory, the performance should be stable across different batch sizes. But when testing the ViDoRe Benchmark 2 with different batch_query and batch_passage values, I observed significant performance discrepancies.

With batch_query=4 and batch_passage=4, the test results of colqwen2.5-v0.2 (max_num_visual_tokens=768) on esg_reports_human_labeled_v2 are:

{'ndcg_at_1': 0.62179, 'ndcg_at_3': 0.67745, 'ndcg_at_5': 0.67287, 'ndcg_at_10': 0.70728, 'ndcg_at_20': 0.72364, 'ndcg_at_50': 0.74802, 'ndcg_at_100': 0.75417, 'map_at_1': 0.41233, 'map_at_3': 0.57055, 'map_at_5': 0.59417, 'map_at_10': 0.62827, 'map_at_20': 0.63596, 'map_at_50': 0.64363, 'map_at_100': 0.64546, 'recall_at_1': 0.41233, 'recall_at_3': 0.6822, 'recall_at_5': 0.72194, 'recall_at_10': 0.82212, 'recall_at_20': 0.87167, 'recall_at_50': 0.95485, 'recall_at_100': 0.981, 'precision_at_1': 0.63462, 'precision_at_3': 0.40385, 'precision_at_5': 0.27308, 'precision_at_10': 0.16538, 'precision_at_20': 0.09327, 'precision_at_50': 0.04346, 'precision_at_100': 0.02308, 'mrr_at_1': 0.6730769230769231, 'mrr_at_3': 0.7596153846153846, 'mrr_at_5': 0.7692307692307693, 'mrr_at_10': 0.7743589743589744, 'mrr_at_20': 0.7743589743589744, 'mrr_at_50': 0.7765232345174206, 'mrr_at_100': 0.7765232345174206, 'naucs_at_1_max': np.float64(0.15808842304687007), 'naucs_at_1_std': np.float64(0.09610820916153678), 'naucs_at_1_diff1': np.float64(0.6580852599392208), 'naucs_at_3_max': np.float64(-0.05157372399638369), 'naucs_at_3_std': np.float64(0.06351447012509337), 'naucs_at_3_diff1': np.float64(-0.14346690686426106), 'naucs_at_5_max': np.float64(-0.12159532915431565), 'naucs_at_5_std': np.float64(0.03573545110145072), 'naucs_at_5_diff1': np.float64(-0.25402425764340436), 'naucs_at_10_max': np.float64(-0.19579840780747396), 'naucs_at_10_std': np.float64(-0.0588230883206641), 'naucs_at_10_diff1': np.float64(-0.34384957425129475), 'naucs_at_20_max': np.float64(-0.1746776235137422), 'naucs_at_20_std': np.float64(-0.04811434229056555), 'naucs_at_20_diff1': np.float64(-0.41708692070003905), 'naucs_at_50_max': np.float64(-0.167966993303741), 'naucs_at_50_std': np.float64(-0.042923187520787366), 'naucs_at_50_diff1': np.float64(-0.4540246380452276), 'naucs_at_100_max': np.float64(-0.15290268208724309), 'naucs_at_100_std': np.float64(-0.03148299510892002), 'naucs_at_100_diff1': np.float64(-0.4484928211339262)}

However, with batch_query=32 and batch_passage=32, the test results are:

{'ndcg_at_1': 0.44872, 'ndcg_at_3': 0.42337, 'ndcg_at_5': 0.42367, 'ndcg_at_10': 0.42917, 'ndcg_at_20': 0.44455, 'ndcg_at_50': 0.45184, 'ndcg_at_100': 0.45877, 'map_at_1': 0.30481, 'map_at_3': 0.35801, 'map_at_5': 0.37332, 'map_at_10': 0.37951, 'map_at_20': 0.38592, 'map_at_50': 0.38873, 'map_at_100': 0.38945, 'recall_at_1': 0.30481, 'recall_at_3': 0.39487, 'recall_at_5': 0.43494, 'recall_at_10': 0.4603, 'recall_at_20': 0.49607, 'recall_at_50': 0.52685, 'recall_at_100': 0.56531, 'precision_at_1': 0.46154, 'precision_at_3': 0.22436, 'precision_at_5': 0.15385, 'precision_at_10': 0.08846, 'precision_at_20': 0.05385, 'precision_at_50': 0.02385, 'precision_at_100': 0.0125, 'mrr_at_1': 0.5, 'mrr_at_3': 0.5288461538461539, 'mrr_at_5': 0.5375, 'mrr_at_10': 0.5399038461538461, 'mrr_at_20': 0.5429857001972387, 'mrr_at_50': 0.5438218205985764, 'mrr_at_100': 0.544285576306339, 'naucs_at_1_max': np.float64(0.25587549371639967), 'naucs_at_1_std': np.float64(0.014463906636920631), 'naucs_at_1_diff1': np.float64(0.4968858770946776), 'naucs_at_3_max': np.float64(-0.002553332813834949), 'naucs_at_3_std': np.float64(-0.10030853800895284), 'naucs_at_3_diff1': np.float64(0.11776417938443044), 'naucs_at_5_max': np.float64(-0.05793992323759826), 'naucs_at_5_std': np.float64(-0.16493527873739347), 'naucs_at_5_diff1': np.float64(0.0027769146142092027), 'naucs_at_10_max': np.float64(-0.022817220388127064), 'naucs_at_10_std': np.float64(-0.08731661589664137), 'naucs_at_10_diff1': np.float64(-0.10630531273010901), 'naucs_at_20_max': np.float64(-0.013609007863728513), 'naucs_at_20_std': np.float64(-0.05326622089219273), 'naucs_at_20_diff1': np.float64(-0.14949256053059332), 'naucs_at_50_max': np.float64(0.011573411943572498), 'naucs_at_50_std': np.float64(0.002126999025563241), 'naucs_at_50_diff1': np.float64(-0.19579215392936578), 'naucs_at_100_max': np.float64(-0.015311183218817678), 'naucs_at_100_std': np.float64(-0.012216797767884598), 'naucs_at_100_diff1': np.float64(-0.18778334725455362)}

Reproduction Code:

import torch
from colpali_engine.models import ColQwen2_5, ColQwen2_5_Processor
from colpali_engine.utils.torch_utils import get_torch_device
from datasets import load_dataset
from tqdm import tqdm

from vidore_benchmark.evaluation.vidore_evaluators import ViDoReEvaluatorBEIR
from vidore_benchmark.retrievers import VisionRetriever

model_name = "local_path_to/colqwen2.5-v0.2"
device = get_torch_device("auto")

# Load the model
model = ColQwen2_5.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map=device,
).eval()
processor = ColQwen2_5_Processor.from_pretrained(
    model_name,
    max_num_visual_tokens=768
)

# Get retriever instance
vision_retriever = VisionRetriever(model=model, processor=processor)

# Evaluate on a single BEIR format dataset (e.g one of the ViDoRe benchmark 2 dataset)
vidore_evaluator_beir = ViDoReEvaluatorBEIR(vision_retriever)
ds = {
    "corpus" : load_dataset("vidore/esg_reports_human_labeled_v2", name="corpus", split="test"),
    "queries" : load_dataset("vidore/esg_reports_human_labeled_v2", name="queries", split="test"),
    "qrels" : load_dataset("vidore/esg_reports_human_labeled_v2", name="qrels", split="test")
}

metrics_dataset_beir = vidore_evaluator_beir.evaluate_dataset(
    ds=ds,
    batch_query=4,
    batch_passage=4,
    batch_score=128,
)
print(metrics_dataset_beir)

Jun 06 '25 03:06 VickiCui

Upon further analysis, I discovered significant differences in embedding values at non-padding positions across different batch sizes. This discrepancy appears to correlate with the observed instability in results. For example: When batch_size=4:

query_embeddings[0][-1] = tensor([-6.5430e-02, 6.8359e-02, 1.9434e-01, -2.6001e-02, -1.8652e-01, -2.9663e-02, -8.0078e-02, -1.6309e-01, 9.6680e-02, 6.4453e-02, -8.3008e-03, 1.5320e-02, 7.6172e-02, -1.0840e-01, 1.5918e-01, 3.2959e-02, 1.0254e-01, 5.9814e-02, 1.2695e-01, 5.0049e-03, 1.7676e-01, 7.1777e-02, 2.8992e-03, 4.4189e-02, -1.1865e-01, -3.8086e-02, -5.0537e-02, 8.8867e-02, 6.1279e-02, -8.3496e-02, 6.1279e-02, 4.8340e-02, -3.1982e-02, 3.2959e-02, -1.0938e-01, 8.5938e-02, 8.5449e-03, -5.8289e-03, -1.3867e-01, -4.4250e-03, 1.1047e-02, 4.3213e-02, 1.1133e-01, 7.1777e-02, 4.7363e-02, 1.3867e-01, 8.6670e-03, -6.2256e-02, 4.6631e-02, -3.6621e-02, 1.3574e-01, -1.2012e-01, 1.1292e-02, -1.4941e-01, -2.7924e-03, -8.5449e-02, 1.2354e-01, -1.4258e-01, 2.6855e-02, -8.1055e-02, 8.5449e-02, 5.2734e-02, -4.4678e-02, -8.3984e-02, 1.2109e-01, -1.8066e-02, -9.2285e-02, -6.1035e-03, -1.8787e-04, -6.5430e-02, 4.1748e-02, -1.6504e-01, 8.1543e-02, 8.2520e-02, 7.6599e-03, 1.0449e-01, -1.9409e-02, -9.9609e-02, -7.0312e-02, 4.0771e-02, -1.7480e-01, -2.6733e-02, -1.3086e-01, 1.9531e-02, -7.7820e-03, -9.4238e-02, -1.9434e-01, 1.1658e-02, 4.8584e-02, 1.0840e-01, -4.3701e-02, -4.7363e-02, 2.3346e-03, -1.2988e-01, 7.7148e-02, 4.6143e-02, 8.3496e-02, -4.0771e-02, -4.5654e-02, 1.2256e-01, 1.1084e-01, -1.5234e-01, -1.0840e-01, 3.1738e-02, -8.2031e-02, -1.3281e-01, 1.1475e-01, -5.1025e-02, 1.4832e-02, -3.9062e-02, 1.2061e-01, -1.7383e-01, -1.0596e-01, -2.9175e-02, 4.8828e-03, 9.5215e-02, 1.5625e-01, 3.5400e-02, -1.2109e-01, 1.4844e-01, -1.1768e-01, 2.5269e-02, 1.6174e-03, 6.1646e-03, -7.0801e-02, -8.3008e-03, -2.8076e-02, -3.1982e-02], dtype=torch.bfloat16)

When batch_size=32:

query_embeddings[0][-1] = tensor([-0.0664, 0.0669, 0.1934, -0.0267, -0.1865, -0.0327, -0.0806, -0.1621, 0.0962, 0.0625, -0.0089, 0.0159, 0.0791, -0.1074, 0.1602, 0.0325, 0.1035, 0.0579, 0.1260, 0.0068, 0.1768, 0.0718, 0.0027, 0.0464, -0.1191, -0.0386, -0.0488, 0.0898, 0.0601, -0.0859, 0.0615, 0.0481, -0.0320, 0.0315, -0.1089, 0.0859, 0.0096, -0.0037, -0.1387, -0.0073, 0.0115, 0.0461, 0.1089, 0.0713, 0.0479, 0.1406, 0.0073, -0.0618, 0.0459, -0.0378, 0.1328, -0.1162, 0.0139, -0.1514, -0.0043, -0.0850, 0.1240, -0.1416, 0.0284, -0.0786, 0.0825, 0.0540, -0.0476, -0.0835, 0.1191, -0.0168, -0.0903, -0.0052, -0.0006, -0.0659, 0.0391, -0.1650, 0.0820, 0.0854, 0.0050, 0.1011, -0.0214, -0.1001, -0.0703, 0.0420, -0.1738, -0.0262, -0.1338, 0.0195, -0.0087, -0.0957, -0.1934, 0.0092, 0.0474, 0.1064, -0.0447, -0.0457, 0.0022, -0.1289, 0.0791, 0.0461, 0.0845, -0.0420, -0.0461, 0.1226, 0.1113, -0.1543, -0.1089, 0.0322, -0.0811, -0.1318, 0.1187, -0.0530, 0.0127, -0.0400, 0.1250, -0.1738, -0.1030, -0.0288, 0.0025, 0.0952, 0.1562, 0.0354, -0.1201, 0.1445, -0.1143, 0.0266, 0.0012, 0.0055, -0.0713, -0.0052, -0.0291, -0.0320], dtype=torch.bfloat16)

Jun 06 '25 07:06 VickiCui

Hello @VickiCui, thanks for the heads up, we are already aware of this issue (and of its quite big impact on the esg_reports_human_labeled_v2 dataset). Since the images in this datasets are really similar, it is the most impacted by this variation in embeddings.

The problem seems inherent to the models' forward pass rather than the benchmark in itself. We are investigating to see where exactly lies the issue.

Jun 06 '25 09:06 QuentinJGMace