[BUG] cuVS CAGRA gets stalled when using SLURM

Open abs51295 opened this issue 11 months ago • 0 comments

Describe the bug I ran cuVS CAGRA on the same dataset two ways:

Run it by logging into a node using python script.py. This finished in 900 seconds and I can see GPU utilization at 100% almost all the time.
Use sbatch in SLURM to submit a job that runs the same python script. This took 2 hours and I can see GPU utilization fluctuating between 0 and 100.

Steps/Code to reproduce bug

import numpy as np
import pytest
from pylibraft.common import device_ndarray
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize

from cuvs.neighbors import cagra
from cuvs.test.ann_utils import calc_recall, generate_data
import tqdm

import math

import argparse

import time

def parse_args():
    parser = argparse.ArgumentParser(description='Compute CAGRA KNN')
    parser.add_argument('--intermediate-graph-degree', required=True, type=int)
    parser.add_argument('--graph-degree', required=True, type=int)
    parser.add_argument('--itopk-size', required=True, type=int)
    return parser.parse_args()

def main():
    args = parse_args()

    resources = None
    build_kwargs = {"graph_degree": args.graph_degree, "intermediate_graph_degree": args.intermediate_graph_degree}
    search_kwargs = {"itopk_size": args.itopk_size}

    import rmm
    from rmm.allocators.cupy import rmm_cupy_allocator
    rmm.reinitialize(
        managed_memory=True,
        pool_allocator=False,
    )
    import cupy as cp
    cp.cuda.set_allocator(rmm_cupy_allocator)

    k = 15

    print("Loading dataset...............")

    Y = np.load('dataset.npy').astype(np.float32)

    print("Dataset loaded......")

    build_params = cagra.IndexParams(metric="sqeuclidean", build_algo="nn_descent", **build_kwargs)

    start = time.time()
    print("Building index....")
    index = cagra.build(build_params, Y)
    print("Index built.......")

    n_samples = Y.shape[0]

    print("Initializing neighbors and distances array...............")
    all_neighbors = np.zeros((n_samples, k), dtype=np.int32)
    all_distances = np.zeros((n_samples, k), dtype=np.float32)

    batchsize = 65000
    n_batches = math.ceil(n_samples / batchsize)
    print(f"Starting processing of batches {n_batches}")

    for batch in tqdm.tqdm(range(n_batches)):
        start_idx = batch * batchsize
        stop_idx = min((batch + 1) * batchsize, n_samples)
        batch_Y = device_ndarray(Y[start_idx:stop_idx, :])

        search_params = cagra.SearchParams(**search_kwargs)
        distances, neighbors = cagra.search(
                search_params, index, batch_Y, k
        )
        all_neighbors[start_idx:stop_idx, :] = neighbors.copy_to_host()
        all_distances[start_idx:stop_idx, :] = distances.copy_to_host()

    print(f"\nKNN computation completed in {time.time() - start:.2f} seconds")

    if resources is not None:
        resources.sync()

    all_distances = np.sqrt(all_distances)

if __name__ == '__main__':
    main()

Expected behavior They should take the same time.

Environment details (please complete the following information):

Environment location: HPC
Method of RAFT install: conda with rapids-24.12

Feb 26 '25 22:02 abs51295