cuvs
cuvs copied to clipboard
[BUG] cuVS CAGRA gets stalled when using SLURM
Describe the bug I ran cuVS CAGRA on the same dataset two ways:
- Run it by logging into a node using
python script.py. This finished in 900 seconds and I can see GPU utilization at 100% almost all the time. - Use
sbatchin SLURM to submit a job that runs the same python script. This took 2 hours and I can see GPU utilization fluctuating between 0 and 100.
Steps/Code to reproduce bug
import numpy as np
import pytest
from pylibraft.common import device_ndarray
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize
from cuvs.neighbors import cagra
from cuvs.test.ann_utils import calc_recall, generate_data
import tqdm
import math
import argparse
import time
def parse_args():
parser = argparse.ArgumentParser(description='Compute CAGRA KNN')
parser.add_argument('--intermediate-graph-degree', required=True, type=int)
parser.add_argument('--graph-degree', required=True, type=int)
parser.add_argument('--itopk-size', required=True, type=int)
return parser.parse_args()
def main():
args = parse_args()
resources = None
build_kwargs = {"graph_degree": args.graph_degree, "intermediate_graph_degree": args.intermediate_graph_degree}
search_kwargs = {"itopk_size": args.itopk_size}
import rmm
from rmm.allocators.cupy import rmm_cupy_allocator
rmm.reinitialize(
managed_memory=True,
pool_allocator=False,
)
import cupy as cp
cp.cuda.set_allocator(rmm_cupy_allocator)
k = 15
print("Loading dataset...............")
Y = np.load('dataset.npy').astype(np.float32)
print("Dataset loaded......")
build_params = cagra.IndexParams(metric="sqeuclidean", build_algo="nn_descent", **build_kwargs)
start = time.time()
print("Building index....")
index = cagra.build(build_params, Y)
print("Index built.......")
n_samples = Y.shape[0]
print("Initializing neighbors and distances array...............")
all_neighbors = np.zeros((n_samples, k), dtype=np.int32)
all_distances = np.zeros((n_samples, k), dtype=np.float32)
batchsize = 65000
n_batches = math.ceil(n_samples / batchsize)
print(f"Starting processing of batches {n_batches}")
for batch in tqdm.tqdm(range(n_batches)):
start_idx = batch * batchsize
stop_idx = min((batch + 1) * batchsize, n_samples)
batch_Y = device_ndarray(Y[start_idx:stop_idx, :])
search_params = cagra.SearchParams(**search_kwargs)
distances, neighbors = cagra.search(
search_params, index, batch_Y, k
)
all_neighbors[start_idx:stop_idx, :] = neighbors.copy_to_host()
all_distances[start_idx:stop_idx, :] = distances.copy_to_host()
print(f"\nKNN computation completed in {time.time() - start:.2f} seconds")
if resources is not None:
resources.sync()
all_distances = np.sqrt(all_distances)
if __name__ == '__main__':
main()
Expected behavior They should take the same time.
Environment details (please complete the following information):
- Environment location: HPC
- Method of RAFT install: conda with rapids-24.12