DNABERT_2
DNABERT_2 copied to clipboard
CUDA out of memory error
Hi, I am trying to finetune DNABERT2 and DNABERTS on my data. My data has approx 800k sequences which are around 500 nucleotides each. I am fine tuning on kaggle's P100 GPU. I am getting CUDA out of memory error, unless I choose a very small dataset. Following is the error when running trainer.train() on the whole dataset, it could process about 70k sequences before crashing.
---------------------------------------------------------------------------
OutOfMemoryError Traceback (most recent call last)
<ipython-input-10-9f78b7486d29> in <cell line: 2>()
1 # Step 6: Train the model
----> 2 trainer.train()
/usr/local/lib/python3.10/dist-packages/transformers/trainer.py in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1631 self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size
1632 )
-> 1633 return inner_training_loop(
1634 args=args,
1635 resume_from_checkpoint=resume_from_checkpoint,
/usr/local/lib/python3.10/dist-packages/transformers/trainer.py in _inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
1992
1993 self.control = self.callback_handler.on_epoch_end(args, self.state, self.control)
-> 1994 self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
1995
1996 if DebugOption.TPU_METRICS_DEBUG in self.args.debug:
/usr/local/lib/python3.10/dist-packages/transformers/trainer.py in _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, ignore_keys_for_eval)
2234 )
2235 else:
-> 2236 metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
2237 self._report_to_hp_search(trial, self.state.global_step, metrics)
2238
/usr/local/lib/python3.10/dist-packages/transformers/trainer.py in evaluate(self, eval_dataset, ignore_keys, metric_key_prefix)
2930
2931 eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
-> 2932 output = eval_loop(
2933 eval_dataloader,
2934 description="Evaluation",
/usr/local/lib/python3.10/dist-packages/transformers/trainer.py in evaluation_loop(self, dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix)
3138 if self.preprocess_logits_for_metrics is not None:
3139 logits = self.preprocess_logits_for_metrics(logits, labels)
-> 3140 preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100)
3141 self.control = self.callback_handler.on_prediction_step(args, self.state, self.control)
3142
/usr/local/lib/python3.10/dist-packages/transformers/trainer_pt_utils.py in nested_concat(tensors, new_tensors, padding_index)
111 ), f"Expected `tensors` and `new_tensors` to have the same type but found {type(tensors)} and {type(new_tensors)}."
112 if isinstance(tensors, (list, tuple)):
--> 113 return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n in zip(tensors, new_tensors))
114 elif isinstance(tensors, torch.Tensor):
115 return torch_pad_and_concatenate(tensors, new_tensors, padding_index=padding_index)
/usr/local/lib/python3.10/dist-packages/transformers/trainer_pt_utils.py in <genexpr>(.0)
111 ), f"Expected `tensors` and `new_tensors` to have the same type but found {type(tensors)} and {type(new_tensors)}."
112 if isinstance(tensors, (list, tuple)):
--> 113 return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n in zip(tensors, new_tensors))
114 elif isinstance(tensors, torch.Tensor):
115 return torch_pad_and_concatenate(tensors, new_tensors, padding_index=padding_index)
/usr/local/lib/python3.10/dist-packages/transformers/trainer_pt_utils.py in nested_concat(tensors, new_tensors, padding_index)
113 return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n in zip(tensors, new_tensors))
114 elif isinstance(tensors, torch.Tensor):
--> 115 return torch_pad_and_concatenate(tensors, new_tensors, padding_index=padding_index)
116 elif isinstance(tensors, Mapping):
117 return type(tensors)(
/usr/local/lib/python3.10/dist-packages/transformers/trainer_pt_utils.py in torch_pad_and_concatenate(tensor1, tensor2, padding_index)
78
79 # Now let's fill the result tensor
---> 80 result = tensor1.new_full(new_shape, padding_index)
81 result[: tensor1.shape[0], : tensor1.shape[1]] = tensor1
82 result[tensor1.shape[0] :, : tensor2.shape[1]] = tensor2
OutOfMemoryError: CUDA out of memory. Tried to allocate 6.99 GiB. GPU 0 has a total capacity of 15.89 GiB of which 6.99 GiB is free. Process 3145 has 8.90 GiB memory in use. Of the allocated memory 8.33 GiB is allocated by PyTorch, and 281.04 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Interestingly, when I ran the same code with just 20k sequences for training and 6k sequences for inference. The code crashed due to CUDA memory error during inference, and the error message is as below.
---------------------------------------------------------------------------
OutOfMemoryError Traceback (most recent call last)
<ipython-input-13-941869335d74> in <cell line: 1>()
----> 1 predictions = trainer.predict(test_dataset)
2
3 y_test = test_dataset['labels']
4 y_pred = predictions.predictions[0].argmax(-1)
5
/usr/local/lib/python3.10/dist-packages/transformers/trainer.py in predict(self, test_dataset, ignore_keys, metric_key_prefix)
3006
3007 eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
-> 3008 output = eval_loop(
3009 test_dataloader, description="Prediction", ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix
3010 )
/usr/local/lib/python3.10/dist-packages/transformers/trainer.py in evaluation_loop(self, dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix)
3138 if self.preprocess_logits_for_metrics is not None:
3139 logits = self.preprocess_logits_for_metrics(logits, labels)
-> 3140 preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100)
3141 self.control = self.callback_handler.on_prediction_step(args, self.state, self.control)
3142
/usr/local/lib/python3.10/dist-packages/transformers/trainer_pt_utils.py in nested_concat(tensors, new_tensors, padding_index)
111 ), f"Expected `tensors` and `new_tensors` to have the same type but found {type(tensors)} and {type(new_tensors)}."
112 if isinstance(tensors, (list, tuple)):
--> 113 return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n in zip(tensors, new_tensors))
114 elif isinstance(tensors, torch.Tensor):
115 return torch_pad_and_concatenate(tensors, new_tensors, padding_index=padding_index)
/usr/local/lib/python3.10/dist-packages/transformers/trainer_pt_utils.py in <genexpr>(.0)
111 ), f"Expected `tensors` and `new_tensors` to have the same type but found {type(tensors)} and {type(new_tensors)}."
112 if isinstance(tensors, (list, tuple)):
--> 113 return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n in zip(tensors, new_tensors))
114 elif isinstance(tensors, torch.Tensor):
115 return torch_pad_and_concatenate(tensors, new_tensors, padding_index=padding_index)
/usr/local/lib/python3.10/dist-packages/transformers/trainer_pt_utils.py in nested_concat(tensors, new_tensors, padding_index)
113 return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n in zip(tensors, new_tensors))
114 elif isinstance(tensors, torch.Tensor):
--> 115 return torch_pad_and_concatenate(tensors, new_tensors, padding_index=padding_index)
116 elif isinstance(tensors, Mapping):
117 return type(tensors)(
/usr/local/lib/python3.10/dist-packages/transformers/trainer_pt_utils.py in torch_pad_and_concatenate(tensor1, tensor2, padding_index)
78
79 # Now let's fill the result tensor
---> 80 result = tensor1.new_full(new_shape, padding_index)
81 result[: tensor1.shape[0], : tensor1.shape[1]] = tensor1
82 result[tensor1.shape[0] :, : tensor2.shape[1]] = tensor2
OutOfMemoryError: CUDA out of memory. Tried to allocate 6.99 GiB. GPU 0 has a total capacity of 15.89 GiB of which 7.01 GiB is free. Process 2689 has 8.88 GiB memory in use. Of the allocated memory 8.34 GiB is allocated by PyTorch, and 252.77 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Please let me know how to resolve this?