NaN when training DeepAR
- PyTorch-Forecasting version: 0.9.1
- PyTorch version: 1.10.0+cu102
- Python version: 3.9.7
- Operating System: Red Hat Enterprise Linux 7.9
Expected behavior
I try to run DeepAR model on my data set.
Actual behavior
Get ValueError and nan in RNN weights and output. A typical example is a time series that is all zero except a really large spike at a given point as shown below.
It seems to do with the max_encoder_length or max_prediction_length as changing such values will influence whether there is a error or not. Also using different target_normalizer will influence whether there is a error as well.
Code to reproduce the problem
import numpy as np
import pandas as pd
import random
import torch
import pytorch_lightning as pl
from pytorch_forecasting import TimeSeriesDataSet
from pytorch_forecasting.data.encoders import EncoderNormalizer
from pytorch_forecasting.models import DeepAR
import warnings
warnings.filterwarnings("ignore")
length = 100
np.random.seed(3)
data = np.zeros(length)
df = pd.DataFrame({'value':data, 'time_idx':range(length)})
df.iloc[70,0] = np.random.rand(1)*10**5
df['group']=0
print(df)
random.seed(0)
torch.manual_seed(0)
np.random.seed(0)
max_encoder_length = 37
max_prediction_length = 6
training_cutoff = df["time_idx"].max() - max_prediction_length
training = TimeSeriesDataSet(
df[lambda x: x.time_idx < training_cutoff],
time_idx="time_idx",
target="value",
group_ids=["group"],
max_encoder_length=max_encoder_length,
max_prediction_length=max_prediction_length,
time_varying_unknown_reals=["value"],
time_varying_known_reals=["time_idx"],
)
validation = TimeSeriesDataSet.from_dataset(training, df, min_prediction_idx=training.index.time.max() + 1, stop_randomization=True)
batch_size = 4
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=2)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=2)
net = DeepAR.from_dataset(
training,
learning_rate=0.001
)
print(f"Number of parameters in network: {net.size()/1e3:.1f}k")
trainer = pl.Trainer(
gpus=0,
gradient_clip_val=0.1,
max_epochs=5
)
trainer.fit(
net,
train_dataloader=train_dataloader,
val_dataloaders=val_dataloader,
)
Traceback:
ValueError Traceback (most recent call last)
/tmp/ipykernel_25063/3740955615.py in <module>
----> 1 trainer.fit(
2 net,
3 train_dataloader=train_dataloader,
4 val_dataloaders=val_dataloader,
5 )
~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py in fit(self, model, train_dataloaders, val_dataloaders, datamodule, train_dataloader)
550 self.checkpoint_connector.resume_start()
551
--> 552 self._run(model)
553
554 assert self.state.stopped
~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py in _run(self, model)
920
921 # dispatch `start_training` or `start_evaluating` or `start_predicting`
--> 922 self._dispatch()
923
924 # plugin will finalized fitting (e.g. ddp_spawn will load trained model)
~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py in _dispatch(self)
988 self.accelerator.start_predicting(self)
989 else:
--> 990 self.accelerator.start_training(self)
991
992 def run_stage(self):
~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/accelerators/accelerator.py in start_training(self, trainer)
90
91 def start_training(self, trainer: "pl.Trainer") -> None:
---> 92 self.training_type_plugin.start_training(trainer)
93
94 def start_evaluating(self, trainer: "pl.Trainer") -> None:
~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py in start_training(self, trainer)
159 def start_training(self, trainer: "pl.Trainer") -> None:
160 # double dispatch to initiate the training loop
--> 161 self._results = trainer.run_stage()
162
163 def start_evaluating(self, trainer: "pl.Trainer") -> None:
~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py in run_stage(self)
998 if self.predicting:
999 return self._run_predict()
-> 1000 return self._run_train()
1001
1002 def _pre_training_routine(self):
~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py in _run_train(self)
1047 # reset trainer on this loop and all child loops in case user connected a custom loop
1048 self.fit_loop.trainer = self
-> 1049 self.fit_loop.run()
1050 except KeyboardInterrupt:
1051 rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/loops/base.py in run(self, *args, **kwargs)
109 try:
110 self.on_advance_start(*args, **kwargs)
--> 111 self.advance(*args, **kwargs)
112 self.on_advance_end()
113 self.iteration_count += 1
~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/loops/fit_loop.py in advance(self)
198 with self.trainer.profiler.profile("run_training_epoch"):
199 # run train epoch
--> 200 epoch_output = self.epoch_loop.run(train_dataloader)
201
202 if epoch_output is None:
~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/loops/base.py in run(self, *args, **kwargs)
109 try:
110 self.on_advance_start(*args, **kwargs)
--> 111 self.advance(*args, **kwargs)
112 self.on_advance_end()
113 self.iteration_count += 1
~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py in advance(self, dataloader_iter, **kwargs)
128
129 with self.trainer.profiler.profile("run_training_batch"):
--> 130 batch_output = self.batch_loop.run(batch, self.iteration_count, self._dataloader_idx)
131 self.batches_seen += 1
132
~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py in run(self, batch, batch_idx, dataloader_idx)
98 self.trainer.fit_loop.epoch_loop.batch_progress.increment_started()
99
--> 100 super().run(batch, batch_idx, dataloader_idx)
101 output = AttributeDict(signal=0, training_step_output=self.batch_outputs)
102 self.batch_outputs = None # free memory
~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/loops/base.py in run(self, *args, **kwargs)
109 try:
110 self.on_advance_start(*args, **kwargs)
--> 111 self.advance(*args, **kwargs)
112 self.on_advance_end()
113 self.iteration_count += 1
~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py in advance(self, batch, batch_idx, dataloader_idx)
145 self.optim_progress.optimizer_idx = opt_idx
146
--> 147 result = self._run_optimization(batch_idx, split_batch, opt_idx, optimizer)
148 if result:
149 self.batch_outputs[opt_idx].append(deepcopy(result.training_step_output))
~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py in _run_optimization(self, batch_idx, split_batch, opt_idx, optimizer)
199 else:
200 if self.trainer.lightning_module.automatic_optimization:
--> 201 self._optimizer_step(optimizer, opt_idx, batch_idx, closure)
202 else:
203 result = self._training_step(split_batch, batch_idx, opt_idx, self._hiddens)
~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py in _optimizer_step(self, optimizer, opt_idx, batch_idx, train_step_and_backward_closure)
393
394 # model hook
--> 395 model_ref.optimizer_step(
396 self.trainer.current_epoch,
397 batch_idx,
~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/core/lightning.py in optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu, using_native_amp, using_lbfgs)
1614
1615 """
-> 1616 optimizer.step(closure=optimizer_closure)
1617
1618 def optimizer_zero_grad(self, epoch: int, batch_idx: int, optimizer: Optimizer, optimizer_idx: int):
~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/core/optimizer.py in step(self, closure, **kwargs)
204 profiler_name = f"optimizer_step_and_closure_{self._optimizer_idx}"
205
--> 206 self.__optimizer_step(closure=closure, profiler_name=profiler_name, **kwargs)
207 self._total_optimizer_step_calls += 1
208
~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/core/optimizer.py in __optimizer_step(self, closure, profiler_name, **kwargs)
126
127 with trainer.profiler.profile(profiler_name):
--> 128 trainer.accelerator.optimizer_step(self._optimizer, self._optimizer_idx, lambda_closure=closure, **kwargs)
129
130 def step(self, closure: Optional[Callable] = None, **kwargs):
~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/accelerators/accelerator.py in optimizer_step(self, optimizer, opt_idx, lambda_closure, **kwargs)
294 )
295 if make_optimizer_step:
--> 296 self.run_optimizer_step(optimizer, opt_idx, lambda_closure, **kwargs)
297 self.precision_plugin.post_optimizer_step(optimizer, opt_idx)
298 self.training_type_plugin.post_optimizer_step(optimizer, opt_idx, **kwargs)
~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/accelerators/accelerator.py in run_optimizer_step(self, optimizer, optimizer_idx, lambda_closure, **kwargs)
301 self, optimizer: Optimizer, optimizer_idx: int, lambda_closure: Callable, **kwargs: Any
302 ) -> None:
--> 303 self.training_type_plugin.optimizer_step(optimizer, lambda_closure=lambda_closure, **kwargs)
304
305 def optimizer_zero_grad(self, current_epoch: int, batch_idx: int, optimizer: Optimizer, opt_idx: int) -> None:
~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py in optimizer_step(self, optimizer, lambda_closure, **kwargs)
224
225 def optimizer_step(self, optimizer: torch.optim.Optimizer, lambda_closure: Callable, **kwargs):
--> 226 optimizer.step(closure=lambda_closure, **kwargs)
227
228 @property
~/.local/lib/python3.9/site-packages/torch/optim/optimizer.py in wrapper(*args, **kwargs)
86 profile_name = "Optimizer.step#{}.step".format(obj.__class__.__name__)
87 with torch.autograd.profiler.record_function(profile_name):
---> 88 return func(*args, **kwargs)
89 return wrapper
90
~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_forecasting/optim.py in step(self, closure)
129 closure: A closure that reevaluates the model and returns the loss.
130 """
--> 131 _ = closure()
132 loss = None
133 # note - below is commented out b/c I have other work that passes back
~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py in _training_step_and_backward_closure(self, split_batch, batch_idx, opt_idx, optimizer, hiddens, return_result)
233 """
234
--> 235 result = self.training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hiddens)
236 if result is not None:
237 return_result.update(result)
~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py in training_step_and_backward(self, split_batch, batch_idx, opt_idx, optimizer, hiddens)
534 with self.trainer.profiler.profile("training_step_and_backward"):
535 # lightning module hook
--> 536 result = self._training_step(split_batch, batch_idx, opt_idx, hiddens)
537
538 if not self._skip_backward and self.trainer.lightning_module.automatic_optimization:
~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py in _training_step(self, split_batch, batch_idx, opt_idx, hiddens)
304 model_ref._current_fx_name = "training_step"
305 with self.trainer.profiler.profile("training_step"):
--> 306 training_step_output = self.trainer.accelerator.training_step(step_kwargs)
307 self.trainer.accelerator.post_training_step()
308
~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/accelerators/accelerator.py in training_step(self, step_kwargs)
191 """
192 with self.precision_plugin.train_step_context(), self.training_type_plugin.train_step_context():
--> 193 return self.training_type_plugin.training_step(*step_kwargs.values())
194
195 def post_training_step(self) -> None:
~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py in training_step(self, *args, **kwargs)
170
171 def training_step(self, *args, **kwargs):
--> 172 return self.model.training_step(*args, **kwargs)
173
174 def post_training_step(self):
~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_forecasting/models/base_model.py in training_step(self, batch, batch_idx)
359 """
360 x, y = batch
--> 361 log, out = self.step(x, y, batch_idx)
362 log.update(self.create_log(x, y, out, batch_idx))
363 return log
~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_forecasting/models/base_model.py in step(self, x, y, batch_idx, **kwargs)
499 )
500 else:
--> 501 loss = self.loss(prediction, y)
502
503 self.log(f"{['val', 'train'][self.training]}_loss", loss, on_step=self.training, on_epoch=True, prog_bar=True)
~/.local/lib/python3.9/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1100 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1101 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102 return forward_call(*input, **kwargs)
1103 # Do not call functions when jit is used
1104 full_backward_hooks, non_full_backward_hooks = [], []
~/.local/lib/python3.9/site-packages/torchmetrics/metric.py in forward(self, *args, **kwargs)
195
196 with torch.no_grad():
--> 197 self.update(*args, **kwargs)
198
199 if self.compute_on_step:
~/.local/lib/python3.9/site-packages/torchmetrics/metric.py in wrapped_func(*args, **kwargs)
253 self._computed = None
254 self._update_called = True
--> 255 return update(*args, **kwargs)
256
257 return wrapped_func
~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_forecasting/metrics.py in update(self, y_pred, target)
534 lengths = torch.full((target.size(0),), fill_value=target.size(1), dtype=torch.long, device=target.device)
535
--> 536 losses = self.loss(y_pred, target)
537 # weight samples
538 if weight is not None:
~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_forecasting/metrics.py in loss(self, y_pred, y_actual)
959 torch.Tensor: metric value on which backpropagation can be applied
960 """
--> 961 distribution = self.map_x_to_distribution(y_pred)
962 loss = -distribution.log_prob(y_actual)
963 return loss
~/.conda/envs/pytorch-cu102/lib/python3.9/site-packages/pytorch_forecasting/metrics.py in map_x_to_distribution(self, x)
1033
1034 def map_x_to_distribution(self, x: torch.Tensor) -> distributions.Normal:
-> 1035 return self.distribution_class(loc=x[..., 0], scale=x[..., 1])
1036
1037 def rescale_parameters(
~/.local/lib/python3.9/site-packages/torch/distributions/normal.py in __init__(self, loc, scale, validate_args)
48 else:
49 batch_shape = self.loc.size()
---> 50 super(Normal, self).__init__(batch_shape, validate_args=validate_args)
51
52 def expand(self, batch_shape, _instance=None):
~/.local/lib/python3.9/site-packages/torch/distributions/distribution.py in __init__(self, batch_shape, event_shape, validate_args)
53 valid = constraint.check(value)
54 if not valid.all():
---> 55 raise ValueError(
56 f"Expected parameter {param} "
57 f"({type(value).__name__} of shape {tuple(value.shape)}) "
ValueError: Expected parameter loc (Tensor of shape (4, 6)) of distribution Normal(loc: torch.Size([4, 6]), scale: torch.Size([4, 6])) to satisfy the constraint Real(), but found invalid values:
tensor([[nan, nan, nan, nan, nan, nan],
[nan, nan, nan, nan, nan, nan],
[nan, nan, nan, nan, nan, nan],
[nan, nan, nan, nan, nan, nan]], requires_grad=True)
I'm facing the same issue - have you reached any conclusions, why is this happening?
You can try gradient clipping it fixed it in my case
exact same issue. resolved by using gradient clipping. thanks @15m43lk4155y
exact same issue. gradient clipping does not work. I have tried gradient_clip_val=0.1,0.5,0.6,1.0.
do you mean the this gradient clipping? @15m43lk4155y
trainer = pl.Trainer(
gpus=[0] if torch.cuda.is_available() else None,
max_epochs=max_epochs,
gradient_clip_val=0.1, <-----this arg ?
callbacks=[early_stop_callback, model_checkpt],
log_every_n_steps=50)