lbann
lbann copied to clipboard
Error in Distconv Input Layer Error Signal Mini-batch
The Identity layer with a parallel strategy defined crashes on the head of develop on Lassen with distconv enabled. LBANN
is built using the heads of Hydrogen, DiHydrogen, and Aluminium and with NVSHMEM enabled. The bp_setup raises a failed assertion.
The error message is:
bp_setup Assertion (int)get_original_error_signals().get_local_shape()[-1] (2) == l.get_error_signals().LocalWidth() (0) failed.
The assertion is located here. The l.get_error_signals().LocalWidth() should be the mini-batch-size but is returning 0. The error seems to be present in all ranks.
Minimal-sample code to recreate error:
Input_layer_distconv_bug
│ dataset.py
│ distconv_identity_test.py
dataset.py is a simple random data generator:
import numpy as np
NUM_SAMPLES = 11
SAMPLE_SIZE = 32
_data = np.random.rand(NUM_SAMPLES, SAMPLE_SIZE)
def num_train_samples():
return NUM_SAMPLES
def sample_dims():
return (SAMPLE_SIZE, )
def get_sample(index):
return _data[index]
distconv_identity_test.py defines a small compute graph that fails:
import lbann
import lbann.contrib.launcher
import lbann.contrib.args
import os
import os.path
import argparse
desc = ("Distconv Identity Test")
parser = argparse.ArgumentParser(description=desc)
lbann.contrib.args.add_scheduler_arguments(parser)
args = parser.parse_args()
MINI_BATCH_SIZE = 9
SAMPLE_SIZE = 32
NUM_GROUPS = 4
NUM_EPOCHS = 2
def create_parallel_strategy(num_channel_groups):
return {"channel_groups": num_channel_groups,
"filter_groups": num_channel_groups}
def get_reader():
reader = lbann.reader_pb2.DataReader()
_reader = reader.reader.add()
_reader.name = 'python'
_reader.role = 'train'
_reader.shuffle = False
_reader.percent_of_data_to_use = 1.0
_reader.python.module = 'dataset'
_reader.python.module_dir = os.path.dirname(os.path.realpath(__file__))
_reader.python.sample_function = 'get_sample'
_reader.python.num_samples_function = 'num_train_samples'
_reader.python.sample_dims_function = 'sample_dims'
return reader
def main():
_inputs = lbann.Input(data_field='samples')
sliced_inputs = lbann.Slice(_inputs, slice_points=(0, SAMPLE_SIZE))
_data = lbann.Identity(sliced_inputs,
name="input_data_data_parallel")
x_weights = lbann.Weights(optimizer=lbann.SGD(),
initializer=lbann.ConstantInitializer(value=0.0),
name='input_weights')
_data = lbann.Sum(_data, lbann.WeightsLayer(weights=x_weights, dims=[SAMPLE_SIZE]))
_data = lbann.Reshape(_data, dims=[SAMPLE_SIZE, 1, 1])
_data = lbann.Identity(_data,
name="input_data_distconv",
parallel_strategy=create_parallel_strategy(NUM_GROUPS))
_data = lbann.Relu(_data,
name="distconv_activation",
parallel_strategy=create_parallel_strategy(NUM_GROUPS))
y = lbann.L2Norm2(_data)
print_model = lbann.CallbackPrintModelDescription()
callbacks = [print_model]
model = lbann.Model(NUM_EPOCHS,
layers=lbann.traverse_layer_graph(_inputs),
objective_function=[y],
callbacks=callbacks)
opt = lbann.NoOptimizer()
data_reader = get_reader()
trainer = lbann.Trainer(mini_batch_size=MINI_BATCH_SIZE)
kwargs = lbann.contrib.args.get_scheduler_kwargs(args)
lbann.contrib.launcher.run(trainer, model, data_reader, opt, **kwargs)
main()
Active environment variables:
LBANN_KEEP_ERROR_SIGNALS=1
LBANN_INIT_NVSHMEM=1
LBANN_NUM_IO_THREADS=1