Error converting Webdataset
Hello, I am trying to convert a imagenet sharded as webdataset tar files into an ffcv dataset.
I downloaded the ffcv-imagnet repo https://github.com/libffcv/ffcv-imagenet
and modified it as best as I could tell from the webdataset tutorial
def pipeline(dataset):
dataset.decode('rgb8').to_tuple('jpg;png;jpeg cls')
@section('cfg')
@param('dataset')
@param('split')
@param('data_dir')
@param('write_path')
@param('max_resolution')
@param('num_workers')
@param('chunk_size')
@param('subset')
@param('jpeg_quality')
@param('write_mode')
@param('compress_probability')
def main(dataset, split, data_dir, write_path, max_resolution, num_workers,
chunk_size, subset, jpeg_quality, write_mode,
compress_probability):
my_shards = glob(path.join(data_dir, f'*{split}*'))
writer = DatasetWriter(write_path, {
'image': RGBImageField(write_mode=write_mode,
max_resolution=max_resolution,
compress_probability=compress_probability,
jpeg_quality=jpeg_quality),
'label': IntField(),
}, num_workers=num_workers)
writer.from_webdataset(my_shards, pipeline=pipeline)
I am getting an error
Traceback (most recent call last):
File "/home/cc/ffcv-imagenet/write_imagenet_web.py", line 67, in <module>
main()
File "/home/cc/miniconda3/envs/ffcv/lib/python3.9/site-packages/fastargs/decorators.py", line 41, in __call__
raise e
File "/home/cc/miniconda3/envs/ffcv/lib/python3.9/site-packages/fastargs/decorators.py", line 35, in __call__
return self.func(*args, **filled_args)
File "/home/cc/ffcv-imagenet/write_imagenet_web.py", line 58, in main
writer.from_webdataset(my_shards, pipeline=pipeline)
File "/home/cc/miniconda3/envs/ffcv/lib/python3.9/site-packages/ffcv/writer.py", line 311, in from_webdataset
lengths = thread_map(counter, shards, max_workers=self.num_workers)
File "/home/cc/miniconda3/envs/ffcv/lib/python3.9/site-packages/tqdm/contrib/concurrent.py", line 94, in thread_map
return _executor_map(ThreadPoolExecutor, fn, *iterables, **tqdm_kwargs)
File "/home/cc/miniconda3/envs/ffcv/lib/python3.9/site-packages/tqdm/contrib/concurrent.py", line 76, in _executor_map
return list(tqdm_class(ex.map(fn, *iterables, **map_args), **kwargs))
File "/home/cc/miniconda3/envs/ffcv/lib/python3.9/site-packages/tqdm/std.py", line 1195, in __iter__
for obj in iterable:
File "/home/cc/miniconda3/envs/ffcv/lib/python3.9/concurrent/futures/_base.py", line 609, in result_iterator
yield fs.pop().result()
File "/home/cc/miniconda3/envs/ffcv/lib/python3.9/concurrent/futures/_base.py", line 446, in result
return self.__get_result()
File "/home/cc/miniconda3/envs/ffcv/lib/python3.9/concurrent/futures/_base.py", line 391, in __get_result
raise self._exception
File "/home/cc/miniconda3/envs/ffcv/lib/python3.9/concurrent/futures/thread.py", line 58, in run
result = self.fn(*self.args, **self.kwargs)
File "/home/cc/miniconda3/envs/ffcv/lib/python3.9/site-packages/ffcv/writer.py", line 36, in count_samples_in_shard
for _ in from_shard(shard, pipeline):
TypeError: 'NoneType' object is not iterable
it looks like an error with either the WebDataset or the pipeline (from writer.py)
def from_shard(shard, pipeline):
# We import webdataset here so that it desn't crash if it's not required
# (Webdataset is an optional depdency)
from webdataset import WebDataset
dataset = WebDataset(shard)
dataset = pipeline(dataset)
return dataset
def count_samples_in_shard(shard, pipeline):
#
# We count the length of the dataset
# We are not using __len__ since it might not be implemented
count = 0
print(shard)
for _ in from_shard(shard, pipeline):
count += 1
return count
what could cause the dataset to return None? I double checked that the glob is correct and returns the right paths to the tar files when called. I'm kinda at a loss here.
Hi @codestar12 ! Sorry for the late response here---is this resolved? If not, can you double check that you're able to iterate over the WebDataset itself (i.e., completely independently of FFCV)?