Loading the dataset from private S3 bucket gives "TypeError: cannot pickle '_contextvars.Context' object"
Describe the bug
I'm trying to read the parquet file from the private s3 bucket using the load_dataset function, but I receive TypeError: cannot pickle '_contextvars.Context' object error
I'm working on a machine with ~/.aws/credentials file. I can't give credentials and the path to a file in a private bucket for obvious reasons, but I'll try to give all possible outputs.
Steps to reproduce the bug
import s3fs
from datasets import load_dataset
from aiobotocore.session import get_session
DATA_PATH = "s3://bucket_name/path/validation.parquet"
fs = s3fs.S3FileSystem(session=get_session())
fs.stat returns the data, so we can say that fs is working and we have all permissions
fs.stat(DATA_PATH)
# Returns:
# {'ETag': '"123123a-19"',
# 'LastModified': datetime.datetime(2023, 11, 1, 10, 16, 57, tzinfo=tzutc()),
# 'size': 312237170,
# 'name': 'bucket_name/path/validation.parquet',
# 'type': 'file',
# 'StorageClass': 'STANDARD',
# 'VersionId': 'Abc.HtmsC9h.as',
# 'ContentType': 'binary/octet-stream'}
fs.storage_options
# Returns:
# {'session': <aiobotocore.session.AioSession at 0x7f9193fa53c0>}
ds = load_dataset("parquet", data_files={"train": DATA_PATH}, storage_options=fs.storage_options)
Returns such error (expandable)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In[88], line 1
----> 1 ds = load_dataset("parquet", data_files={"train": DATA_PATH}, storage_options=fs.storage_options)
File ~/miniconda3/envs/test-env/lib/python3.10/site-packages/datasets/load.py:2153, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, token, use_auth_token, task, streaming, num_proc, storage_options, **config_kwargs)
2150 try_from_hf_gcs = path not in _PACKAGED_DATASETS_MODULES
2152 # Download and prepare data
-> 2153 builder_instance.download_and_prepare(
2154 download_config=download_config,
2155 download_mode=download_mode,
2156 verification_mode=verification_mode,
2157 try_from_hf_gcs=try_from_hf_gcs,
2158 num_proc=num_proc,
2159 storage_options=storage_options,
2160 )
2162 # Build dataset for splits
2163 keep_in_memory = (
2164 keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size)
2165 )
File ~/miniconda3/envs/test-env/lib/python3.10/site-packages/datasets/builder.py:954, in DatasetBuilder.download_and_prepare(self, output_dir, download_config, download_mode, verification_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)
952 if num_proc is not None:
953 prepare_split_kwargs["num_proc"] = num_proc
--> 954 self._download_and_prepare(
955 dl_manager=dl_manager,
956 verification_mode=verification_mode,
957 **prepare_split_kwargs,
958 **download_and_prepare_kwargs,
959 )
960 # Sync info
961 self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values())
File ~/miniconda3/envs/test-env/lib/python3.10/site-packages/datasets/builder.py:1027, in DatasetBuilder._download_and_prepare(self, dl_manager, verification_mode, **prepare_split_kwargs)
1025 split_dict = SplitDict(dataset_name=self.dataset_name)
1026 split_generators_kwargs = self._make_split_generators_kwargs(prepare_split_kwargs)
-> 1027 split_generators = self._split_generators(dl_manager, **split_generators_kwargs)
1029 # Checksums verification
1030 if verification_mode == VerificationMode.ALL_CHECKS and dl_manager.record_checksums:
File ~/miniconda3/envs/test-env/lib/python3.10/site-packages/datasets/packaged_modules/parquet/parquet.py:34, in Parquet._split_generators(self, dl_manager)
32 if not self.config.data_files:
33 raise ValueError(f"At least one data file must be specified, but got data_files={self.config.data_files}")
---> 34 data_files = dl_manager.download_and_extract(self.config.data_files)
35 if isinstance(data_files, (str, list, tuple)):
36 files = data_files
File ~/miniconda3/envs/test-env/lib/python3.10/site-packages/datasets/download/download_manager.py:565, in DownloadManager.download_and_extract(self, url_or_urls)
549 def download_and_extract(self, url_or_urls):
550 """Download and extract given `url_or_urls`.
551
552 Is roughly equivalent to:
(...)
563 extracted_path(s): `str`, extracted paths of given URL(s).
564 """
--> 565 return self.extract(self.download(url_or_urls))
File ~/miniconda3/envs/test-env/lib/python3.10/site-packages/datasets/download/download_manager.py:420, in DownloadManager.download(self, url_or_urls)
401 def download(self, url_or_urls):
402 """Download given URL(s).
403
404 By default, only one process is used for download. Pass customized `download_config.num_proc` to change this behavior.
(...)
418 ```
419 """
--> 420 download_config = self.download_config.copy()
421 download_config.extract_compressed_file = False
422 if download_config.download_desc is None:
File ~/miniconda3/envs/test-env/lib/python3.10/site-packages/datasets/download/download_config.py:94, in DownloadConfig.copy(self)
93 def copy(self) -> "DownloadConfig":
---> 94 return self.__class__(**{k: copy.deepcopy(v) for k, v in self.__dict__.items()})
File ~/miniconda3/envs/test-env/lib/python3.10/site-packages/datasets/download/download_config.py:94, in <dictcomp>(.0)
93 def copy(self) -> "DownloadConfig":
---> 94 return self.__class__(**{k: copy.deepcopy(v) for k, v in self.__dict__.items()})
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:146, in deepcopy(x, memo, _nil)
144 copier = _deepcopy_dispatch.get(cls)
145 if copier is not None:
--> 146 y = copier(x, memo)
147 else:
148 if issubclass(cls, type):
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:231, in _deepcopy_dict(x, memo, deepcopy)
229 memo[id(x)] = y
230 for key, value in x.items():
--> 231 y[deepcopy(key, memo)] = deepcopy(value, memo)
232 return y
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:172, in deepcopy(x, memo, _nil)
170 y = x
171 else:
--> 172 y = _reconstruct(x, memo, *rv)
174 # If is its own copy, don't memoize.
175 if y is not x:
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:271, in _reconstruct(x, memo, func, args, state, listiter, dictiter, deepcopy)
269 if state is not None:
270 if deep:
--> 271 state = deepcopy(state, memo)
272 if hasattr(y, '__setstate__'):
273 y.__setstate__(state)
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:146, in deepcopy(x, memo, _nil)
144 copier = _deepcopy_dispatch.get(cls)
145 if copier is not None:
--> 146 y = copier(x, memo)
147 else:
148 if issubclass(cls, type):
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:231, in _deepcopy_dict(x, memo, deepcopy)
229 memo[id(x)] = y
230 for key, value in x.items():
--> 231 y[deepcopy(key, memo)] = deepcopy(value, memo)
232 return y
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:172, in deepcopy(x, memo, _nil)
170 y = x
171 else:
--> 172 y = _reconstruct(x, memo, *rv)
174 # If is its own copy, don't memoize.
175 if y is not x:
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:271, in _reconstruct(x, memo, func, args, state, listiter, dictiter, deepcopy)
269 if state is not None:
270 if deep:
--> 271 state = deepcopy(state, memo)
272 if hasattr(y, '__setstate__'):
273 y.__setstate__(state)
[... skipping similar frames: _deepcopy_dict at line 231 (2 times), deepcopy at line 146 (2 times)]
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:172, in deepcopy(x, memo, _nil)
170 y = x
171 else:
--> 172 y = _reconstruct(x, memo, *rv)
174 # If is its own copy, don't memoize.
175 if y is not x:
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:271, in _reconstruct(x, memo, func, args, state, listiter, dictiter, deepcopy)
269 if state is not None:
270 if deep:
--> 271 state = deepcopy(state, memo)
272 if hasattr(y, '__setstate__'):
273 y.__setstate__(state)
[... skipping similar frames: deepcopy at line 146 (1 times)]
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:231, in _deepcopy_dict(x, memo, deepcopy)
229 memo[id(x)] = y
230 for key, value in x.items():
--> 231 y[deepcopy(key, memo)] = deepcopy(value, memo)
232 return y
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:146, in deepcopy(x, memo, _nil)
144 copier = _deepcopy_dispatch.get(cls)
145 if copier is not None:
--> 146 y = copier(x, memo)
147 else:
148 if issubclass(cls, type):
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:206, in _deepcopy_list(x, memo, deepcopy)
204 append = y.append
205 for a in x:
--> 206 append(deepcopy(a, memo))
207 return y
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:172, in deepcopy(x, memo, _nil)
170 y = x
171 else:
--> 172 y = _reconstruct(x, memo, *rv)
174 # If is its own copy, don't memoize.
175 if y is not x:
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:271, in _reconstruct(x, memo, func, args, state, listiter, dictiter, deepcopy)
269 if state is not None:
270 if deep:
--> 271 state = deepcopy(state, memo)
272 if hasattr(y, '__setstate__'):
273 y.__setstate__(state)
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:146, in deepcopy(x, memo, _nil)
144 copier = _deepcopy_dispatch.get(cls)
145 if copier is not None:
--> 146 y = copier(x, memo)
147 else:
148 if issubclass(cls, type):
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:231, in _deepcopy_dict(x, memo, deepcopy)
229 memo[id(x)] = y
230 for key, value in x.items():
--> 231 y[deepcopy(key, memo)] = deepcopy(value, memo)
232 return y
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:146, in deepcopy(x, memo, _nil)
144 copier = _deepcopy_dispatch.get(cls)
145 if copier is not None:
--> 146 y = copier(x, memo)
147 else:
148 if issubclass(cls, type):
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:238, in _deepcopy_method(x, memo)
237 def _deepcopy_method(x, memo): # Copy instance methods
--> 238 return type(x)(x.__func__, deepcopy(x.__self__, memo))
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:172, in deepcopy(x, memo, _nil)
170 y = x
171 else:
--> 172 y = _reconstruct(x, memo, *rv)
174 # If is its own copy, don't memoize.
175 if y is not x:
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:271, in _reconstruct(x, memo, func, args, state, listiter, dictiter, deepcopy)
269 if state is not None:
270 if deep:
--> 271 state = deepcopy(state, memo)
272 if hasattr(y, '__setstate__'):
273 y.__setstate__(state)
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:146, in deepcopy(x, memo, _nil)
144 copier = _deepcopy_dispatch.get(cls)
145 if copier is not None:
--> 146 y = copier(x, memo)
147 else:
148 if issubclass(cls, type):
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:231, in _deepcopy_dict(x, memo, deepcopy)
229 memo[id(x)] = y
230 for key, value in x.items():
--> 231 y[deepcopy(key, memo)] = deepcopy(value, memo)
232 return y
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:146, in deepcopy(x, memo, _nil)
144 copier = _deepcopy_dispatch.get(cls)
145 if copier is not None:
--> 146 y = copier(x, memo)
147 else:
148 if issubclass(cls, type):
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:231, in _deepcopy_dict(x, memo, deepcopy)
229 memo[id(x)] = y
230 for key, value in x.items():
--> 231 y[deepcopy(key, memo)] = deepcopy(value, memo)
232 return y
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:172, in deepcopy(x, memo, _nil)
170 y = x
171 else:
--> 172 y = _reconstruct(x, memo, *rv)
174 # If is its own copy, don't memoize.
175 if y is not x:
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:271, in _reconstruct(x, memo, func, args, state, listiter, dictiter, deepcopy)
269 if state is not None:
270 if deep:
--> 271 state = deepcopy(state, memo)
272 if hasattr(y, '__setstate__'):
273 y.__setstate__(state)
[... skipping similar frames: _deepcopy_dict at line 231 (3 times), deepcopy at line 146 (3 times), deepcopy at line 172 (3 times), _reconstruct at line 271 (2 times)]
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:271, in _reconstruct(x, memo, func, args, state, listiter, dictiter, deepcopy)
269 if state is not None:
270 if deep:
--> 271 state = deepcopy(state, memo)
272 if hasattr(y, '__setstate__'):
273 y.__setstate__(state)
[... skipping similar frames: _deepcopy_dict at line 231 (1 times), deepcopy at line 146 (1 times)]
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:146, in deepcopy(x, memo, _nil)
144 copier = _deepcopy_dispatch.get(cls)
145 if copier is not None:
--> 146 y = copier(x, memo)
147 else:
148 if issubclass(cls, type):
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:231, in _deepcopy_dict(x, memo, deepcopy)
229 memo[id(x)] = y
230 for key, value in x.items():
--> 231 y[deepcopy(key, memo)] = deepcopy(value, memo)
232 return y
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:172, in deepcopy(x, memo, _nil)
170 y = x
171 else:
--> 172 y = _reconstruct(x, memo, *rv)
174 # If is its own copy, don't memoize.
175 if y is not x:
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:265, in _reconstruct(x, memo, func, args, state, listiter, dictiter, deepcopy)
263 if deep and args:
264 args = (deepcopy(arg, memo) for arg in args)
--> 265 y = func(*args)
266 if deep:
267 memo[id(x)] = y
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:264, in <genexpr>(.0)
262 deep = memo is not None
263 if deep and args:
--> 264 args = (deepcopy(arg, memo) for arg in args)
265 y = func(*args)
266 if deep:
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:146, in deepcopy(x, memo, _nil)
144 copier = _deepcopy_dispatch.get(cls)
145 if copier is not None:
--> 146 y = copier(x, memo)
147 else:
148 if issubclass(cls, type):
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:211, in _deepcopy_tuple(x, memo, deepcopy)
210 def _deepcopy_tuple(x, memo, deepcopy=deepcopy):
--> 211 y = [deepcopy(a, memo) for a in x]
212 # We're not going to put the tuple in the memo, but it's still important we
213 # check for it, in case the tuple contains recursive mutable structures.
214 try:
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:211, in <listcomp>(.0)
210 def _deepcopy_tuple(x, memo, deepcopy=deepcopy):
--> 211 y = [deepcopy(a, memo) for a in x]
212 # We're not going to put the tuple in the memo, but it's still important we
213 # check for it, in case the tuple contains recursive mutable structures.
214 try:
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:172, in deepcopy(x, memo, _nil)
170 y = x
171 else:
--> 172 y = _reconstruct(x, memo, *rv)
174 # If is its own copy, don't memoize.
175 if y is not x:
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:271, in _reconstruct(x, memo, func, args, state, listiter, dictiter, deepcopy)
269 if state is not None:
270 if deep:
--> 271 state = deepcopy(state, memo)
272 if hasattr(y, '__setstate__'):
273 y.__setstate__(state)
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:146, in deepcopy(x, memo, _nil)
144 copier = _deepcopy_dispatch.get(cls)
145 if copier is not None:
--> 146 y = copier(x, memo)
147 else:
148 if issubclass(cls, type):
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:211, in _deepcopy_tuple(x, memo, deepcopy)
210 def _deepcopy_tuple(x, memo, deepcopy=deepcopy):
--> 211 y = [deepcopy(a, memo) for a in x]
212 # We're not going to put the tuple in the memo, but it's still important we
213 # check for it, in case the tuple contains recursive mutable structures.
214 try:
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:211, in <listcomp>(.0)
210 def _deepcopy_tuple(x, memo, deepcopy=deepcopy):
--> 211 y = [deepcopy(a, memo) for a in x]
212 # We're not going to put the tuple in the memo, but it's still important we
213 # check for it, in case the tuple contains recursive mutable structures.
214 try:
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:146, in deepcopy(x, memo, _nil)
144 copier = _deepcopy_dispatch.get(cls)
145 if copier is not None:
--> 146 y = copier(x, memo)
147 else:
148 if issubclass(cls, type):
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:231, in _deepcopy_dict(x, memo, deepcopy)
229 memo[id(x)] = y
230 for key, value in x.items():
--> 231 y[deepcopy(key, memo)] = deepcopy(value, memo)
232 return y
File ~/miniconda3/envs/test-env/lib/python3.10/copy.py:161, in deepcopy(x, memo, _nil)
159 reductor = getattr(x, "__reduce_ex__", None)
160 if reductor is not None:
--> 161 rv = reductor(4)
162 else:
163 reductor = getattr(x, "__reduce__", None)
TypeError: cannot pickle '_contextvars.Context' object
Expected behavior
If I choose to load the file from the public bucket with anon=True passed - everything works, so I expected loading from the private bucket to work as well
Environment info
-
datasetsversion: 2.14.6 - Platform: macOS-10.16-x86_64-i386-64bit
- Python version: 3.10.13
- Huggingface_hub version: 0.19.1
- PyArrow version: 14.0.1
- Pandas version: 1.5.3
- s3fs version: 2023.10.0
- fsspec version: 2023.10.0
- aiobotocore version: 2.7.0
I have encountered the same problem with datasets-2.20.0.
I found the following workaround for this issue (including the fix from #6598):
- specify the AWS profile name in the
storage_optionsinstead of passing an existing session object - use a custom
DownloadConfigobject to fix #6598 - pass the
storage_optionsto theDownloadConfig
from datasets import load_dataset, DownloadConfig
# Fix for DownloadConfig from https://github.com/huggingface/datasets/issues/6598#issuecomment-1986699619
class ReviseDownloadConfig(DownloadConfig):
def __post_init__(self, use_auth_token):
if use_auth_token != "deprecated":
warnings.warn(
"'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n"
f"You can remove this warning by passing 'token={use_auth_token}' instead.",
FutureWarning,
)
self.token = use_auth_token
storage_options={"profile": "my-aws-profile-name"}
ds = load_dataset(
"parquet",
data_files={"train": DATA_PATH},
storage_options=storage_options,
download_config=ReviseDownloadConfig(storage_options=storage_options)
)