datasets
datasets copied to clipboard
Random 400 Client Error when pushing dataset
Describe the bug
When pushing a dataset, the client errors randomly with Bad Request for url:....
At the next call, a new parquet file is created for each shard.
The client may fail at any random shard.
Steps to reproduce the bug
dataset.push_to_hub("ORG/DATASET", private=True, branch="main")
Expected results
Push all the dataset to the Hub with no duplicates. If it fails, it should retry or fail, but continue from the last failed shard.
Actual results
---------------------------------------------------------------------------
HTTPError Traceback (most recent call last)
testing.ipynb Cell 29 in <cell line: 1>()
----> [1](testing.ipynb?line=0) dataset.push_to_hub("ORG/DATASET", private=True, branch="main")
File ~/.local/lib/python3.9/site-packages/datasets/arrow_dataset.py:4297, in Dataset.push_to_hub(self, repo_id, split, private, token, branch, max_shard_size, shard_size, embed_external_files)
4291 warnings.warn(
4292 "'shard_size' was renamed to 'max_shard_size' in version 2.1.1 and will be removed in 2.4.0.",
4293 FutureWarning,
4294 )
4295 max_shard_size = shard_size
-> 4297 repo_id, split, uploaded_size, dataset_nbytes, repo_files, deleted_size = self._push_parquet_shards_to_hub(
4298 repo_id=repo_id,
4299 split=split,
4300 private=private,
4301 token=token,
4302 branch=branch,
4303 max_shard_size=max_shard_size,
4304 embed_external_files=embed_external_files,
4305 )
4306 organization, dataset_name = repo_id.split("/")
4307 info_to_dump = self.info.copy()
File ~/.local/lib/python3.9/site-packages/datasets/arrow_dataset.py:4195, in Dataset._push_parquet_shards_to_hub(self, repo_id, split, private, token, branch, max_shard_size, embed_external_files)
4193 shard.to_parquet(buffer)
4194 uploaded_size += buffer.tell()
-> 4195 _retry(
4196 api.upload_file,
4197 func_kwargs=dict(
4198 path_or_fileobj=buffer.getvalue(),
4199 path_in_repo=shard_path_in_repo,
4200 repo_id=repo_id,
4201 token=token,
4202 repo_type="dataset",
4203 revision=branch,
4204 identical_ok=False,
4205 ),
4206 exceptions=HTTPError,
4207 status_codes=[504],
4208 base_wait_time=2.0,
4209 max_retries=5,
4210 max_wait_time=20.0,
4211 )
4212 shards_path_in_repo.append(shard_path_in_repo)
4214 # Cleanup to remove unused files
File ~/.local/lib/python3.9/site-packages/datasets/utils/file_utils.py:284, in _retry(func, func_args, func_kwargs, exceptions, status_codes, max_retries, base_wait_time, max_wait_time)
282 except exceptions as err:
283 if retry >= max_retries or (status_codes and err.response.status_code not in status_codes):
--> 284 raise err
285 else:
286 sleep_time = min(max_wait_time, base_wait_time * 2**retry) # Exponential backoff
File ~/.local/lib/python3.9/site-packages/datasets/utils/file_utils.py:281, in _retry(func, func_args, func_kwargs, exceptions, status_codes, max_retries, base_wait_time, max_wait_time)
279 while True:
280 try:
--> 281 return func(*func_args, **func_kwargs)
282 except exceptions as err:
283 if retry >= max_retries or (status_codes and err.response.status_code not in status_codes):
File ~/.local/lib/python3.9/site-packages/huggingface_hub/hf_api.py:1967, in HfApi.upload_file(self, path_or_fileobj, path_in_repo, repo_id, token, repo_type, revision, identical_ok, commit_message, commit_description, create_pr)
1957 commit_message = (
1958 commit_message
1959 if commit_message is not None
1960 else f"Upload {path_in_repo} with huggingface_hub"
1961 )
1962 operation = CommitOperationAdd(
1963 path_or_fileobj=path_or_fileobj,
1964 path_in_repo=path_in_repo,
1965 )
-> 1967 pr_url = self.create_commit(
1968 repo_id=repo_id,
1969 repo_type=repo_type,
1970 operations=[operation],
1971 commit_message=commit_message,
1972 commit_description=commit_description,
1973 token=token,
1974 revision=revision,
1975 create_pr=create_pr,
1976 )
1977 if pr_url is not None:
1978 re_match = re.match(REGEX_DISCUSSION_URL, pr_url)
File ~/.local/lib/python3.9/site-packages/huggingface_hub/hf_api.py:1844, in HfApi.create_commit(self, repo_id, operations, commit_message, commit_description, token, repo_type, revision, create_pr, num_threads)
1836 commit_url = f"{self.endpoint}/api/{repo_type}s/{repo_id}/commit/{revision}"
1838 commit_resp = requests.post(
1839 url=commit_url,
1840 headers={"Authorization": f"Bearer {token}"},
1841 json=commit_payload,
1842 params={"create_pr": 1} if create_pr else None,
1843 )
-> 1844 _raise_for_status(commit_resp)
1845 return commit_resp.json().get("pullRequestUrl", None)
File ~/.local/lib/python3.9/site-packages/huggingface_hub/utils/_errors.py:84, in _raise_for_status(request)
76 if request.status_code == 401:
77 # The repo was not found and the user is not Authenticated
78 raise RepositoryNotFoundError(
79 f"401 Client Error: Repository Not Found for url: {request.url}. If the"
80 " repo is private, make sure you are authenticated. (Request ID:"
81 f" {request_id})"
82 )
---> 84 _raise_with_request_id(request)
File ~/.local/lib/python3.9/site-packages/huggingface_hub/utils/_errors.py:95, in _raise_with_request_id(request)
92 if request_id is not None and len(e.args) > 0 and isinstance(e.args[0], str):
93 e.args = (e.args[0] + f" (Request ID: {request_id})",) + e.args[1:]
---> 95 raise e
File ~/.local/lib/python3.9/site-packages/huggingface_hub/utils/_errors.py:90, in _raise_with_request_id(request)
88 request_id = request.headers.get("X-Request-Id")
89 try:
---> 90 request.raise_for_status()
91 except Exception as e:
92 if request_id is not None and len(e.args) > 0 and isinstance(e.args[0], str):
File ~/.local/lib/python3.9/site-packages/requests/models.py:1021, in Response.raise_for_status(self)
1016 http_error_msg = (
1017 f"{self.status_code} Server Error: {reason} for url: {self.url}"
1018 )
1020 if http_error_msg:
-> 1021 raise HTTPError(http_error_msg, response=self)
HTTPError: 400 Client Error: Bad Request for url: https://huggingface.co/api/datasets/ORG/DATASET/commit/main (Request ID: a_F0IQAHJdxGKVRYyu1cF)
Environment info
-
datasetsversion: 2.3.2 - Platform: Linux-5.13.0-1025-aws-x86_64-with-glibc2.31
- Python version: 3.9.4
- PyArrow version: 8.0.0
- Pandas version: 1.4.3