datasets icon indicating copy to clipboard operation
datasets copied to clipboard

Random 400 Client Error when pushing dataset

Open msis opened this issue 3 years ago • 0 comments

Describe the bug

When pushing a dataset, the client errors randomly with Bad Request for url:.... At the next call, a new parquet file is created for each shard. The client may fail at any random shard.

Steps to reproduce the bug

dataset.push_to_hub("ORG/DATASET", private=True, branch="main")

Expected results

Push all the dataset to the Hub with no duplicates. If it fails, it should retry or fail, but continue from the last failed shard.

Actual results

---------------------------------------------------------------------------
HTTPError                                 Traceback (most recent call last)
testing.ipynb Cell 29 in <cell line: 1>()
----> [1](testing.ipynb?line=0) dataset.push_to_hub("ORG/DATASET", private=True, branch="main")

File ~/.local/lib/python3.9/site-packages/datasets/arrow_dataset.py:4297, in Dataset.push_to_hub(self, repo_id, split, private, token, branch, max_shard_size, shard_size, embed_external_files)
   4291     warnings.warn(
   4292         "'shard_size' was renamed to 'max_shard_size' in version 2.1.1 and will be removed in 2.4.0.",
   4293         FutureWarning,
   4294     )
   4295     max_shard_size = shard_size
-> 4297 repo_id, split, uploaded_size, dataset_nbytes, repo_files, deleted_size = self._push_parquet_shards_to_hub(
   4298     repo_id=repo_id,
   4299     split=split,
   4300     private=private,
   4301     token=token,
   4302     branch=branch,
   4303     max_shard_size=max_shard_size,
   4304     embed_external_files=embed_external_files,
   4305 )
   4306 organization, dataset_name = repo_id.split("/")
   4307 info_to_dump = self.info.copy()

File ~/.local/lib/python3.9/site-packages/datasets/arrow_dataset.py:4195, in Dataset._push_parquet_shards_to_hub(self, repo_id, split, private, token, branch, max_shard_size, embed_external_files)
   4193         shard.to_parquet(buffer)
   4194         uploaded_size += buffer.tell()
-> 4195         _retry(
   4196             api.upload_file,
   4197             func_kwargs=dict(
   4198                 path_or_fileobj=buffer.getvalue(),
   4199                 path_in_repo=shard_path_in_repo,
   4200                 repo_id=repo_id,
   4201                 token=token,
   4202                 repo_type="dataset",
   4203                 revision=branch,
   4204                 identical_ok=False,
   4205             ),
   4206             exceptions=HTTPError,
   4207             status_codes=[504],
   4208             base_wait_time=2.0,
   4209             max_retries=5,
   4210             max_wait_time=20.0,
   4211         )
   4212     shards_path_in_repo.append(shard_path_in_repo)
   4214 # Cleanup to remove unused files

File ~/.local/lib/python3.9/site-packages/datasets/utils/file_utils.py:284, in _retry(func, func_args, func_kwargs, exceptions, status_codes, max_retries, base_wait_time, max_wait_time)
    282 except exceptions as err:
    283     if retry >= max_retries or (status_codes and err.response.status_code not in status_codes):
--> 284         raise err
    285     else:
    286         sleep_time = min(max_wait_time, base_wait_time * 2**retry)  # Exponential backoff

File ~/.local/lib/python3.9/site-packages/datasets/utils/file_utils.py:281, in _retry(func, func_args, func_kwargs, exceptions, status_codes, max_retries, base_wait_time, max_wait_time)
    279 while True:
    280     try:
--> 281         return func(*func_args, **func_kwargs)
    282     except exceptions as err:
    283         if retry >= max_retries or (status_codes and err.response.status_code not in status_codes):

File ~/.local/lib/python3.9/site-packages/huggingface_hub/hf_api.py:1967, in HfApi.upload_file(self, path_or_fileobj, path_in_repo, repo_id, token, repo_type, revision, identical_ok, commit_message, commit_description, create_pr)
   1957 commit_message = (
   1958     commit_message
   1959     if commit_message is not None
   1960     else f"Upload {path_in_repo} with huggingface_hub"
   1961 )
   1962 operation = CommitOperationAdd(
   1963     path_or_fileobj=path_or_fileobj,
   1964     path_in_repo=path_in_repo,
   1965 )
-> 1967 pr_url = self.create_commit(
   1968     repo_id=repo_id,
   1969     repo_type=repo_type,
   1970     operations=[operation],
   1971     commit_message=commit_message,
   1972     commit_description=commit_description,
   1973     token=token,
   1974     revision=revision,
   1975     create_pr=create_pr,
   1976 )
   1977 if pr_url is not None:
   1978     re_match = re.match(REGEX_DISCUSSION_URL, pr_url)

File ~/.local/lib/python3.9/site-packages/huggingface_hub/hf_api.py:1844, in HfApi.create_commit(self, repo_id, operations, commit_message, commit_description, token, repo_type, revision, create_pr, num_threads)
   1836 commit_url = f"{self.endpoint}/api/{repo_type}s/{repo_id}/commit/{revision}"
   1838 commit_resp = requests.post(
   1839     url=commit_url,
   1840     headers={"Authorization": f"Bearer {token}"},
   1841     json=commit_payload,
   1842     params={"create_pr": 1} if create_pr else None,
   1843 )
-> 1844 _raise_for_status(commit_resp)
   1845 return commit_resp.json().get("pullRequestUrl", None)

File ~/.local/lib/python3.9/site-packages/huggingface_hub/utils/_errors.py:84, in _raise_for_status(request)
     76 if request.status_code == 401:
     77     # The repo was not found and the user is not Authenticated
     78     raise RepositoryNotFoundError(
     79         f"401 Client Error: Repository Not Found for url: {request.url}. If the"
     80         " repo is private, make sure you are authenticated. (Request ID:"
     81         f" {request_id})"
     82     )
---> 84 _raise_with_request_id(request)

File ~/.local/lib/python3.9/site-packages/huggingface_hub/utils/_errors.py:95, in _raise_with_request_id(request)
     92 if request_id is not None and len(e.args) > 0 and isinstance(e.args[0], str):
     93     e.args = (e.args[0] + f" (Request ID: {request_id})",) + e.args[1:]
---> 95 raise e

File ~/.local/lib/python3.9/site-packages/huggingface_hub/utils/_errors.py:90, in _raise_with_request_id(request)
     88 request_id = request.headers.get("X-Request-Id")
     89 try:
---> 90     request.raise_for_status()
     91 except Exception as e:
     92     if request_id is not None and len(e.args) > 0 and isinstance(e.args[0], str):

File ~/.local/lib/python3.9/site-packages/requests/models.py:1021, in Response.raise_for_status(self)
   1016     http_error_msg = (
   1017         f"{self.status_code} Server Error: {reason} for url: {self.url}"
   1018     )
   1020 if http_error_msg:
-> 1021     raise HTTPError(http_error_msg, response=self)

HTTPError: 400 Client Error: Bad Request for url: https://huggingface.co/api/datasets/ORG/DATASET/commit/main (Request ID: a_F0IQAHJdxGKVRYyu1cF)

Environment info

  • datasets version: 2.3.2
  • Platform: Linux-5.13.0-1025-aws-x86_64-with-glibc2.31
  • Python version: 3.9.4
  • PyArrow version: 8.0.0
  • Pandas version: 1.4.3

msis avatar Jul 12 '22 15:07 msis