Job Manager crash on ApiResponseError
The job manager crashed on some ApiResponseError:
The error:
File "/data/users/Private/joris.c/lcfm-production/notebooks/sentinel1-jm.py", line 92, in <module>
job_manager.run_jobs(
File "/home/joris.c/mambaforge/envs/lcfm-production/lib/python3.11/site-packages/openeo/extra/job_management.py", line 365, in run_jobs
self._launch_job(start_job, df, i, backend_name)
File "/home/joris.c/mambaforge/envs/lcfm-production/lib/python3.11/site-packages/openeo/extra/job_management.py", line 400, in _launch_job
job = start_job(
^^^^^^^^^^
File "/data/users/Private/joris.c/lcfm-production/src/sentinel1/pipeline.py", line 87, in start_job
secondary_result = result_datacube.result_node()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joris.c/mambaforge/envs/lcfm-production/lib/python3.11/site-packages/openeo/rest/connection.py", line 1764, in create_job
response = self.post("/jobs", json=pg_with_metadata, expected_status=201)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joris.c/mambaforge/envs/lcfm-production/lib/python3.11/site-packages/openeo/rest/connection.py", line 249, in post
return self.request("post", path=path, json=json, allow_redirects=False, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joris.c/mambaforge/envs/lcfm-production/lib/python3.11/site-packages/openeo/rest/connection.py", line 816, in request
return _request()
^^^^^^^^^^
File "/home/joris.c/mambaforge/envs/lcfm-production/lib/python3.11/site-packages/openeo/rest/connection.py", line 809, in _request
return super(Connection, self).request(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joris.c/mambaforge/envs/lcfm-production/lib/python3.11/site-packages/openeo/rest/connection.py", line 187, in request
self._raise_api_error(resp)
File "/home/joris.c/mambaforge/envs/lcfm-production/lib/python3.11/site-packages/openeo/rest/connection.py", line 207, in _raise_api_error
raise OpenEoApiError(
openeo.rest.OpenEoApiError: [500] Internal: Server error: EjrApiResponseError('EJR API error: 500 \'Internal Server Error\' on `POST \'https://jobregistry.vgt.vito.be/jobs\'`: {"statusCode":500,"message":"Could not store jobs in database: illegal_argument_exception - no write index is defined for alias [openeo-jobs-prod]. The write index may be explicitly disabled using is_write_index=false or the alias points to multiple indices without one being designated as a write index"}') (ref: r-24092514fc8a4f3c96a98bd6ec4230c2)
openeo.rest.OpenEoApiError: [500] Internal: Server error: EjrApiResponseError('EJR API error: 500 'Internal Server Error' on POST \'https://jobregistry.vgt.vito.be/jobs\': {"statusCode":500,"message":"Could not store jobs in database: illegal_argument_exception - no write index is defined for alias [openeo-jobs-prod]. The write index may be explicitly disabled using is_write_index=false or the alias points to multiple indices without one being designated as a write index"}') (ref: r-24092514fc8a4f3c96a98bd6ec4230c2)
This is a back-end issue: creation of the job failed there.
What we can do client side in job manager:
- given that this a server side "500" error, we could retry a couple of times (with some wait time in between)
- but in the end there is no guarantee that it will work eventually, so I guess we ultimately should mark the job as failed and continue with the other jobs (but these might all fail as well)
I had a another similar 500 error, OidcException:
Traceback (most recent call last):
File "/data/users/Private/joris.c/lcfm-production/notebooks/sentinel1-jm.py", line 88, in <module>
job_manager.run_jobs(
File "/home/joris.c/mambaforge/envs/lcfm-production/lib/python3.11/site-packages/openeo/extra/job_management.py", line 365, in run_jobs
self._launch_job(start_job, df, i, backend_name)
File "/home/joris.c/mambaforge/envs/lcfm-production/lib/python3.11/site-packages/openeo/extra/job_management.py", line 414, in _launch_job
status = job.status()
^^^^^^^^^^^^
File "/home/joris.c/mambaforge/envs/lcfm-production/lib/python3.11/site-packages/openeo/rest/job.py", line 87, in status
return self.describe().get("status", "N/A")
^^^^^^^^^^^^^^^
File "/home/joris.c/mambaforge/envs/lcfm-production/lib/python3.11/site-packages/openeo/rest/job.py", line 77, in describe
return self.connection.get(f"/jobs/{self.job_id}", expected_status=200).json()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joris.c/mambaforge/envs/lcfm-production/lib/python3.11/site-packages/openeo/rest/connection.py", line 239, in get
return self.request("get", path=path, stream=stream, auth=auth, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joris.c/mambaforge/envs/lcfm-production/lib/python3.11/site-packages/openeo/rest/connection.py", line 816, in request
return _request()
^^^^^^^^^^
File "/home/joris.c/mambaforge/envs/lcfm-production/lib/python3.11/site-packages/openeo/rest/connection.py", line 809, in _request
return super(Connection, self).request(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joris.c/mambaforge/envs/lcfm-production/lib/python3.11/site-packages/openeo/rest/connection.py", line 187, in request
self._raise_api_error(resp)
File "/home/joris.c/mambaforge/envs/lcfm-production/lib/python3.11/site-packages/openeo/rest/connection.py", line 207, in _raise_api_error
raise OpenEoApiError(
openeo.rest.OpenEoApiError: [500] Internal: Server error: OidcException('Failed to retrieve access token at \'https://sso.terrascope.be/auth/realms/terrascope/protocol/openid-connect/token\': 500 \'Internal Server Error\' \'{"error":"unknown_error"}\'') (ref: r-240925ae6e434800b096226d2de87769)
openeo.rest.OpenEoApiError: [500] Internal: Server error: OidcException('Failed to retrieve access token at 'https://sso.terrascope.be/auth/realms/terrascope/protocol/openid-connect/token': 500 'Internal Server Error' '{"error":"unknown_error"}'') (ref: r-240925ae6e434800b096226d2de87769)
With v 0.32.0, I just got this error:
2024-10-24 07:19:34.070 | INFO | sentinel1.pipeline:start_job:120 - Starting Job: j-2410247e2d7f46239f21d65252bebc31
{'executor_memory': '2G', 'executor_memoryOverhead': '1G', 'driver-memory': '3G', 'driver-memoryOverhead': '1G', 'python-memory': '16m', 'max-executors': 10, 'executor-memory': '1G', 'executor-memoryOverhead': '1G'}
urllib3.exceptions.ResponseError: too many 503 error responses
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/joris.c/mambaforge/envs/lcfm-production/lib/python3.11/site-packages/requests/adapters.py", line 667, in send
resp = conn.urlopen(
^^^^^^^^^^^^^
File "/home/joris.c/mambaforge/envs/lcfm-production/lib/python3.11/site-packages/urllib3/connectionpool.py", line 944, in urlopen
return self.urlopen(
^^^^^^^^^^^^^
File "/home/joris.c/mambaforge/envs/lcfm-production/lib/python3.11/site-packages/urllib3/connectionpool.py", line 944, in urlopen
return self.urlopen(
^^^^^^^^^^^^^
File "/home/joris.c/mambaforge/envs/lcfm-production/lib/python3.11/site-packages/urllib3/connectionpool.py", line 944, in urlopen
return self.urlopen(
^^^^^^^^^^^^^
[Previous line repeated 2 more times]
File "/home/joris.c/mambaforge/envs/lcfm-production/lib/python3.11/site-packages/urllib3/connectionpool.py", line 934, in urlopen
retries = retries.increment(method, url, response=response, _pool=self)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joris.c/mambaforge/envs/lcfm-production/lib/python3.11/site-packages/urllib3/util/retry.py", line 519, in increment
raise MaxRetryError(_pool, url, reason) from reason # type: ignore[arg-type]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='openeo.dataspace.copernicus.eu', port=443): Max retries exceeded with url: /openeo/1.2/jobs/j-241024b74812415ca0e89a771dc4ca22 (Caused by ResponseError('too many 503 error responses'))```