(GCP) `GCPCluster` Throws OAuth errors when attempting to create a new cluster.
What happened: RefreshError: ('invalid_scope: Invalid OAuth scope or ID token audience provided.', '{"error":"invalid_scope","error_description":"Invalid OAuth scope or ID token audience provided."}')
What you expected to happen: Cluster creation to succeed
Minimal Complete Verifiable Example:
- Successfully able to run
gcloud auth login - Subsequently able to run gsutil on a variety of authenticated tasks without a problem (for validation)
- cat ~/.config/dask/cloudprovider.yaml
cloudprovider:
gcp:
projectid: "<correct project>"
from dask.distributed import Client, wait, get_worker
from dask_cloudprovider.gcp import GCPCluster
cluster = GCPCluster(projectid="<correct project>",
machine_type="n1-standard-4",
zone="us-central1-c",
ngpus=1,
gpu_type="nvidia-tesla-t4",
n_workers=1)
client = Client(cluster)
---------------------------------------------------------------------------
RefreshError Traceback (most recent call last)
<ipython-input-6-032f333965aa> in <module>
----> 1 cluster = GCPCluster(projectid="<correct project>",
2 machine_type="n1-standard-4",
3 zone="us-central1-c",
4 ngpus=1,
5 gpu_type="nvidia-tesla-t4",
~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/dask_cloudprovider/gcp/instances.py in __init__(self, projectid, zone, network, machine_type, on_host_maintenance, source_image, docker_image, ngpus, gpu_type, filesystem_size, disk_type, auto_shutdown, bootstrap, preemptible, debug, **kwargs)
601 self.worker_options = {**self.options}
602
--> 603 super().__init__(debug=debug, **kwargs)
604
605
~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/dask_cloudprovider/generic/vmcluster.py in __init__(self, n_workers, worker_class, worker_options, scheduler_options, docker_image, docker_args, env_vars, security, protocol, debug, **kwargs)
287 self.uuid = str(uuid.uuid4())[:8]
288
--> 289 super().__init__(**kwargs, security=self.security)
290
291 async def call_async(self, f, *args, **kwargs):
~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/distributed/deploy/spec.py in __init__(self, workers, scheduler, worker, asynchronous, loop, security, silence_logs, name, shutdown_on_close)
279 if not self.asynchronous:
280 self._loop_runner.start()
--> 281 self.sync(self._start)
282 self.sync(self._correct_state)
283
~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/distributed/deploy/cluster.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
187 return future
188 else:
--> 189 return sync(self.loop, func, *args, **kwargs)
190
191 def _log(self, log):
~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
338 if error[0]:
339 typ, exc, tb = error[0]
--> 340 raise exc.with_traceback(tb)
341 else:
342 return result[0]
~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/distributed/utils.py in f()
322 if callback_timeout is not None:
323 future = asyncio.wait_for(future, callback_timeout)
--> 324 result[0] = yield future
325 except Exception as exc:
326 error[0] = sys.exc_info()
~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/tornado/gen.py in run(self)
760
761 try:
--> 762 value = future.result()
763 except Exception:
764 exc_info = sys.exc_info()
~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/dask_cloudprovider/generic/vmcluster.py in _start(self)
327 "Hang tight! ",
328 ):
--> 329 await super()._start()
330
331 def render_process_cloud_init(self, process):
~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/distributed/deploy/spec.py in _start(self)
307
308 self.status = Status.starting
--> 309 self.scheduler = await self.scheduler
310 self.scheduler_comm = rpc(
311 getattr(self.scheduler, "external_address", None) or self.scheduler.address,
~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/distributed/deploy/spec.py in _()
69 async with self.lock:
70 if self.status == Status.created:
---> 71 await self.start()
72 assert self.status == Status.running
73 return self
~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/dask_cloudprovider/gcp/instances.py in start(self)
281
282 async def start(self):
--> 283 await self.start_scheduler()
284 self.status = Status.running
285
~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/dask_cloudprovider/gcp/instances.py in start_scheduler(self)
296 )
297 self.cluster._log("Creating scheduler instance")
--> 298 self.internal_ip, self.external_ip = await self.create_vm()
299
300 if self.config.get("public_ingress", True) and not is_inside_gce():
~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/dask_cloudprovider/gcp/instances.py in create_vm(self)
196
197 try:
--> 198 inst = await self.cluster.call_async(
199 self.cluster.compute.instances()
200 .insert(project=self.projectid, zone=self.zone, body=self.gcp_config)
~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/dask_cloudprovider/generic/vmcluster.py in call_async(self, f, *args, **kwargs)
302 return_when=asyncio.ALL_COMPLETED,
303 )
--> 304 return done.result()
305
306 async def _start(
~/anaconda3/envs/rapids-core-0.18/lib/python3.8/concurrent/futures/thread.py in run(self)
55
56 try:
---> 57 result = self.fn(*self.args, **self.kwargs)
58 except BaseException as exc:
59 self.future.set_exception(exc)
~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/dask_cloudprovider/generic/vmcluster.py in <lambda>()
299 """
300 [done], _ = await asyncio.wait(
--> 301 fs={self.loop.run_in_executor(None, lambda: f(*args, **kwargs))},
302 return_when=asyncio.ALL_COMPLETED,
303 )
~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/googleapiclient/_helpers.py in positional_wrapper(*args, **kwargs)
132 elif positional_parameters_enforcement == POSITIONAL_WARNING:
133 logger.warning(message)
--> 134 return wrapped(*args, **kwargs)
135
136 return positional_wrapper
~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/googleapiclient/http.py in execute(self, http, num_retries)
918
919 # Handle retries for server-side errors.
--> 920 resp, content = _retry_request(
921 http,
922 num_retries,
~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/googleapiclient/http.py in _retry_request(http, num_retries, req_type, sleep, rand, uri, method, *args, **kwargs)
189 try:
190 exception = None
--> 191 resp, content = http.request(uri, method, *args, **kwargs)
192 # Retry on SSL errors and socket timeout errors.
193 except _ssl_SSLError as ssl_error:
~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/google_auth_httplib2.py in request(self, uri, method, body, headers, redirections, connection_type, **kwargs)
207 request_headers = headers.copy() if headers is not None else {}
208
--> 209 self.credentials.before_request(self._request, method, uri, request_headers)
210
211 # Check if the body is a file-like stream, and if so, save the body
~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/google/auth/credentials.py in before_request(self, request, method, url, headers)
131 # the http request.)
132 if not self.valid:
--> 133 self.refresh(request)
134 self.apply(headers)
135
~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/google/oauth2/service_account.py in refresh(self, request)
359 def refresh(self, request):
360 assertion = self._make_authorization_grant_assertion()
--> 361 access_token, expiry, _ = _client.jwt_grant(request, self._token_uri, assertion)
362 self.token = access_token
363 self.expiry = expiry
~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/google/oauth2/_client.py in jwt_grant(request, token_uri, assertion)
151 body = {"assertion": assertion, "grant_type": _JWT_GRANT_TYPE}
152
--> 153 response_data = _token_endpoint_request(request, token_uri, body)
154
155 try:
~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/google/oauth2/_client.py in _token_endpoint_request(request, token_uri, body)
122 retry += 1
123 continue
--> 124 _handle_error_response(response_body)
125
126 return response_data
~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/google/oauth2/_client.py in _handle_error_response(response_body)
58 error_details = response_body
59
---> 60 raise exceptions.RefreshError(error_details, response_body)
61
62
RefreshError: ('invalid_scope: Invalid OAuth scope or ID token audience provided.', '{"error":"invalid_scope","error_description":"Invalid OAuth scope or ID token audience provided."}')
Environment: Conda environment
- Dask version: 2021.02.0
- Python version: 3.8.8 / 3.9
- Operating System: Linux Mint 20
- Install method (conda, pip, source): Pip (same results from source)
Hmm, I'm not sure what the issue here is. In addition to running gsutil commands can you also try a few gcloud compute compute commands to create machines ? Perhaps you account does not have the correct perms to create compute instances ?
@quasiben Just checked, something like this works fine with gcloud compute
gcloud compute instances create drobisontest --project "<correct-project>" --machine-type "a2-highgpu-1g" --zone "us-central1-c" --image-family tf2-ent-2-3-cu110 --image-project deeplearning-platform-release --boot-disk-size 200GB --metadata "install-nvidia-driver=True,proxy-mode=project_editors" --scopes https://www.googleapis.com/auth/cloud-platform --maintenance-policy TERMINATE --restart-on-failure
NAME ZONE MACHINE_TYPE PREEMPTIBLE INTERNAL_IP EXTERNAL_IP STATUS
drobisontest us-central1-c a2-highgpu-1g ..... ..... RUNNING
I ran this recently on GCP. I was unable to reproduce the RefreshError: ('invalid_scope: Invalid OAuth scope or ID token audience provided.', '{"error":"invalid_scope","error_description":"Invalid OAuth scope or ID token audience provided."}') error. Something may have fixed it.
However, I did run into similar issues as Issue #292 . On looking at the cloud-init-output.log , it appears that the scheduler VM shuts down when trying to start the daskdev:dask:latest docker image with the following error:
docker: Error response from daemon: OCI runtime create failed: container_linux.go:380: starting container process caused: process_linux.go:545: container init caused: Running hook #0:: error running hook: exit status 1, stdout: , stderr: nvidia-container-cli: initialization error: nvml error: driver not loaded: unknown.
Further, I tried using a custom existing NGC image like the following :
from dask.distributed import Client, wait, get_worker
from dask_cloudprovider.gcp import GCPCluster
cluster = GCPCluster(projectid="nv-ai-infra",
machine_type="n1-standard-4",
zone="us-central1-a",
ngpus=1,
gpu_type="nvidia-tesla-v100",
n_workers=1,
source_image="projects/nvidia-ngc-public/global/images/nvidia-gpu-cloud-image-pytorch-20210609",
debug=True,
bootstrap=False,
silence_logs=False)
This fails with the same error. I would imagine passing a custom image which has NVIDIA drivers preinstalled would probably work. Is there such an image ?
Any of the RAPIDS images should be ok.
Just wanted to mention I ran into the same issue as @drobison00 . No need for me to paste the output. It's literally the same exact error.
RefreshError: ('invalid_scope: Invalid OAuth scope or ID token audience provided.', '{"error":"invalid_scope","error_description":"Invalid OAuth scope or ID token audience provided."}')
This is with the example from the docs:
from dask_cloudprovider.gcp import GCPCluster
cluster = GCPCluster(projectid=[PROJECT], machine_type="n1-standard-4", zone="us-east1-b")
client = Client(cluster)
The only way I got it to work was by:
- creating a service account,
- Providing
Service Account UserIAM role to myself and the account (not sure if both were needed) - Using the service account key when calling above.
I don't know if this issue is unique to dask though. I generally have OAuth token issues with several python libraries that try to use a subset of GCP services, particularly via the REST API. e.g. Google Sheets.