dask-cloudprovider
dask-cloudprovider copied to clipboard
AccessDeniedException from SageMaker
When I try to launch a Fargate cluster from a SageMaker notebook, it fails with an AccessDeniedException as shown below. I don't know if there's a way round this.
AccessDeniedException Traceback (most recent call last)
<ipython-input-9-1f7539e53e4e> in <module>()
3 # })
4
----> 5 cluster = FargateCluster()
6
7 from distributed import Client
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/dask_cloudprovider/providers/aws/ecs.py in __init__(self, **kwargs)
1051
1052 def __init__(self, **kwargs):
-> 1053 super().__init__(fargate_scheduler=True, fargate_workers=True, **kwargs)
1054
1055
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/dask_cloudprovider/providers/aws/ecs.py in __init__(self, fargate_scheduler, fargate_workers, image, scheduler_cpu, scheduler_mem, scheduler_timeout, worker_cpu, worker_mem, worker_gpu, n_workers, cluster_arn, cluster_name_template, execution_role_arn, task_role_arn, task_role_policies, cloudwatch_logs_group, cloudwatch_logs_stream_prefix, cloudwatch_logs_default_retention, vpc, subnets, security_groups, environment, tags, skip_cleanup, **kwargs)
553 self._skip_cleanup = skip_cleanup
554 self._lock = asyncio.Lock()
--> 555 super().__init__(**kwargs)
556
557 async def _start(self,):
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/distributed/deploy/spec.py in __init__(self, workers, scheduler, worker, asynchronous, loop, security, silence_logs, name)
240 if not self.asynchronous:
241 self._loop_runner.start()
--> 242 self.sync(self._start)
243 self.sync(self._correct_state)
244
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/distributed/deploy/cluster.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
160 return future
161 else:
--> 162 return sync(self.loop, func, *args, **kwargs)
163
164 async def _logs(self, scheduler=True, workers=True):
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
332 if error[0]:
333 typ, exc, tb = error[0]
--> 334 raise exc.with_traceback(tb)
335 else:
336 return result[0]
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/distributed/utils.py in f()
316 if callback_timeout is not None:
317 future = gen.with_timeout(timedelta(seconds=callback_timeout), future)
--> 318 result[0] = yield future
319 except Exception as exc:
320 error[0] = sys.exc_info()
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/tornado/gen.py in run(self)
1097
1098 try:
-> 1099 value = future.result()
1100 except Exception:
1101 self.had_exception = True
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/dask_cloudprovider/providers/aws/ecs.py in _start(self)
569 self._skip_cleanup = self.config.get("skip_cleanup")
570 if not self._skip_cleanup:
--> 571 await _cleanup_stale_resources()
572
573 self._clients = await self._get_clients()
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/dask_cloudprovider/providers/aws/ecs.py in _cleanup_stale_resources()
1073 active_clusters = []
1074 clusters_to_delete = []
-> 1075 async for page in ecs.get_paginator("list_clusters").paginate():
1076 clusters = (
1077 await ecs.describe_clusters(
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/async_generator/_impl.py in step()
364 try:
365 self.ag_running = True
--> 366 return await ANextIter(self._it, start_fn, *args)
367 except StopAsyncIteration:
368 self._pypy_issue2786_workaround.discard(self._coroutine)
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/async_generator/_impl.py in __next__(self)
197 return self._invoke(first_fn, *first_args)
198 else:
--> 199 return self._invoke(self._it.__next__)
200
201 def send(self, value):
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/async_generator/_impl.py in _invoke(self, fn, *args)
207 def _invoke(self, fn, *args):
208 try:
--> 209 result = fn(*args)
210 except StopIteration as e:
211 # The underlying generator returned, so we should signal the end
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/aiobotocore/paginate.py in __anext__(self)
78
79 while True:
---> 80 response = await self._make_request(current_kwargs)
81 parsed = self._extract_parsed_response(response)
82 if first_request:
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/aiobotocore/client.py in _make_api_call(self, operation_name, api_params)
103 error_code = parsed_response.get("Error", {}).get("Code")
104 error_class = self.exceptions.from_code(error_code)
--> 105 raise error_class(parsed_response, operation_name)
106 else:
107 return parsed_response
AccessDeniedException: An error occurred (AccessDeniedException) when calling the ListClusters operation: User: arn:aws:sts::536099501702:assumed-role/AmazonSageMaker-ExecutionRole-20171130T095318/SageMaker is not authorized to perform: ecs:ListClusters on resource: *```
Thanks for raising this @RPrudden!
Looks like your IAM role in SageMaker doesn't have Fargate permissions. You'll either need to add those to the role or setup some different credentials for this. See the docs here for setting up your own credentials or the AWS docs for adding permissions to your SageMaker role.
This error message is not very pleasant. We should definitely be catching this and providing a much nicer and more understandable error, sorry about that!