dask-cloudprovider icon indicating copy to clipboard operation
dask-cloudprovider copied to clipboard

AccessDeniedException from SageMaker

Open RPrudden opened this issue 6 years ago • 1 comments

When I try to launch a Fargate cluster from a SageMaker notebook, it fails with an AccessDeniedException as shown below. I don't know if there's a way round this.

AccessDeniedException                     Traceback (most recent call last)
<ipython-input-9-1f7539e53e4e> in <module>()
      3 # })
      4 
----> 5 cluster = FargateCluster()
      6 
      7 from distributed import Client

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/dask_cloudprovider/providers/aws/ecs.py in __init__(self, **kwargs)
   1051 
   1052     def __init__(self, **kwargs):
-> 1053         super().__init__(fargate_scheduler=True, fargate_workers=True, **kwargs)
   1054 
   1055 

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/dask_cloudprovider/providers/aws/ecs.py in __init__(self, fargate_scheduler, fargate_workers, image, scheduler_cpu, scheduler_mem, scheduler_timeout, worker_cpu, worker_mem, worker_gpu, n_workers, cluster_arn, cluster_name_template, execution_role_arn, task_role_arn, task_role_policies, cloudwatch_logs_group, cloudwatch_logs_stream_prefix, cloudwatch_logs_default_retention, vpc, subnets, security_groups, environment, tags, skip_cleanup, **kwargs)
    553         self._skip_cleanup = skip_cleanup
    554         self._lock = asyncio.Lock()
--> 555         super().__init__(**kwargs)
    556 
    557     async def _start(self,):

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/distributed/deploy/spec.py in __init__(self, workers, scheduler, worker, asynchronous, loop, security, silence_logs, name)
    240         if not self.asynchronous:
    241             self._loop_runner.start()
--> 242             self.sync(self._start)
    243             self.sync(self._correct_state)
    244 

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/distributed/deploy/cluster.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
    160             return future
    161         else:
--> 162             return sync(self.loop, func, *args, **kwargs)
    163 
    164     async def _logs(self, scheduler=True, workers=True):

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
    332     if error[0]:
    333         typ, exc, tb = error[0]
--> 334         raise exc.with_traceback(tb)
    335     else:
    336         return result[0]

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/distributed/utils.py in f()
    316             if callback_timeout is not None:
    317                 future = gen.with_timeout(timedelta(seconds=callback_timeout), future)
--> 318             result[0] = yield future
    319         except Exception as exc:
    320             error[0] = sys.exc_info()

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/tornado/gen.py in run(self)
   1097 
   1098                     try:
-> 1099                         value = future.result()
   1100                     except Exception:
   1101                         self.had_exception = True

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/dask_cloudprovider/providers/aws/ecs.py in _start(self)
    569             self._skip_cleanup = self.config.get("skip_cleanup")
    570         if not self._skip_cleanup:
--> 571             await _cleanup_stale_resources()
    572 
    573         self._clients = await self._get_clients()

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/dask_cloudprovider/providers/aws/ecs.py in _cleanup_stale_resources()
   1073         active_clusters = []
   1074         clusters_to_delete = []
-> 1075         async for page in ecs.get_paginator("list_clusters").paginate():
   1076             clusters = (
   1077                 await ecs.describe_clusters(

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/async_generator/_impl.py in step()
    364             try:
    365                 self.ag_running = True
--> 366                 return await ANextIter(self._it, start_fn, *args)
    367             except StopAsyncIteration:
    368                 self._pypy_issue2786_workaround.discard(self._coroutine)

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/async_generator/_impl.py in __next__(self)
    197             return self._invoke(first_fn, *first_args)
    198         else:
--> 199             return self._invoke(self._it.__next__)
    200 
    201     def send(self, value):

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/async_generator/_impl.py in _invoke(self, fn, *args)
    207     def _invoke(self, fn, *args):
    208         try:
--> 209             result = fn(*args)
    210         except StopIteration as e:
    211             # The underlying generator returned, so we should signal the end

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/aiobotocore/paginate.py in __anext__(self)
     78 
     79         while True:
---> 80             response = await self._make_request(current_kwargs)
     81             parsed = self._extract_parsed_response(response)
     82             if first_request:

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/aiobotocore/client.py in _make_api_call(self, operation_name, api_params)
    103             error_code = parsed_response.get("Error", {}).get("Code")
    104             error_class = self.exceptions.from_code(error_code)
--> 105             raise error_class(parsed_response, operation_name)
    106         else:
    107             return parsed_response

AccessDeniedException: An error occurred (AccessDeniedException) when calling the ListClusters operation: User: arn:aws:sts::536099501702:assumed-role/AmazonSageMaker-ExecutionRole-20171130T095318/SageMaker is not authorized to perform: ecs:ListClusters on resource: *```

RPrudden avatar Nov 12 '19 10:11 RPrudden

Thanks for raising this @RPrudden!

Looks like your IAM role in SageMaker doesn't have Fargate permissions. You'll either need to add those to the role or setup some different credentials for this. See the docs here for setting up your own credentials or the AWS docs for adding permissions to your SageMaker role.

This error message is not very pleasant. We should definitely be catching this and providing a much nicer and more understandable error, sorry about that!

jacobtomlinson avatar Nov 12 '19 11:11 jacobtomlinson