Ray processes failed to startup
TimeoutError Traceback (most recent call last)
File ~/miniconda3/envs/regulon/lib/python3.10/site-packages/ray/_private/node.py:292, in Node.__init__(self, ray_params, head, shutdown_at_exit, spawn_reaper, connect_only, default_worker)
291 try:
--> 292 ray._private.services.wait_for_node(
293 self.gcs_address,
294 self._plasma_store_socket_name,
295 )
296 except TimeoutError as te:
File ~/miniconda3/envs/regulon/lib/python3.10/site-packages/ray/_private/services.py:460, in wait_for_node(gcs_address, node_plasma_store_socket_name, timeout)
459 time.sleep(0.1)
--> 460 raise TimeoutError(
461 f"Timed out after {timeout} seconds while waiting for node to startup. "
462 f"Did not find socket name {node_plasma_store_socket_name} in the list "
463 "of object store socket names."
464 )
TimeoutError: Timed out after 30 seconds while waiting for node to startup. Did not find socket name ./Downloads/scratch/ray_spill/session_2023-07-11_16-06-36_773603_40437/sockets/plasma_store in the list of object store socket names.
The above exception was the direct cause of the following exception:
Exception Traceback (most recent call last)
Cell In[30], line 4
2 cistopic_obj = pickle.load(open(os.path.join(work_dir, 'scATAC/cistopic_obj.pkl'), 'rb'))
3 from pycisTopic.cistopic_class import *
----> 4 models=run_cgs_models(cistopic_obj,
5 n_topics=[2,4,10,16,32,48],
6 n_cpu=1,
7 n_iter=500,
8 random_state=555,
9 alpha=50,
10 alpha_by_topic=True,
11 eta=0.1,
12 eta_by_topic=False,
13 save_path=None,
14 _temp_dir = os.path.join(tmp_dir + 'ray_spill'))
File ~/miniconda3/envs/regulon/lib/python3.10/site-packages/pycisTopic-1.0.3.dev17+g0b5f4d1.d20230711-py3.10.egg/pycisTopic/lda_models.py:154, in run_cgs_models(cistopic_obj, n_topics, n_cpu, n_iter, random_state, alpha, alpha_by_topic, eta, eta_by_topic, top_topics_coh, save_path, **kwargs)
152 region_names = cistopic_obj.region_names
153 cell_names = cistopic_obj.cell_names
--> 154 ray.init(num_cpus=n_cpu, **kwargs)
155 model_list = ray.get(
156 [
157 run_cgs_model.remote(
(...)
172 ]
173 )
174 ray.shutdown()
File ~/miniconda3/envs/regulon/lib/python3.10/site-packages/ray/_private/client_mode_hook.py:105, in client_mode_hook.<locals>.wrapper(*args, **kwargs)
103 if func.__name__ != "init" or is_client_mode_enabled_by_default:
104 return getattr(ray, func.__name__)(*args, **kwargs)
--> 105 return func(*args, **kwargs)
File ~/miniconda3/envs/regulon/lib/python3.10/site-packages/ray/_private/worker.py:1523, in init(address, num_cpus, num_gpus, resources, object_store_memory, local_mode, ignore_reinit_error, include_dashboard, dashboard_host, dashboard_port, job_config, configure_logging, logging_level, logging_format, log_to_driver, namespace, runtime_env, storage, **kwargs)
1481 ray_params = ray._private.parameter.RayParams(
1482 node_ip_address=node_ip_address,
1483 raylet_ip_address=raylet_ip_address,
(...)
1517 node_name=_node_name,
1518 )
1519 # Start the Ray processes. We set shutdown_at_exit=False because we
1520 # shutdown the node in the ray.shutdown call that happens in the atexit
1521 # handler. We still spawn a reaper process in case the atexit handler
1522 # isn't called.
-> 1523 _global_node = ray._private.node.Node(
1524 head=True,
1525 shutdown_at_exit=False,
1526 spawn_reaper=True,
1527 ray_params=ray_params,
1528 )
1529 else:
1530 # In this case, we are connecting to an existing cluster.
1531 if num_cpus is not None or num_gpus is not None:
File ~/miniconda3/envs/regulon/lib/python3.10/site-packages/ray/_private/node.py:297, in Node.__init__(self, ray_params, head, shutdown_at_exit, spawn_reaper, connect_only, default_worker)
292 ray._private.services.wait_for_node(
293 self.gcs_address,
294 self._plasma_store_socket_name,
295 )
296 except TimeoutError as te:
--> 297 raise Exception(
298 "The current node timed out during startup. This "
299 "could happen because some of the Ray processes "
300 "failed to startup."
301 ) from te
302 node_info = ray._private.services.get_node_to_connect_for_driver(
303 self.gcs_address,
304 self._raylet_ip_address,
305 )
306 if self._ray_params.node_manager_port == 0:
Exception: The current node timed out during startup. This could happen because some of the Ray processes failed to startup.
Hi @crsky1023
What is your ray version?
import ray
ray.__version__
Does the following code produce the same error?
import ray
ray.init()
ray.shutdown()
If not, does this code produce the same error?
import ray
ray.init(num_cpus = 1)
ray.shutdown()
if not, does this code produce the same error?
import ray
ray.init(num_cpus = 1, _temp_dir = os.path.join(tmp_dir + 'ray_spill'))
ray.shutdown()
Best,
Seppe
Could be some problem with the ray version as I think that ray>=2.3 isn't compatible with pydantic>=2. In general, with pydantic 2.0.3 when I import ray I get:
import ray
ray.version
'2.5.1'
ray.init(num_cpus = 16)
gives me this error: AssertionError: pydantic.dataclasses.dataclass only supports init=False
if I downgrade ray:
import ray
ray.version
'2.1.0'
ray.init(num_cpus = 16)
gives me no error Started a local Ray instance.
But I still get AttributeError: module 'pydantic.fields' has no attribute 'ModelField' when creating the models with run_cgs_models.
However trying to downgrade pydantic instead of ray generates too many problems.
Just set the encoding format of the system to UTF-8 and the problem is solved
@xaiopi could you please share the command you executed. I tried a couple of options and still getting the issue
Hi,
I got the same error when running run_cgs_models.
>>> import ray
>>> ray.__version__
'2.9.3'
>>> ray.init()
2024-03-28 20:52:39,683 ERROR services.py:1329 -- Failed to start the dashboard , return code -11
2024-03-28 20:52:39,684 ERROR services.py:1354 -- Error should be written to 'dashboard.log' or 'dashboard.err'. We are printing the last 20 lines for you. See 'https://docs.ray.io/en/master/ray-observability/ray-logging.html#logging-directory-structure' to find where the log file is.
2024-03-28 20:52:39,684 ERROR services.py:1398 --
The last 20 lines of /tmp/ray/session_2024-03-28_20-52-37_113955_328097/logs/dashboard.log (it contains the error message from the dashboard):
2024-03-28 20:52:39,165 INFO head.py:254 -- Starting dashboard metrics server on port 44227
2024-03-28 20:52:39,814 INFO worker.py:1724 -- Started a local Ray instance.
[2024-03-28 20:52:52,435 E 328097 328097] core_worker.cc:215: Failed to register worker 01000000ffffffffffffffffffffffffffffffffffffffffffffffff to Raylet. IOError: [RayletClient] Unable to register worker with raylet. No such file or directory
Looking forward to the solutions! Thanks tingting