ray-project · stephanie-wang · Oct 21, 2024 · Aug 8, 2024 · Aug 8, 2024 · Aug 8, 2024
diff --git a/python/ray/dag/BUILD b/python/ray/dag/BUILD
@@ -105,6 +105,7 @@ py_test_module_list(
         "tests/experimental/test_detect_deadlock_dag.py",
         "tests/experimental/test_multi_node_dag.py",
         "tests/experimental/test_torch_tensor_dag.py",
+        "tests/experimental/test_collective_dag.py",
         "tests/experimental/test_execution_schedule.py",
     ],
     tags = [

diff --git a/python/ray/dag/__init__.py b/python/ray/dag/__init__.py
@@ -16,7 +16,7 @@
     PREV_CLASS_METHOD_CALL_KEY,
     BIND_INDEX_KEY,
     IS_CLASS_METHOD_OUTPUT_KEY,
-    COLLECTIVE_GROUP_KEY,
+    COLLECTIVE_OPERATION_KEY,
     DAGNODE_TYPE_KEY,
 )
 from ray.dag.vis_utils import plot
@@ -35,7 +35,7 @@
     "PREV_CLASS_METHOD_CALL_KEY",
     "BIND_INDEX_KEY",
     "IS_CLASS_METHOD_OUTPUT_KEY",
-    "COLLECTIVE_GROUP_KEY",
+    "COLLECTIVE_OPERATION_KEY",
     "DAGNODE_TYPE_KEY",
     "plot",
     "MultiOutputNode",

diff --git a/python/ray/dag/collective_node.py b/python/ray/dag/collective_node.py
@@ -8,22 +8,27 @@
     DAGNode,
     ClassMethodNode,
 )
-from ray.dag.constants import COLLECTIVE_GROUP_KEY
-from ray.util.annotations import DeveloperAPI
+from ray.dag.constants import COLLECTIVE_OPERATION_KEY
 from ray.experimental.channel import ChannelContext
 from ray.experimental.channel.torch_tensor_nccl_channel import _init_nccl_group
 from ray.experimental.channel.torch_tensor_type import GPUCommunicator, TorchTensorType
 from ray.experimental.util.types import _CollectiveOp, ReduceOp
+from ray.util.annotations import DeveloperAPI
 
 
-class _CollectiveGroup:
+class _CollectiveOperation:
     """
     Represent metadata for a NCCL collective operation.
 
     Args:
         input_nodes: A list of input nodes to the collective operation.
         op: The collective operation to perform.
         transport: The transport to use for the collective operation.
+
+    Requirements:
+    1. Input nodes are unique.
+    2. Actor handles are unique.
+    3. Actor handles match the custom NCCL group if specified.
     """
 
     def __init__(
@@ -34,9 +39,9 @@ def __init__(
     ):
         self._input_nodes: List[DAGNode] = input_nodes
         if len(self._input_nodes) == 0:
-            raise ValueError("Expected input nodes for a collective group")
+            raise ValueError("Expected input nodes for a collective operation")
         if len(set(self._input_nodes)) != len(self._input_nodes):
-            raise ValueError("Expected unique input nodes for a collective group")
+            raise ValueError("Expected unique input nodes for a collective operation")
 
         self._actor_handles: List["ray.actor.ActorHandle"] = []
         for input_node in self._input_nodes:
@@ -51,8 +56,9 @@ def __init__(
                 if self._actor_handles.count(input_node._get_actor_handle()) > 1
             ]
             raise ValueError(
-                "Expected unique actor handles for a collective group, but found "
-                f"duplicate actor handles from input nodes: {invalid_input_nodes}"
+                "Expected unique actor handles for a collective operation, "
+                "but found duplicate actor handles from input nodes: "
+                f"{invalid_input_nodes}"
             )
 
         self._op = op
@@ -91,7 +97,6 @@ def init_nccl_group(self, nccl_group_id: Optional[str] = None) -> str:
         """
         type_hint = self._type_hint
         if type_hint.nccl_group_id is not None:
-            # The NCCL group has already been initialized.
             return type_hint.nccl_group_id
         if nccl_group_id is None:
             nccl_group_id = _init_nccl_group(
@@ -110,7 +115,7 @@ def get_nccl_group(self) -> GPUCommunicator:
             raise ValueError("Expected a NCCL group")
         return nccl_group
 
-    def method(self, send_buf: "torch.Tensor") -> "torch.Tensor":
+    def execute(self, send_buf: "torch.Tensor") -> "torch.Tensor":
         """
         Call the collective operation on the input tensor. An output tensor is
         allocated and returned.
@@ -147,16 +152,13 @@ def __init__(
         ):
             raise ValueError("Expected a single input node")
         self._input_node = method_args[0]
-        # Parse the collective group.
-        self._collective_group: _CollectiveGroup = other_args_to_resolve.get(
-            COLLECTIVE_GROUP_KEY, None
+        # Parse the collective operation.
+        self._collective_op: _CollectiveOperation = other_args_to_resolve.get(
+            COLLECTIVE_OPERATION_KEY, None
         )
-        if self._collective_group is None:
-            raise ValueError("Expected a collective group")
+        if self._collective_op is None:
+            raise ValueError("Expected a collective operation")
 
-        # The actor creation task dependency is encoded as the first argument,
-        # and the ordering dependency as the second, which ensures they are
-        # executed prior to this node.
         super().__init__(
             method_name,
             method_args,
@@ -186,5 +188,5 @@ def _execute_impl(self, *args, **kwargs):
         )
 
     @property
-    def collective_group(self) -> _CollectiveGroup:
-        return self._collective_group
+    def collective_op(self) -> _CollectiveOperation:
+        return self._collective_op
diff --git a/python/ray/dag/compiled_dag_node.py b/python/ray/dag/compiled_dag_node.py
@@ -309,10 +309,10 @@ def __init__(
         self.input_type_hints: List[ChannelOutputType] = task.arg_type_hints
         self.output_type_hint: ChannelOutputType = task.dag_node.type_hint
 
-        # The collective group that runs a NCCL collective operation.
-        self.collective_group: Optional["ray.dag.CollectiveGroup"] = None
+        # The NCCL collective operation.
+        self.collective_op: Optional["ray.dag.CollectiveOperation"] = None
         if isinstance(task.dag_node, CollectiveOutputNode):
-            self.collective_group = task.dag_node.collective_group
+            self.collective_op = task.dag_node.collective_op
 
         self.input_channels: List[ChannelInterface] = []
         self.task_inputs: List[_ExecutableTaskInput] = []
@@ -451,9 +451,9 @@ def _compute(self, class_handle) -> bool:
         for task_input in self.task_inputs:
             resolved_inputs.append(task_input.resolve(input_data))
 
-        if self.collective_group is not None:
+        if self.collective_op is not None:
             # Run a NCCL collective operation.
-            method = self.collective_group.method
+            method = self.collective_op.execute
         else:
             # Run an actor method.
             method = getattr(class_handle, self.method_name)
@@ -685,7 +685,7 @@ def __init__(
         self._use_default_nccl_group = False
         # This is set to the specified custom nccl group
         # if there exists a type hint of `transport=nccl_group`.
-        self._custom_nccl_group: Optional[GPUCommunicator] = None
+        self._custom_nccl_group_p2p: Optional[GPUCommunicator] = None
         # The NCCL group ID for P2P send/recv operations.
         self._nccl_group_id_p2p: Optional[str] = None
         # All the NCCL group IDs for P2P send/recv and collective operations.
@@ -715,6 +715,14 @@ def _create_proxy_actor() -> "ray.actor.ActorHandle":
 
         self._proxy_actor = _create_proxy_actor()
 
+    @property
+    def nccl_group_id_p2p(self) -> Optional[str]:
+        return self._nccl_group_id_p2p
+
+    @property
+    def nccl_group_ids(self) -> Set[str]:
+        return self._nccl_group_ids
+
     def increment_max_finished_execution_index(self) -> None:
         """Increment the max finished execution index. It is used to
         figure out the max number of in-flight requests to the DAG
@@ -752,13 +760,13 @@ def _preprocess(self) -> None:
             InputNode,
             MultiOutputNode,
         )
-        from ray.dag.collective_node import _CollectiveGroup
+        from ray.dag.collective_node import _CollectiveOperation
 
         self.input_task_idx, self.output_task_idx = None, None
         self.actor_task_count.clear()
 
-        nccl_actors: Set["ray.actor.ActorHandle"] = set()
-        nccl_collective_groups: Set[_CollectiveGroup] = set()
+        nccl_actors_p2p: Set["ray.actor.ActorHandle"] = set()
+        nccl_collective_ops: Set[_CollectiveOperation] = set()
 
         # Find the input node to the DAG.
         for idx, task in self.idx_to_task.items():
@@ -833,7 +841,7 @@ def _preprocess(self) -> None:
 
                 # Collect actors for NCCL P2P methods.
                 if dag_node.type_hint.requires_nccl():
-                    nccl_actors.add(actor_handle)
+                    nccl_actors_p2p.add(actor_handle)
                     custom_nccl_group = dag_node.type_hint.get_custom_nccl_group()
                     mixed_nccl_group_error_message = (
                         "Accelerated DAGs do not support mixed usage of "
@@ -845,26 +853,26 @@ def _preprocess(self) -> None:
                         "make sure only one type of NCCL transport is specified."
                     )
                     if custom_nccl_group is None:
-                        if self._custom_nccl_group is not None:
+                        if self._custom_nccl_group_p2p is not None:
                             raise ValueError(mixed_nccl_group_error_message)
                         self._use_default_nccl_group = True
                     else:
                         if self._use_default_nccl_group:
                             raise ValueError(mixed_nccl_group_error_message)
-                        if self._custom_nccl_group is not None:
-                            if self._custom_nccl_group != custom_nccl_group:
+                        if self._custom_nccl_group_p2p is not None:
+                            if self._custom_nccl_group_p2p != custom_nccl_group:
                                 raise ValueError(
                                     "Accelerated DAGs currently only support "
                                     "a single custom NCCL group, but multiple "
                                     "have been specified. Check all the "
                                     "TorchTensor(transport=nccl_group) type hints "
                                     "to make sure only one NCCL group is used."
                                 )
-                        self._custom_nccl_group = custom_nccl_group
+                        self._custom_nccl_group_p2p = custom_nccl_group
 
-                # Collect collective groups for NCCL collective operations.
+                # Collect NCCL collective operations.
                 if isinstance(dag_node, CollectiveOutputNode):
-                    nccl_collective_groups.add(dag_node.collective_group)
+                    nccl_collective_ops.add(dag_node.collective_op)
             elif isinstance(dag_node, InputNode):
                 if dag_node.type_hint.requires_nccl():
                     raise ValueError(
@@ -935,72 +943,80 @@ def _preprocess(self) -> None:
                 task.arg_type_hints.append(upstream_task.dag_node.type_hint)
 
                 if upstream_task.dag_node.type_hint.requires_nccl():
-                    # Add all readers to the NCCL group.
-                    nccl_actors.add(downstream_actor_handle)
+                    # Add all readers to the NCCL actors of P2P.
+                    nccl_actors_p2p.add(downstream_actor_handle)
 
-        nccl_actors = list(nccl_actors)
-        if None in nccl_actors:
+        nccl_actors_p2p = list(nccl_actors_p2p)
+        if None in nccl_actors_p2p:
             raise ValueError("Driver cannot participate in the NCCL group.")
 
+        # Initialize and cache a NCCL group for each custom NCCL group. All the
+        # custom NCCL groups are initialized before the default NCCL groups.
+        custom_nccl_group_to_id: Dict[GPUCommunicator, str] = {}
         # Initialize and cache a NCCL group for each set of actors. A set of actors
-        # can perform P2P send/recv and collective operations. All the custom NCCL
-        # groups are initialized before the default NCCL groups. If there are
-        # multiple custom NCCL groups for a set of actors, only one is cached.
+        # can perform P2P send/recv and collective operations. If there are multiple
+        # custom NCCL groups for a set of actors, only one is cached.
         actors_to_nccl_group_id: Dict[FrozenSet["ray.actor.ActorHandle"], str] = {}
-        # Initialize a NCCL group for each custom NCCL group.
-        custom_nccl_group_to_id: Dict[GPUCommunicator, str] = {}
 
         # If a custom NCCL group is specified for P2P actors, initialize and cache
         # the NCCL group ID.
-        if nccl_actors and self._custom_nccl_group:
+        if nccl_actors_p2p and self._custom_nccl_group_p2p:
+            if not set(nccl_actors_p2p).issubset(
+                set(self._custom_nccl_group_p2p.get_actor_handles())
+            ):
+                raise ValueError(
+                    "Expected P2P actor handles to be a subset of the custom NCCL group"
+                )
             self._nccl_group_id_p2p = _init_nccl_group(
-                nccl_actors, self._custom_nccl_group
+                nccl_actors_p2p, self._custom_nccl_group_p2p
             )
-            actors = frozenset(nccl_actors)
+            custom_nccl_group_to_id[
+                self._custom_nccl_group_p2p
+            ] = self._nccl_group_id_p2p
+            actors = frozenset(nccl_actors_p2p)
             actors_to_nccl_group_id[actors] = self._nccl_group_id_p2p
-            custom_nccl_group_to_id[self._custom_nccl_group] = self._nccl_group_id_p2p
 
         # If a custom NCCL group is specified for collective actors, initialize and
         # cache the NCCL group ID.
-        for collective_group in nccl_collective_groups:
-            type_hint = collective_group.type_hint
+        for collective_op in nccl_collective_ops:
+            type_hint = collective_op.type_hint
             custom_nccl_group = type_hint.get_custom_nccl_group()
             if custom_nccl_group:
-                nccl_group_id = collective_group.init_nccl_group(
+                nccl_group_id = collective_op.init_nccl_group(
                     custom_nccl_group_to_id.get(custom_nccl_group, None)
                 )
-                actors = frozenset(collective_group.actor_handles)
+                custom_nccl_group_to_id[custom_nccl_group] = nccl_group_id
+                actors = frozenset(collective_op.actor_handles)
                 if actors not in actors_to_nccl_group_id:
                     actors_to_nccl_group_id[actors] = nccl_group_id
-                custom_nccl_group_to_id[custom_nccl_group] = nccl_group_id
 
         # If a NCCL group for P2P actors is not initialized, initialize and cache
         # the NCCL group ID.
-        if nccl_actors and self._nccl_group_id_p2p is None:
-            actors = frozenset(nccl_actors)
+        if nccl_actors_p2p and self._nccl_group_id_p2p is None:
+            actors = frozenset(nccl_actors_p2p)
             if actors in actors_to_nccl_group_id:
                 self._nccl_group_id_p2p = actors_to_nccl_group_id[actors]
             else:
                 self._nccl_group_id_p2p = _init_nccl_group(
-                    nccl_actors, self._custom_nccl_group
+                    nccl_actors_p2p, self._custom_nccl_group_p2p
                 )
                 actors_to_nccl_group_id[actors] = self._nccl_group_id_p2p
 
         # If a NCCL group for collective actors is not initialized, initialize and
         # cache the NCCL group ID.
-        for collective_group in nccl_collective_groups:
-            type_hint = collective_group.type_hint
-            if type_hint.nccl_group_id is None:
-                actors = frozenset(collective_group.actor_handles)
-                if actors in actors_to_nccl_group_id:
-                    nccl_group_id = actors_to_nccl_group_id[actors]
-                    type_hint.set_nccl_group_id(nccl_group_id)
-                else:
-                    nccl_group_id = collective_group.init_nccl_group()
+        for collective_op in nccl_collective_ops:
+            if collective_op.type_hint.nccl_group_id is None:
+                actors = frozenset(collective_op.actor_handles)
+                nccl_group_id = collective_op.init_nccl_group(
+                    actors_to_nccl_group_id.get(actors, None)
+                )
+                if actors not in actors_to_nccl_group_id:
                     actors_to_nccl_group_id[actors] = nccl_group_id
 
         # Store all the NCCL group IDs for P2P send/recv and collective operations.
-        self._nccl_group_ids = set(actors_to_nccl_group_id.values())
+        self._nccl_group_ids = set(actors_to_nccl_group_id.values()).union(
+            set(custom_nccl_group_to_id.values())
+        )
 
         if direct_input:
             self._input_num_positional_args = 1
@@ -1460,19 +1476,19 @@ def _generate_dag_operation_graph_node(
                 ]
             }
         """
-        from ray.dag.collective_node import CollectiveOutputNode, _CollectiveGroup
+        from ray.dag.collective_node import CollectiveOutputNode, _CollectiveOperation
 
         assert self.idx_to_task
         assert self.actor_to_executable_tasks
 
         actor_to_operation_nodes: Dict[
             "ray.actor.ActorHandle", List[List[_DAGOperationGraphNode]]
         ] = defaultdict(list)
-        collective_group_to_nodes: Dict[
-            _CollectiveGroup, Set[_DAGOperationGraphNode]
+        collective_op_to_nodes: Dict[
+            _CollectiveOperation, Set[_DAGOperationGraphNode]
         ] = defaultdict(set)
-        collective_group_to_idxs: Dict[
-            _CollectiveGroup, Tuple[int, _DAGNodeOperationType]
+        collective_op_to_idxs: Dict[
+            _CollectiveOperation, Tuple[int, _DAGNodeOperationType]
         ] = defaultdict(set)
 
         for actor_handle, executable_tasks in self.actor_to_executable_tasks.items():
@@ -1507,18 +1523,16 @@ def _generate_dag_operation_graph_node(
                     [read_node, compute_node, write_node]
                 )
                 if isinstance(dag_node, CollectiveOutputNode):
-                    collective_group_to_nodes[dag_node.collective_group].add(
-                        compute_node
-                    )
-                    collective_group_to_idxs[dag_node.collective_group].add(
+                    collective_op_to_nodes[dag_node.collective_op].add(compute_node)
+                    collective_op_to_idxs[dag_node.collective_op].add(
                         (task_idx, _DAGNodeOperationType.COMPUTE)
                     )
 
-        # Set collective group nodes for all the NCCL collective nodes.
-        for collective_group, nodes in collective_group_to_nodes.items():
-            idxs = collective_group_to_idxs[collective_group]
+        # Set collective nodes for all the NCCL collective operation nodes.
+        for collective_op, nodes in collective_op_to_nodes.items():
+            idxs = collective_op_to_idxs[collective_op]
             for node in nodes:
-                node.set_collective_group_idxs(idxs)
+                node.collective_idxs = idxs
 
         return actor_to_operation_nodes