dask-contrib · jacobtomlinson · Nov 9, 2023 · Nov 17, 2023 · Nov 17, 2023
diff --git a/README.md b/README.md
@@ -48,6 +48,39 @@ https://dbc-dp-xxxx.cloud.databricks.com/driver-proxy/o/xxxx/xx-xxx-xxxx/8087/st
 
 ![](https://user-images.githubusercontent.com/1610850/281441285-9b84d5f1-d58a-45dc-9354-7385e1599d1f.png)
 
+### Troubleshooting with cluster logs
+
+If you're experiencing problems starting your Dask Databricks cluster then viewing logs for your init scripts can help narrow down the problem.
+
+When you create your cluster we recommend that you [configure your logs](https://docs.databricks.com/en/clusters/configure.html#cluster-log-delivery) to write to somewhere like `dbfs:/cluster_init_logs`.
+
+To make viewing these logs a little easier we've included a couple of CLI utilities in `dask-databricks` to help you navigate them.
+
+#### Listing clusters
+
+You can get a full list of available logs with the `dask databricks ls <path>` command where the path is the DBFS location you configured your logs to write to.
+
+```console
+$ dask databricks logs ls dbfs:/cluster_init_logs
+
+  Cluster                Start time          Node Count   Node IPs
+ ──────────────────────────────────────────────────────────────────────────────────────
+  1234-987654-a1b2c3d4   Nov 16 2023 10:36   2            10.0.0.1, 10.0.0.2
+```
+
+#### Viewing logs
+
+Once you have your cluster ID you can view the logs from the latest launch of that cluster with `dask databricks cat <path> <cluster>`.
+
+```console
+$ dask databricks logs cat dbfs:/cluster_init_logs 1234-987654-a1b2c3d4
+Cluster: 1234-987654-a1b2c3d4
+Start time: Nov 16 2023 10:36
+10.0.0.1: Start Python bootstrap
+10.0.0.1: PYSPARK_PYTHON is /databricks/python3/bin/python
+...
+```
+
 ## Releasing
 
 Releases of this project are automated using [GitHub Actions and the `pypa/gh-action-pypi-publish` action](https://github.com/jacobtomlinson/dask-databricks/blob/main/.github/workflows/release.yaml).

diff --git a/dask_databricks/cli.py b/dask_databricks/cli.py
@@ -1,13 +1,31 @@
 import json
 import logging
 import os
+import random
 import socket
 import subprocess
 import sys
 import time
+from datetime import datetime
 
 import click
+from rich import box
+from rich.color import ANSI_COLOR_NAMES
+from rich.console import Console
 from rich.logging import RichHandler
+from rich.table import Table
+
+console = Console()
+
+NODE_COLOURS = ["medium_spring_green", "light_steel_blue1", "wheat1", "medium_orchid"]
+
+# Generate list of random colours from rich
+# import random
+# from rich.color import Color
+#
+# for i in range(100):
+#     colour = Color.random()
+#     print(f'"{colour.name}",', end="
 
 
 def get_logger():
@@ -96,5 +114,121 @@ def run(worker_command, worker_args, cuda):
             sys.exit(1)
 
 
+@main.group()
+def logs():
+    """View cluster init logs."""
+
+
+def _get_logs_at_path(path):
+    try:
+        from databricks.sdk.runtime import dbutils
+    except ImportError:
+        raise RuntimeError("Please install databricks-sdk.")
+    clusters = {}
+
+    for cluster in dbutils.fs.ls(path):
+        cluster_id = cluster.path.split("/")[-1]
+        clusters[cluster_id] = {}
+        for node in dbutils.fs.ls(cluster.path + "/init_scripts"):
+            for log in dbutils.fs.ls(node.path):
+                filename = log.path.split("/")[-1]
+                channel = filename.split(".")[-2]
+                datetime = "_".join(filename.split("_")[:2])
+                node_name = log.path.split("/")[-2].split("_", 1)[-1].replace("_", ".")
+                if datetime not in clusters[cluster_id]:
+                    clusters[cluster_id][datetime] = {}
+
+                if node_name not in clusters[cluster_id][datetime]:
+                    clusters[cluster_id][datetime][node_name] = {}
+
+                clusters[cluster_id][datetime][node_name][channel] = log.path
+    return clusters
+
+
+def _get_node_color(i):
+    if i < len(NODE_COLOURS):
+        return NODE_COLOURS[i]
+    else:
+        return random.choice(list(ANSI_COLOR_NAMES))
+
+
+def _prettify_launch_time(launch_time):
+    return datetime.strptime(launch_time, "%Y%m%d_%H%M%S").strftime("%b %d %Y %H:%M")
+
+
+@logs.command()
+@click.argument("path")
+@click.option("--show-filenames", help="Show filenames in the output", is_flag=True, default=False, show_default=True)
+def ls(path, show_filenames):
+    # TODO add flag to list filenames
+    table = Table(box=box.SIMPLE_HEAD)
+    table.add_column("Cluster", style="cyan", no_wrap=True)
+    table.add_column("Start time", style="plum2")
+    table.add_column("Node Count")
+    table.add_column("Node IPs")
+    if show_filenames:
+        table.add_column("Filenames")
+    with console.status("[bright_black]Finding logs..."):
+        clusters = _get_logs_at_path(path)
+    for cluster in clusters:
+        first = True
+        for launch_time in sorted(clusters[cluster], reverse=True):
+            pretty_launch_time = _prettify_launch_time(launch_time)
+            cluster_name = cluster if first else ""
+            node_list = ", ".join(
+                f"[{_get_node_color(i)}]{name}[/{_get_node_color(i)}]"
+                for i, name in enumerate(clusters[cluster][launch_time])
+            )
+            data = [cluster_name, pretty_launch_time, str(len(clusters[cluster][launch_time])), node_list]
+            if show_filenames:
+                filenames = ""
+                for i, node in enumerate(clusters[cluster][launch_time]):
+                    for channel in ["stdout", "stderr"]:
+                        node_colour = _get_node_color(i)
+                        filenames += f"[{node_colour}]{clusters[cluster][launch_time][node][channel]}[/{node_colour}]\n"
+                data.append(filenames)
+            table.add_row(*data)
+            first = False
+
+    console.print(table)
+
+
+@logs.command()
+@click.argument("path")
+@click.argument("cluster")
+def cat(path, cluster):
+    # TODO add a flag for selecting which start time to view
+    # TODO add a flag to filter which nodes to view logs for
+    try:
+        from databricks.sdk.runtime import dbutils
+    except ImportError:
+        raise RuntimeError("Please install databricks-sdk.")
+
+    with console.status("[bright_black]Finding logs..."):
+        clusters = _get_logs_at_path(path)
+
+    if cluster not in clusters:
+        console.print(f"Cluster {cluster} not found.", style="bold red", highlight=False)
+        console.print(
+            f"Hint: Try running dask [b i]databricks logs ls {path}[/b i] to list clusters.",
+            style="bright_black",
+            highlight=False,
+        )
+        sys.exit(1)
+
+    most_recent = sorted(clusters[cluster].keys())[-1]
+
+    console.print(f"Cluster: {cluster}", style="bold cyan", highlight=False)
+    console.print(f"Start time: {_prettify_launch_time(most_recent)}", style="bold cyan", highlight=False)
+
+    for i, node in enumerate(clusters[cluster][most_recent]):
+        for channel in ["stdout", "stderr"]:
+            for line in dbutils.fs.head(clusters[cluster][most_recent][node][channel], 65536).split("\n"):
+                node_colour = _get_node_color(i)
+                console.print(
+                    f"[{node_colour}]{node}[/{node_colour}]: {line}", style="grey89" if channel == "stdout" else "plum4"
+                )
+
+
 if __name__ == "__main__":
     main()