Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add log viewing tools #28

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,39 @@ https://dbc-dp-xxxx.cloud.databricks.com/driver-proxy/o/xxxx/xx-xxx-xxxx/8087/st

![](https://user-images.githubusercontent.com/1610850/281441285-9b84d5f1-d58a-45dc-9354-7385e1599d1f.png)

### Troubleshooting with cluster logs

If you're experiencing problems starting your Dask Databricks cluster then viewing logs for your init scripts can help narrow down the problem.

When you create your cluster we recommend that you [configure your logs](https://docs.databricks.com/en/clusters/configure.html#cluster-log-delivery) to write to somewhere like `dbfs:/cluster_init_logs`.

To make viewing these logs a little easier we've included a couple of CLI utilities in `dask-databricks` to help you navigate them.

#### Listing clusters

You can get a full list of available logs with the `dask databricks ls <path>` command where the path is the DBFS location you configured your logs to write to.

```console
$ dask databricks logs ls dbfs:/cluster_init_logs

Cluster Start time Node Count Node IPs
──────────────────────────────────────────────────────────────────────────────────────
1234-987654-a1b2c3d4 Nov 16 2023 10:36 2 10.0.0.1, 10.0.0.2
```

#### Viewing logs

Once you have your cluster ID you can view the logs from the latest launch of that cluster with `dask databricks cat <path> <cluster>`.

```console
$ dask databricks logs cat dbfs:/cluster_init_logs 1234-987654-a1b2c3d4
Cluster: 1234-987654-a1b2c3d4
Start time: Nov 16 2023 10:36
10.0.0.1: Start Python bootstrap
10.0.0.1: PYSPARK_PYTHON is /databricks/python3/bin/python
...
```

## Releasing

Releases of this project are automated using [GitHub Actions and the `pypa/gh-action-pypi-publish` action](https://github.com/jacobtomlinson/dask-databricks/blob/main/.github/workflows/release.yaml).
Expand Down
134 changes: 134 additions & 0 deletions dask_databricks/cli.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,31 @@
import json
import logging
import os
import random
import socket
import subprocess
import sys
import time
from datetime import datetime

import click
from rich import box
from rich.color import ANSI_COLOR_NAMES
from rich.console import Console
from rich.logging import RichHandler
from rich.table import Table

console = Console()

NODE_COLOURS = ["medium_spring_green", "light_steel_blue1", "wheat1", "medium_orchid"]

# Generate list of random colours from rich
# import random
# from rich.color import Color
#
# for i in range(100):
# colour = Color.random()
# print(f'"{colour.name}",', end="


def get_logger():
Expand Down Expand Up @@ -96,5 +114,121 @@ def run(worker_command, worker_args, cuda):
sys.exit(1)


@main.group()
def logs():
"""View cluster init logs."""


def _get_logs_at_path(path):
try:
from databricks.sdk.runtime import dbutils
except ImportError:
raise RuntimeError("Please install databricks-sdk.")
clusters = {}

for cluster in dbutils.fs.ls(path):
cluster_id = cluster.path.split("/")[-1]
clusters[cluster_id] = {}
for node in dbutils.fs.ls(cluster.path + "/init_scripts"):
for log in dbutils.fs.ls(node.path):
filename = log.path.split("/")[-1]
channel = filename.split(".")[-2]
datetime = "_".join(filename.split("_")[:2])
node_name = log.path.split("/")[-2].split("_", 1)[-1].replace("_", ".")
if datetime not in clusters[cluster_id]:
clusters[cluster_id][datetime] = {}

if node_name not in clusters[cluster_id][datetime]:
clusters[cluster_id][datetime][node_name] = {}

clusters[cluster_id][datetime][node_name][channel] = log.path
return clusters


def _get_node_color(i):
if i < len(NODE_COLOURS):
return NODE_COLOURS[i]
else:
return random.choice(list(ANSI_COLOR_NAMES))


def _prettify_launch_time(launch_time):
return datetime.strptime(launch_time, "%Y%m%d_%H%M%S").strftime("%b %d %Y %H:%M")


@logs.command()
@click.argument("path")
@click.option("--show-filenames", help="Show filenames in the output", is_flag=True, default=False, show_default=True)
def ls(path, show_filenames):
# TODO add flag to list filenames
table = Table(box=box.SIMPLE_HEAD)
table.add_column("Cluster", style="cyan", no_wrap=True)
table.add_column("Start time", style="plum2")
table.add_column("Node Count")
table.add_column("Node IPs")
if show_filenames:
table.add_column("Filenames")
with console.status("[bright_black]Finding logs..."):
clusters = _get_logs_at_path(path)
for cluster in clusters:
first = True
for launch_time in sorted(clusters[cluster], reverse=True):
pretty_launch_time = _prettify_launch_time(launch_time)
cluster_name = cluster if first else ""
node_list = ", ".join(
f"[{_get_node_color(i)}]{name}[/{_get_node_color(i)}]"
for i, name in enumerate(clusters[cluster][launch_time])
)
data = [cluster_name, pretty_launch_time, str(len(clusters[cluster][launch_time])), node_list]
if show_filenames:
filenames = ""
for i, node in enumerate(clusters[cluster][launch_time]):
for channel in ["stdout", "stderr"]:
node_colour = _get_node_color(i)
filenames += f"[{node_colour}]{clusters[cluster][launch_time][node][channel]}[/{node_colour}]\n"
data.append(filenames)
table.add_row(*data)
first = False

console.print(table)


@logs.command()
@click.argument("path")
@click.argument("cluster")
def cat(path, cluster):
# TODO add a flag for selecting which start time to view
# TODO add a flag to filter which nodes to view logs for
try:
from databricks.sdk.runtime import dbutils
except ImportError:
raise RuntimeError("Please install databricks-sdk.")

with console.status("[bright_black]Finding logs..."):
clusters = _get_logs_at_path(path)

if cluster not in clusters:
console.print(f"Cluster {cluster} not found.", style="bold red", highlight=False)
console.print(
f"Hint: Try running dask [b i]databricks logs ls {path}[/b i] to list clusters.",
style="bright_black",
highlight=False,
)
sys.exit(1)

most_recent = sorted(clusters[cluster].keys())[-1]

console.print(f"Cluster: {cluster}", style="bold cyan", highlight=False)
console.print(f"Start time: {_prettify_launch_time(most_recent)}", style="bold cyan", highlight=False)

for i, node in enumerate(clusters[cluster][most_recent]):
for channel in ["stdout", "stderr"]:
for line in dbutils.fs.head(clusters[cluster][most_recent][node][channel], 65536).split("\n"):
node_colour = _get_node_color(i)
console.print(
f"[{node_colour}]{node}[/{node_colour}]: {line}", style="grey89" if channel == "stdout" else "plum4"
)


if __name__ == "__main__":
main()
Loading