Skip to content

Commit

Permalink
Merge branch 'tools' into ibaldoall
Browse files Browse the repository at this point in the history
  • Loading branch information
ibaldonl committed Jun 6, 2024
2 parents 2d17c60 + 40371b8 commit 316b064
Show file tree
Hide file tree
Showing 8 changed files with 99 additions and 75 deletions.
18 changes: 13 additions & 5 deletions tools/analytics/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
aiohttp==3.9.3
aiohttp==3.9.5
aiosignal==1.3.1
async-timeout==4.0.3
attrs==23.2.0
beautifulsoup4==4.12.3
bleach==6.1.0
Expand All @@ -13,15 +14,19 @@ defusedxml==0.7.1
fastjsonschema==2.19.1
fonttools==4.48.1
frozenlist==1.4.1
gpg==1.15.1
html2image==2.0.4.3
idna==3.6
Jinja2==3.1.3
idna==3.7
importlib_metadata==7.1.0
importlib_resources==6.4.0
Jinja2==3.1.4
jsonschema==4.21.1
jsonschema-specifications==2023.12.1
jupyter_client==8.6.0
jupyter_core==5.7.1
jupyterlab_pygments==0.3.0
kiwisolver==1.4.5
libcomps==0.1.18
lxml==5.1.0
MarkupSafe==2.1.5
matplotlib==3.8.2
Expand All @@ -34,7 +39,7 @@ numpy==1.26.4
packaging==23.2
pandas==2.2.0
pandocfilters==1.5.1
pillow==10.2.0
pillow==10.3.0
platformdirs==4.2.0
pyarrow==15.0.0
Pygments==2.17.2
Expand All @@ -46,17 +51,20 @@ python-dateutil==2.8.2
pytz==2024.1
pyzmq==25.1.2
referencing==0.33.0
requests==2.31.0
requests==2.32.3
rpds-py==0.18.0
rpm==4.16.1.3
seaborn==0.13.2
six==1.16.0
soupsieve==2.5
tabulate==0.9.0
tinycss2==1.2.1
tornado==6.4
tqdm==4.66.4
traitlets==5.14.1
tzdata==2023.4
urllib3==2.2.0
webencodings==0.5.1
websocket-client==1.7.0
yarl==1.9.4
zipp==3.19.2
24 changes: 16 additions & 8 deletions tools/analytics/src/config.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
METRICS = ["mean", "max", "min", "stddev"]
COLOR_SCHEME = 'coolwarm'
COLOR_SCHEME = "coolwarm"

GRAPH_STYLE = {
"rot":45,
"fontsize":6,
"legend":None,
"rot": 45,
"fontsize": 6,
"legend": None,
"title": "Tokens per second (average)",
"x": "title",
"xlabel": "Model"
"xlabel": "Model",
}

SUPPORTED_FORMATS = ["free", "mpstat", "csv", "nvidia-smi"]
Expand All @@ -18,13 +18,21 @@

MPSTAT_OUTPUT = ["title", "InfCPU", "MaxCPU"]
FREE_OUTPUT = ["title", "MaxMem", "InfMem"]
NVIDIA_SMI_OUTPUT = ["title","InfVRAM", "MaxVRAM", "InfVRAMBW%", "InfMaxSinglVRAMBW%", "InfGPU%", "InfMaxSinglGPU%" ]
NVIDIA_SMI_OUTPUT = [
"title",
"InfVRAM",
"MaxVRAM",
"InfVRAMBW%",
"InfMaxSinglVRAMBW%",
"InfGPU%",
"InfMaxSinglGPU%",
]

OUTPUT_SCHEMAS = {
"csv": ["title", *METRICS],
"free": FREE_OUTPUT,
"mpstat": MPSTAT_OUTPUT,
"nvidia-smi": NVIDIA_SMI_OUTPUT
"nvidia-smi": NVIDIA_SMI_OUTPUT,
}

# Indicates the metric to highlight and the order of display
Expand All @@ -33,5 +41,5 @@
"csv": ("first%", False),
"free": (FREE_OUTPUT[1], True),
"mpstat": (MPSTAT_OUTPUT[1], True),
"nvidia-smi": (NVIDIA_SMI_OUTPUT[1], True)
"nvidia-smi": (NVIDIA_SMI_OUTPUT[1], True),
}
4 changes: 1 addition & 3 deletions tools/analytics/src/ec2benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

# from config import METRICS


def __get_avg_tok_p_s(path: str):
df = pd.read_csv(path).drop("note", axis=1)
df["tok_per_sec"] = df["tok_count"] / df["time"]
Expand All @@ -19,7 +20,6 @@ def __get_avg_tok_p_s(path: str):
# return df



def process(file):
# name = os.path.basename(file)
# data = (name[:-4], *__get_avg_tok_p_s(file))
Expand All @@ -30,5 +30,3 @@ def process(file):

# return data
return __get_avg_tok_p_s(file)


46 changes: 27 additions & 19 deletions tools/analytics/src/huggingface.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import pandas as pd
import numpy as np
import pandas as pd


def free(file, inf_mem_row=-3):
Expand All @@ -9,20 +9,20 @@ def free(file, inf_mem_row=-3):
By default it's the third-to-last value
"""
df = pd.read_csv(file, sep="\t")
chunk_size = len(df)//2
chunk_size = len(df) // 2
chunked = np.array_split(df["used"], chunk_size)
sums = np.empty(chunk_size)
for (i, chunk) in enumerate(chunked):
for i, chunk in enumerate(chunked):
sums[i] = chunk.iloc[0].sum()

inf_mem = sums[inf_mem_row]
inf_mem = sums[inf_mem_row]
return sums.max(), inf_mem


def mpstat(file, start=-13, end=-3):
"""
file: name of the file to read
start:end : time interval to consider, in seconds
start:end : time interval to consider, in seconds
The default range starts at the last 13 seconds and ends in the last 3
Metrics:
Expand All @@ -31,56 +31,64 @@ def mpstat(file, start=-13, end=-3):
"""
df = pd.read_csv(file, sep="\t")
# Mean of %idle
agg = df[ df["CPU"] == "all" ][["time", "%idle"]].iloc[start:end]
agg = df[df["CPU"] == "all"][["time", "%idle"]].iloc[start:end]
agg["%use"] = 100 - agg["%idle"]
infcpu = agg["%use"].mean()

# maxcpu = 100 - df[ df["CPU"] == str(max_cpu_target) ]["%idle"].iloc[start:end].min()
group = df[ df["CPU"] != "all" ].groupby("time").min()["%idle"][start:end]
group = df[df["CPU"] != "all"].groupby("time").min()["%idle"][start:end]
maxcpu = 100 - min(group)
return (infcpu, maxcpu)


def nvidia_smi(file, start=-13, end=-3):
"""
Keeping the header with units around for reference:
gpu (Idx) pwr (W) gtemp (C) mtemp (C) sm (%) mem (%)
gpu (Idx) pwr (W) gtemp (C) mtemp (C) sm (%) mem (%)
enc (%) dec (%) jpg (%) ofa (%) mclk (MHz) pclk (MHz)
pviol (%) tviol (bool) fb (MB) bar1 (MB) ccpm (MB)
sbecc (errs) dbecc (errs) pci (errs) rxpci (MB/s) txpci (MB/s)
file: name of the file to read
start:end : time interval to consider, in seconds
start:end : time interval to consider, in seconds
The default range starts at the last 13 seconds and ends in the last 3
Metrics:
InfGPU%: average of the per second average of sm% in the start:end time interval.
InfGPU%: average of the per second average of sm% in the start:end time interval.
InfMaxSinglGPU%: max value of sm% among all GPUS in the start:end time interval
"""
df = pd.read_csv(file, sep="\t")
n_gpus = df["gpu"].nunique()
n_chunks = int(len(df) / n_gpus)
sums = np.empty(n_chunks)

# As there are no time fields, separate the input in chunks, with a single entry in each chunk for every GPU
# Then keep only the metrics of interest
chunked = np.array_split(df[["fb", "sm", "mem"]], n_chunks)

for (i, chunk) in enumerate(chunked):
for i, chunk in enumerate(chunked):
sums[i] = chunk["fb"].sum()
infvram = sums[end]
maxvram = sums.max()

# Average of the average of each chunk
# `abs(start-end)` is the duration of the chosen timeframe
infgpu = max([
x for x in
map(lambda chunk: chunk["sm"].max(), chunked[start:end])]) #/ abs(start - end)
infgpu = max(
[x for x in map(lambda chunk: chunk["sm"].max(), chunked[start:end])]
) # / abs(start - end)

infvram_bw_percent = max([x for x in map(lambda chunk: chunk["mem"].max(), chunked[start:end])])
infvram_bw_percent = max(
[x for x in map(lambda chunk: chunk["mem"].max(), chunked[start:end])]
)

infmaxsinglgpu_percent = max([ch["sm"].max() for ch in chunked[start:end]])
infmaxsinglvrambw = max([ch["mem"].max() for ch in chunked[start:end]])

return (maxvram,infvram, infgpu, infvram_bw_percent, infmaxsinglgpu_percent, infmaxsinglvrambw)

return (
maxvram,
infvram,
infgpu,
infvram_bw_percent,
infmaxsinglgpu_percent,
infmaxsinglvrambw,
)
31 changes: 17 additions & 14 deletions tools/analytics/src/main.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import sys, os
import os
import sys
from typing import List

import seaborn as sns

from render import AnalysisResults
from config import COLOR_SCHEME
from ec2benchmarks import process as process_ec2_data
from huggingface import free, mpstat, nvidia_smi
from preprocessing import preprocess_files
from render import AnalysisResults


def main(args):
directory = args[1]
Expand Down Expand Up @@ -38,17 +39,20 @@ def main(args):
infgpu,
infvram_bw_percent,
infmaxsinglgpu_percent,
infmaxsinglvrambw
infmaxsinglvrambw,
) = nvidia_smi(file, start=-20, end=-10)

results.add_entry(file, [
maxvram,
infvram,
infvram_bw_percent,
infmaxsinglvrambw,
infgpu,
infmaxsinglgpu_percent
])
results.add_entry(
file,
[
maxvram,
infvram,
infvram_bw_percent,
infmaxsinglvrambw,
infgpu,
infmaxsinglgpu_percent,
],
)

case "vmstat":
pass
Expand All @@ -59,7 +63,7 @@ def main(args):
def get_data_files(dir: str) -> List[str]:
paths = []
walk = os.walk(dir)
for root,_,files in walk:
for root, _, files in walk:
for f in files:
paths.append(f"{root}/{f}")
return paths
Expand All @@ -75,4 +79,3 @@ def get_data_files(dir: str) -> List[str]:
preprocess_files(get_data_files(sys.argv[2]))
else:
main(sys.argv)

3 changes: 2 additions & 1 deletion tools/analytics/src/postprocess.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import pandas as pd

from config import HIGHLIGHTED_METRIC, METRICS, NVIDIA_SMI_OUTPUT


Expand All @@ -10,13 +9,15 @@ def postprocess_csv(df: pd.DataFrame) -> pd.DataFrame:

return df


def postprocess_free(df: pd.DataFrame) -> pd.DataFrame:
cols = [col for col in df.columns if col != "title"]
df[cols] = df[cols].astype(int)
(metric, _) = HIGHLIGHTED_METRIC["free"]
df.sort_values(by=metric, inplace=True, ascending=True)
return df


def postprocess_mpstat(df: pd.DataFrame) -> pd.DataFrame:
(metric, _) = HIGHLIGHTED_METRIC["mpstat"]
df.sort_values(by=metric, inplace=True, ascending=True)
Expand Down
24 changes: 9 additions & 15 deletions tools/analytics/src/preprocessing.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os, sys
from typing import List

import os
import shutil
import sys
from typing import List

from config import MPSTAT_HEADER

Expand All @@ -10,7 +10,7 @@ def preprocess_files(files: List[str]):
for file in files:
parent_dir = os.path.dirname(file).split("/")[0]
rest = os.path.dirname(file).split("/")[1:]
outdir = parent_dir + "_preprocessed/" + "/".join(rest)
outdir = parent_dir + "_preprocessed/" + "/".join(rest)
if not os.path.exists(outdir):
os.makedirs(outdir, exist_ok=True)
filename = os.path.basename(file)
Expand All @@ -26,7 +26,7 @@ def preprocess_files(files: List[str]):
result = _preprocess_nvidia_smi(file)
case "csv":
# Already correct formatting, just copy it over
shutil.copy(file, dst)
shutil.copy(file, dst)
continue
case _:
print(f"[WARNING] Unsupported file format: {dst}", file=sys.stderr)
Expand Down Expand Up @@ -54,27 +54,21 @@ def _preprocess_free(file) -> str:
lines = f.readlines()
header = lines[0]
rest = lines[1:]
data = "".join([
_replace_spaces_with_tabs(l)
for l in rest
if "free" not in l
])
data = "".join([_replace_spaces_with_tabs(l) for l in rest if "free" not in l])
# the column which indicates Mem or Swap lacks a name, call it type and add it to the header
return f"type\t{_replace_spaces_with_tabs(header)}{data}"


def _preprocess_nvidia_smi(file) -> str:
with open(file) as f:
lines = f.readlines()
# Remove the two first lines that start with '#'
header = lines[0][2:]
# Skip units
rest = lines[2:]
data = [
_replace_spaces_with_tabs(l)
for l in rest
if "#" not in l
]
data = [_replace_spaces_with_tabs(l) for l in rest if "#" not in l]
return _replace_spaces_with_tabs(header) + "".join(data)


def _replace_spaces_with_tabs(text):
return "\t".join([x for x in text.split(" ") if x != ""])
Loading

0 comments on commit 316b064

Please sign in to comment.