Merge branch 'tools' into ibaldoall

ivanbaldo · Jun 6, 2024 · 316b064 · 316b064
2 parents 2d17c60 + 40371b8
commit 316b064
Show file tree

Hide file tree

Showing 8 changed files with 99 additions and 75 deletions.
diff --git a/tools/analytics/requirements.txt b/tools/analytics/requirements.txt
@@ -1,5 +1,6 @@
-aiohttp==3.9.3
+aiohttp==3.9.5
 aiosignal==1.3.1
+async-timeout==4.0.3
 attrs==23.2.0
 beautifulsoup4==4.12.3
 bleach==6.1.0
@@ -13,15 +14,19 @@ defusedxml==0.7.1
 fastjsonschema==2.19.1
 fonttools==4.48.1
 frozenlist==1.4.1
+gpg==1.15.1
 html2image==2.0.4.3
-idna==3.6
-Jinja2==3.1.3
+idna==3.7
+importlib_metadata==7.1.0
+importlib_resources==6.4.0
+Jinja2==3.1.4
 jsonschema==4.21.1
 jsonschema-specifications==2023.12.1
 jupyter_client==8.6.0
 jupyter_core==5.7.1
 jupyterlab_pygments==0.3.0
 kiwisolver==1.4.5
+libcomps==0.1.18
 lxml==5.1.0
 MarkupSafe==2.1.5
 matplotlib==3.8.2
@@ -34,7 +39,7 @@ numpy==1.26.4
 packaging==23.2
 pandas==2.2.0
 pandocfilters==1.5.1
-pillow==10.2.0
+pillow==10.3.0
 platformdirs==4.2.0
 pyarrow==15.0.0
 Pygments==2.17.2
@@ -46,17 +51,20 @@ python-dateutil==2.8.2
 pytz==2024.1
 pyzmq==25.1.2
 referencing==0.33.0
-requests==2.31.0
+requests==2.32.3
 rpds-py==0.18.0
+rpm==4.16.1.3
 seaborn==0.13.2
 six==1.16.0
 soupsieve==2.5
 tabulate==0.9.0
 tinycss2==1.2.1
 tornado==6.4
+tqdm==4.66.4
 traitlets==5.14.1
 tzdata==2023.4
 urllib3==2.2.0
 webencodings==0.5.1
 websocket-client==1.7.0
 yarl==1.9.4
+zipp==3.19.2
diff --git a/tools/analytics/src/config.py b/tools/analytics/src/config.py
@@ -1,13 +1,13 @@
 METRICS = ["mean", "max", "min", "stddev"]
-COLOR_SCHEME = 'coolwarm'
+COLOR_SCHEME = "coolwarm"
 
 GRAPH_STYLE = {
-    "rot":45,
-    "fontsize":6,
-    "legend":None,
+    "rot": 45,
+    "fontsize": 6,
+    "legend": None,
     "title": "Tokens per second (average)",
     "x": "title",
-    "xlabel": "Model"
+    "xlabel": "Model",
 }
 
 SUPPORTED_FORMATS = ["free", "mpstat", "csv", "nvidia-smi"]
@@ -18,13 +18,21 @@
 
 MPSTAT_OUTPUT = ["title", "InfCPU", "MaxCPU"]
 FREE_OUTPUT = ["title", "MaxMem", "InfMem"]
-NVIDIA_SMI_OUTPUT = ["title","InfVRAM",  "MaxVRAM", "InfVRAMBW%", "InfMaxSinglVRAMBW%", "InfGPU%",  "InfMaxSinglGPU%" ]
+NVIDIA_SMI_OUTPUT = [
+    "title",
+    "InfVRAM",
+    "MaxVRAM",
+    "InfVRAMBW%",
+    "InfMaxSinglVRAMBW%",
+    "InfGPU%",
+    "InfMaxSinglGPU%",
+]
 
 OUTPUT_SCHEMAS = {
     "csv": ["title", *METRICS],
     "free": FREE_OUTPUT,
     "mpstat": MPSTAT_OUTPUT,
-    "nvidia-smi": NVIDIA_SMI_OUTPUT
+    "nvidia-smi": NVIDIA_SMI_OUTPUT,
 }
 
 # Indicates the metric to highlight and the order of display
@@ -33,5 +41,5 @@
     "csv": ("first%", False),
     "free": (FREE_OUTPUT[1], True),
     "mpstat": (MPSTAT_OUTPUT[1], True),
-    "nvidia-smi": (NVIDIA_SMI_OUTPUT[1], True)
+    "nvidia-smi": (NVIDIA_SMI_OUTPUT[1], True),
 }
diff --git a/tools/analytics/src/ec2benchmarks.py b/tools/analytics/src/ec2benchmarks.py
@@ -2,6 +2,7 @@
 
 # from config import METRICS
 
+
 def __get_avg_tok_p_s(path: str):
     df = pd.read_csv(path).drop("note", axis=1)
     df["tok_per_sec"] = df["tok_count"] / df["time"]
@@ -19,7 +20,6 @@ def __get_avg_tok_p_s(path: str):
 #     return df
 
 
-
 def process(file):
     # name = os.path.basename(file)
     # data = (name[:-4], *__get_avg_tok_p_s(file))
@@ -30,5 +30,3 @@ def process(file):
 
     # return data
     return __get_avg_tok_p_s(file)
-
-
diff --git a/tools/analytics/src/huggingface.py b/tools/analytics/src/huggingface.py
@@ -1,5 +1,5 @@
-import pandas as pd
 import numpy as np
+import pandas as pd
 
 
 def free(file, inf_mem_row=-3):
@@ -9,20 +9,20 @@ def free(file, inf_mem_row=-3):
     By default it's the third-to-last value
     """
     df = pd.read_csv(file, sep="\t")
-    chunk_size = len(df)//2
+    chunk_size = len(df) // 2
     chunked = np.array_split(df["used"], chunk_size)
     sums = np.empty(chunk_size)
-    for (i, chunk) in enumerate(chunked):
+    for i, chunk in enumerate(chunked):
         sums[i] = chunk.iloc[0].sum()
 
-    inf_mem = sums[inf_mem_row] 
+    inf_mem = sums[inf_mem_row]
     return sums.max(), inf_mem
 
 
 def mpstat(file, start=-13, end=-3):
     """
     file: name of the file to read
-    start:end : time interval to consider, in seconds  
+    start:end : time interval to consider, in seconds
     The default range starts at the last 13 seconds and ends in the last 3
 
     Metrics:
@@ -31,56 +31,64 @@ def mpstat(file, start=-13, end=-3):
     """
     df = pd.read_csv(file, sep="\t")
     # Mean of %idle
-    agg = df[ df["CPU"] == "all" ][["time", "%idle"]].iloc[start:end]
+    agg = df[df["CPU"] == "all"][["time", "%idle"]].iloc[start:end]
     agg["%use"] = 100 - agg["%idle"]
     infcpu = agg["%use"].mean()
-    
+
     # maxcpu = 100 - df[ df["CPU"] == str(max_cpu_target) ]["%idle"].iloc[start:end].min()
-    group = df[ df["CPU"] != "all" ].groupby("time").min()["%idle"][start:end]
+    group = df[df["CPU"] != "all"].groupby("time").min()["%idle"][start:end]
     maxcpu = 100 - min(group)
     return (infcpu, maxcpu)
 
 
 def nvidia_smi(file, start=-13, end=-3):
     """
     Keeping the header with units around for reference:
-    gpu (Idx)	pwr (W)	gtemp (C)	mtemp (C)	sm (%)	mem (%)	
+    gpu (Idx)	pwr (W)	gtemp (C)	mtemp (C)	sm (%)	mem (%)
     enc (%)	dec (%)	jpg (%)	ofa (%) mclk (MHz)	pclk (MHz)
     pviol (%)	tviol (bool)	fb (MB)	bar1 (MB)	ccpm (MB)
     sbecc (errs)	dbecc (errs)	pci (errs)	rxpci (MB/s)	txpci (MB/s)
 
     file: name of the file to read
-    start:end : time interval to consider, in seconds  
+    start:end : time interval to consider, in seconds
     The default range starts at the last 13 seconds and ends in the last 3
 
     Metrics:
-    InfGPU%: average of the per second average of sm% in the start:end time interval. 
+    InfGPU%: average of the per second average of sm% in the start:end time interval.
     InfMaxSinglGPU%: max value of sm% among all GPUS in the start:end time interval
     """
     df = pd.read_csv(file, sep="\t")
     n_gpus = df["gpu"].nunique()
     n_chunks = int(len(df) / n_gpus)
     sums = np.empty(n_chunks)
-    
+
     # As there are no time fields, separate the input in chunks, with a single entry in each chunk for every GPU
     # Then keep only the metrics of interest
     chunked = np.array_split(df[["fb", "sm", "mem"]], n_chunks)
 
-    for (i, chunk) in enumerate(chunked):
+    for i, chunk in enumerate(chunked):
         sums[i] = chunk["fb"].sum()
     infvram = sums[end]
     maxvram = sums.max()
 
     # Average of the average of each chunk
     # `abs(start-end)` is the duration of the chosen timeframe
-    infgpu = max([
-        x for x in 
-        map(lambda chunk: chunk["sm"].max(), chunked[start:end])]) #/ abs(start - end)
+    infgpu = max(
+        [x for x in map(lambda chunk: chunk["sm"].max(), chunked[start:end])]
+    )  # / abs(start - end)
 
-    infvram_bw_percent = max([x for x in map(lambda chunk: chunk["mem"].max(), chunked[start:end])]) 
+    infvram_bw_percent = max(
+        [x for x in map(lambda chunk: chunk["mem"].max(), chunked[start:end])]
+    )
 
     infmaxsinglgpu_percent = max([ch["sm"].max() for ch in chunked[start:end]])
     infmaxsinglvrambw = max([ch["mem"].max() for ch in chunked[start:end]])
-
-    return (maxvram,infvram, infgpu, infvram_bw_percent, infmaxsinglgpu_percent, infmaxsinglvrambw)
 
+    return (
+        maxvram,
+        infvram,
+        infgpu,
+        infvram_bw_percent,
+        infmaxsinglgpu_percent,
+        infmaxsinglvrambw,
+    )
diff --git a/tools/analytics/src/main.py b/tools/analytics/src/main.py
@@ -1,13 +1,14 @@
-import sys, os
+import os
+import sys
 from typing import List
 
 import seaborn as sns
-
-from render import AnalysisResults
 from config import COLOR_SCHEME
 from ec2benchmarks import process as process_ec2_data
 from huggingface import free, mpstat, nvidia_smi
 from preprocessing import preprocess_files
+from render import AnalysisResults
+
 
 def main(args):
     directory = args[1]
@@ -38,17 +39,20 @@ def main(args):
                     infgpu,
                     infvram_bw_percent,
                     infmaxsinglgpu_percent,
-                    infmaxsinglvrambw
+                    infmaxsinglvrambw,
                 ) = nvidia_smi(file, start=-20, end=-10)
 
-                results.add_entry(file, [
-                    maxvram,
-                    infvram,
-                    infvram_bw_percent,
-                    infmaxsinglvrambw,
-                    infgpu,
-                    infmaxsinglgpu_percent
-                ])
+                results.add_entry(
+                    file,
+                    [
+                        maxvram,
+                        infvram,
+                        infvram_bw_percent,
+                        infmaxsinglvrambw,
+                        infgpu,
+                        infmaxsinglgpu_percent,
+                    ],
+                )
 
             case "vmstat":
                 pass
@@ -59,7 +63,7 @@ def main(args):
 def get_data_files(dir: str) -> List[str]:
     paths = []
     walk = os.walk(dir)
-    for root,_,files in walk:
+    for root, _, files in walk:
         for f in files:
             paths.append(f"{root}/{f}")
     return paths
@@ -75,4 +79,3 @@ def get_data_files(dir: str) -> List[str]:
         preprocess_files(get_data_files(sys.argv[2]))
     else:
         main(sys.argv)
-
diff --git a/tools/analytics/src/postprocess.py b/tools/analytics/src/postprocess.py
@@ -1,5 +1,4 @@
 import pandas as pd
-
 from config import HIGHLIGHTED_METRIC, METRICS, NVIDIA_SMI_OUTPUT
 
 
@@ -10,13 +9,15 @@ def postprocess_csv(df: pd.DataFrame) -> pd.DataFrame:
 
     return df
 
+
 def postprocess_free(df: pd.DataFrame) -> pd.DataFrame:
     cols = [col for col in df.columns if col != "title"]
     df[cols] = df[cols].astype(int)
     (metric, _) = HIGHLIGHTED_METRIC["free"]
     df.sort_values(by=metric, inplace=True, ascending=True)
     return df
 
+
 def postprocess_mpstat(df: pd.DataFrame) -> pd.DataFrame:
     (metric, _) = HIGHLIGHTED_METRIC["mpstat"]
     df.sort_values(by=metric, inplace=True, ascending=True)

diff --git a/tools/analytics/src/preprocessing.py b/tools/analytics/src/preprocessing.py
@@ -1,7 +1,7 @@
-import os, sys
-from typing import List
-
+import os
 import shutil
+import sys
+from typing import List
 
 from config import MPSTAT_HEADER
 
@@ -10,7 +10,7 @@ def preprocess_files(files: List[str]):
     for file in files:
         parent_dir = os.path.dirname(file).split("/")[0]
         rest = os.path.dirname(file).split("/")[1:]
-        outdir = parent_dir + "_preprocessed/" + "/".join(rest) 
+        outdir = parent_dir + "_preprocessed/" + "/".join(rest)
         if not os.path.exists(outdir):
             os.makedirs(outdir, exist_ok=True)
         filename = os.path.basename(file)
@@ -26,7 +26,7 @@ def preprocess_files(files: List[str]):
                 result = _preprocess_nvidia_smi(file)
             case "csv":
                 # Already correct formatting, just copy it over
-                shutil.copy(file, dst)  
+                shutil.copy(file, dst)
                 continue
             case _:
                 print(f"[WARNING] Unsupported file format: {dst}", file=sys.stderr)
@@ -54,27 +54,21 @@ def _preprocess_free(file) -> str:
         lines = f.readlines()
         header = lines[0]
         rest = lines[1:]
-        data = "".join([
-            _replace_spaces_with_tabs(l) 
-            for l in rest
-            if "free" not in l
-        ])
+        data = "".join([_replace_spaces_with_tabs(l) for l in rest if "free" not in l])
     # the column which indicates Mem or Swap lacks a name, call it type and add it to the header
     return f"type\t{_replace_spaces_with_tabs(header)}{data}"
 
+
 def _preprocess_nvidia_smi(file) -> str:
     with open(file) as f:
         lines = f.readlines()
         # Remove the two first lines that start with '#'
         header = lines[0][2:]
         # Skip units
         rest = lines[2:]
-        data = [
-            _replace_spaces_with_tabs(l)
-            for l in rest
-            if "#" not in l
-        ]
+        data = [_replace_spaces_with_tabs(l) for l in rest if "#" not in l]
         return _replace_spaces_with_tabs(header) + "".join(data)
 
+
 def _replace_spaces_with_tabs(text):
     return "\t".join([x for x in text.split(" ") if x != ""])