fix linting issues

rapidsai · Oct 11, 2024 · b4bce66 · b4bce66
1 parent 6993b09
commit b4bce66
Show file tree

Hide file tree

Showing 34 changed files with 552 additions and 182 deletions.
diff --git a/extensions/rapids_notebook_files.py b/extensions/rapids_notebook_files.py
@@ -16,7 +16,9 @@ def walk_files(app, dir, outdir):
     related_notebook_files = {}
     for page in dir.glob("*"):
         if page.is_dir():
-            related_notebook_files[page.name] = walk_files(app, page, outdir / page.name)
+            related_notebook_files[page.name] = walk_files(
+                app, page, outdir / page.name
+            )
         else:
             with contextlib.suppress(OSError):
                 os.remove(str(outdir / page.name))
@@ -57,7 +59,9 @@ def find_notebook_related_files(app, pagename, templatename, context, doctree):
         path_to_output_parent = output_root / rel_page_parent
 
         # Copy all related files to output and apply templating
-        related_notebook_files = walk_files(app, path_to_page_parent, path_to_output_parent)
+        related_notebook_files = walk_files(
+            app, path_to_page_parent, path_to_output_parent
+        )
 
         # Make archive of related files
         if related_notebook_files and len(related_notebook_files) > 1:

diff --git a/extensions/rapids_related_examples.py b/extensions/rapids_related_examples.py
@@ -22,7 +22,9 @@ def read_notebook_tags(path: str) -> list[str]:
         return []
 
 
-def generate_notebook_grid_myst(notebooks: list[str], env: BuildEnvironment) -> list[str]:
+def generate_notebook_grid_myst(
+    notebooks: list[str], env: BuildEnvironment
+) -> list[str]:
     """Generate sphinx-design grid of notebooks in MyST markdown.
 
     Take a list of notebook documents and render out some MyST markdown displaying those
@@ -73,7 +75,11 @@ def get_title_for_notebook(path: str) -> str:
                 if i == len(cell_source) - 1:  # no next_token
                     continue
                 next_token = cell_source[i + 1]
-                if token.type == "heading_open" and token.tag == "h1" and next_token.type == "inline":
+                if (
+                    token.type == "heading_open"
+                    and token.tag == "h1"
+                    and next_token.type == "inline"
+                ):
                     return next_token.content
     raise ValueError("No top-level heading found")
 
@@ -140,7 +146,9 @@ def add_notebook_tag_map_to_context(app, pagename, templatename, context, doctre
         except KeyError:
             tag_tree[root] = [suffix]
     context["notebook_tag_tree"] = tag_tree
-    context["notebook_tags"] = [tag for tag, pages in app.env.notebook_tag_map.items() if pagename in pages]
+    context["notebook_tags"] = [
+        tag for tag, pages in app.env.notebook_tag_map.items() if pagename in pages
+    ]
 
 
 class NotebookGalleryTocTree(TocTree):
@@ -154,7 +162,9 @@ def run(self) -> list[nodes.Node]:
         output += toctree
 
         # Generate the card grid for all items in the toctree
-        notebooks = [notebook for _, notebook in toctree[0].children[0].attributes["entries"]]
+        notebooks = [
+            notebook for _, notebook in toctree[0].children[0].attributes["entries"]
+        ]
         grid_markdown = generate_notebook_grid_myst(notebooks=notebooks, env=self.env)
         for node in parse_markdown(markdown=grid_markdown, state=self.state):
             gallery += node

diff --git a/extensions/rapids_version_templating.py b/extensions/rapids_version_templating.py
@@ -49,7 +49,9 @@ def visit_reference(self, node: nodes.reference) -> None:
         uri_str = re.sub(r"~~~(.*?)~~~", r"{{ \1 }}", uri_str)
 
         # fill in appropriate values based on app context
-        node.attributes["refuri"] = re.sub(r"(?<!\$)\{\{.*?\}\}", self.template_func, uri_str)
+        node.attributes["refuri"] = re.sub(
+            r"(?<!\$)\{\{.*?\}\}", self.template_func, uri_str
+        )
 
         # update the document
         node.parent.replace(node, node)
@@ -59,15 +61,19 @@ def visit_Text(self, node: nodes.Text) -> None:
         Replace template strings in generic text.
         This roughly corresponds to HTML ``<p>``, ``<pre>``, and similar elements.
         """
-        new_node = nodes.Text(re.sub(r"(?<!\$)\{\{.*?\}\}", self.template_func, node.astext()))
+        new_node = nodes.Text(
+            re.sub(r"(?<!\$)\{\{.*?\}\}", self.template_func, node.astext())
+        )
         node.parent.replace(node, new_node)
 
     def template_func(self, match: re.Match) -> str:
         """
         Replace template strings like ``{{ rapids_version }}`` with real
         values like ``24.10``.
         """
-        return self.app.builder.templates.render_string(source=match.group(), context=self.app.config.rapids_version)
+        return self.app.builder.templates.render_string(
+            source=match.group(), context=self.app.config.rapids_version
+        )
 
 
 def version_template(

diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -1,5 +1,5 @@
 {
   "devDependencies": {
-    "prettier": "3.3.3"
+    "prettier": "^3.3.3"
   }
 }
diff --git a/source/conf.py b/source/conf.py
@@ -43,12 +43,18 @@
     },
 }
 rapids_version = (
-    versions["stable"] if os.environ.get("DEPLOYMENT_DOCS_BUILD_STABLE", "false") == "true" else versions["nightly"]
+    versions["stable"]
+    if os.environ.get("DEPLOYMENT_DOCS_BUILD_STABLE", "false") == "true"
+    else versions["nightly"]
 )
 rapids_version["rapids_conda_channels_list"] = [
-    channel for channel in rapids_version["rapids_conda_channels"].split(" ") if channel != "-c"
+    channel
+    for channel in rapids_version["rapids_conda_channels"].split(" ")
+    if channel != "-c"
 ]
-rapids_version["rapids_conda_packages_list"] = rapids_version["rapids_conda_packages"].split(" ")
+rapids_version["rapids_conda_packages_list"] = rapids_version[
+    "rapids_conda_packages"
+].split(" ")
 
 # -- General configuration ---------------------------------------------------
 
@@ -88,7 +94,9 @@
 # -- Options for notebooks -------------------------------------------------
 
 nb_execution_mode = "off"
-rapids_deployment_notebooks_base_url = "https://github.com/rapidsai/deployment/blob/main/source/"
+rapids_deployment_notebooks_base_url = (
+    "https://github.com/rapidsai/deployment/blob/main/source/"
+)
 
 # -- Options for HTML output -------------------------------------------------
 
@@ -138,6 +146,8 @@
 def setup(app):
     app.add_css_file("https://docs.rapids.ai/assets/css/custom.css")
     app.add_css_file("css/custom.css")
-    app.add_js_file("https://docs.rapids.ai/assets/js/custom.js", loading_method="defer")
+    app.add_js_file(
+        "https://docs.rapids.ai/assets/js/custom.js", loading_method="defer"
+    )
     app.add_js_file("js/nav.js", loading_method="defer")
     app.add_js_file("js/notebook-gallery.js", loading_method="defer")
diff --git a/source/examples/rapids-1brc-single-node/notebook.ipynb b/source/examples/rapids-1brc-single-node/notebook.ipynb
@@ -200,7 +200,9 @@
    "source": [
     "n = 1_000_000_000  # Number of rows of data to generate\n",
     "\n",
-    "lookup_df = cudf.read_csv(\"lookup.csv\")  # Load our lookup table of stations and their mean temperatures\n",
+    "lookup_df = cudf.read_csv(\n",
+    "    \"lookup.csv\"\n",
+    ")  # Load our lookup table of stations and their mean temperatures\n",
     "std = 10.0  # We assume temperatures are normally distributed with a standard deviation of 10\n",
     "chunksize = 2e8  # Set the number of rows to generate in one go (reduce this if you run into GPU RAM limits)\n",
     "filename = Path(\"measurements.txt\")  # Choose where to write to\n",

diff --git a/source/examples/rapids-autoscaling-multi-tenant-kubernetes/notebook.ipynb b/source/examples/rapids-autoscaling-multi-tenant-kubernetes/notebook.ipynb
@@ -995,8 +995,12 @@
     "\n",
     "\n",
     "def map_haversine(part):\n",
-    "    pickup = cuspatial.GeoSeries.from_points_xy(part[[\"pickup_longitude\", \"pickup_latitude\"]].interleave_columns())\n",
-    "    dropoff = cuspatial.GeoSeries.from_points_xy(part[[\"dropoff_longitude\", \"dropoff_latitude\"]].interleave_columns())\n",
+    "    pickup = cuspatial.GeoSeries.from_points_xy(\n",
+    "        part[[\"pickup_longitude\", \"pickup_latitude\"]].interleave_columns()\n",
+    "    )\n",
+    "    dropoff = cuspatial.GeoSeries.from_points_xy(\n",
+    "        part[[\"dropoff_longitude\", \"dropoff_latitude\"]].interleave_columns()\n",
+    "    )\n",
     "    return cuspatial.haversine_distance(pickup, dropoff)\n",
     "\n",
     "\n",
@@ -1502,7 +1506,9 @@
     "from random import randrange\n",
     "\n",
     "\n",
-    "def generate_workload(stages=3, min_width=1, max_width=3, variation=1, input_workload=None):\n",
+    "def generate_workload(\n",
+    "    stages=3, min_width=1, max_width=3, variation=1, input_workload=None\n",
+    "):\n",
     "    graph = [input_workload] if input_workload is not None else [run_haversine()]\n",
     "    last_width = min_width\n",
     "    for _ in range(stages):\n",
@@ -1640,25 +1646,35 @@
    ],
    "source": [
     "%%time\n",
-    "start_time = (datetime.datetime.now() - datetime.timedelta(minutes=15)).strftime(\"%Y-%m-%dT%H:%M:%SZ\")\n",
+    "start_time = (datetime.datetime.now() - datetime.timedelta(minutes=15)).strftime(\n",
+    "    \"%Y-%m-%dT%H:%M:%SZ\"\n",
+    ")\n",
     "try:\n",
     "    # Start with a couple of concurrent workloads\n",
     "    workload = generate_workload(stages=10, max_width=2)\n",
     "    # Then increase demand as more users appear\n",
-    "    workload = generate_workload(stages=5, max_width=5, min_width=3, variation=5, input_workload=workload)\n",
+    "    workload = generate_workload(\n",
+    "        stages=5, max_width=5, min_width=3, variation=5, input_workload=workload\n",
+    "    )\n",
     "    # Now reduce the workload for a longer period of time, this could be over a lunchbreak or something\n",
     "    workload = generate_workload(stages=30, max_width=2, input_workload=workload)\n",
     "    # Everyone is back from lunch and it hitting the cluster hard\n",
-    "    workload = generate_workload(stages=10, max_width=10, min_width=3, variation=5, input_workload=workload)\n",
+    "    workload = generate_workload(\n",
+    "        stages=10, max_width=10, min_width=3, variation=5, input_workload=workload\n",
+    "    )\n",
     "    # The after lunch rush is easing\n",
-    "    workload = generate_workload(stages=5, max_width=5, min_width=3, variation=5, input_workload=workload)\n",
+    "    workload = generate_workload(\n",
+    "        stages=5, max_width=5, min_width=3, variation=5, input_workload=workload\n",
+    "    )\n",
     "    # As we get towards the end of the day demand slows off again\n",
     "    workload = generate_workload(stages=10, max_width=2, input_workload=workload)\n",
     "    workload.compute()\n",
     "finally:\n",
     "    client.close()\n",
     "    cluster.close()\n",
-    "    end_time = (datetime.datetime.now() + datetime.timedelta(minutes=15)).strftime(\"%Y-%m-%dT%H:%M:%SZ\")"
+    "    end_time = (datetime.datetime.now() + datetime.timedelta(minutes=15)).strftime(\n",
+    "        \"%Y-%m-%dT%H:%M:%SZ\"\n",
+    "    )"
    ]
   },
   {
@@ -2021,10 +2037,14 @@
     "    end_time,\n",
     "    \"1s\",\n",
     ")\n",
-    "running_pods = running_pods[running_pods.columns.drop(list(running_pods.filter(regex=\"prepull\")))]\n",
+    "running_pods = running_pods[\n",
+    "    running_pods.columns.drop(list(running_pods.filter(regex=\"prepull\")))\n",
+    "]\n",
     "nodes = p.query_range(\"count(kube_node_info)\", start_time, end_time, \"1s\")\n",
     "nodes.columns = [\"Available GPUs\"]\n",
-    "nodes[\"Available GPUs\"] = nodes[\"Available GPUs\"] * 2  # We know our nodes each had 2 GPUs\n",
+    "nodes[\"Available GPUs\"] = (\n",
+    "    nodes[\"Available GPUs\"] * 2\n",
+    ")  # We know our nodes each had 2 GPUs\n",
     "nodes[\"Utilized GPUs\"] = running_pods.sum(axis=1)"
    ]
   },

diff --git a/source/examples/rapids-azureml-hpo/notebook.ipynb b/source/examples/rapids-azureml-hpo/notebook.ipynb
@@ -218,7 +218,9 @@
     "    )\n",
     "    ml_client.compute.begin_create_or_update(gpu_target).result()\n",
     "\n",
-    "    print(f\"AMLCompute with name {gpu_target.name} is created, the compute size is {gpu_target.size}\")"
+    "    print(\n",
+    "        f\"AMLCompute with name {gpu_target.name} is created, the compute size is {gpu_target.size}\"\n",
+    "    )"
    ]
   },
   {
@@ -485,7 +487,9 @@
     "\n",
     "\n",
     "# Define the limits for this sweep\n",
-    "sweep_job.set_limits(max_total_trials=10, max_concurrent_trials=2, timeout=18000, trial_timeout=3600)\n",
+    "sweep_job.set_limits(\n",
+    "    max_total_trials=10, max_concurrent_trials=2, timeout=18000, trial_timeout=3600\n",
+    ")\n",
     "\n",
     "\n",
     "# Specify your experiment details\n",

diff --git a/source/examples/rapids-azureml-hpo/rapids_csp_azure.py b/source/examples/rapids-azureml-hpo/rapids_csp_azure.py
@@ -132,7 +132,9 @@ def load_hyperparams(self, model_name="XGBoost"):
             self.log_to_file(str(error))
             return
 
-    def load_data(self, filename="dataset.orc", col_labels=None, y_label="ArrDelayBinary"):
+    def load_data(
+        self, filename="dataset.orc", col_labels=None, y_label="ArrDelayBinary"
+    ):
         """
         Loading the data into the object from the filename and based on the columns that we are
         interested in. Also, generates y_label from 'ArrDelay' column to convert this into a binary
@@ -183,7 +185,9 @@ def load_data(self, filename="dataset.orc", col_labels=None, y_label="ArrDelayBi
 
                     elif "multi" in self.compute_type:
                         self.log_to_file("\n\tReading using dask dataframe")
-                        dataset = dask.dataframe.read_parquet(target_filename, columns=col_labels)
+                        dataset = dask.dataframe.read_parquet(
+                            target_filename, columns=col_labels
+                        )
 
             elif "GPU" in self.compute_type:
                 # GPU Reading Option
@@ -201,7 +205,9 @@ def load_data(self, filename="dataset.orc", col_labels=None, y_label="ArrDelayBi
 
                     elif "multi" in self.compute_type:
                         self.log_to_file("\n\tReading using dask_cudf")
-                        dataset = dask_cudf.read_parquet(target_filename, columns=col_labels)
+                        dataset = dask_cudf.read_parquet(
+                            target_filename, columns=col_labels
+                        )
 
         # cast all columns to float32
         for col in dataset.columns:
@@ -216,10 +222,14 @@ def load_data(self, filename="dataset.orc", col_labels=None, y_label="ArrDelayBi
         dataset = dataset.fillna(0.0)  # Filling the null values. Needed for dask-cudf
 
         self.log_to_file(f"\n\tIngestion completed in {ingestion_timer.duration}")
-        self.log_to_file(f"\n\tDataset descriptors: {dataset.shape}\n\t{dataset.dtypes}")
+        self.log_to_file(
+            f"\n\tDataset descriptors: {dataset.shape}\n\t{dataset.dtypes}"
+        )
         return dataset, col_labels, y_label, ingestion_timer.duration
 
-    def split_data(self, dataset, y_label, train_size=0.8, random_state=0, shuffle=True):
+    def split_data(
+        self, dataset, y_label, train_size=0.8, random_state=0, shuffle=True
+    ):
         """
         Splitting data into train and test split, has appropriate imports for different compute modes.
         CPU compute - Uses sklearn, we manually filter y_label column in the split call
@@ -311,9 +321,13 @@ def train_model(self, X_train, y_train, model_params):
 
         try:
             if self.model_type == "XGBoost":
-                trained_model, training_time = self.fit_xgboost(X_train, y_train, model_params)
+                trained_model, training_time = self.fit_xgboost(
+                    X_train, y_train, model_params
+                )
             elif self.model_type == "RandomForest":
-                trained_model, training_time = self.fit_random_forest(X_train, y_train, model_params)
+                trained_model, training_time = self.fit_random_forest(
+                    X_train, y_train, model_params
+                )
         except Exception as error:
             self.log_to_file("\n\n!error during model training: " + str(error))
         self.log_to_file(f"\n\tFinished training in {training_time:.4f} s")
@@ -340,7 +354,9 @@ def fit_xgboost(self, X_train, y_train, model_params):
                 )
             elif "multi" in self.compute_type:
                 self.log_to_file("\n\tTraining multi-GPU XGBoost")
-                train_DMatrix = xgboost.dask.DaskDMatrix(self.client, data=X_train, label=y_train)
+                train_DMatrix = xgboost.dask.DaskDMatrix(
+                    self.client, data=X_train, label=y_train
+                )
                 trained_model = xgboost.dask.train(
                     self.client,
                     dtrain=train_DMatrix,
@@ -425,8 +441,12 @@ def evaluate_test_perf(self, trained_model, X_test, y_test, threshold=0.5):
             try:
                 if self.model_type == "XGBoost":
                     if "multi" in self.compute_type:
-                        test_DMatrix = xgboost.dask.DaskDMatrix(self.client, data=X_test, label=y_test)
-                        xgb_pred = xgboost.dask.predict(self.client, trained_model, test_DMatrix).compute()
+                        test_DMatrix = xgboost.dask.DaskDMatrix(
+                            self.client, data=X_test, label=y_test
+                        )
+                        xgb_pred = xgboost.dask.predict(
+                            self.client, trained_model, test_DMatrix
+                        ).compute()
                         xgb_pred = (xgb_pred > threshold) * 1.0
                         test_accuracy = accuracy_score(y_test.compute(), xgb_pred)
                     elif "single" in self.compute_type:
@@ -439,9 +459,13 @@ def evaluate_test_perf(self, trained_model, X_test, y_test, threshold=0.5):
                     if "multi" in self.compute_type:
                         cuml_pred = trained_model.predict(X_test).compute()
                         self.log_to_file("\n\tPrediction complete")
-                        test_accuracy = accuracy_score(y_test.compute(), cuml_pred, convert_dtype=True)
+                        test_accuracy = accuracy_score(
+                            y_test.compute(), cuml_pred, convert_dtype=True
+                        )
                     elif "single" in self.compute_type:
-                        test_accuracy = trained_model.score(X_test, y_test.astype("int32"))
+                        test_accuracy = trained_model.score(
+                            X_test, y_test.astype("int32")
+                        )
 
             except Exception as error:
                 self.log_to_file("\n\n!error during inference: " + str(error))