ci: migrate to uv and update build tools🔧

KarelZe · Dec 2, 2024 · 9126976 · 9126976
1 parent 2b25376
commit 9126976
Show file tree

Hide file tree

Showing 26 changed files with 5,673 additions and 5,222 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -25,6 +25,7 @@ repos:
     hooks:
       - id: trailing-whitespace
       - id: check-added-large-files
+        exclude: uv.lock
       - id: check-builtin-literals
       - id: check-byte-order-marker
       - id: check-merge-conflict

diff --git a/notebooks/1.0-mb-data-preprocessing-mem-reduce.ipynb b/notebooks/1.0-mb-data-preprocessing-mem-reduce.ipynb
@@ -31,7 +31,7 @@
     "ProgressBar.enable()\n",
     "\n",
     "import wandb\n",
-    "from tqdm.auto import tqdm\n"
+    "from tqdm.auto import tqdm"
    ]
   },
   {
@@ -47,7 +47,7 @@
     "FILE_PATH_INPUT = (\n",
     "    \"gs://thesis-bucket-option-trade-classification/data/raw/matched_cboe_quotes.csv\"\n",
     ")\n",
-    "FILE_PATH_OUTPUT = \"gs://thesis-bucket-option-trade-classification/data/preprocessed/\"\n"
+    "FILE_PATH_OUTPUT = \"gs://thesis-bucket-option-trade-classification/data/preprocessed/\""
    ]
   },
   {
@@ -58,7 +58,7 @@
    "source": [
     "os.environ[\"GCLOUD_PROJECT\"] = \"flowing-mantis-239216\"\n",
     "credentials, _ = google.auth.default()\n",
-    "fs = gcsfs.GCSFileSystem(project=\"thesis\", token=credentials)\n"
+    "fs = gcsfs.GCSFileSystem(project=\"thesis\", token=credentials)"
    ]
   },
   {
@@ -76,7 +76,7 @@
    "source": [
     "# connect to weights and biases\n",
     "run = wandb.init(project=\"thesis\", job_type=\"dataset-creation\", entity=\"fbv\")\n",
-    "dataset = wandb.Artifact(name=f\"{EXCHANGE}_{STRATEGY}_csv\", type=\"raw_data\")\n"
+    "dataset = wandb.Artifact(name=f\"{EXCHANGE}_{STRATEGY}_csv\", type=\"raw_data\")"
    ]
   },
   {
@@ -88,8 +88,7 @@
    "outputs": [],
    "source": [
     "def import_data(input_file: str) -> pd.DataFrame:\n",
-    "    \"\"\"\n",
-    "    create a dataframe and optimize its memory usage.\n",
+    "    \"\"\"Create a dataframe and optimize its memory usage.\n",
     "\n",
     "    I.e., apply some optimizations i.e, manual inference of dtypes, pre-selection\n",
     "    of unique columns and chunking to enable import.\n",
@@ -189,7 +188,7 @@
     "\n",
     "    format = \"%d%b%y:%H:%M:%S\"\n",
     "    df[\"QUOTE_DATETIME\"] = pd.to_datetime(df[\"QUOTE_DATETIME\"], format=format)\n",
-    "    return df\n"
+    "    return df"
    ]
   },
   {
@@ -203,8 +202,7 @@
     "def df_to_parquet(\n",
     "    x: pd.DataFrame, target_dir: str, chunk_size: int = 1000000, **parquet_wargs\n",
     ") -> None:\n",
-    "    \"\"\"\n",
-    "    Write pd.DataFrame to parquet format.\n",
+    "    \"\"\"Write pd.DataFrame to parquet format.\n",
     "\n",
     "    Args:\n",
     "        x (pd.DataFrame): input dataframe.\n",
@@ -222,7 +220,7 @@
     "        slc.to_parquet(output_path, **parquet_wargs)\n",
     "\n",
     "        # log in w & b\n",
-    "        dataset.add_reference(output_path, name=f\"raw_parquet_{chunk:04d}\")\n"
+    "        dataset.add_reference(output_path, name=f\"raw_parquet_{chunk:04d}\")"
    ]
   },
   {
@@ -805,7 +803,7 @@
     "client = Client()\n",
     "\n",
     "df = import_data(FILE_PATH_INPUT)\n",
-    "df_to_parquet(df, FILE_PATH_OUTPUT)\n"
+    "df_to_parquet(df, FILE_PATH_OUTPUT)"
    ]
   },
   {
@@ -833,7 +831,7 @@
    "source": [
     "# Log the artifact to save it as an output of this run\n",
     "run.log_artifact(dataset)\n",
-    "wandb.finish()\n"
+    "wandb.finish()"
    ]
   }
  ],

diff --git a/notebooks/2.0a-mb-data-preprocessing-supervised.ipynb b/notebooks/2.0a-mb-data-preprocessing-supervised.ipynb
@@ -21,7 +21,7 @@
     "from pandas._testing.asserters import assert_almost_equal\n",
     "from tqdm.auto import tqdm\n",
     "\n",
-    "sys.path.append(\"..\")\n"
+    "sys.path.append(\"..\")"
    ]
   },
   {
@@ -34,7 +34,7 @@
    "source": [
     "EXCHANGE = \"cboe\"  # \"ise\"\n",
     "STRATEGY = \"transfer\"  # \"supervised\"\n",
-    "max_i = 50 if EXCHANGE == \"ise\" else 38  # number of partial files\n"
+    "max_i = 50 if EXCHANGE == \"ise\" else 38  # number of partial files"
    ]
   },
   {
@@ -53,7 +53,7 @@
    "source": [
     "# connect to weights and biases\n",
     "run = wandb.init(project=\"thesis\", job_type=\"dataset-creation\", entity=\"fbv\")\n",
-    "dataset = wandb.Artifact(name=f\"{EXCHANGE}_{STRATEGY}_raw\", type=\"preprocessed_data\")\n"
+    "dataset = wandb.Artifact(name=f\"{EXCHANGE}_{STRATEGY}_raw\", type=\"preprocessed_data\")"
    ]
   },
   {
@@ -67,7 +67,7 @@
    "source": [
     "os.environ[\"GCLOUD_PROJECT\"] = \"flowing-mantis-239216\"\n",
     "\n",
-    "fs = gcsfs.GCSFileSystem(project=\"thesis\")\n"
+    "fs = gcsfs.GCSFileSystem(project=\"thesis\")"
    ]
   },
   {
@@ -99,7 +99,7 @@
    "source": [
     "files = [\n",
     "    f\"gs://thesis-bucket-option-trade-classification/data/preprocessed/{'unmatched' if STRATEGY == 'unsupervised' else 'matched'}_{EXCHANGE}_quotes_min_mem_usage_extended_part_{i:04d}.parquet\"\n",
-    "    for i in range(0, max_i)\n",
+    "    for i in range(max_i)\n",
     "]\n",
     "\n",
     "columns = [\n",
@@ -130,7 +130,7 @@
     "dfs = [pd.read_parquet(gc_file, columns=columns) for gc_file in tqdm(files)]\n",
     "df = pd.concat(dfs)\n",
     "\n",
-    "del dfs\n"
+    "del dfs"
    ]
   },
   {
@@ -146,7 +146,7 @@
    },
    "outputs": [],
    "source": [
-    "df.memory_usage(deep=True).sum()\n"
+    "df.memory_usage(deep=True).sum()"
    ]
   },
   {
@@ -157,7 +157,7 @@
    },
    "outputs": [],
    "source": [
-    "len(df)\n"
+    "len(df)"
    ]
   },
   {
@@ -207,7 +207,7 @@
     "    assert_almost_equal(\n",
     "        stats_trade_size.values.tolist(), [18.14, 5.0, 223.24], atol=0.1\n",
     "    )\n",
-    "    assert_almost_equal(stats_buy_trades, 0.4500, atol=0.01)\n"
+    "    assert_almost_equal(stats_buy_trades, 0.4500, atol=0.01)"
    ]
   },
   {
@@ -227,7 +227,7 @@
    },
    "outputs": [],
    "source": [
-    "df.sort_values(by=\"QUOTE_DATETIME\", inplace=True)\n"
+    "df.sort_values(by=\"QUOTE_DATETIME\", inplace=True)"
    ]
   },
   {
@@ -263,7 +263,7 @@
     "\n",
     "if EXCHANGE == \"cboe\" and STRATEGY == \"transfer\":\n",
     "    # use everything after *ISE* validation set for transfer learning\n",
-    "    test_range = df.QUOTE_DATETIME.between(\"2015-11-06 00:00:01\", \"2017-10-31 23:59:00\")\n"
+    "    test_range = df.QUOTE_DATETIME.between(\"2015-11-06 00:00:01\", \"2017-10-31 23:59:00\")"
    ]
   },
   {
@@ -280,7 +280,6 @@
    "outputs": [],
    "source": [
     "if STRATEGY == \"supervised\":\n",
-    "\n",
     "    train = df[train_range]\n",
     "\n",
     "    len_train = len(train)\n",
@@ -317,7 +316,7 @@
     "\n",
     "    output_path = f\"gs://thesis-bucket-option-trade-classification/data/preprocessed/{EXCHANGE}_{STRATEGY}_test.parquet\"\n",
     "    test.to_parquet(output_path)\n",
-    "    dataset.add_reference(output_path, name=\"test_set\")\n"
+    "    dataset.add_reference(output_path, name=\"test_set\")"
    ]
   },
   {
@@ -347,7 +346,7 @@
     "# Log the artifact to save it as an output of this run\n",
     "run.log_artifact(dataset)\n",
     "\n",
-    "wandb.finish()\n"
+    "wandb.finish()"
    ]
   },
   {
@@ -370,7 +369,7 @@
     "val = pd.read_parquet(\n",
     "    \"gs://thesis-bucket-option-trade-classification/data/ise_log_standardized/val_set.parquet\",\n",
     "    engine=\"fastparquet\",\n",
-    ")\n"
+    ")"
    ]
   },
   {
@@ -382,7 +381,7 @@
     "val = pd.read_parquet(\n",
     "    \"gs://thesis-bucket-option-trade-classification/data/ise_log_standardized/val_set_20.parquet\",\n",
     "    engine=\"fastparquet\",\n",
-    ")\n"
+    ")"
    ]
   },
   {
@@ -392,7 +391,7 @@
    "outputs": [],
    "source": [
     "y_train = train[\"buy_sell\"]\n",
-    "X_train = train.drop(columns=[\"buy_sell\"])\n"
+    "X_train = train.drop(columns=[\"buy_sell\"])"
    ]
   },
   {
@@ -402,7 +401,7 @@
    "outputs": [],
    "source": [
     "y_val = val[\"buy_sell\"]\n",
-    "X_val = val.drop(columns=[\"buy_sell\"])\n"
+    "X_val = val.drop(columns=[\"buy_sell\"])"
    ]
   },
   {
@@ -411,7 +410,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "X_train.head()\n"
+    "X_train.head()"
    ]
   },
   {
@@ -431,7 +430,6 @@
     "    timestamp = np.linspace(0, 1, length)\n",
     "    # keep weight fixed\n",
     "    for strategy in [\"uniform\", \"exponential\"]:\n",
-    "\n",
     "        if strategy == \"uniform\":\n",
     "            weight = np.ones(length)\n",
     "        else:\n",
@@ -474,7 +472,7 @@
     "            \"strategy\": strategy,\n",
     "        }\n",
     "        print(res)\n",
-    "        results_p.append(res)\n"
+    "        results_p.append(res)"
    ]
   },
   {
@@ -483,7 +481,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "results_df = pd.DataFrame(results_p)\n"
+    "results_df = pd.DataFrame(results_p)"
    ]
   },
   {
@@ -492,7 +490,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "results_df\n"
+    "results_df"
    ]
   },
   {
@@ -501,7 +499,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "results_df.to_csv(\"learning_curves_gbm_default_params.csv\")\n"
+    "results_df.to_csv(\"learning_curves_gbm_default_params.csv\")"
    ]
   },
   {
@@ -533,7 +531,7 @@
     "data = pd.read_parquet(\n",
     "    \"gs://thesis-bucket-option-trade-classification/data/classical_size_features_log_normalized/train_set_extended_60.parquet\",\n",
     "    engine=\"fastparquet\",\n",
-    ")\n"
+    ")"
    ]
   },
   {
@@ -546,7 +544,7 @@
    "source": [
     "# try to predict last 10 % in training set using first 10 % of features. Accuracy should be above 50 %.\n",
     "label = data[\"buy_sell\"]\n",
-    "data.drop(columns=[\"buy_sell\"], inplace=True)\n"
+    "data.drop(columns=[\"buy_sell\"], inplace=True)"
    ]
   },
   {
@@ -563,7 +561,7 @@
     "X_train = data.iloc[0 : len(data) // 10, :]\n",
     "X_test = data.iloc[-len(data) // 10 :, :]\n",
     "\n",
-    "del label, data\n"
+    "del label, data"
    ]
   },
   {
@@ -578,7 +576,7 @@
    },
    "outputs": [],
    "source": [
-    "y_train.shape\n"
+    "y_train.shape"
    ]
   },
   {
@@ -599,7 +597,7 @@
     "    \"eval_metric\": \"Accuracy\",\n",
     "    \"iterations\": 1000,\n",
     "    \"early_stopping_rounds\": 100,\n",
-    "}\n"
+    "}"
    ]
   },
   {
@@ -610,7 +608,7 @@
    },
    "outputs": [],
    "source": [
-    "columns = X_train.columns\n"
+    "columns = X_train.columns"
    ]
   },
   {
@@ -644,7 +642,7 @@
     "    model = CatBoostClassifier(**params)\n",
     "    model.fit(X_train[[col]], y_train, eval_set=(X_test[[col]], y_test))\n",
     "    acc = model.score(X_test[[col]], y_test)\n",
-    "    results.append([col, acc])\n"
+    "    results.append([col, acc])"
    ]
   },
   {
@@ -661,7 +659,7 @@
    "outputs": [],
    "source": [
     "results_df = pd.DataFrame(results, columns=[\"feature\", \"accuracy\"])\n",
-    "results_df.sort_values(by=\"accuracy\")\n"
+    "results_df.sort_values(by=\"accuracy\")"
    ]
   },
   {