Skip to content

Commit

Permalink
ci: migrate to uv and update build tools🔧
Browse files Browse the repository at this point in the history
  • Loading branch information
KarelZe committed Dec 2, 2024
1 parent 2b25376 commit 9126976
Show file tree
Hide file tree
Showing 26 changed files with 5,673 additions and 5,222 deletions.
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ repos:
hooks:
- id: trailing-whitespace
- id: check-added-large-files
exclude: uv.lock
- id: check-builtin-literals
- id: check-byte-order-marker
- id: check-merge-conflict
Expand Down
22 changes: 10 additions & 12 deletions notebooks/1.0-mb-data-preprocessing-mem-reduce.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
"ProgressBar.enable()\n",
"\n",
"import wandb\n",
"from tqdm.auto import tqdm\n"
"from tqdm.auto import tqdm"
]
},
{
Expand All @@ -47,7 +47,7 @@
"FILE_PATH_INPUT = (\n",
" \"gs://thesis-bucket-option-trade-classification/data/raw/matched_cboe_quotes.csv\"\n",
")\n",
"FILE_PATH_OUTPUT = \"gs://thesis-bucket-option-trade-classification/data/preprocessed/\"\n"
"FILE_PATH_OUTPUT = \"gs://thesis-bucket-option-trade-classification/data/preprocessed/\""
]
},
{
Expand All @@ -58,7 +58,7 @@
"source": [
"os.environ[\"GCLOUD_PROJECT\"] = \"flowing-mantis-239216\"\n",
"credentials, _ = google.auth.default()\n",
"fs = gcsfs.GCSFileSystem(project=\"thesis\", token=credentials)\n"
"fs = gcsfs.GCSFileSystem(project=\"thesis\", token=credentials)"
]
},
{
Expand All @@ -76,7 +76,7 @@
"source": [
"# connect to weights and biases\n",
"run = wandb.init(project=\"thesis\", job_type=\"dataset-creation\", entity=\"fbv\")\n",
"dataset = wandb.Artifact(name=f\"{EXCHANGE}_{STRATEGY}_csv\", type=\"raw_data\")\n"
"dataset = wandb.Artifact(name=f\"{EXCHANGE}_{STRATEGY}_csv\", type=\"raw_data\")"
]
},
{
Expand All @@ -88,8 +88,7 @@
"outputs": [],
"source": [
"def import_data(input_file: str) -> pd.DataFrame:\n",
" \"\"\"\n",
" create a dataframe and optimize its memory usage.\n",
" \"\"\"Create a dataframe and optimize its memory usage.\n",
"\n",
" I.e., apply some optimizations i.e, manual inference of dtypes, pre-selection\n",
" of unique columns and chunking to enable import.\n",
Expand Down Expand Up @@ -189,7 +188,7 @@
"\n",
" format = \"%d%b%y:%H:%M:%S\"\n",
" df[\"QUOTE_DATETIME\"] = pd.to_datetime(df[\"QUOTE_DATETIME\"], format=format)\n",
" return df\n"
" return df"
]
},
{
Expand All @@ -203,8 +202,7 @@
"def df_to_parquet(\n",
" x: pd.DataFrame, target_dir: str, chunk_size: int = 1000000, **parquet_wargs\n",
") -> None:\n",
" \"\"\"\n",
" Write pd.DataFrame to parquet format.\n",
" \"\"\"Write pd.DataFrame to parquet format.\n",
"\n",
" Args:\n",
" x (pd.DataFrame): input dataframe.\n",
Expand All @@ -222,7 +220,7 @@
" slc.to_parquet(output_path, **parquet_wargs)\n",
"\n",
" # log in w & b\n",
" dataset.add_reference(output_path, name=f\"raw_parquet_{chunk:04d}\")\n"
" dataset.add_reference(output_path, name=f\"raw_parquet_{chunk:04d}\")"
]
},
{
Expand Down Expand Up @@ -805,7 +803,7 @@
"client = Client()\n",
"\n",
"df = import_data(FILE_PATH_INPUT)\n",
"df_to_parquet(df, FILE_PATH_OUTPUT)\n"
"df_to_parquet(df, FILE_PATH_OUTPUT)"
]
},
{
Expand Down Expand Up @@ -833,7 +831,7 @@
"source": [
"# Log the artifact to save it as an output of this run\n",
"run.log_artifact(dataset)\n",
"wandb.finish()\n"
"wandb.finish()"
]
}
],
Expand Down
62 changes: 30 additions & 32 deletions notebooks/2.0a-mb-data-preprocessing-supervised.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
"from pandas._testing.asserters import assert_almost_equal\n",
"from tqdm.auto import tqdm\n",
"\n",
"sys.path.append(\"..\")\n"
"sys.path.append(\"..\")"
]
},
{
Expand All @@ -34,7 +34,7 @@
"source": [
"EXCHANGE = \"cboe\" # \"ise\"\n",
"STRATEGY = \"transfer\" # \"supervised\"\n",
"max_i = 50 if EXCHANGE == \"ise\" else 38 # number of partial files\n"
"max_i = 50 if EXCHANGE == \"ise\" else 38 # number of partial files"
]
},
{
Expand All @@ -53,7 +53,7 @@
"source": [
"# connect to weights and biases\n",
"run = wandb.init(project=\"thesis\", job_type=\"dataset-creation\", entity=\"fbv\")\n",
"dataset = wandb.Artifact(name=f\"{EXCHANGE}_{STRATEGY}_raw\", type=\"preprocessed_data\")\n"
"dataset = wandb.Artifact(name=f\"{EXCHANGE}_{STRATEGY}_raw\", type=\"preprocessed_data\")"
]
},
{
Expand All @@ -67,7 +67,7 @@
"source": [
"os.environ[\"GCLOUD_PROJECT\"] = \"flowing-mantis-239216\"\n",
"\n",
"fs = gcsfs.GCSFileSystem(project=\"thesis\")\n"
"fs = gcsfs.GCSFileSystem(project=\"thesis\")"
]
},
{
Expand Down Expand Up @@ -99,7 +99,7 @@
"source": [
"files = [\n",
" f\"gs://thesis-bucket-option-trade-classification/data/preprocessed/{'unmatched' if STRATEGY == 'unsupervised' else 'matched'}_{EXCHANGE}_quotes_min_mem_usage_extended_part_{i:04d}.parquet\"\n",
" for i in range(0, max_i)\n",
" for i in range(max_i)\n",
"]\n",
"\n",
"columns = [\n",
Expand Down Expand Up @@ -130,7 +130,7 @@
"dfs = [pd.read_parquet(gc_file, columns=columns) for gc_file in tqdm(files)]\n",
"df = pd.concat(dfs)\n",
"\n",
"del dfs\n"
"del dfs"
]
},
{
Expand All @@ -146,7 +146,7 @@
},
"outputs": [],
"source": [
"df.memory_usage(deep=True).sum()\n"
"df.memory_usage(deep=True).sum()"
]
},
{
Expand All @@ -157,7 +157,7 @@
},
"outputs": [],
"source": [
"len(df)\n"
"len(df)"
]
},
{
Expand Down Expand Up @@ -207,7 +207,7 @@
" assert_almost_equal(\n",
" stats_trade_size.values.tolist(), [18.14, 5.0, 223.24], atol=0.1\n",
" )\n",
" assert_almost_equal(stats_buy_trades, 0.4500, atol=0.01)\n"
" assert_almost_equal(stats_buy_trades, 0.4500, atol=0.01)"
]
},
{
Expand All @@ -227,7 +227,7 @@
},
"outputs": [],
"source": [
"df.sort_values(by=\"QUOTE_DATETIME\", inplace=True)\n"
"df.sort_values(by=\"QUOTE_DATETIME\", inplace=True)"
]
},
{
Expand Down Expand Up @@ -263,7 +263,7 @@
"\n",
"if EXCHANGE == \"cboe\" and STRATEGY == \"transfer\":\n",
" # use everything after *ISE* validation set for transfer learning\n",
" test_range = df.QUOTE_DATETIME.between(\"2015-11-06 00:00:01\", \"2017-10-31 23:59:00\")\n"
" test_range = df.QUOTE_DATETIME.between(\"2015-11-06 00:00:01\", \"2017-10-31 23:59:00\")"
]
},
{
Expand All @@ -280,7 +280,6 @@
"outputs": [],
"source": [
"if STRATEGY == \"supervised\":\n",
"\n",
" train = df[train_range]\n",
"\n",
" len_train = len(train)\n",
Expand Down Expand Up @@ -317,7 +316,7 @@
"\n",
" output_path = f\"gs://thesis-bucket-option-trade-classification/data/preprocessed/{EXCHANGE}_{STRATEGY}_test.parquet\"\n",
" test.to_parquet(output_path)\n",
" dataset.add_reference(output_path, name=\"test_set\")\n"
" dataset.add_reference(output_path, name=\"test_set\")"
]
},
{
Expand Down Expand Up @@ -347,7 +346,7 @@
"# Log the artifact to save it as an output of this run\n",
"run.log_artifact(dataset)\n",
"\n",
"wandb.finish()\n"
"wandb.finish()"
]
},
{
Expand All @@ -370,7 +369,7 @@
"val = pd.read_parquet(\n",
" \"gs://thesis-bucket-option-trade-classification/data/ise_log_standardized/val_set.parquet\",\n",
" engine=\"fastparquet\",\n",
")\n"
")"
]
},
{
Expand All @@ -382,7 +381,7 @@
"val = pd.read_parquet(\n",
" \"gs://thesis-bucket-option-trade-classification/data/ise_log_standardized/val_set_20.parquet\",\n",
" engine=\"fastparquet\",\n",
")\n"
")"
]
},
{
Expand All @@ -392,7 +391,7 @@
"outputs": [],
"source": [
"y_train = train[\"buy_sell\"]\n",
"X_train = train.drop(columns=[\"buy_sell\"])\n"
"X_train = train.drop(columns=[\"buy_sell\"])"
]
},
{
Expand All @@ -402,7 +401,7 @@
"outputs": [],
"source": [
"y_val = val[\"buy_sell\"]\n",
"X_val = val.drop(columns=[\"buy_sell\"])\n"
"X_val = val.drop(columns=[\"buy_sell\"])"
]
},
{
Expand All @@ -411,7 +410,7 @@
"metadata": {},
"outputs": [],
"source": [
"X_train.head()\n"
"X_train.head()"
]
},
{
Expand All @@ -431,7 +430,6 @@
" timestamp = np.linspace(0, 1, length)\n",
" # keep weight fixed\n",
" for strategy in [\"uniform\", \"exponential\"]:\n",
"\n",
" if strategy == \"uniform\":\n",
" weight = np.ones(length)\n",
" else:\n",
Expand Down Expand Up @@ -474,7 +472,7 @@
" \"strategy\": strategy,\n",
" }\n",
" print(res)\n",
" results_p.append(res)\n"
" results_p.append(res)"
]
},
{
Expand All @@ -483,7 +481,7 @@
"metadata": {},
"outputs": [],
"source": [
"results_df = pd.DataFrame(results_p)\n"
"results_df = pd.DataFrame(results_p)"
]
},
{
Expand All @@ -492,7 +490,7 @@
"metadata": {},
"outputs": [],
"source": [
"results_df\n"
"results_df"
]
},
{
Expand All @@ -501,7 +499,7 @@
"metadata": {},
"outputs": [],
"source": [
"results_df.to_csv(\"learning_curves_gbm_default_params.csv\")\n"
"results_df.to_csv(\"learning_curves_gbm_default_params.csv\")"
]
},
{
Expand Down Expand Up @@ -533,7 +531,7 @@
"data = pd.read_parquet(\n",
" \"gs://thesis-bucket-option-trade-classification/data/classical_size_features_log_normalized/train_set_extended_60.parquet\",\n",
" engine=\"fastparquet\",\n",
")\n"
")"
]
},
{
Expand All @@ -546,7 +544,7 @@
"source": [
"# try to predict last 10 % in training set using first 10 % of features. Accuracy should be above 50 %.\n",
"label = data[\"buy_sell\"]\n",
"data.drop(columns=[\"buy_sell\"], inplace=True)\n"
"data.drop(columns=[\"buy_sell\"], inplace=True)"
]
},
{
Expand All @@ -563,7 +561,7 @@
"X_train = data.iloc[0 : len(data) // 10, :]\n",
"X_test = data.iloc[-len(data) // 10 :, :]\n",
"\n",
"del label, data\n"
"del label, data"
]
},
{
Expand All @@ -578,7 +576,7 @@
},
"outputs": [],
"source": [
"y_train.shape\n"
"y_train.shape"
]
},
{
Expand All @@ -599,7 +597,7 @@
" \"eval_metric\": \"Accuracy\",\n",
" \"iterations\": 1000,\n",
" \"early_stopping_rounds\": 100,\n",
"}\n"
"}"
]
},
{
Expand All @@ -610,7 +608,7 @@
},
"outputs": [],
"source": [
"columns = X_train.columns\n"
"columns = X_train.columns"
]
},
{
Expand Down Expand Up @@ -644,7 +642,7 @@
" model = CatBoostClassifier(**params)\n",
" model.fit(X_train[[col]], y_train, eval_set=(X_test[[col]], y_test))\n",
" acc = model.score(X_test[[col]], y_test)\n",
" results.append([col, acc])\n"
" results.append([col, acc])"
]
},
{
Expand All @@ -661,7 +659,7 @@
"outputs": [],
"source": [
"results_df = pd.DataFrame(results, columns=[\"feature\", \"accuracy\"])\n",
"results_df.sort_values(by=\"accuracy\")\n"
"results_df.sort_values(by=\"accuracy\")"
]
},
{
Expand Down
Loading

0 comments on commit 9126976

Please sign in to comment.