Add chapter on discussion + various fixes💬 (#420)

* Cleanup front matter + list of (...) * Improve headlines and captions✍️ * Fix introduction + bibliography⛩️ * Rework related works chapter 🧙 * Rework chapter on rule based approaches🚀 * Rework problem framing 🚀 * Start with conclusion 🧙 * Add first part of conclusion 🧙 * Extend conclusion 🔚 * fix some typos 🔚 * Rewrite outlook and extend conclusion 🔚 * Extend conclusion 🔚 * Finish conclusion and outlook 🔚 * Cleanup notes 📑 * Cleanup notes 📑 * Rework feature set definition🧙 * Add notes on feature sets and discussion✍️ * Add notes on feature set and discussion🍁 * Adjust train-test-split🍕 * Fix some todos🚀 * Close more todos 🚀 * Rework table📑 * Improve table🥊 * Extend description of feature set 2 * Feature set definition🧃 * Fix appendix📑 * cut clutter 🧑‍🌾 * Restructure discussion✍️ * cleanup 🧹 * Restructure text ✍️ * Fix several typos / proof-reading🕶️ * Prepare analysis for discussion 🗨️ * Restructure points in discussion 🗨️ * Weave in discussion into supervised results 🚀 * Discuss results of supervised classifiers🧙 * Extend robustness checks for discussion🥊 * Cleanup random todos 🧹 * shorten text✍️ * Shorten paper 🧹 * Shorten paper + other fixes ✍️ * Improve hyperparam plots 🧙 * Prepare final discussion of classical rules * Add pre-train loss 🚀 * Finalize chapter on classical results 🚀 * Finish chapter on training / hyperparam search🚀 * Shorten evaluation / hyperparam part 🚀 * Finish rework of SAGE chapter 🚀 * Cleanup notebooks 🚀 * Add clear names to embeddings 💤 * Rewrite semi-supervised results 🚀 * complete discussion of semi-supervised results 🗨️ * add generic discussion ✍️ * Improve introduction 🚀 * Properly refer to as GSU🚀
KarelZe · Jul 3, 2023 · 32cf8a9 · 32cf8a9
1 parent b9db779
commit 32cf8a9
Show file tree

Hide file tree

Showing 49 changed files with 1,455 additions and 3,437 deletions.
diff --git a/notebooks/4.0f-mb-results-own-rule.ipynb b/notebooks/4.0f-mb-results-own-rule.ipynb
diff --git a/notebooks/6.0a-mb-visualizations.ipynb b/notebooks/6.0a-mb-visualizations.ipynb
diff --git a/notebooks/6.0e-mb-viz-universal.ipynb b/notebooks/6.0e-mb-viz-universal.ipynb
diff --git a/notebooks/6.0h-mb-viz-embeddings.ipynb b/notebooks/6.0h-mb-viz-embeddings.ipynb
@@ -8,14 +8,17 @@
    },
    "outputs": [],
    "source": [
+    "import gcsfs\n",
+    "import google.auth\n",
+    "\n",
     "\n",
     "import json\n",
     "import os\n",
     "import pickle\n",
     "import sys\n",
     "from pathlib import Path\n",
     "\n",
-    "from adjustText import adjust_text\n",
+    "# from adjustText import adjust_text\n",
     "\n",
     "import numpy as np\n",
     "import pandas as pd\n",
@@ -25,6 +28,111 @@
     "import wandb"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "os.environ[\"GCLOUD_PROJECT\"] = \"flowing-mantis-239216\"\n",
+    "credentials, _ = google.auth.default()\n",
+    "fs = gcsfs.GCSFileSystem(project=\"thesis\", token=credentials)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "fs.get(\"gs://thesis-bucket-option-trade-classification/data/raw/matched_samples_ise_quotes_extended.csv\", \"ise_matched.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sec_id_root = pd.read_csv(\"ise_matched.csv\",usecols=[\"ROOT\", \"secid_OM\"])\n",
+    "sec_id_root = sec_id_root.drop_duplicates(keep=\"last\",subset=\"ROOT\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "security_names = pd.read_csv('../data/security_name.csv')\n",
+    "security_names = security_names[[\"secid\", \"issuer\"]].drop_duplicates(subset=\"secid\", keep=\"last\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "root_clearname = sec_id_root.merge(security_names, left_on=\"secid_OM\", right_on=\"secid\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "label = pd.read_csv('../models/metadata.tsv', sep='\\t', header=None).rename({0:\"label\"},axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "label_merged = label.merge(root_clearname, left_on=\"label\", right_on=\"ROOT\", how=\"left\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "label_commented = label_merged[\"label\"]\n",
+    "\n",
+    "commented_label = label_merged[\"label\"].astype(str) + \" (\" + label_merged[\"issuer\"].astype(str) + \")\"\n",
+    "# skip issue type and option type\n",
+    "label_commented.iloc[8:] = commented_label.iloc[8:]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "label_commented.to_csv('../models/metadata_clearlabels.tsv',sep=\"\\t\")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -349,9 +457,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "thesis",
+   "display_name": "myenv",
    "language": "python",
-   "name": "thesis"
+   "name": "myenv"
   },
   "language_info": {
    "codemirror_mode": {