From 3d4080e7a372caf97276c91af3d9b0ded0073eff Mon Sep 17 00:00:00 2001
From: Guoxuan Xu <guoxuan.xu8@gmail.com>
Date: Fri, 1 Nov 2024 22:20:33 -0700
Subject: [PATCH] updating competitor analysis

---
 .../eda_competitor_analysis.ipynb             | 201 ++++++++++++++++++
 1 file changed, 201 insertions(+)
 create mode 100644 impact_evaluation/eda_competitor_analysis.ipynb

diff --git a/impact_evaluation/eda_competitor_analysis.ipynb b/impact_evaluation/eda_competitor_analysis.ipynb
new file mode 100644
index 0000000..17d27d5
--- /dev/null
+++ b/impact_evaluation/eda_competitor_analysis.ipynb
@@ -0,0 +1,201 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Competitor Analysis"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/anaconda3/envs/tongConsultinInc/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3508: DtypeWarning: Columns (3,13,14) have mixed types.Specify dtype option on import or set low_memory=False.\n",
+      "  exec(code_obj, self.user_global_ns, self.user_ns)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# import data\n",
+    "\n",
+    "candidates_rt = pd.read_csv('../data/temproary_data/candidate_roundTrip_route.csv')\n",
+    "candidates_rt['round_trip_route_IATA'] = candidates_rt['round_trip_route_IATA'].apply(eval)\n",
+    "roundTrips = pd.read_csv('../data/temproary_data/round_trip_flights.csv')\n",
+    "roundTrips['round_trip_route_IATA'] = roundTrips['round_trip_route_IATA'].apply(eval)\n",
+    "roundTrips['inbound_FL_DATE'] = pd.to_datetime(roundTrips['inbound_FL_DATE'])\n",
+    "roundTrips['outbound_FL_DATE'] = pd.to_datetime(roundTrips['outbound_FL_DATE'])\n",
+    "airports_info = pd.read_csv('../data/cleaned_data/Airport_Codes.csv')\n",
+    "tickets_info = pd.read_csv('../data/cleaned_data/Tickets.csv')\n",
+    "\n",
+    "\n",
+    "tickets_info = tickets_info.assign(\n",
+    "        sorted_route=tickets_info.apply(\n",
+    "            lambda x: tuple(\n",
+    "                sorted([x[\"ORIGIN_AIRPORT_IATA_CODE\"], x[\"DEST_AIRPORT_IATA_CODE\"]])\n",
+    "            ),\n",
+    "            axis=1,\n",
+    "        )\n",
+    "    )\n",
+    "\n",
+    "round_trip_profit = pd.read_csv('../data/temproary_data/roundTrip_profit.csv')\n",
+    "round_trip_profit['round_trip_route_IATA'] = round_trip_profit['round_trip_route_IATA'].apply(eval)\n",
+    "avg_ticket_price = pd.read_csv('../data/temproary_data/average_ticket_price.csv')\n",
+    "all_flights = pd.read_csv('../data/original_data/Flights.csv')\n",
+    "\n",
+    "\n",
+    "# gather relevent data\n",
+    "\n",
+    "airports_can = {i for pair in candidates_rt['round_trip_route_IATA'].values for i in pair}\n",
+    "\n",
+    "candidate_roundTrips = roundTrips[roundTrips['round_trip_route_IATA'].isin(candidates_rt['round_trip_route_IATA'])]\n",
+    "candidate_airports = airports_info[airports_info['AIRPORT_IATA_CODE'].apply(lambda x: x in airports_can)]\n",
+    "candidate_tickets = tickets_info[tickets_info['sorted_route'].isin(candidates_rt['round_trip_route_IATA'])]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Competitor associatd with each candidate round trip route"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# find the operation carieer assoicated with the top 10 around trip route\n",
+    "roundTrip_op = candidate_roundTrips.groupby('round_trip_route_IATA')['inbound_OP_CARRIER'].apply(lambda x: set(x.value_counts().index)).reset_index()\n",
+    "roundTrip_op['op_count'] = roundTrip_op['inbound_OP_CARRIER'].apply(len)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "operating_carrier = pd.DataFrame(pd.Series(list(set([j for i in roundTrip_op['inbound_OP_CARRIER'].values for j in i])), name='OP_CARRIER'))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# the proportion of cancel flights associated with each operating carieer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 100,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_flights['round_trip'] = all_flights.apply(lambda row : tuple(sorted([row['ORIGIN'], row['DESTINATION']])), axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 116,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "op_prop_cancel = all_flights[(all_flights['round_trip'].isin(candidates_rt['round_trip_route_IATA'])) & (all_flights['OP_CARRIER'].isin(operating_carrier['OP_CARRIER']))].groupby(['OP_CARRIER', 'round_trip'])['CANCELLED'].mean().reset_index().rename(columns={'CANCELLED': 'prop_cancel'})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 124,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "op_prop_cancel.to_csv('../data/temproary_data/op_prop_cancel.csv', index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Average delay rate assoicated with each operating carieer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 118,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_flights = pd.concat(\n",
+    "    [candidate_roundTrips[['inbound_DEP_DELAY', 'inbound_ARR_DELAY', 'inbound_OP_CARRIER' ,'round_trip_route_IATA']].rename(columns={'inbound_DEP_DELAY':'DEP_DELAY', 'inbound_ARR_DELAY':'ARR_DELAY', 'inbound_OP_CARRIER': 'OP_CARRIER'}, inplace= False),\n",
+    "    candidate_roundTrips[['outbound_DEP_DELAY', 'outbound_ARR_DELAY', 'outbound_OP_CARRIER' ,'round_trip_route_IATA']].rename(columns={'outbound_DEP_DELAY':'DEP_DELAY', 'outbound_ARR_DELAY':'ARR_DELAY', 'outbound_OP_CARRIER': 'OP_CARRIER'}, inplace= False)],\n",
+    "    axis=0,\n",
+    "    ignore_index=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 123,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/0y/gbwmzjp93k12t06yhk8_2p7h0000gn/T/ipykernel_34231/2552485323.py:1: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.\n",
+      "  all_flights.groupby(['OP_CARRIER', 'round_trip_route_IATA'])['DEP_DELAY', 'ARR_DELAY'].mean().reset_index().to_csv('../data/temproary_data/op_average_delay.csv', index=False)\n"
+     ]
+    }
+   ],
+   "source": [
+    "all_flights.groupby(['OP_CARRIER', 'round_trip_route_IATA'])['DEP_DELAY', 'ARR_DELAY'].mean().reset_index().to_csv('../data/temproary_data/op_average_delay.csv', index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "tongConsultinInc",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.19"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}