diff --git a/impact_evaluation/eda_competitor_analysis.ipynb b/impact_evaluation/eda_competitor_analysis.ipynb new file mode 100644 index 0000000..17d27d5 --- /dev/null +++ b/impact_evaluation/eda_competitor_analysis.ipynb @@ -0,0 +1,201 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Competitor Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/anaconda3/envs/tongConsultinInc/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3508: DtypeWarning: Columns (3,13,14) have mixed types.Specify dtype option on import or set low_memory=False.\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n" + ] + } + ], + "source": [ + "# import data\n", + "\n", + "candidates_rt = pd.read_csv('../data/temproary_data/candidate_roundTrip_route.csv')\n", + "candidates_rt['round_trip_route_IATA'] = candidates_rt['round_trip_route_IATA'].apply(eval)\n", + "roundTrips = pd.read_csv('../data/temproary_data/round_trip_flights.csv')\n", + "roundTrips['round_trip_route_IATA'] = roundTrips['round_trip_route_IATA'].apply(eval)\n", + "roundTrips['inbound_FL_DATE'] = pd.to_datetime(roundTrips['inbound_FL_DATE'])\n", + "roundTrips['outbound_FL_DATE'] = pd.to_datetime(roundTrips['outbound_FL_DATE'])\n", + "airports_info = pd.read_csv('../data/cleaned_data/Airport_Codes.csv')\n", + "tickets_info = pd.read_csv('../data/cleaned_data/Tickets.csv')\n", + "\n", + "\n", + "tickets_info = tickets_info.assign(\n", + " sorted_route=tickets_info.apply(\n", + " lambda x: tuple(\n", + " sorted([x[\"ORIGIN_AIRPORT_IATA_CODE\"], x[\"DEST_AIRPORT_IATA_CODE\"]])\n", + " ),\n", + " axis=1,\n", + " )\n", + " )\n", + "\n", + "round_trip_profit = pd.read_csv('../data/temproary_data/roundTrip_profit.csv')\n", + "round_trip_profit['round_trip_route_IATA'] = round_trip_profit['round_trip_route_IATA'].apply(eval)\n", + "avg_ticket_price = pd.read_csv('../data/temproary_data/average_ticket_price.csv')\n", + "all_flights = pd.read_csv('../data/original_data/Flights.csv')\n", + "\n", + "\n", + "# gather relevent data\n", + "\n", + "airports_can = {i for pair in candidates_rt['round_trip_route_IATA'].values for i in pair}\n", + "\n", + "candidate_roundTrips = roundTrips[roundTrips['round_trip_route_IATA'].isin(candidates_rt['round_trip_route_IATA'])]\n", + "candidate_airports = airports_info[airports_info['AIRPORT_IATA_CODE'].apply(lambda x: x in airports_can)]\n", + "candidate_tickets = tickets_info[tickets_info['sorted_route'].isin(candidates_rt['round_trip_route_IATA'])]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Competitor associatd with each candidate round trip route" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "# find the operation carieer assoicated with the top 10 around trip route\n", + "roundTrip_op = candidate_roundTrips.groupby('round_trip_route_IATA')['inbound_OP_CARRIER'].apply(lambda x: set(x.value_counts().index)).reset_index()\n", + "roundTrip_op['op_count'] = roundTrip_op['inbound_OP_CARRIER'].apply(len)" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [], + "source": [ + "operating_carrier = pd.DataFrame(pd.Series(list(set([j for i in roundTrip_op['inbound_OP_CARRIER'].values for j in i])), name='OP_CARRIER'))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# the proportion of cancel flights associated with each operating carieer" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [], + "source": [ + "all_flights['round_trip'] = all_flights.apply(lambda row : tuple(sorted([row['ORIGIN'], row['DESTINATION']])), axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "metadata": {}, + "outputs": [], + "source": [ + "op_prop_cancel = all_flights[(all_flights['round_trip'].isin(candidates_rt['round_trip_route_IATA'])) & (all_flights['OP_CARRIER'].isin(operating_carrier['OP_CARRIER']))].groupby(['OP_CARRIER', 'round_trip'])['CANCELLED'].mean().reset_index().rename(columns={'CANCELLED': 'prop_cancel'})" + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "metadata": {}, + "outputs": [], + "source": [ + "op_prop_cancel.to_csv('../data/temproary_data/op_prop_cancel.csv', index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Average delay rate assoicated with each operating carieer" + ] + }, + { + "cell_type": "code", + "execution_count": 118, + "metadata": {}, + "outputs": [], + "source": [ + "all_flights = pd.concat(\n", + " [candidate_roundTrips[['inbound_DEP_DELAY', 'inbound_ARR_DELAY', 'inbound_OP_CARRIER' ,'round_trip_route_IATA']].rename(columns={'inbound_DEP_DELAY':'DEP_DELAY', 'inbound_ARR_DELAY':'ARR_DELAY', 'inbound_OP_CARRIER': 'OP_CARRIER'}, inplace= False),\n", + " candidate_roundTrips[['outbound_DEP_DELAY', 'outbound_ARR_DELAY', 'outbound_OP_CARRIER' ,'round_trip_route_IATA']].rename(columns={'outbound_DEP_DELAY':'DEP_DELAY', 'outbound_ARR_DELAY':'ARR_DELAY', 'outbound_OP_CARRIER': 'OP_CARRIER'}, inplace= False)],\n", + " axis=0,\n", + " ignore_index=True\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/0y/gbwmzjp93k12t06yhk8_2p7h0000gn/T/ipykernel_34231/2552485323.py:1: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.\n", + " all_flights.groupby(['OP_CARRIER', 'round_trip_route_IATA'])['DEP_DELAY', 'ARR_DELAY'].mean().reset_index().to_csv('../data/temproary_data/op_average_delay.csv', index=False)\n" + ] + } + ], + "source": [ + "all_flights.groupby(['OP_CARRIER', 'round_trip_route_IATA'])['DEP_DELAY', 'ARR_DELAY'].mean().reset_index().to_csv('../data/temproary_data/op_average_delay.csv', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "tongConsultinInc", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.19" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}