diff --git a/impact_evaluation/eda_delay_analysis.ipynb b/impact_evaluation/eda_delay_analysis.ipynb new file mode 100644 index 0000000..03668d4 --- /dev/null +++ b/impact_evaluation/eda_delay_analysis.ipynb @@ -0,0 +1,88 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# import data\n", + "\n", + "candidates_rt = pd.read_csv('../data/temproary_data/candidate_roundTrip_route.csv')\n", + "candidates_rt['round_trip_route_IATA'] = candidates_rt['round_trip_route_IATA'].apply(eval)\n", + "roundTrips = pd.read_csv('../data/temproary_data/round_trip_flights.csv')\n", + "airports_info = pd.read_csv('../data/cleaned_data/Airport_Codes.csv')\n", + "tickets_info = pd.read_csv('../data/cleaned_data/Tickets.csv')\n", + "\n", + "tickets_info = tickets_info.assign(\n", + " sorted_route=tickets_info.apply(\n", + " lambda x: tuple(\n", + " sorted([x[\"ORIGIN_AIRPORT_IATA_CODE\"], x[\"DEST_AIRPORT_IATA_CODE\"]])\n", + " ),\n", + " axis=1,\n", + " )\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# gather relevent data\n", + "\n", + "airports_can = {i for pair in candidates_rt['round_trip_route_IATA'].values for i in pair}\n", + "\n", + "candidate_roundTrips = roundTrips[roundTrips['round_trip_route_IATA'].isin(candidates_rt['round_trip_route_IATA'])]\n", + "candidate_airports = airports_info[airports_info['AIRPORT_IATA_CODE'].apply(lambda x: x in airports_can)]\n", + "candidate_tickets = tickets_info[tickets_info['sorted_route'].isin(candidates_rt['round_trip_route_IATA'])]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Delay analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "tongConsultinInc", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.19" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/impact_evaluation/eval_finding_top10_RRcandidate.ipynb b/impact_evaluation/eval_finding_top10_RRcandidate.ipynb deleted file mode 100644 index 44b34af..0000000 --- a/impact_evaluation/eval_finding_top10_RRcandidate.ipynb +++ /dev/null @@ -1,479 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Main statistical analysis\n", - "- narrow candidate to 10 candidate using four metrics(total profit, weekly profit, monthly profit, flights frequency)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 199, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "\n", - "roundTrip_profit_g = pd.read_csv('../data/temproary_data/roundTrip_profit.csv')\n", - "roundTrip_profit_g['round_trip_route_IATA'] = roundTrip_profit_g['round_trip_route_IATA'].apply(lambda x: eval(x))\n", - "roundTrip_profit_g['inbound_FL_DATE'] = pd.to_datetime(roundTrip_profit_g['inbound_FL_DATE'])\n", - "roundTrip_profit_g['outbound_FL_DATE'] = pd.to_datetime(roundTrip_profit_g['outbound_FL_DATE'])\n", - "\n", - "roundTrip_fre_g = pd.read_csv('../data/temproary_data/round_trip_flights.csv')\n", - "roundTrip_fre_g['round_trip_route_IATA'] = roundTrip_fre_g['round_trip_route_IATA'].apply(lambda x: eval(x))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Aggregrating all the factors and assign ranks to each factors for round-trip route\n", - "consider factors:\n", - "- flight frequency\n", - "- flight profit (total profit - total cost)\n", - "- TODO: weekly profit" - ] - }, - { - "cell_type": "code", - "execution_count": 200, - "metadata": {}, - "outputs": [], - "source": [ - "# calculate the ranking of the profit\n", - "roundTrip_profit = (roundTrip_profit_g.groupby('round_trip_route_IATA')['profit']\n", - " .sum()\n", - " .reset_index()\n", - " .rename(columns={'profit':'round_trip_profit'}))\n", - "roundTrip_profit = (roundTrip_profit.assign(round_trip_profit_ranking = roundTrip_profit['round_trip_profit']\n", - " .rank(ascending = False, method = 'min')))\n", - "\n", - "\n", - "# calculate the ranking of the frequency\n", - "roundTrip_fre = (roundTrip_fre_g.groupby('round_trip_route_IATA')['outbound_OCCUPANCY_RATE']\n", - " .count().sort_values(ascending=False)\n", - " .reset_index()\n", - " .rename(columns={'outbound_OCCUPANCY_RATE':'round_trip_flights_count'}))\n", - "roundTrip_fre = roundTrip_fre[roundTrip_fre['round_trip_route_IATA'].isin(roundTrip_profit['round_trip_route_IATA'])]\n", - "roundTrip_fre = (roundTrip_fre.assign(round_trip_flights_count_ranking = roundTrip_fre['round_trip_flights_count']\n", - " .rank(ascending = False, method = 'min')))\n", - "\n", - "# calculate the monthly average profit\n", - "roundTrip_profit_m = (roundTrip_profit_g.assign(month = np.minimum(roundTrip_profit_g['inbound_FL_DATE'], roundTrip_profit_g['outbound_FL_DATE']).dt.month)\n", - " .groupby('round_trip_route_IATA')\n", - " .apply(lambda df: df.groupby('month')['profit'].sum().mean())\n", - " .reset_index()\n", - " .rename(columns={0:'monthly_avg_profit'}))\n", - "roundTrip_profit_m = roundTrip_profit_m[roundTrip_profit_m['round_trip_route_IATA'].isin(roundTrip_profit['round_trip_route_IATA'])]\n", - "roundTrip_profit_m = (roundTrip_profit_m.assign(monthly_avg_profit_ranking = roundTrip_profit_m['monthly_avg_profit']\n", - " .rank(ascending = False, method = 'min')))\n", - "\n", - "# calculate the weekly average profit\n", - "roundTrip_profit_w = (roundTrip_profit_g.assign(week = np.minimum(roundTrip_profit_g['inbound_FL_DATE'], roundTrip_profit_g['outbound_FL_DATE']).dt.isocalendar().week)\n", - " .groupby('round_trip_route_IATA')\n", - " .apply(lambda df: df.groupby('week')['profit'].sum().mean())\n", - " .reset_index()\n", - " .rename(columns={0:'weekly_avg_profit'}))\n", - "roundTrip_profit_w = roundTrip_profit_w[roundTrip_profit_w['round_trip_route_IATA'].isin(roundTrip_profit['round_trip_route_IATA'])]\n", - "roundTrip_profit_w = (roundTrip_profit_w.assign(weekly_avg_profit_ranking = roundTrip_profit_w['weekly_avg_profit']\n", - " .rank(ascending = False, method = 'min')))\n", - "\n", - "# calculate the ranking of the operation career\n", - "roundTrip_op = (roundTrip_profit_g\n", - " .groupby('round_trip_route_IATA')\n", - " .apply(lambda df: len(set(df['inbound_OP_CARRIER']).union(set(df['outbound_OP_CARRIER']))))\n", - " .reset_index()\n", - " .rename(columns={0:'round_trip_op_count'}))\n", - "\n", - "# calcualte the proprotion of the delay flights for each round trip route\n", - "inbound_trips = roundTrip_profit_g[['inbound_DEP_DELAY', 'inbound_ARR_DELAY', 'round_trip_route_IATA']]\n", - "outbound_trips = roundTrip_profit_g[['outbound_DEP_DELAY', 'outbound_ARR_DELAY', 'round_trip_route_IATA']]\n", - "\n", - "all_flights = pd.concat(\n", - " [inbound_trips.rename(columns={'inbound_DEP_DELAY':'DEP_DELAY', 'inbound_ARR_DELAY':'ARR_DELAY'}, inplace= False),\n", - " outbound_trips.rename(columns={'outbound_DEP_DELAY':'DEP_DELAY', 'outbound_ARR_DELAY':'ARR_DELAY'}, inplace= False)],\n", - " axis=0\n", - ")\n", - "all_flights = all_flights[all_flights['round_trip_route_IATA'].isin(roundTrip_profit['round_trip_route_IATA'])]\n", - "\n", - "all_flights['is_dep_delay'] = all_flights['DEP_DELAY'].apply(lambda val : True if val >= 15 else False if val <= -15 else 0)\n", - "all_flights['is_arr_delay'] = all_flights['ARR_DELAY'].apply(lambda val : True if val >= 15 else False if val <= -15 else 0)\n", - "\n", - "roundTrip_dep_delay_rate = all_flights.groupby('round_trip_route_IATA')['is_dep_delay'].mean().sort_values(ascending=True).reset_index().rename(columns={'is_dep_delay':'dep_delay_rate'})\n", - "roundTrip_dep_delay_rate = (roundTrip_dep_delay_rate.assign(dep_delay_rate_ranking = roundTrip_dep_delay_rate['dep_delay_rate'].rank(ascending = True, method = 'min')))\n", - "\n", - "roundTrip_arr_delay_rate = all_flights.groupby('round_trip_route_IATA')['is_arr_delay'].mean().sort_values(ascending=True).reset_index().rename(columns={'is_arr_delay':'arr_delay_rate'})\n", - "roundTrip_arr_delay_rate = (roundTrip_arr_delay_rate.assign(arr_delay_rate_ranking = roundTrip_arr_delay_rate['arr_delay_rate'].rank(ascending = True, method = 'min')))\n", - "\n", - "# calcualate the average delay rate for each round trip route\n", - "\n", - "\n", - "\n", - "# # merge the ranks\n", - "round_trip_info = roundTrip_profit.merge(roundTrip_fre, on='round_trip_route_IATA', how='inner')\n", - "\n", - "round_trip_info = round_trip_info.merge(roundTrip_profit_m, on='round_trip_route_IATA', how='left')\n", - "\n", - "round_trip_info = round_trip_info.merge(roundTrip_profit_w, on='round_trip_route_IATA', how='left')\n", - "\n", - "round_trip_info = round_trip_info.merge(roundTrip_op, on='round_trip_route_IATA', how='left')\n", - "\n", - "round_trip_info = round_trip_info.merge(roundTrip_dep_delay_rate, on='round_trip_route_IATA', how='left')\n", - "\n", - "round_trip_info = round_trip_info.merge(roundTrip_arr_delay_rate, on='round_trip_route_IATA', how='left')\n", - "\n", - "\n", - "\n", - "\n", - "# # # aggregate the ranking\n", - "round_trip_info = round_trip_info.assign(avg_ranking = ((round_trip_info['round_trip_flights_count_ranking'] * 0.2 + \n", - " round_trip_info['round_trip_profit_ranking'] * 0.7 + \n", - " round_trip_info['monthly_avg_profit_ranking'] * 0.5 +\n", - " round_trip_info['weekly_avg_profit_ranking'] * 0.5) / 4))\n", - "candidate_roundTrip_route = round_trip_info.sort_values(by='avg_ranking', ascending=True).head(10)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 201, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - " | round_trip_route_IATA | \n", - "round_trip_profit | \n", - "round_trip_profit_ranking | \n", - "round_trip_flights_count | \n", - "round_trip_flights_count_ranking | \n", - "monthly_avg_profit | \n", - "monthly_avg_profit_ranking | \n", - "weekly_avg_profit | \n", - "weekly_avg_profit_ranking | \n", - "round_trip_op_count | \n", - "dep_delay_rate | \n", - "dep_delay_rate_ranking | \n", - "arr_delay_rate | \n", - "arr_delay_rate_ranking | \n", - "avg_ranking | \n", - "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1978 | \n", - "(JFK, LAX) | \n", - "1.065010e+08 | \n", - "1.0 | \n", - "3140 | \n", - "4.0 | \n", - "3.550033e+07 | \n", - "1.0 | \n", - "8.192385e+06 | \n", - "1.0 | \n", - "4 | \n", - "0.153025 | \n", - "924.0 | \n", - "0.172134 | \n", - "1153.0 | \n", - "0.625 | \n", - "
2004 | \n", - "(JFK, SFO) | \n", - "4.670737e+07 | \n", - "8.0 | \n", - "1842 | \n", - "21.0 | \n", - "1.556912e+07 | \n", - "8.0 | \n", - "3.592874e+06 | \n", - "8.0 | \n", - "4 | \n", - "0.226384 | \n", - "2131.0 | \n", - "0.256515 | \n", - "2277.0 | \n", - "4.450 | \n", - "
1109 | \n", - "(DCA, ORD) | \n", - "4.445894e+07 | \n", - "9.0 | \n", - "1764 | \n", - "24.0 | \n", - "1.481965e+07 | \n", - "9.0 | \n", - "3.419918e+06 | \n", - "9.0 | \n", - "6 | \n", - "0.189059 | \n", - "1576.0 | \n", - "0.213435 | \n", - "1801.0 | \n", - "5.025 | \n", - "
128 | \n", - "(ATL, CLT) | \n", - "4.716123e+07 | \n", - "7.0 | \n", - "1534 | \n", - "43.0 | \n", - "1.572041e+07 | \n", - "7.0 | \n", - "3.627787e+06 | \n", - "7.0 | \n", - "3 | \n", - "0.124185 | \n", - "466.0 | \n", - "0.140482 | \n", - "584.0 | \n", - "5.125 | \n", - "
1555 | \n", - "(EWR, SFO) | \n", - "5.828642e+07 | \n", - "3.0 | \n", - "1199 | \n", - "94.0 | \n", - "1.942881e+07 | \n", - "3.0 | \n", - "4.483570e+06 | \n", - "3.0 | \n", - "2 | \n", - "0.277731 | \n", - "2544.0 | \n", - "0.297331 | \n", - "2536.0 | \n", - "5.975 | \n", - "
1093 | \n", - "(DCA, LGA) | \n", - "4.069849e+07 | \n", - "12.0 | \n", - "1674 | \n", - "35.0 | \n", - "1.356616e+07 | \n", - "12.0 | \n", - "3.130653e+06 | \n", - "12.0 | \n", - "2 | \n", - "0.222222 | \n", - "2081.0 | \n", - "0.296595 | \n", - "2533.0 | \n", - "6.850 | \n", - "
1283 | \n", - "(DFW, IAH) | \n", - "3.786681e+07 | \n", - "15.0 | \n", - "1432 | \n", - "57.0 | \n", - "1.262227e+07 | \n", - "15.0 | \n", - "2.912832e+06 | \n", - "15.0 | \n", - "7 | \n", - "0.182961 | \n", - "1471.0 | \n", - "0.246159 | \n", - "2178.0 | \n", - "9.225 | \n", - "
2392 | \n", - "(MSP, ORD) | \n", - "3.276681e+07 | \n", - "25.0 | \n", - "1705 | \n", - "31.0 | \n", - "1.092227e+07 | \n", - "25.0 | \n", - "2.520524e+06 | \n", - "25.0 | \n", - "9 | \n", - "0.226979 | \n", - "2136.0 | \n", - "0.248094 | \n", - "2199.0 | \n", - "12.175 | \n", - "
1382 | \n", - "(DSM, ORD) | \n", - "3.788485e+07 | \n", - "14.0 | \n", - "947 | \n", - "161.0 | \n", - "1.262828e+07 | \n", - "14.0 | \n", - "2.914219e+06 | \n", - "14.0 | \n", - "8 | \n", - "0.240232 | \n", - "2280.0 | \n", - "0.293559 | \n", - "2519.0 | \n", - "14.000 | \n", - "
831 | \n", - "(CLT, GSP) | \n", - "5.746858e+07 | \n", - "4.0 | \n", - "772 | \n", - "253.0 | \n", - "1.915619e+07 | \n", - "4.0 | \n", - "4.420660e+06 | \n", - "4.0 | \n", - "1 | \n", - "0.139896 | \n", - "703.0 | \n", - "0.123705 | \n", - "363.0 | \n", - "14.350 | \n", - "