diff --git a/data_eda.ipynb b/data_eda.ipynb
deleted file mode 100644
index 3622756..0000000
--- a/data_eda.ipynb
+++ /dev/null
@@ -1,1034 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 279,
- "metadata": {},
- "outputs": [],
- "source": [
- "import pandas as pd\n",
- "import numpy as np\n",
- "import re"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 280,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/opt/anaconda3/envs/tongConsultinInc/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3508: DtypeWarning: Columns (3,13,14) have mixed types.Specify dtype option on import or set low_memory=False.\n",
- " exec(code_obj, self.user_global_ns, self.user_ns)\n"
- ]
- }
- ],
- "source": [
- "# Load the data\n",
- "routes = pd.read_csv('data/Flights.csv')\n",
- "ticket_price = pd.read_csv('data/Tickets.csv')\n",
- "airportsInfo = pd.read_csv('data/Airport_Codes.csv')"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Flights\n",
- "- FL_DATE: string to stanarded datetime object\n",
- "- ORIGIN_CITY_NAME: split into city and state\n",
- "- DEST_CITY_NAME: split into city and state\n",
- "- AIR_TIME: \n",
- " - Two; NAN; negative number; number in str\n",
- "- DISTANCE: to float"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 281,
- "metadata": {},
- "outputs": [],
- "source": [
- "routes['FL_DATE'] = pd.to_datetime(routes['FL_DATE'])\n",
- "\n",
- "# \n",
- "routes['ORIGIN_STATE_NAME'] = routes['ORIGIN_CITY_NAME'].str.split(', ').str[1]\n",
- "routes['ORIGIN_CITY_NAME'] = routes['ORIGIN_CITY_NAME'].str.split(', ').str[0]\n",
- "routes['DEST_STATE_NAME'] = routes['DEST_CITY_NAME'].str.split(', ').str[1]\n",
- "routes['DEST_CITY_NAME'] = routes['DEST_CITY_NAME'].str.split(', ').str[0]\n",
- "\n",
- "# air time column adjustments\n",
- "routes['AIR_TIME'] = routes['AIR_TIME'].apply(lambda x: 2.0 if x == 'Two' else x)\n",
- "routes['AIR_TIME'] = routes['AIR_TIME'].apply(lambda x: np.nan if x == 'NAN' or x == '$$$' else x)\n",
- "routes['AIR_TIME'] = routes['AIR_TIME'].apply(lambda x: 121.0 if x == '121.0' else x)\n",
- "routes['AIR_TIME'] = routes['AIR_TIME'].astype(float)\n",
- "\n",
- "# clean and convert distance to float\n",
- "def distance_to_float(val):\n",
- " try:\n",
- " float_val = float(val)\n",
- " if float_val < 0:\n",
- " return -1 * float_val\n",
- " return float_val\n",
- " except:\n",
- " return np.nan\n",
- " \n",
- "routes['DISTANCE'] = routes['DISTANCE'].apply(distance_to_float)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 282,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " FL_DATE | \n",
- " OP_CARRIER | \n",
- " TAIL_NUM | \n",
- " OP_CARRIER_FL_NUM | \n",
- " ORIGIN_AIRPORT_ID | \n",
- " ORIGIN | \n",
- " ORIGIN_CITY_NAME | \n",
- " DEST_AIRPORT_ID | \n",
- " DESTINATION | \n",
- " DEST_CITY_NAME | \n",
- " DEP_DELAY | \n",
- " ARR_DELAY | \n",
- " CANCELLED | \n",
- " AIR_TIME | \n",
- " DISTANCE | \n",
- " OCCUPANCY_RATE | \n",
- " ORIGIN_STATE_NAME | \n",
- " DEST_STATE_NAME | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 2019-03-02 | \n",
- " WN | \n",
- " N955WN | \n",
- " 4591 | \n",
- " 14635 | \n",
- " RSW | \n",
- " Fort Myers | \n",
- " 11042 | \n",
- " CLE | \n",
- " Cleveland | \n",
- " -8.0 | \n",
- " -6.0 | \n",
- " 0.0 | \n",
- " 143.0 | \n",
- " 1025.0 | \n",
- " 0.970000 | \n",
- " FL | \n",
- " OH | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 2019-03-02 | \n",
- " WN | \n",
- " N8686A | \n",
- " 3231 | \n",
- " 14635 | \n",
- " RSW | \n",
- " Fort Myers | \n",
- " 11066 | \n",
- " CMH | \n",
- " Columbus | \n",
- " 1.0 | \n",
- " 5.0 | \n",
- " 0.0 | \n",
- " 135.0 | \n",
- " 930.0 | \n",
- " 0.550000 | \n",
- " FL | \n",
- " OH | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 2019-03-02 | \n",
- " WN | \n",
- " N201LV | \n",
- " 3383 | \n",
- " 14635 | \n",
- " RSW | \n",
- " Fort Myers | \n",
- " 11066 | \n",
- " CMH | \n",
- " Columbus | \n",
- " 0.0 | \n",
- " 4.0 | \n",
- " 0.0 | \n",
- " 132.0 | \n",
- " 930.0 | \n",
- " 0.910000 | \n",
- " FL | \n",
- " OH | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 2019-03-02 | \n",
- " WN | \n",
- " N413WN | \n",
- " 5498 | \n",
- " 14635 | \n",
- " RSW | \n",
- " Fort Myers | \n",
- " 11066 | \n",
- " CMH | \n",
- " Columbus | \n",
- " 11.0 | \n",
- " 14.0 | \n",
- " 0.0 | \n",
- " 136.0 | \n",
- " 930.0 | \n",
- " 0.670000 | \n",
- " FL | \n",
- " OH | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 2019-03-02 | \n",
- " WN | \n",
- " N7832A | \n",
- " 6933 | \n",
- " 14635 | \n",
- " RSW | \n",
- " Fort Myers | \n",
- " 11259 | \n",
- " DAL | \n",
- " Dallas | \n",
- " 0.0 | \n",
- " -17.0 | \n",
- " 0.0 | \n",
- " 151.0 | \n",
- " 1005.0 | \n",
- " 0.620000 | \n",
- " FL | \n",
- " TX | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 1915881 | \n",
- " 2019-03-23 | \n",
- " AA | \n",
- " N903NN | \n",
- " 1433 | \n",
- " 15370 | \n",
- " TUL | \n",
- " Tulsa | \n",
- " 11057 | \n",
- " CLT | \n",
- " Charlotte | \n",
- " -9.0 | \n",
- " -6.0 | \n",
- " 0.0 | \n",
- " 112.0 | \n",
- " NaN | \n",
- " 0.794884 | \n",
- " OK | \n",
- " NC | \n",
- "
\n",
- " \n",
- " 1915882 | \n",
- " 2019-03-24 | \n",
- " AA | \n",
- " N965AN | \n",
- " 1433 | \n",
- " 15370 | \n",
- " TUL | \n",
- " Tulsa | \n",
- " 11057 | \n",
- " CLT | \n",
- " Charlotte | \n",
- " -2.0 | \n",
- " -1.0 | \n",
- " 0.0 | \n",
- " 106.0 | \n",
- " NaN | \n",
- " 0.538399 | \n",
- " OK | \n",
- " NC | \n",
- "
\n",
- " \n",
- " 1915883 | \n",
- " 2019-03-25 | \n",
- " AA | \n",
- " N979NN | \n",
- " 1433 | \n",
- " 15370 | \n",
- " TUL | \n",
- " Tulsa | \n",
- " 11057 | \n",
- " CLT | \n",
- " Charlotte | \n",
- " -8.0 | \n",
- " -25.0 | \n",
- " 0.0 | \n",
- " 106.0 | \n",
- " NaN | \n",
- " 0.955579 | \n",
- " OK | \n",
- " NC | \n",
- "
\n",
- " \n",
- " 1915884 | \n",
- " 2019-03-26 | \n",
- " AA | \n",
- " N872NN | \n",
- " 1433 | \n",
- " 15370 | \n",
- " TUL | \n",
- " Tulsa | \n",
- " 11057 | \n",
- " CLT | \n",
- " Charlotte | \n",
- " -9.0 | \n",
- " -6.0 | \n",
- " 0.0 | \n",
- " 112.0 | \n",
- " NaN | \n",
- " 0.595344 | \n",
- " OK | \n",
- " NC | \n",
- "
\n",
- " \n",
- " 1915885 | \n",
- " 2019-03-27 | \n",
- " AA | \n",
- " N945AN | \n",
- " 1433 | \n",
- " 15370 | \n",
- " TUL | \n",
- " Tulsa | \n",
- " 11057 | \n",
- " CLT | \n",
- " Charlotte | \n",
- " -8.0 | \n",
- " 5.0 | \n",
- " 0.0 | \n",
- " 117.0 | \n",
- " NaN | \n",
- " 0.350192 | \n",
- " OK | \n",
- " NC | \n",
- "
\n",
- " \n",
- "
\n",
- "
1915886 rows × 18 columns
\n",
- "
"
- ],
- "text/plain": [
- " FL_DATE OP_CARRIER TAIL_NUM OP_CARRIER_FL_NUM ORIGIN_AIRPORT_ID \\\n",
- "0 2019-03-02 WN N955WN 4591 14635 \n",
- "1 2019-03-02 WN N8686A 3231 14635 \n",
- "2 2019-03-02 WN N201LV 3383 14635 \n",
- "3 2019-03-02 WN N413WN 5498 14635 \n",
- "4 2019-03-02 WN N7832A 6933 14635 \n",
- "... ... ... ... ... ... \n",
- "1915881 2019-03-23 AA N903NN 1433 15370 \n",
- "1915882 2019-03-24 AA N965AN 1433 15370 \n",
- "1915883 2019-03-25 AA N979NN 1433 15370 \n",
- "1915884 2019-03-26 AA N872NN 1433 15370 \n",
- "1915885 2019-03-27 AA N945AN 1433 15370 \n",
- "\n",
- " ORIGIN ORIGIN_CITY_NAME DEST_AIRPORT_ID DESTINATION DEST_CITY_NAME \\\n",
- "0 RSW Fort Myers 11042 CLE Cleveland \n",
- "1 RSW Fort Myers 11066 CMH Columbus \n",
- "2 RSW Fort Myers 11066 CMH Columbus \n",
- "3 RSW Fort Myers 11066 CMH Columbus \n",
- "4 RSW Fort Myers 11259 DAL Dallas \n",
- "... ... ... ... ... ... \n",
- "1915881 TUL Tulsa 11057 CLT Charlotte \n",
- "1915882 TUL Tulsa 11057 CLT Charlotte \n",
- "1915883 TUL Tulsa 11057 CLT Charlotte \n",
- "1915884 TUL Tulsa 11057 CLT Charlotte \n",
- "1915885 TUL Tulsa 11057 CLT Charlotte \n",
- "\n",
- " DEP_DELAY ARR_DELAY CANCELLED AIR_TIME DISTANCE OCCUPANCY_RATE \\\n",
- "0 -8.0 -6.0 0.0 143.0 1025.0 0.970000 \n",
- "1 1.0 5.0 0.0 135.0 930.0 0.550000 \n",
- "2 0.0 4.0 0.0 132.0 930.0 0.910000 \n",
- "3 11.0 14.0 0.0 136.0 930.0 0.670000 \n",
- "4 0.0 -17.0 0.0 151.0 1005.0 0.620000 \n",
- "... ... ... ... ... ... ... \n",
- "1915881 -9.0 -6.0 0.0 112.0 NaN 0.794884 \n",
- "1915882 -2.0 -1.0 0.0 106.0 NaN 0.538399 \n",
- "1915883 -8.0 -25.0 0.0 106.0 NaN 0.955579 \n",
- "1915884 -9.0 -6.0 0.0 112.0 NaN 0.595344 \n",
- "1915885 -8.0 5.0 0.0 117.0 NaN 0.350192 \n",
- "\n",
- " ORIGIN_STATE_NAME DEST_STATE_NAME \n",
- "0 FL OH \n",
- "1 FL OH \n",
- "2 FL OH \n",
- "3 FL OH \n",
- "4 FL TX \n",
- "... ... ... \n",
- "1915881 OK NC \n",
- "1915882 OK NC \n",
- "1915883 OK NC \n",
- "1915884 OK NC \n",
- "1915885 OK NC \n",
- "\n",
- "[1915886 rows x 18 columns]"
- ]
- },
- "execution_count": 282,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "routes"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Tickets\n",
- "- YEAR to int\n",
- "- clean itin_fare"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 283,
- "metadata": {},
- "outputs": [],
- "source": [
- "# year column to int year\n",
- "ticket_price['YEAR'] = ticket_price['YEAR'].astype(int)\n",
- "\n",
- "# clean and convert price to float\n",
- "def find_number(text):\n",
- " if type(text) != str:\n",
- " return np.nan\n",
- " re_result = re.search(r'[\\d\\.]+', text)\n",
- " if re_result is not None:\n",
- " return float(re_result.group(0))\n",
- " return np.nan\n",
- "\n",
- "ticket_price['ITIN_FARE'] = ticket_price['ITIN_FARE'].apply(find_number)\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 284,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " ITIN_ID | \n",
- " YEAR | \n",
- " QUARTER | \n",
- " ORIGIN | \n",
- " ORIGIN_COUNTRY | \n",
- " ORIGIN_STATE_ABR | \n",
- " ORIGIN_STATE_NM | \n",
- " ROUNDTRIP | \n",
- " REPORTING_CARRIER | \n",
- " PASSENGERS | \n",
- " ITIN_FARE | \n",
- " DESTINATION | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 201912723049 | \n",
- " 2019 | \n",
- " 1 | \n",
- " ABI | \n",
- " US | \n",
- " TX | \n",
- " Texas | \n",
- " 1.0 | \n",
- " MQ | \n",
- " 1.0 | \n",
- " 736.0 | \n",
- " DAB | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 201912723085 | \n",
- " 2019 | \n",
- " 1 | \n",
- " ABI | \n",
- " US | \n",
- " TX | \n",
- " Texas | \n",
- " 1.0 | \n",
- " MQ | \n",
- " 1.0 | \n",
- " 570.0 | \n",
- " COS | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 201912723491 | \n",
- " 2019 | \n",
- " 1 | \n",
- " ABI | \n",
- " US | \n",
- " TX | \n",
- " Texas | \n",
- " 1.0 | \n",
- " MQ | \n",
- " 1.0 | \n",
- " 564.0 | \n",
- " MCO | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 201912723428 | \n",
- " 2019 | \n",
- " 1 | \n",
- " ABI | \n",
- " US | \n",
- " TX | \n",
- " Texas | \n",
- " 1.0 | \n",
- " MQ | \n",
- " 1.0 | \n",
- " 345.0 | \n",
- " LGA | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 201912723509 | \n",
- " 2019 | \n",
- " 1 | \n",
- " ABI | \n",
- " US | \n",
- " TX | \n",
- " Texas | \n",
- " 0.0 | \n",
- " MQ | \n",
- " 1.0 | \n",
- " 309.0 | \n",
- " MGM | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 1167280 | \n",
- " 201911284909 | \n",
- " 2019 | \n",
- " 1 | \n",
- " YAK | \n",
- " US | \n",
- " AK | \n",
- " Alaska | \n",
- " 0.0 | \n",
- " AS | \n",
- " 1.0 | \n",
- " 244.0 | \n",
- " ANC | \n",
- "
\n",
- " \n",
- " 1167281 | \n",
- " 201911284959 | \n",
- " 2019 | \n",
- " 1 | \n",
- " YAK | \n",
- " US | \n",
- " AK | \n",
- " Alaska | \n",
- " 1.0 | \n",
- " AS | \n",
- " 1.0 | \n",
- " 371.0 | \n",
- " JNU | \n",
- "
\n",
- " \n",
- " 1167282 | \n",
- " 201911284940 | \n",
- " 2019 | \n",
- " 1 | \n",
- " YAK | \n",
- " US | \n",
- " AK | \n",
- " Alaska | \n",
- " 0.0 | \n",
- " AS | \n",
- " 1.0 | \n",
- " 271.0 | \n",
- " JNU | \n",
- "
\n",
- " \n",
- " 1167283 | \n",
- " 201911284914 | \n",
- " 2019 | \n",
- " 1 | \n",
- " YAK | \n",
- " US | \n",
- " AK | \n",
- " Alaska | \n",
- " 0.0 | \n",
- " AS | \n",
- " 1.0 | \n",
- " 603.0 | \n",
- " ANC | \n",
- "
\n",
- " \n",
- " 1167284 | \n",
- " 201911284952 | \n",
- " 2019 | \n",
- " 1 | \n",
- " YAK | \n",
- " US | \n",
- " AK | \n",
- " Alaska | \n",
- " 1.0 | \n",
- " AS | \n",
- " 1.0 | \n",
- " 299.0 | \n",
- " JNU | \n",
- "
\n",
- " \n",
- "
\n",
- "
1167285 rows × 12 columns
\n",
- "
"
- ],
- "text/plain": [
- " ITIN_ID YEAR QUARTER ORIGIN ORIGIN_COUNTRY ORIGIN_STATE_ABR \\\n",
- "0 201912723049 2019 1 ABI US TX \n",
- "1 201912723085 2019 1 ABI US TX \n",
- "2 201912723491 2019 1 ABI US TX \n",
- "3 201912723428 2019 1 ABI US TX \n",
- "4 201912723509 2019 1 ABI US TX \n",
- "... ... ... ... ... ... ... \n",
- "1167280 201911284909 2019 1 YAK US AK \n",
- "1167281 201911284959 2019 1 YAK US AK \n",
- "1167282 201911284940 2019 1 YAK US AK \n",
- "1167283 201911284914 2019 1 YAK US AK \n",
- "1167284 201911284952 2019 1 YAK US AK \n",
- "\n",
- " ORIGIN_STATE_NM ROUNDTRIP REPORTING_CARRIER PASSENGERS ITIN_FARE \\\n",
- "0 Texas 1.0 MQ 1.0 736.0 \n",
- "1 Texas 1.0 MQ 1.0 570.0 \n",
- "2 Texas 1.0 MQ 1.0 564.0 \n",
- "3 Texas 1.0 MQ 1.0 345.0 \n",
- "4 Texas 0.0 MQ 1.0 309.0 \n",
- "... ... ... ... ... ... \n",
- "1167280 Alaska 0.0 AS 1.0 244.0 \n",
- "1167281 Alaska 1.0 AS 1.0 371.0 \n",
- "1167282 Alaska 0.0 AS 1.0 271.0 \n",
- "1167283 Alaska 0.0 AS 1.0 603.0 \n",
- "1167284 Alaska 1.0 AS 1.0 299.0 \n",
- "\n",
- " DESTINATION \n",
- "0 DAB \n",
- "1 COS \n",
- "2 MCO \n",
- "3 LGA \n",
- "4 MGM \n",
- "... ... \n",
- "1167280 ANC \n",
- "1167281 JNU \n",
- "1167282 JNU \n",
- "1167283 ANC \n",
- "1167284 JNU \n",
- "\n",
- "[1167285 rows x 12 columns]"
- ]
- },
- "execution_count": 284,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "ticket_price"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## AirportsInfo\n",
- "- COORDINATES into atomic data set\n",
- " - first one is longitude\n",
- " - second one is latitude"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 285,
- "metadata": {},
- "outputs": [],
- "source": [
- "# clean coordinates\n",
- "\n",
- "airportsInfo['COORDINATES_LONGITUDE'] = airportsInfo['COORDINATES'].apply(lambda x: x.split(', ')[0]).astype(float)\n",
- "airportsInfo['COORDINATES_LATITUDE'] = airportsInfo['COORDINATES'].apply(lambda x: x.split(', ')[1]).astype(float)\n",
- "airportsInfo.drop(columns=['COORDINATES'], inplace=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 286,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "RangeIndex: 55369 entries, 0 to 55368\n",
- "Data columns (total 9 columns):\n",
- " # Column Non-Null Count Dtype \n",
- "--- ------ -------------- ----- \n",
- " 0 TYPE 55369 non-null object \n",
- " 1 NAME 55369 non-null object \n",
- " 2 ELEVATION_FT 48354 non-null float64\n",
- " 3 CONTINENT 27526 non-null object \n",
- " 4 ISO_COUNTRY 55122 non-null object \n",
- " 5 MUNICIPALITY 49663 non-null object \n",
- " 6 IATA_CODE 9182 non-null object \n",
- " 7 COORDINATES_LONGITUDE 55369 non-null float64\n",
- " 8 COORDINATES_LATITUDE 55369 non-null float64\n",
- "dtypes: float64(3), object(6)\n",
- "memory usage: 3.8+ MB\n"
- ]
- }
- ],
- "source": [
- "airportsInfo.info()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 287,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " TYPE | \n",
- " NAME | \n",
- " ELEVATION_FT | \n",
- " CONTINENT | \n",
- " ISO_COUNTRY | \n",
- " MUNICIPALITY | \n",
- " IATA_CODE | \n",
- " COORDINATES_LONGITUDE | \n",
- " COORDINATES_LATITUDE | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " heliport | \n",
- " Total Rf Heliport | \n",
- " 11.0 | \n",
- " NaN | \n",
- " US | \n",
- " Bensalem | \n",
- " NaN | \n",
- " -74.933601 | \n",
- " 40.070801 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " small_airport | \n",
- " Aero B Ranch Airport | \n",
- " 3435.0 | \n",
- " NaN | \n",
- " US | \n",
- " Leoti | \n",
- " NaN | \n",
- " -101.473911 | \n",
- " 38.704022 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " small_airport | \n",
- " Lowell Field | \n",
- " 450.0 | \n",
- " NaN | \n",
- " US | \n",
- " Anchor Point | \n",
- " NaN | \n",
- " -151.695999 | \n",
- " 59.949200 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " small_airport | \n",
- " Epps Airpark | \n",
- " 820.0 | \n",
- " NaN | \n",
- " US | \n",
- " Harvest | \n",
- " NaN | \n",
- " -86.770302 | \n",
- " 34.864799 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " closed | \n",
- " Newport Hospital & Clinic Heliport | \n",
- " 237.0 | \n",
- " NaN | \n",
- " US | \n",
- " Newport | \n",
- " NaN | \n",
- " -91.254898 | \n",
- " 35.608700 | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 55364 | \n",
- " medium_airport | \n",
- " Yingkou Lanqi Airport | \n",
- " 0.0 | \n",
- " AS | \n",
- " CN | \n",
- " Yingkou | \n",
- " YKH | \n",
- " 122.358600 | \n",
- " 40.542524 | \n",
- "
\n",
- " \n",
- " 55365 | \n",
- " medium_airport | \n",
- " Shenyang Dongta Airport | \n",
- " NaN | \n",
- " AS | \n",
- " CN | \n",
- " Shenyang | \n",
- " NaN | \n",
- " 123.496002 | \n",
- " 41.784401 | \n",
- "
\n",
- " \n",
- " 55366 | \n",
- " heliport | \n",
- " Sealand Helipad | \n",
- " 40.0 | \n",
- " EU | \n",
- " GB | \n",
- " Sealand | \n",
- " NaN | \n",
- " 1.482500 | \n",
- " 51.894444 | \n",
- "
\n",
- " \n",
- " 55367 | \n",
- " small_airport | \n",
- " Glorioso Islands Airstrip | \n",
- " 11.0 | \n",
- " AF | \n",
- " TF | \n",
- " Grande Glorieuse | \n",
- " NaN | \n",
- " 47.296389 | \n",
- " -11.584278 | \n",
- "
\n",
- " \n",
- " 55368 | \n",
- " small_airport | \n",
- " Satsuma IÅjima Airport | \n",
- " 338.0 | \n",
- " AS | \n",
- " JP | \n",
- " Mishima-Mura | \n",
- " NaN | \n",
- " 130.270556 | \n",
- " 30.784722 | \n",
- "
\n",
- " \n",
- "
\n",
- "
55369 rows × 9 columns
\n",
- "
"
- ],
- "text/plain": [
- " TYPE NAME ELEVATION_FT \\\n",
- "0 heliport Total Rf Heliport 11.0 \n",
- "1 small_airport Aero B Ranch Airport 3435.0 \n",
- "2 small_airport Lowell Field 450.0 \n",
- "3 small_airport Epps Airpark 820.0 \n",
- "4 closed Newport Hospital & Clinic Heliport 237.0 \n",
- "... ... ... ... \n",
- "55364 medium_airport Yingkou Lanqi Airport 0.0 \n",
- "55365 medium_airport Shenyang Dongta Airport NaN \n",
- "55366 heliport Sealand Helipad 40.0 \n",
- "55367 small_airport Glorioso Islands Airstrip 11.0 \n",
- "55368 small_airport Satsuma IÅjima Airport 338.0 \n",
- "\n",
- " CONTINENT ISO_COUNTRY MUNICIPALITY IATA_CODE \\\n",
- "0 NaN US Bensalem NaN \n",
- "1 NaN US Leoti NaN \n",
- "2 NaN US Anchor Point NaN \n",
- "3 NaN US Harvest NaN \n",
- "4 NaN US Newport NaN \n",
- "... ... ... ... ... \n",
- "55364 AS CN Yingkou YKH \n",
- "55365 AS CN Shenyang NaN \n",
- "55366 EU GB Sealand NaN \n",
- "55367 AF TF Grande Glorieuse NaN \n",
- "55368 AS JP Mishima-Mura NaN \n",
- "\n",
- " COORDINATES_LONGITUDE COORDINATES_LATITUDE \n",
- "0 -74.933601 40.070801 \n",
- "1 -101.473911 38.704022 \n",
- "2 -151.695999 59.949200 \n",
- "3 -86.770302 34.864799 \n",
- "4 -91.254898 35.608700 \n",
- "... ... ... \n",
- "55364 122.358600 40.542524 \n",
- "55365 123.496002 41.784401 \n",
- "55366 1.482500 51.894444 \n",
- "55367 47.296389 -11.584278 \n",
- "55368 130.270556 30.784722 \n",
- "\n",
- "[55369 rows x 9 columns]"
- ]
- },
- "execution_count": 287,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "airportsInfo"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "tongConsultinInc",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.19"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/impact_evaluation/eda_average_ticketPrice.ipynb b/impact_evaluation/eda_average_ticketPrice.ipynb
new file mode 100644
index 0000000..409150b
--- /dev/null
+++ b/impact_evaluation/eda_average_ticketPrice.ipynb
@@ -0,0 +1,276 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## eval_impact_of_tickets_associations"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 200,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 201,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Load data\n",
+ "\n",
+ "tickets = pd.read_csv('../data/cleaned_data/Tickets.csv').dropna()\n",
+ "ignore_column_tickets = ['ITIN_ID', 'YEAR', 'QUARTER', 'ORIGIN_COUNTRY', 'ORIGIN_STATE_ABR', 'ORIGIN_STATE_NM', 'ROUNDTRIP']\n",
+ "# tickets = tickets.drop(columns=ignore_column_tickets)\n",
+ "# # remove tickets that visit airports outside of the US\n",
+ "# tickets = tickets[(tickets['DEST_AIRPORT_IATA'].isin(airports['IATA_CODE'])) & (tickets['ORIGIN_AIRPORT_IATA'].isin(airports['IATA_CODE']))]\n",
+ "# tickets = tickets.assign(sorted_route = tickets.apply(lambda x : tuple(sorted([x['ORIGIN_AIRPORT_IATA'], x['DEST_AIRPORT_IATA']])), axis=1))\n",
+ "# tickets = tickets.drop(['ORIGIN_AIRPORT_IATA', 'DEST_AIRPORT_IATA'], axis=1)\n",
+ "\n",
+ "tickets = tickets.assign(\n",
+ " sorted_route=tickets.apply(\n",
+ " lambda x: tuple(\n",
+ " sorted([x[\"ORIGIN_AIRPORT_IATA_CODE\"], x[\"DEST_AIRPORT_IATA_CODE\"]])\n",
+ " ),\n",
+ " axis=1,\n",
+ " )\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Checking how many tickets associated with each round-trip routes\n",
+ "\n",
+ "It is critical to check the number of tickets associated with the round trip routes because we calculate average ticket price based on the ticket associated with each round trip route. Specially for round-trip routes associated with only a single ticket, the analysis would be highly biased if we only rely on a single ticket for that round-trip route"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 202,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "roundtrip_route_distribution = tickets.groupby('sorted_route')['ITIN_ID'].count().value_counts(normalize=True)\n",
+ "roundtrip_route_distributions_more_than_10 = roundtrip_route_distribution[roundtrip_route_distribution.index > 9].sum()\n",
+ "roundtrip_route_distribution = roundtrip_route_distribution.reset_index().loc[:8]\n",
+ "roundtrip_route_distribution.loc[9] = {'index' : '10 or more', 'ITIN_ID' : roundtrip_route_distributions_more_than_10}\n",
+ "roundtrip_route_distribution.columns = ['# of tickets on roundtrip route', 'Percetange of roundtrip routes']\n",
+ "\n",
+ "roundtrip_route_distribution['# of tickets on roundtrip route'] = pd.Series(['1 ticket', '2 tickets', '3 tickets', '4 tickets', '5 tickets', '6 tickets', '7 tickets', '8 tickets', '9 tickets', '10 or more'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 240,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ "