From 3dcde65e39eeeba8f43d3b19db3dbd776022e339 Mon Sep 17 00:00:00 2001 From: Guoxuan Xu Date: Mon, 14 Oct 2024 20:36:54 -0700 Subject: [PATCH] data evaluation notebook --- data_eda.ipynb | 1034 ----------------- .../eda_average_ticketPrice.ipynb | 276 +++++ .../eval_impact_of_zero ticket price.ipynb | 129 ++ 3 files changed, 405 insertions(+), 1034 deletions(-) delete mode 100644 data_eda.ipynb create mode 100644 impact_evaluation/eda_average_ticketPrice.ipynb create mode 100644 impact_evaluation/eval_impact_of_zero ticket price.ipynb diff --git a/data_eda.ipynb b/data_eda.ipynb deleted file mode 100644 index 3622756..0000000 --- a/data_eda.ipynb +++ /dev/null @@ -1,1034 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 279, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "import re" - ] - }, - { - "cell_type": "code", - "execution_count": 280, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/anaconda3/envs/tongConsultinInc/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3508: DtypeWarning: Columns (3,13,14) have mixed types.Specify dtype option on import or set low_memory=False.\n", - " exec(code_obj, self.user_global_ns, self.user_ns)\n" - ] - } - ], - "source": [ - "# Load the data\n", - "routes = pd.read_csv('data/Flights.csv')\n", - "ticket_price = pd.read_csv('data/Tickets.csv')\n", - "airportsInfo = pd.read_csv('data/Airport_Codes.csv')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Flights\n", - "- FL_DATE: string to stanarded datetime object\n", - "- ORIGIN_CITY_NAME: split into city and state\n", - "- DEST_CITY_NAME: split into city and state\n", - "- AIR_TIME: \n", - " - Two; NAN; negative number; number in str\n", - "- DISTANCE: to float" - ] - }, - { - "cell_type": "code", - "execution_count": 281, - "metadata": {}, - "outputs": [], - "source": [ - "routes['FL_DATE'] = pd.to_datetime(routes['FL_DATE'])\n", - "\n", - "# \n", - "routes['ORIGIN_STATE_NAME'] = routes['ORIGIN_CITY_NAME'].str.split(', ').str[1]\n", - "routes['ORIGIN_CITY_NAME'] = routes['ORIGIN_CITY_NAME'].str.split(', ').str[0]\n", - "routes['DEST_STATE_NAME'] = routes['DEST_CITY_NAME'].str.split(', ').str[1]\n", - "routes['DEST_CITY_NAME'] = routes['DEST_CITY_NAME'].str.split(', ').str[0]\n", - "\n", - "# air time column adjustments\n", - "routes['AIR_TIME'] = routes['AIR_TIME'].apply(lambda x: 2.0 if x == 'Two' else x)\n", - "routes['AIR_TIME'] = routes['AIR_TIME'].apply(lambda x: np.nan if x == 'NAN' or x == '$$$' else x)\n", - "routes['AIR_TIME'] = routes['AIR_TIME'].apply(lambda x: 121.0 if x == '121.0' else x)\n", - "routes['AIR_TIME'] = routes['AIR_TIME'].astype(float)\n", - "\n", - "# clean and convert distance to float\n", - "def distance_to_float(val):\n", - " try:\n", - " float_val = float(val)\n", - " if float_val < 0:\n", - " return -1 * float_val\n", - " return float_val\n", - " except:\n", - " return np.nan\n", - " \n", - "routes['DISTANCE'] = routes['DISTANCE'].apply(distance_to_float)" - ] - }, - { - "cell_type": "code", - "execution_count": 282, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
FL_DATEOP_CARRIERTAIL_NUMOP_CARRIER_FL_NUMORIGIN_AIRPORT_IDORIGINORIGIN_CITY_NAMEDEST_AIRPORT_IDDESTINATIONDEST_CITY_NAMEDEP_DELAYARR_DELAYCANCELLEDAIR_TIMEDISTANCEOCCUPANCY_RATEORIGIN_STATE_NAMEDEST_STATE_NAME
02019-03-02WNN955WN459114635RSWFort Myers11042CLECleveland-8.0-6.00.0143.01025.00.970000FLOH
12019-03-02WNN8686A323114635RSWFort Myers11066CMHColumbus1.05.00.0135.0930.00.550000FLOH
22019-03-02WNN201LV338314635RSWFort Myers11066CMHColumbus0.04.00.0132.0930.00.910000FLOH
32019-03-02WNN413WN549814635RSWFort Myers11066CMHColumbus11.014.00.0136.0930.00.670000FLOH
42019-03-02WNN7832A693314635RSWFort Myers11259DALDallas0.0-17.00.0151.01005.00.620000FLTX
.........................................................
19158812019-03-23AAN903NN143315370TULTulsa11057CLTCharlotte-9.0-6.00.0112.0NaN0.794884OKNC
19158822019-03-24AAN965AN143315370TULTulsa11057CLTCharlotte-2.0-1.00.0106.0NaN0.538399OKNC
19158832019-03-25AAN979NN143315370TULTulsa11057CLTCharlotte-8.0-25.00.0106.0NaN0.955579OKNC
19158842019-03-26AAN872NN143315370TULTulsa11057CLTCharlotte-9.0-6.00.0112.0NaN0.595344OKNC
19158852019-03-27AAN945AN143315370TULTulsa11057CLTCharlotte-8.05.00.0117.0NaN0.350192OKNC
\n", - "

1915886 rows × 18 columns

\n", - "
" - ], - "text/plain": [ - " FL_DATE OP_CARRIER TAIL_NUM OP_CARRIER_FL_NUM ORIGIN_AIRPORT_ID \\\n", - "0 2019-03-02 WN N955WN 4591 14635 \n", - "1 2019-03-02 WN N8686A 3231 14635 \n", - "2 2019-03-02 WN N201LV 3383 14635 \n", - "3 2019-03-02 WN N413WN 5498 14635 \n", - "4 2019-03-02 WN N7832A 6933 14635 \n", - "... ... ... ... ... ... \n", - "1915881 2019-03-23 AA N903NN 1433 15370 \n", - "1915882 2019-03-24 AA N965AN 1433 15370 \n", - "1915883 2019-03-25 AA N979NN 1433 15370 \n", - "1915884 2019-03-26 AA N872NN 1433 15370 \n", - "1915885 2019-03-27 AA N945AN 1433 15370 \n", - "\n", - " ORIGIN ORIGIN_CITY_NAME DEST_AIRPORT_ID DESTINATION DEST_CITY_NAME \\\n", - "0 RSW Fort Myers 11042 CLE Cleveland \n", - "1 RSW Fort Myers 11066 CMH Columbus \n", - "2 RSW Fort Myers 11066 CMH Columbus \n", - "3 RSW Fort Myers 11066 CMH Columbus \n", - "4 RSW Fort Myers 11259 DAL Dallas \n", - "... ... ... ... ... ... \n", - "1915881 TUL Tulsa 11057 CLT Charlotte \n", - "1915882 TUL Tulsa 11057 CLT Charlotte \n", - "1915883 TUL Tulsa 11057 CLT Charlotte \n", - "1915884 TUL Tulsa 11057 CLT Charlotte \n", - "1915885 TUL Tulsa 11057 CLT Charlotte \n", - "\n", - " DEP_DELAY ARR_DELAY CANCELLED AIR_TIME DISTANCE OCCUPANCY_RATE \\\n", - "0 -8.0 -6.0 0.0 143.0 1025.0 0.970000 \n", - "1 1.0 5.0 0.0 135.0 930.0 0.550000 \n", - "2 0.0 4.0 0.0 132.0 930.0 0.910000 \n", - "3 11.0 14.0 0.0 136.0 930.0 0.670000 \n", - "4 0.0 -17.0 0.0 151.0 1005.0 0.620000 \n", - "... ... ... ... ... ... ... \n", - "1915881 -9.0 -6.0 0.0 112.0 NaN 0.794884 \n", - "1915882 -2.0 -1.0 0.0 106.0 NaN 0.538399 \n", - "1915883 -8.0 -25.0 0.0 106.0 NaN 0.955579 \n", - "1915884 -9.0 -6.0 0.0 112.0 NaN 0.595344 \n", - "1915885 -8.0 5.0 0.0 117.0 NaN 0.350192 \n", - "\n", - " ORIGIN_STATE_NAME DEST_STATE_NAME \n", - "0 FL OH \n", - "1 FL OH \n", - "2 FL OH \n", - "3 FL OH \n", - "4 FL TX \n", - "... ... ... \n", - "1915881 OK NC \n", - "1915882 OK NC \n", - "1915883 OK NC \n", - "1915884 OK NC \n", - "1915885 OK NC \n", - "\n", - "[1915886 rows x 18 columns]" - ] - }, - "execution_count": 282, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "routes" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Tickets\n", - "- YEAR to int\n", - "- clean itin_fare" - ] - }, - { - "cell_type": "code", - "execution_count": 283, - "metadata": {}, - "outputs": [], - "source": [ - "# year column to int year\n", - "ticket_price['YEAR'] = ticket_price['YEAR'].astype(int)\n", - "\n", - "# clean and convert price to float\n", - "def find_number(text):\n", - " if type(text) != str:\n", - " return np.nan\n", - " re_result = re.search(r'[\\d\\.]+', text)\n", - " if re_result is not None:\n", - " return float(re_result.group(0))\n", - " return np.nan\n", - "\n", - "ticket_price['ITIN_FARE'] = ticket_price['ITIN_FARE'].apply(find_number)\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 284, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ITIN_IDYEARQUARTERORIGINORIGIN_COUNTRYORIGIN_STATE_ABRORIGIN_STATE_NMROUNDTRIPREPORTING_CARRIERPASSENGERSITIN_FAREDESTINATION
020191272304920191ABIUSTXTexas1.0MQ1.0736.0DAB
120191272308520191ABIUSTXTexas1.0MQ1.0570.0COS
220191272349120191ABIUSTXTexas1.0MQ1.0564.0MCO
320191272342820191ABIUSTXTexas1.0MQ1.0345.0LGA
420191272350920191ABIUSTXTexas0.0MQ1.0309.0MGM
.......................................
116728020191128490920191YAKUSAKAlaska0.0AS1.0244.0ANC
116728120191128495920191YAKUSAKAlaska1.0AS1.0371.0JNU
116728220191128494020191YAKUSAKAlaska0.0AS1.0271.0JNU
116728320191128491420191YAKUSAKAlaska0.0AS1.0603.0ANC
116728420191128495220191YAKUSAKAlaska1.0AS1.0299.0JNU
\n", - "

1167285 rows × 12 columns

\n", - "
" - ], - "text/plain": [ - " ITIN_ID YEAR QUARTER ORIGIN ORIGIN_COUNTRY ORIGIN_STATE_ABR \\\n", - "0 201912723049 2019 1 ABI US TX \n", - "1 201912723085 2019 1 ABI US TX \n", - "2 201912723491 2019 1 ABI US TX \n", - "3 201912723428 2019 1 ABI US TX \n", - "4 201912723509 2019 1 ABI US TX \n", - "... ... ... ... ... ... ... \n", - "1167280 201911284909 2019 1 YAK US AK \n", - "1167281 201911284959 2019 1 YAK US AK \n", - "1167282 201911284940 2019 1 YAK US AK \n", - "1167283 201911284914 2019 1 YAK US AK \n", - "1167284 201911284952 2019 1 YAK US AK \n", - "\n", - " ORIGIN_STATE_NM ROUNDTRIP REPORTING_CARRIER PASSENGERS ITIN_FARE \\\n", - "0 Texas 1.0 MQ 1.0 736.0 \n", - "1 Texas 1.0 MQ 1.0 570.0 \n", - "2 Texas 1.0 MQ 1.0 564.0 \n", - "3 Texas 1.0 MQ 1.0 345.0 \n", - "4 Texas 0.0 MQ 1.0 309.0 \n", - "... ... ... ... ... ... \n", - "1167280 Alaska 0.0 AS 1.0 244.0 \n", - "1167281 Alaska 1.0 AS 1.0 371.0 \n", - "1167282 Alaska 0.0 AS 1.0 271.0 \n", - "1167283 Alaska 0.0 AS 1.0 603.0 \n", - "1167284 Alaska 1.0 AS 1.0 299.0 \n", - "\n", - " DESTINATION \n", - "0 DAB \n", - "1 COS \n", - "2 MCO \n", - "3 LGA \n", - "4 MGM \n", - "... ... \n", - "1167280 ANC \n", - "1167281 JNU \n", - "1167282 JNU \n", - "1167283 ANC \n", - "1167284 JNU \n", - "\n", - "[1167285 rows x 12 columns]" - ] - }, - "execution_count": 284, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ticket_price" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## AirportsInfo\n", - "- COORDINATES into atomic data set\n", - " - first one is longitude\n", - " - second one is latitude" - ] - }, - { - "cell_type": "code", - "execution_count": 285, - "metadata": {}, - "outputs": [], - "source": [ - "# clean coordinates\n", - "\n", - "airportsInfo['COORDINATES_LONGITUDE'] = airportsInfo['COORDINATES'].apply(lambda x: x.split(', ')[0]).astype(float)\n", - "airportsInfo['COORDINATES_LATITUDE'] = airportsInfo['COORDINATES'].apply(lambda x: x.split(', ')[1]).astype(float)\n", - "airportsInfo.drop(columns=['COORDINATES'], inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 286, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "RangeIndex: 55369 entries, 0 to 55368\n", - "Data columns (total 9 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 TYPE 55369 non-null object \n", - " 1 NAME 55369 non-null object \n", - " 2 ELEVATION_FT 48354 non-null float64\n", - " 3 CONTINENT 27526 non-null object \n", - " 4 ISO_COUNTRY 55122 non-null object \n", - " 5 MUNICIPALITY 49663 non-null object \n", - " 6 IATA_CODE 9182 non-null object \n", - " 7 COORDINATES_LONGITUDE 55369 non-null float64\n", - " 8 COORDINATES_LATITUDE 55369 non-null float64\n", - "dtypes: float64(3), object(6)\n", - "memory usage: 3.8+ MB\n" - ] - } - ], - "source": [ - "airportsInfo.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 287, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
TYPENAMEELEVATION_FTCONTINENTISO_COUNTRYMUNICIPALITYIATA_CODECOORDINATES_LONGITUDECOORDINATES_LATITUDE
0heliportTotal Rf Heliport11.0NaNUSBensalemNaN-74.93360140.070801
1small_airportAero B Ranch Airport3435.0NaNUSLeotiNaN-101.47391138.704022
2small_airportLowell Field450.0NaNUSAnchor PointNaN-151.69599959.949200
3small_airportEpps Airpark820.0NaNUSHarvestNaN-86.77030234.864799
4closedNewport Hospital & Clinic Heliport237.0NaNUSNewportNaN-91.25489835.608700
..............................
55364medium_airportYingkou Lanqi Airport0.0ASCNYingkouYKH122.35860040.542524
55365medium_airportShenyang Dongta AirportNaNASCNShenyangNaN123.49600241.784401
55366heliportSealand Helipad40.0EUGBSealandNaN1.48250051.894444
55367small_airportGlorioso Islands Airstrip11.0AFTFGrande GlorieuseNaN47.296389-11.584278
55368small_airportSatsuma Iōjima Airport338.0ASJPMishima-MuraNaN130.27055630.784722
\n", - "

55369 rows × 9 columns

\n", - "
" - ], - "text/plain": [ - " TYPE NAME ELEVATION_FT \\\n", - "0 heliport Total Rf Heliport 11.0 \n", - "1 small_airport Aero B Ranch Airport 3435.0 \n", - "2 small_airport Lowell Field 450.0 \n", - "3 small_airport Epps Airpark 820.0 \n", - "4 closed Newport Hospital & Clinic Heliport 237.0 \n", - "... ... ... ... \n", - "55364 medium_airport Yingkou Lanqi Airport 0.0 \n", - "55365 medium_airport Shenyang Dongta Airport NaN \n", - "55366 heliport Sealand Helipad 40.0 \n", - "55367 small_airport Glorioso Islands Airstrip 11.0 \n", - "55368 small_airport Satsuma Iōjima Airport 338.0 \n", - "\n", - " CONTINENT ISO_COUNTRY MUNICIPALITY IATA_CODE \\\n", - "0 NaN US Bensalem NaN \n", - "1 NaN US Leoti NaN \n", - "2 NaN US Anchor Point NaN \n", - "3 NaN US Harvest NaN \n", - "4 NaN US Newport NaN \n", - "... ... ... ... ... \n", - "55364 AS CN Yingkou YKH \n", - "55365 AS CN Shenyang NaN \n", - "55366 EU GB Sealand NaN \n", - "55367 AF TF Grande Glorieuse NaN \n", - "55368 AS JP Mishima-Mura NaN \n", - "\n", - " COORDINATES_LONGITUDE COORDINATES_LATITUDE \n", - "0 -74.933601 40.070801 \n", - "1 -101.473911 38.704022 \n", - "2 -151.695999 59.949200 \n", - "3 -86.770302 34.864799 \n", - "4 -91.254898 35.608700 \n", - "... ... ... \n", - "55364 122.358600 40.542524 \n", - "55365 123.496002 41.784401 \n", - "55366 1.482500 51.894444 \n", - "55367 47.296389 -11.584278 \n", - "55368 130.270556 30.784722 \n", - "\n", - "[55369 rows x 9 columns]" - ] - }, - "execution_count": 287, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "airportsInfo" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "tongConsultinInc", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.19" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/impact_evaluation/eda_average_ticketPrice.ipynb b/impact_evaluation/eda_average_ticketPrice.ipynb new file mode 100644 index 0000000..409150b --- /dev/null +++ b/impact_evaluation/eda_average_ticketPrice.ipynb @@ -0,0 +1,276 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## eval_impact_of_tickets_associations" + ] + }, + { + "cell_type": "code", + "execution_count": 200, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 201, + "metadata": {}, + "outputs": [], + "source": [ + "# Load data\n", + "\n", + "tickets = pd.read_csv('../data/cleaned_data/Tickets.csv').dropna()\n", + "ignore_column_tickets = ['ITIN_ID', 'YEAR', 'QUARTER', 'ORIGIN_COUNTRY', 'ORIGIN_STATE_ABR', 'ORIGIN_STATE_NM', 'ROUNDTRIP']\n", + "# tickets = tickets.drop(columns=ignore_column_tickets)\n", + "# # remove tickets that visit airports outside of the US\n", + "# tickets = tickets[(tickets['DEST_AIRPORT_IATA'].isin(airports['IATA_CODE'])) & (tickets['ORIGIN_AIRPORT_IATA'].isin(airports['IATA_CODE']))]\n", + "# tickets = tickets.assign(sorted_route = tickets.apply(lambda x : tuple(sorted([x['ORIGIN_AIRPORT_IATA'], x['DEST_AIRPORT_IATA']])), axis=1))\n", + "# tickets = tickets.drop(['ORIGIN_AIRPORT_IATA', 'DEST_AIRPORT_IATA'], axis=1)\n", + "\n", + "tickets = tickets.assign(\n", + " sorted_route=tickets.apply(\n", + " lambda x: tuple(\n", + " sorted([x[\"ORIGIN_AIRPORT_IATA_CODE\"], x[\"DEST_AIRPORT_IATA_CODE\"]])\n", + " ),\n", + " axis=1,\n", + " )\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Checking how many tickets associated with each round-trip routes\n", + "\n", + "It is critical to check the number of tickets associated with the round trip routes because we calculate average ticket price based on the ticket associated with each round trip route. Specially for round-trip routes associated with only a single ticket, the analysis would be highly biased if we only rely on a single ticket for that round-trip route" + ] + }, + { + "cell_type": "code", + "execution_count": 202, + "metadata": {}, + "outputs": [], + "source": [ + "roundtrip_route_distribution = tickets.groupby('sorted_route')['ITIN_ID'].count().value_counts(normalize=True)\n", + "roundtrip_route_distributions_more_than_10 = roundtrip_route_distribution[roundtrip_route_distribution.index > 9].sum()\n", + "roundtrip_route_distribution = roundtrip_route_distribution.reset_index().loc[:8]\n", + "roundtrip_route_distribution.loc[9] = {'index' : '10 or more', 'ITIN_ID' : roundtrip_route_distributions_more_than_10}\n", + "roundtrip_route_distribution.columns = ['# of tickets on roundtrip route', 'Percetange of roundtrip routes']\n", + "\n", + "roundtrip_route_distribution['# of tickets on roundtrip route'] = pd.Series(['1 ticket', '2 tickets', '3 tickets', '4 tickets', '5 tickets', '6 tickets', '7 tickets', '8 tickets', '9 tickets', '10 or more'])" + ] + }, + { + "cell_type": "code", + "execution_count": 240, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(10, 6))\n", + "plt.pie(roundtrip_route_distribution['Percetange of roundtrip routes'], labels=roundtrip_route_distribution['# of tickets on roundtrip route'], autopct='%1.1f%%', startangle=140)\n", + "\n", + "plt.title('Distribution of roundtrip routes categorized by the number of assoicated tickets');" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Investiage the round trip route only associated with less than 3 tickets\n", + "- we hypothesis that unexpected distribution of single ticket association might be due to certain factors\n", + "- potential candidates\n", + " - OP_CARRIER\n", + " - ORIGIN_AIRPORT_IATA_CODE\n", + " - ORIGIN_STATE_ABR\n", + " - DEST_AIRPORT_IATA_CODE\n", + " - ONE_PASSENGERS_FARE" + ] + }, + { + "cell_type": "code", + "execution_count": 252, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "x = tickets.groupby('sorted_route').filter(lambda x: len(x) > 3)['OP_CARRIER'].value_counts(True)\n", + "y = tickets.groupby('sorted_route').filter(lambda x: len(x) <= 3)['OP_CARRIER'].value_counts(True)\n", + "df = pd.DataFrame({'x': x, 'y': y}).sort_values('y', ascending=False)\n", + "ax = df.plot(kind='bar', figsize=(10, 6))\n", + "plt.title('Distribution of Operation carrier for roundtrip tickets')\n", + "plt.xlabel('Operation Carrier')\n", + "plt.ylabel('Percentage of roundtrip tickets')\n", + "plt.legend(['Tickets with 3 or more tickets on a single round-trip route', 'Tickets with 3 or less tickets on a single round-trip route'])\n", + "plt.grid(axis='y')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Conducting Hypothesis Testing " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We hypothesis that the less ticket association(roundtrips associated with less than 3 tickets) happen due to the Operation Carrier. And we will conduct hypothesis testing on this problem.\n", + "\n", + "Null Hypothesis: There is no direct correlation between the Operation carrier and less ticket assoication. Thus, any distributions is due to random chance\n", + "Alternative Hypothesis: There is some corrleations happening between the Operation carrier and ticket assoications\n", + "\n", + "test stiatistic: The Total variation distance between the distribution of Operation carriers of tickets with less than 3 associations and the distribution of Operation carriers of tickets with more than 3 association" + ] + }, + { + "cell_type": "code", + "execution_count": 255, + "metadata": {}, + "outputs": [], + "source": [ + "# finding sample statistic\n", + "def tvd(series1, series2):\n", + " ser_diff = series1.combine(series2, lambda x, y: y if pd.isnull(x) else x if pd.isnull(y) else x - y)\n", + " return 0.5 * np.sum(np.abs(ser_diff))" + ] + }, + { + "cell_type": "code", + "execution_count": 288, + "metadata": {}, + "outputs": [], + "source": [ + "observed_tvd = tvd(tickets.groupby('sorted_route').filter(lambda x: len(x) > 3)['OP_CARRIER'].value_counts(True), tickets.groupby('sorted_route').filter(lambda x: len(x) <= 3)['OP_CARRIER'].value_counts(True))" + ] + }, + { + "cell_type": "code", + "execution_count": 311, + "metadata": {}, + "outputs": [], + "source": [ + "sim_times = 1000\n", + "\n", + "sim_tvd = []\n", + "for i in range(sim_times):\n", + " perm_tickets = tickets.assign(OP_CARRIER = np.random.permutation(tickets['OP_CARRIER']))\n", + " route_less_association = perm_tickets.groupby('sorted_route')['ITIN_ID'].count()\n", + " route_less_association = pd.Series(route_less_association[route_less_association <= 3].index)\n", + "\n", + " dis1 = perm_tickets[perm_tickets['sorted_route'].isin(route_less_association)]['OP_CARRIER'].value_counts(True)\n", + " dis2 = perm_tickets[~perm_tickets['sorted_route'].isin(route_less_association)]['OP_CARRIER'].value_counts(True)\n", + "\n", + "\n", + " sim_tvd.append(tvd(dis1, dis2))" + ] + }, + { + "cell_type": "code", + "execution_count": 328, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 1.0, 'Distribution of simulated TVD')" + ] + }, + "execution_count": 328, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.hist(sim_tvd, density=True, bins=20)\n", + "plt.axvline(observed_tvd, color='r')\n", + "\n", + "plt.legend(['Observed statistic', 'Simulated statistic'])\n", + "plt.title('Distribution of simulated TVD')\n", + "\n", + "# plt.grid(axis='both')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "p-value of 0 which reject the null hypothesis" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# result" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There might be an association between the operation carrier and associated ticket" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "tongConsultinInc", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.19" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/impact_evaluation/eval_impact_of_zero ticket price.ipynb b/impact_evaluation/eval_impact_of_zero ticket price.ipynb new file mode 100644 index 0000000..bec50ce --- /dev/null +++ b/impact_evaluation/eval_impact_of_zero ticket price.ipynb @@ -0,0 +1,129 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# evaluating the degree of impact on removing ticket with 0 price" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "ori_tickets = pd.read_csv('../data/cleaned_data/Tickets.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "free_tickets_proportion = round(ori_tickets['ITIN_FARE'].isnull().sum() / len(ori_tickets) * 100, 2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compare the impact before and after removing the ticket of 0 price\n", + "- will evaluate the distribution of ORIGIN, DEST, ORIGIN_STATE, OPERATION carrier" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "def tvd(series1, series2):\n", + " ser_diff = series1.combine(series2, lambda x, y: y if pd.isnull(x) else x if pd.isnull(y) else x - y)\n", + " return 0.5 * np.sum(np.abs(ser_diff))" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [], + "source": [ + "ori_tickets_no_free = ori_tickets[ori_tickets['ITIN_FARE'].notnull()]" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [], + "source": [ + "tvd_across_columns = {}\n", + "for col in ['ORIGIN_AIRPORT_IATA_CODE', 'ORIGIN_STATE_ABR', 'OP_CARRIER', 'DEST_AIRPORT_IATA_CODE']:\n", + " tvd_across_columns[col] = tvd(ori_tickets[col].value_counts(normalize=True), ori_tickets_no_free[col].value_counts(normalize=True))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Conclusion" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.07% of the ticket has price of 0 assoicated with it which is really low\n", + "TVD of orginal and modified ticket data on ORIGIN_AIRPORT_IATA_CODE is 0.00018956667384382902\n", + "TVD of orginal and modified ticket data on ORIGIN_STATE_ABR is 8.62530261284656e-05\n", + "TVD of orginal and modified ticket data on OP_CARRIER is 6.253813875461848e-05\n", + "TVD of orginal and modified ticket data on DEST_AIRPORT_IATA_CODE is 0.00016569140934213444\n" + ] + } + ], + "source": [ + "print(f\"{free_tickets_proportion}% of the ticket has price of 0 assoicated with it which is really low\")\n", + "for k, v in tvd_across_columns.items():\n", + " print(f\"TVD of orginal and modified ticket data on {k} is {v}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Each of the above values is less than 0.01, so we can conclude that the impact of removing ticket with 0 price has extremely low impact on the data set" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "tongConsultinInc", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.19" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}