diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml new file mode 100644 index 0000000..6d0498e --- /dev/null +++ b/.github/workflows/ci.yaml @@ -0,0 +1,42 @@ +name: CI + +on: + push: + pull_request: + +jobs: + run-tests: + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, windows-latest, macos-latest] + python-version: + - "3.7" + - "3.8" + - "3.9" + - "3.10" + - "3.11" + - "3.12-dev" + - "pypy-3.7" + - "pypy-3.8" + - "pypy-3.9" + + name: Test + runs-on: ${{ matrix.os }} + + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + + - name: Run tests + run: pytest \ No newline at end of file diff --git a/data_eda.ipynb b/data_eda.ipynb new file mode 100644 index 0000000..3622756 --- /dev/null +++ b/data_eda.ipynb @@ -0,0 +1,1034 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 279, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": 280, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/anaconda3/envs/tongConsultinInc/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3508: DtypeWarning: Columns (3,13,14) have mixed types.Specify dtype option on import or set low_memory=False.\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n" + ] + } + ], + "source": [ + "# Load the data\n", + "routes = pd.read_csv('data/Flights.csv')\n", + "ticket_price = pd.read_csv('data/Tickets.csv')\n", + "airportsInfo = pd.read_csv('data/Airport_Codes.csv')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Flights\n", + "- FL_DATE: string to stanarded datetime object\n", + "- ORIGIN_CITY_NAME: split into city and state\n", + "- DEST_CITY_NAME: split into city and state\n", + "- AIR_TIME: \n", + " - Two; NAN; negative number; number in str\n", + "- DISTANCE: to float" + ] + }, + { + "cell_type": "code", + "execution_count": 281, + "metadata": {}, + "outputs": [], + "source": [ + "routes['FL_DATE'] = pd.to_datetime(routes['FL_DATE'])\n", + "\n", + "# \n", + "routes['ORIGIN_STATE_NAME'] = routes['ORIGIN_CITY_NAME'].str.split(', ').str[1]\n", + "routes['ORIGIN_CITY_NAME'] = routes['ORIGIN_CITY_NAME'].str.split(', ').str[0]\n", + "routes['DEST_STATE_NAME'] = routes['DEST_CITY_NAME'].str.split(', ').str[1]\n", + "routes['DEST_CITY_NAME'] = routes['DEST_CITY_NAME'].str.split(', ').str[0]\n", + "\n", + "# air time column adjustments\n", + "routes['AIR_TIME'] = routes['AIR_TIME'].apply(lambda x: 2.0 if x == 'Two' else x)\n", + "routes['AIR_TIME'] = routes['AIR_TIME'].apply(lambda x: np.nan if x == 'NAN' or x == '$$$' else x)\n", + "routes['AIR_TIME'] = routes['AIR_TIME'].apply(lambda x: 121.0 if x == '121.0' else x)\n", + "routes['AIR_TIME'] = routes['AIR_TIME'].astype(float)\n", + "\n", + "# clean and convert distance to float\n", + "def distance_to_float(val):\n", + " try:\n", + " float_val = float(val)\n", + " if float_val < 0:\n", + " return -1 * float_val\n", + " return float_val\n", + " except:\n", + " return np.nan\n", + " \n", + "routes['DISTANCE'] = routes['DISTANCE'].apply(distance_to_float)" + ] + }, + { + "cell_type": "code", + "execution_count": 282, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FL_DATEOP_CARRIERTAIL_NUMOP_CARRIER_FL_NUMORIGIN_AIRPORT_IDORIGINORIGIN_CITY_NAMEDEST_AIRPORT_IDDESTINATIONDEST_CITY_NAMEDEP_DELAYARR_DELAYCANCELLEDAIR_TIMEDISTANCEOCCUPANCY_RATEORIGIN_STATE_NAMEDEST_STATE_NAME
02019-03-02WNN955WN459114635RSWFort Myers11042CLECleveland-8.0-6.00.0143.01025.00.970000FLOH
12019-03-02WNN8686A323114635RSWFort Myers11066CMHColumbus1.05.00.0135.0930.00.550000FLOH
22019-03-02WNN201LV338314635RSWFort Myers11066CMHColumbus0.04.00.0132.0930.00.910000FLOH
32019-03-02WNN413WN549814635RSWFort Myers11066CMHColumbus11.014.00.0136.0930.00.670000FLOH
42019-03-02WNN7832A693314635RSWFort Myers11259DALDallas0.0-17.00.0151.01005.00.620000FLTX
.........................................................
19158812019-03-23AAN903NN143315370TULTulsa11057CLTCharlotte-9.0-6.00.0112.0NaN0.794884OKNC
19158822019-03-24AAN965AN143315370TULTulsa11057CLTCharlotte-2.0-1.00.0106.0NaN0.538399OKNC
19158832019-03-25AAN979NN143315370TULTulsa11057CLTCharlotte-8.0-25.00.0106.0NaN0.955579OKNC
19158842019-03-26AAN872NN143315370TULTulsa11057CLTCharlotte-9.0-6.00.0112.0NaN0.595344OKNC
19158852019-03-27AAN945AN143315370TULTulsa11057CLTCharlotte-8.05.00.0117.0NaN0.350192OKNC
\n", + "

1915886 rows × 18 columns

\n", + "
" + ], + "text/plain": [ + " FL_DATE OP_CARRIER TAIL_NUM OP_CARRIER_FL_NUM ORIGIN_AIRPORT_ID \\\n", + "0 2019-03-02 WN N955WN 4591 14635 \n", + "1 2019-03-02 WN N8686A 3231 14635 \n", + "2 2019-03-02 WN N201LV 3383 14635 \n", + "3 2019-03-02 WN N413WN 5498 14635 \n", + "4 2019-03-02 WN N7832A 6933 14635 \n", + "... ... ... ... ... ... \n", + "1915881 2019-03-23 AA N903NN 1433 15370 \n", + "1915882 2019-03-24 AA N965AN 1433 15370 \n", + "1915883 2019-03-25 AA N979NN 1433 15370 \n", + "1915884 2019-03-26 AA N872NN 1433 15370 \n", + "1915885 2019-03-27 AA N945AN 1433 15370 \n", + "\n", + " ORIGIN ORIGIN_CITY_NAME DEST_AIRPORT_ID DESTINATION DEST_CITY_NAME \\\n", + "0 RSW Fort Myers 11042 CLE Cleveland \n", + "1 RSW Fort Myers 11066 CMH Columbus \n", + "2 RSW Fort Myers 11066 CMH Columbus \n", + "3 RSW Fort Myers 11066 CMH Columbus \n", + "4 RSW Fort Myers 11259 DAL Dallas \n", + "... ... ... ... ... ... \n", + "1915881 TUL Tulsa 11057 CLT Charlotte \n", + "1915882 TUL Tulsa 11057 CLT Charlotte \n", + "1915883 TUL Tulsa 11057 CLT Charlotte \n", + "1915884 TUL Tulsa 11057 CLT Charlotte \n", + "1915885 TUL Tulsa 11057 CLT Charlotte \n", + "\n", + " DEP_DELAY ARR_DELAY CANCELLED AIR_TIME DISTANCE OCCUPANCY_RATE \\\n", + "0 -8.0 -6.0 0.0 143.0 1025.0 0.970000 \n", + "1 1.0 5.0 0.0 135.0 930.0 0.550000 \n", + "2 0.0 4.0 0.0 132.0 930.0 0.910000 \n", + "3 11.0 14.0 0.0 136.0 930.0 0.670000 \n", + "4 0.0 -17.0 0.0 151.0 1005.0 0.620000 \n", + "... ... ... ... ... ... ... \n", + "1915881 -9.0 -6.0 0.0 112.0 NaN 0.794884 \n", + "1915882 -2.0 -1.0 0.0 106.0 NaN 0.538399 \n", + "1915883 -8.0 -25.0 0.0 106.0 NaN 0.955579 \n", + "1915884 -9.0 -6.0 0.0 112.0 NaN 0.595344 \n", + "1915885 -8.0 5.0 0.0 117.0 NaN 0.350192 \n", + "\n", + " ORIGIN_STATE_NAME DEST_STATE_NAME \n", + "0 FL OH \n", + "1 FL OH \n", + "2 FL OH \n", + "3 FL OH \n", + "4 FL TX \n", + "... ... ... \n", + "1915881 OK NC \n", + "1915882 OK NC \n", + "1915883 OK NC \n", + "1915884 OK NC \n", + "1915885 OK NC \n", + "\n", + "[1915886 rows x 18 columns]" + ] + }, + "execution_count": 282, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "routes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tickets\n", + "- YEAR to int\n", + "- clean itin_fare" + ] + }, + { + "cell_type": "code", + "execution_count": 283, + "metadata": {}, + "outputs": [], + "source": [ + "# year column to int year\n", + "ticket_price['YEAR'] = ticket_price['YEAR'].astype(int)\n", + "\n", + "# clean and convert price to float\n", + "def find_number(text):\n", + " if type(text) != str:\n", + " return np.nan\n", + " re_result = re.search(r'[\\d\\.]+', text)\n", + " if re_result is not None:\n", + " return float(re_result.group(0))\n", + " return np.nan\n", + "\n", + "ticket_price['ITIN_FARE'] = ticket_price['ITIN_FARE'].apply(find_number)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 284, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ITIN_IDYEARQUARTERORIGINORIGIN_COUNTRYORIGIN_STATE_ABRORIGIN_STATE_NMROUNDTRIPREPORTING_CARRIERPASSENGERSITIN_FAREDESTINATION
020191272304920191ABIUSTXTexas1.0MQ1.0736.0DAB
120191272308520191ABIUSTXTexas1.0MQ1.0570.0COS
220191272349120191ABIUSTXTexas1.0MQ1.0564.0MCO
320191272342820191ABIUSTXTexas1.0MQ1.0345.0LGA
420191272350920191ABIUSTXTexas0.0MQ1.0309.0MGM
.......................................
116728020191128490920191YAKUSAKAlaska0.0AS1.0244.0ANC
116728120191128495920191YAKUSAKAlaska1.0AS1.0371.0JNU
116728220191128494020191YAKUSAKAlaska0.0AS1.0271.0JNU
116728320191128491420191YAKUSAKAlaska0.0AS1.0603.0ANC
116728420191128495220191YAKUSAKAlaska1.0AS1.0299.0JNU
\n", + "

1167285 rows × 12 columns

\n", + "
" + ], + "text/plain": [ + " ITIN_ID YEAR QUARTER ORIGIN ORIGIN_COUNTRY ORIGIN_STATE_ABR \\\n", + "0 201912723049 2019 1 ABI US TX \n", + "1 201912723085 2019 1 ABI US TX \n", + "2 201912723491 2019 1 ABI US TX \n", + "3 201912723428 2019 1 ABI US TX \n", + "4 201912723509 2019 1 ABI US TX \n", + "... ... ... ... ... ... ... \n", + "1167280 201911284909 2019 1 YAK US AK \n", + "1167281 201911284959 2019 1 YAK US AK \n", + "1167282 201911284940 2019 1 YAK US AK \n", + "1167283 201911284914 2019 1 YAK US AK \n", + "1167284 201911284952 2019 1 YAK US AK \n", + "\n", + " ORIGIN_STATE_NM ROUNDTRIP REPORTING_CARRIER PASSENGERS ITIN_FARE \\\n", + "0 Texas 1.0 MQ 1.0 736.0 \n", + "1 Texas 1.0 MQ 1.0 570.0 \n", + "2 Texas 1.0 MQ 1.0 564.0 \n", + "3 Texas 1.0 MQ 1.0 345.0 \n", + "4 Texas 0.0 MQ 1.0 309.0 \n", + "... ... ... ... ... ... \n", + "1167280 Alaska 0.0 AS 1.0 244.0 \n", + "1167281 Alaska 1.0 AS 1.0 371.0 \n", + "1167282 Alaska 0.0 AS 1.0 271.0 \n", + "1167283 Alaska 0.0 AS 1.0 603.0 \n", + "1167284 Alaska 1.0 AS 1.0 299.0 \n", + "\n", + " DESTINATION \n", + "0 DAB \n", + "1 COS \n", + "2 MCO \n", + "3 LGA \n", + "4 MGM \n", + "... ... \n", + "1167280 ANC \n", + "1167281 JNU \n", + "1167282 JNU \n", + "1167283 ANC \n", + "1167284 JNU \n", + "\n", + "[1167285 rows x 12 columns]" + ] + }, + "execution_count": 284, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ticket_price" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## AirportsInfo\n", + "- COORDINATES into atomic data set\n", + " - first one is longitude\n", + " - second one is latitude" + ] + }, + { + "cell_type": "code", + "execution_count": 285, + "metadata": {}, + "outputs": [], + "source": [ + "# clean coordinates\n", + "\n", + "airportsInfo['COORDINATES_LONGITUDE'] = airportsInfo['COORDINATES'].apply(lambda x: x.split(', ')[0]).astype(float)\n", + "airportsInfo['COORDINATES_LATITUDE'] = airportsInfo['COORDINATES'].apply(lambda x: x.split(', ')[1]).astype(float)\n", + "airportsInfo.drop(columns=['COORDINATES'], inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 286, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 55369 entries, 0 to 55368\n", + "Data columns (total 9 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 TYPE 55369 non-null object \n", + " 1 NAME 55369 non-null object \n", + " 2 ELEVATION_FT 48354 non-null float64\n", + " 3 CONTINENT 27526 non-null object \n", + " 4 ISO_COUNTRY 55122 non-null object \n", + " 5 MUNICIPALITY 49663 non-null object \n", + " 6 IATA_CODE 9182 non-null object \n", + " 7 COORDINATES_LONGITUDE 55369 non-null float64\n", + " 8 COORDINATES_LATITUDE 55369 non-null float64\n", + "dtypes: float64(3), object(6)\n", + "memory usage: 3.8+ MB\n" + ] + } + ], + "source": [ + "airportsInfo.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 287, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TYPENAMEELEVATION_FTCONTINENTISO_COUNTRYMUNICIPALITYIATA_CODECOORDINATES_LONGITUDECOORDINATES_LATITUDE
0heliportTotal Rf Heliport11.0NaNUSBensalemNaN-74.93360140.070801
1small_airportAero B Ranch Airport3435.0NaNUSLeotiNaN-101.47391138.704022
2small_airportLowell Field450.0NaNUSAnchor PointNaN-151.69599959.949200
3small_airportEpps Airpark820.0NaNUSHarvestNaN-86.77030234.864799
4closedNewport Hospital & Clinic Heliport237.0NaNUSNewportNaN-91.25489835.608700
..............................
55364medium_airportYingkou Lanqi Airport0.0ASCNYingkouYKH122.35860040.542524
55365medium_airportShenyang Dongta AirportNaNASCNShenyangNaN123.49600241.784401
55366heliportSealand Helipad40.0EUGBSealandNaN1.48250051.894444
55367small_airportGlorioso Islands Airstrip11.0AFTFGrande GlorieuseNaN47.296389-11.584278
55368small_airportSatsuma Iōjima Airport338.0ASJPMishima-MuraNaN130.27055630.784722
\n", + "

55369 rows × 9 columns

\n", + "
" + ], + "text/plain": [ + " TYPE NAME ELEVATION_FT \\\n", + "0 heliport Total Rf Heliport 11.0 \n", + "1 small_airport Aero B Ranch Airport 3435.0 \n", + "2 small_airport Lowell Field 450.0 \n", + "3 small_airport Epps Airpark 820.0 \n", + "4 closed Newport Hospital & Clinic Heliport 237.0 \n", + "... ... ... ... \n", + "55364 medium_airport Yingkou Lanqi Airport 0.0 \n", + "55365 medium_airport Shenyang Dongta Airport NaN \n", + "55366 heliport Sealand Helipad 40.0 \n", + "55367 small_airport Glorioso Islands Airstrip 11.0 \n", + "55368 small_airport Satsuma Iōjima Airport 338.0 \n", + "\n", + " CONTINENT ISO_COUNTRY MUNICIPALITY IATA_CODE \\\n", + "0 NaN US Bensalem NaN \n", + "1 NaN US Leoti NaN \n", + "2 NaN US Anchor Point NaN \n", + "3 NaN US Harvest NaN \n", + "4 NaN US Newport NaN \n", + "... ... ... ... ... \n", + "55364 AS CN Yingkou YKH \n", + "55365 AS CN Shenyang NaN \n", + "55366 EU GB Sealand NaN \n", + "55367 AF TF Grande Glorieuse NaN \n", + "55368 AS JP Mishima-Mura NaN \n", + "\n", + " COORDINATES_LONGITUDE COORDINATES_LATITUDE \n", + "0 -74.933601 40.070801 \n", + "1 -101.473911 38.704022 \n", + "2 -151.695999 59.949200 \n", + "3 -86.770302 34.864799 \n", + "4 -91.254898 35.608700 \n", + "... ... ... \n", + "55364 122.358600 40.542524 \n", + "55365 123.496002 41.784401 \n", + "55366 1.482500 51.894444 \n", + "55367 47.296389 -11.584278 \n", + "55368 130.270556 30.784722 \n", + "\n", + "[55369 rows x 9 columns]" + ] + }, + "execution_count": 287, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "airportsInfo" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "tongConsultinInc", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.19" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/data_preprocessing.py b/data_preprocessing.py new file mode 100644 index 0000000..6baba43 --- /dev/null +++ b/data_preprocessing.py @@ -0,0 +1,61 @@ +import pandas as pd +import numpy as np +import re + + +def str_to_float(text): + """ + convert a string to a float. if the input is Two, convert it to 2.0 + --- + text: str to be converted + --- + return: converted float or np.nan if conversion fails + >>> str_to_float("1.0") + 1.0 + >>> str_to_float("1") + 1.0 + >>> str_to_float("1.0.0") + nan + >>> str_to_float("Two") + 2.0 + >>> str_to_float("-1.0") + 1.0 + >>> str_to_float("1.0-") + nan + >>> str_to_float("20000.00") + 20000.0 + """ + try: + return abs(float(text)) + except: + if text == "Two": + return 2.0 + return np.nan + + +def main(): + # loading data + flights = pd.read_csv("data/original_data/Flights.csv") + tickets = pd.read_csv("data/original_data/Tickets.csv") + airport_codes = pd.read_csv("data/original_data/Airport_Codes.csv") + + # cleaning flights data + flights["FL_DATE"] = pd.to_datetime(flights["FL_DATE"]) + + # finding the specific city and state name + split_ORIGIN_CITY_STATE = flights["ORIGIN_CITY_NAME"].str.split(", ") + split_DEST_CITY_STATE = flights["DEST_CITY_NAME"].str.split(", ") + + flights["ORIGIN_CITY_NAME"] = split_ORIGIN_CITY_STATE.str[0] + flights["ORIGIN_STATE_NAME"] = split_ORIGIN_CITY_STATE.str[1] + + flights["DEST_STATE_NAME"] = split_DEST_CITY_STATE.str[1] + flights["DEST_CITY_NAME"] = split_DEST_CITY_STATE.str[0] + + flights["DISTANCE"] = flights["DISTANCE"].apply(str_to_float) + + flights.to_csv("data/cleaned_data/flights.csv", index=False) + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..7f07acf --- /dev/null +++ b/requirements.txt @@ -0,0 +1,161 @@ +anyio @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_a17a7759g2/croot/anyio_1706220182417/work +appnope @ file:///Users/ktietz/demo/mc3/conda-bld/appnope_1629146036738/work +argon2-cffi @ file:///opt/conda/conda-bld/argon2-cffi_1645000214183/work +argon2-cffi-bindings @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/croot-wbf5edig/argon2-cffi-bindings_1644845754377/work +asttokens @ file:///opt/conda/conda-bld/asttokens_1646925590279/work +async-lru @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_02efro5ps8/croot/async-lru_1699554529181/work +attrs @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_224434dqzl/croot/attrs_1695717839274/work +Babel @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_00k1rl2pus/croot/babel_1671781944131/work +backcall @ file:///home/ktietz/src/ci/backcall_1611930011877/work +beautifulsoup4 @ file:///tmp/build/80754af9/beautifulsoup4_1631874778482/work +bleach @ file:///opt/conda/conda-bld/bleach_1641577558959/work +brotlipy==0.7.0 +cachetools==5.3.3 +certifi @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_3bzbkiv4h_/croot/certifi_1707229182618/work/certifi +cffi @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_7a9c7wyorr/croot/cffi_1714483157752/work +charset-normalizer @ file:///tmp/build/80754af9/charset-normalizer_1630003229654/work +click==8.1.7 +comm @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_3doui0bmzb/croot/comm_1709322861485/work +cryptography @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_31zgxm62w8/croot/cryptography_1714660690857/work +cycler @ file:///tmp/build/80754af9/cycler_1637851556182/work +debugpy @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_563_nwtkoc/croot/debugpy_1690905063850/work +decorator @ file:///opt/conda/conda-bld/decorator_1643638310831/work +defusedxml @ file:///tmp/build/80754af9/defusedxml_1615228127516/work +dill==0.3.8 +docopt @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_5alx0ctp1q/croots/recipe/docopt_1663662430075/work +entrypoints @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_croot-jb01gaox/entrypoints_1650293758411/work +exceptiongroup @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_b2258scr33/croot/exceptiongroup_1706031391815/work +execnet @ file:///tmp/build/80754af9/execnet_1623921183358/work +executing @ file:///opt/conda/conda-bld/executing_1646925071911/work +fastjsonschema @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_43a0jaiddu/croots/recipe/python-fastjsonschema_1661368628129/work +fonttools @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_60c8ux4mkl/croot/fonttools_1713551354374/work +google-api-core==2.19.0 +google-api-python-client==2.129.0 +google-auth==2.29.0 +google-auth-httplib2==0.2.0 +google-auth-oauthlib==1.2.0 +googleapis-common-protos==1.63.0 +greenlet @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_516imz09pb/croot/greenlet_1702059966336/work +gspread==6.1.2 +httplib2==0.22.0 +idna @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_a12xpo84t2/croot/idna_1714398852854/work +importlib-metadata @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_5498c88e7n/croot/importlib_metadata-suite_1704813534254/work +importlib-resources @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_77lzrsh8mp/croot/importlib_resources-suite_1704281852961/work +iniconfig @ file:///home/linux1/recipes/ci/iniconfig_1610983019677/work +ipykernel @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_f428_5tjvx/croot/ipykernel_1705933835534/work +ipysheet==0.7.0 +ipython @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_84r7osg3nm/croot/ipython_1691532095330/work +ipython-genutils @ file:///tmp/build/80754af9/ipython_genutils_1606773439826/work +ipywidgets==8.1.3 +jedi @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/croot-f1t6hma6/jedi_1644315882177/work +Jinja2 @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_7dognxkzoy/croot/jinja2_1706733627811/work +joblib @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_f75pzkv6n1/croot/joblib_1713976769255/work +json5 @ file:///tmp/build/80754af9/json5_1624432770122/work +jsonschema @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_27o3go8sqa/croot/jsonschema_1699041627313/work +jsonschema-specifications @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_d38pclgu95/croot/jsonschema-specifications_1699032390832/work +jupyter-events @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_14ldd9s4d0/croot/jupyter_events_1699282481406/work +jupyter-lsp @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_ae9br5v37x/croot/jupyter-lsp-meta_1699978259353/work +jupyter_client @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_aen57n2aow/croot/jupyter_client_1676329104065/work +jupyter_core @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_782yoyc_98/croot/jupyter_core_1698937318631/work +jupyter_server @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_281pz9vly5/croot/jupyter_server_1699466465530/work +jupyter_server_terminals @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_e7ryd60iuw/croot/jupyter_server_terminals_1686870731283/work +jupyterlab @ file:///home/conda/feedstock_root/build_artifacts/jupyterlab_1694728214446/work +jupyterlab-pygments @ file:///tmp/build/80754af9/jupyterlab_pygments_1601490720602/work +jupyterlab_server @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_315a64u22w/croot/jupyterlab_server_1699555438434/work +jupyterlab_widgets==3.0.11 +kiwisolver @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_93o8te804v/croot/kiwisolver_1672387163224/work +lxml @ file:///Users/runner/miniforge3/conda-bld/lxml_1704590488726/work +markdown-it-py==3.0.0 +MarkupSafe @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_a84ni4pci8/croot/markupsafe_1704206002077/work +matplotlib @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_croot-_3usbnl1/matplotlib-suite_1647506475477/work +matplotlib-inline @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_f6fdc0hldi/croots/recipe/matplotlib-inline_1662014472341/work +mdurl==0.1.2 +mistune @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_17ya6k1sbs/croots/recipe/mistune_1661496228719/work +mypy-extensions==0.4.3 +nbclient @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_626hpwnurm/croot/nbclient_1698934218848/work +nbconvert @ file:///home/conda/feedstock_root/build_artifacts/nbconvert-meta_1693331710275/work +nbformat @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_cbnf5nccgk/croot/nbformat_1694616744196/work +nest-asyncio @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_310vb5e2a0/croot/nest-asyncio_1708532678212/work +notebook @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_dfmids47bo/croots/recipe/notebook_1659083663569/work +notebook_shim @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_d6_ze10f45/croot/notebook-shim_1699455897525/work +numpy @ file:///Users/ktietz/ci_310/numpy_and_numpy_base_1644255524335/work +oauthlib==3.2.2 +olefile @ file:///Users/ktietz/demo/mc3/conda-bld/olefile_1629805411829/work +otter-grader==3.1.4 +overrides @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_70s80guh9g/croot/overrides_1699371144462/work +packaging @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_a6lqg7at4g/croot/packaging_1710807410750/work +pandas==1.3.3 +pandas-tutor==2.0.3 +pandocfilters @ file:///opt/conda/conda-bld/pandocfilters_1643405455980/work +parso @ file:///opt/conda/conda-bld/parso_1641458642106/work +patsy==0.5.3 +pdfkit==1.0.0 +pexpect @ file:///tmp/build/80754af9/pexpect_1605563209008/work +pickleshare @ file:///tmp/build/80754af9/pickleshare_1606932040724/work +Pillow @ file:///Users/runner/miniforge3/conda-bld/pillow_1630696687447/work +pkgutil_resolve_name @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_98lcqyhajf/croot/pkgutil-resolve-name_1704297463060/work +platformdirs @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_a8u4fy8k9o/croot/platformdirs_1692205661656/work +plotly @ file:///home/conda/feedstock_root/build_artifacts/plotly_1694802097009/work +pluggy @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_croot-w6jyveby/pluggy_1648109277227/work +pooch @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_1bocfs80f4/croot/pooch_1695850117888/work +prometheus-client @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_25sgeyk0j5/croots/recipe/prometheus_client_1659455103277/work +prompt-toolkit @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_c63v4kqjzr/croot/prompt-toolkit_1704404354115/work +proto-plus==1.23.0 +protobuf==4.25.3 +psutil @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_1310b568-21f4-4cb0-b0e3-2f3d31e39728k9coaga5/croots/recipe/psutil_1656431280844/work +ptyprocess @ file:///tmp/build/80754af9/ptyprocess_1609355006118/work/dist/ptyprocess-0.7.0-py2.py3-none-any.whl +pure-eval @ file:///opt/conda/conda-bld/pure_eval_1646925070566/work +pyasn1==0.6.0 +pyasn1_modules==0.4.0 +pycparser @ file:///tmp/build/80754af9/pycparser_1636541352034/work +pydantic==1.10.2 +Pygments @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_29bs9f_dh9/croot/pygments_1684279974747/work +pyOpenSSL @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_94bn0tgaw5/croot/pyopenssl_1708381744097/work +pyparsing @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_3b_3vxnd07/croots/recipe/pyparsing_1661452540919/work +PyPDF2==3.0.1 +PySocks @ file:///Users/ktietz/Code/oss/ci_pkgs/pysocks_1626781349491/work +pytest @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_75ehl8i878/croot/pytest_1690474711033/work +pytest-xdist @ file:///home/conda/feedstock_root/build_artifacts/pytest-xdist_1684499835847/work +python-dateutil @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_6e_fc8xema/croot/python-dateutil_1715108793034/work +python-json-logger @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_c3baq2ko4j/croot/python-json-logger_1683823815343/work +python-on-whales==0.71.0 +pytz @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_a4b76c83ik/croot/pytz_1713974318928/work +PyYAML @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_a8_sdgulmz/croot/pyyaml_1698096054705/work +pyzmq @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_cbtlm0pib_/croot/pyzmq_1709318330127/work +referencing @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_5cz64gsx70/croot/referencing_1699012046031/work +requests @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_b3tnputioh/croot/requests_1707355573919/work +requests-oauthlib==2.0.0 +rfc3339-validator @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_76ae5cu30h/croot/rfc3339-validator_1683077051957/work +rfc3986-validator @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_d0l5zd97kt/croot/rfc3986-validator_1683058998431/work +rich==13.7.1 +rpds-py @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_f8jkozoefm/croot/rpds-py_1698945944860/work +rsa==4.9 +scikit-learn @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/croot-ic3nfbzi/scikit-learn_1642621481325/work +scipy==1.10.1 +seaborn @ file:///tmp/build/80754af9/seaborn_1629307859561/work +Send2Trash @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_5b31f0zzlv/croot/send2trash_1699371144121/work +shellingham==1.5.4 +six @ file:///tmp/build/80754af9/six_1644875935023/work +sniffio @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_1573pknjrg/croot/sniffio_1705431298885/work +soupsieve @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_9798xzs_03/croot/soupsieve_1696347567192/work +SQLAlchemy @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_da4pw1i5_7/croot/sqlalchemy_1695720907140/work +stack-data @ file:///opt/conda/conda-bld/stack_data_1646927590127/work +statsmodels @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_94gox2humz/croot/statsmodels_1676644453811/work +tenacity @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_0ew5sfng29/croot/tenacity_1682972282256/work +terminado @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_fcfvyc0an2/croot/terminado_1671751835701/work +threadpoolctl @ file:///Users/ktietz/demo/mc3/conda-bld/threadpoolctl_1629802263681/work +tinycss2 @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_fcw5_i306t/croot/tinycss2_1668168825117/work +tomli @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_d0e5ffbf-5cf1-45be-8693-c5dff8108a2awhthtjlq/croots/recipe/tomli_1657175508477/work +tornado @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_3a5nrn2jeh/croot/tornado_1696936974091/work +tqdm @ file:///tmp/build/80754af9/tqdm_1635330843403/work +traitlets @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_6301rd5qbe/croot/traitlets_1671143894285/work +typer==0.12.3 +typing_extensions==4.1.1 +unicodedata2 @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_a3epjto7gs/croot/unicodedata2_1713212955584/work +uritemplate==4.1.1 +urllib3==1.26.7 +wcwidth @ file:///Users/ktietz/demo/mc3/conda-bld/wcwidth_1629357192024/work +webencodings==0.5.1 +websocket-client @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_d37u7gqts8/croot/websocket-client_1715878310260/work +widgetsnbextension==4.0.11 +zipp @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_31jm3q76eq/croot/zipp_1704206913245/work