Fixed bugs and updated Streamlit app

carlosfab · Oct 4, 2023 · 783f91c · 783f91c
1 parent b1c0cb6
commit 783f91c
Show file tree

Hide file tree

Showing 19 changed files with 371 additions and 45 deletions.
diff --git a/notebooks/12_simulated_feature_pipeline.ipynb b/notebooks/12_simulated_feature_pipeline.ipynb
@@ -68,8 +68,8 @@
     "    \"\"\"\n",
     "    \n",
     "    # Calculate equivalent date range from a year ago\n",
-    "    from_date_ = from_date - timedelta(weeks=52)\n",
-    "    to_date_ = to_date - timedelta(weeks=52)\n",
+    "    from_date_ = from_date - timedelta(days=7*52)\n",
+    "    to_date_ = to_date - timedelta(days=7*52)\n",
     "\n",
     "    print(f\"Fetching raw data from {from_date_} to {to_date_}\")\n",
     "\n",
@@ -84,7 +84,7 @@
     "    rides = pd.concat([rides, rides_2])\n",
     "\n",
     "    # Shift the data by 52 weeks to make it look recent\n",
-    "    rides[\"pickup_datetime\"] += timedelta(weeks=52)\n",
+    "    rides[\"pickup_datetime\"] += timedelta(days=7*52)\n",
     "    \n",
     "    # Sort the dataframe by location and datetime\n",
     "    rides.sort_values(by=[\"pickup_location_id\", \"pickup_datetime\"], inplace=True)\n",
@@ -101,7 +101,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Fetching raw data from 2022-09-03 11:00:00 to 2022-10-01 11:00:00\n",
+      "Fetching raw data from 2022-09-07 02:00:00 to 2022-10-05 02:00:00\n",
       "File 2022-09 was already in local storage\n",
       "File 2022-10 was already in local storage\n"
      ]
@@ -125,7 +125,7 @@
       "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)\n",
       "DeprecationWarning: np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`.\n",
       "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)\n",
-      "100%|██████████| 265/265 [00:00<00:00, 272.82it/s]\n"
+      "100%|██████████| 265/265 [00:01<00:00, 254.10it/s]\n"
      ]
     },
     {
@@ -157,32 +157,32 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>2023-09-02 11:00:00</td>\n",
-       "      <td>1</td>\n",
+       "      <td>2023-09-06 02:00:00</td>\n",
+       "      <td>0</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>2023-09-02 12:00:00</td>\n",
-       "      <td>1</td>\n",
+       "      <td>2023-09-06 03:00:00</td>\n",
+       "      <td>0</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>2023-09-02 13:00:00</td>\n",
-       "      <td>1</td>\n",
+       "      <td>2023-09-06 04:00:00</td>\n",
+       "      <td>0</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>2023-09-02 14:00:00</td>\n",
-       "      <td>3</td>\n",
+       "      <td>2023-09-06 05:00:00</td>\n",
+       "      <td>0</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>2023-09-02 15:00:00</td>\n",
-       "      <td>6</td>\n",
+       "      <td>2023-09-06 06:00:00</td>\n",
+       "      <td>3</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
@@ -191,11 +191,11 @@
       ],
       "text/plain": [
        "          pickup_hour  rides  pickup_location_id\n",
-       "0 2023-09-02 11:00:00      1                   1\n",
-       "1 2023-09-02 12:00:00      1                   1\n",
-       "2 2023-09-02 13:00:00      1                   1\n",
-       "3 2023-09-02 14:00:00      3                   1\n",
-       "4 2023-09-02 15:00:00      6                   1"
+       "0 2023-09-06 02:00:00      0                   1\n",
+       "1 2023-09-06 03:00:00      0                   1\n",
+       "2 2023-09-06 04:00:00      0                   1\n",
+       "3 2023-09-06 05:00:00      0                   1\n",
+       "4 2023-09-06 06:00:00      3                   1"
       ]
      },
      "execution_count": 5,
@@ -253,7 +253,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "88ad1c2efb6d439aa09b2ace324b9b04",
+       "model_id": "b50ca4ac755e467b8846d43ede1f7e60",
        "version_major": 2,
        "version_minor": 0
       },
@@ -276,7 +276,7 @@
     {
      "data": {
       "text/plain": [
-       "(<hsfs.core.job.Job at 0x14ca6ec20>, None)"
+       "(<hsfs.core.job.Job at 0x143a28a00>, None)"
       ]
      },
      "execution_count": 7,

diff --git a/pyproject.toml b/pyproject.toml
@@ -24,6 +24,7 @@ pydeck = "^0.8.0"
 geopandas = "^0.12.2"
 discordwebhook = "^1.0.3"
 fire = "^0.5.0"
+schema = "^0.7.5"
 
 [tool.poetry.dev-dependencies]
 pytest = "^5.2"

diff --git a/src/.DS_Store b/src/.DS_Store
diff --git a/src/__init__.py b/src/__init__.py
@@ -0,0 +1 @@
+__version__ = '0.1.0'
diff --git a/src/__pycache__/__init__.cpython-310.pyc b/src/__pycache__/__init__.cpython-310.pyc
diff --git a/src/__pycache__/app_component.cpython-310.pyc b/src/__pycache__/app_component.cpython-310.pyc
diff --git a/src/__pycache__/config.cpython-310.pyc b/src/__pycache__/config.cpython-310.pyc
diff --git a/src/__pycache__/data.cpython-310.pyc b/src/__pycache__/data.cpython-310.pyc
diff --git a/src/__pycache__/data_split.cpython-310.pyc b/src/__pycache__/data_split.cpython-310.pyc
diff --git a/src/__pycache__/feature_store_api.cpython-310.pyc b/src/__pycache__/feature_store_api.cpython-310.pyc
diff --git a/src/__pycache__/inference.cpython-310.pyc b/src/__pycache__/inference.cpython-310.pyc
diff --git a/src/__pycache__/model.cpython-310.pyc b/src/__pycache__/model.cpython-310.pyc
diff --git a/src/__pycache__/paths.cpython-310.pyc b/src/__pycache__/paths.cpython-310.pyc
diff --git a/src/__pycache__/plot.cpython-310.pyc b/src/__pycache__/plot.cpython-310.pyc
diff --git a/src/app.py b/src/app.py
@@ -1,6 +1,9 @@
 # Imports
+from datetime import datetime
+import pandas as pd
 import streamlit as st
 import app_component as ac
+from src.inference import load_batch_of_features_from_store
 
 # Page configuration
 st.set_page_config(
@@ -14,21 +17,48 @@
     unsafe_allow_html=True
 )
 
+
 # Header Section
 home_title = "Taxi Demand Prediction"
 st.markdown(f"""# {home_title} <span style=color:#2E9BF5><font size=5>Web App</font></span>""", unsafe_allow_html=True)
 
 st.markdown("""\n""")
+
+loading_info = st.empty()
+
 st.markdown("#### Greetings 🚖")
 st.write(
     """
-    Welcome to the Taxi Demand Predictor Hub, where cutting-edge Machine Learning meets urban mobility needs. This web application is an integral component of an end-to-end Machine Learning Project. For those interested in the technical aspects, the project repository offers comprehensive insights, including explanatory notebooks, source code, and automation utilities.
+    Welcome to the Taxi Demand Predictor Hub, where cutting-edge Machine Learning meets urban mobility needs. For those interested in the technical aspects, the project repository offers comprehensive insights.
     """
 )
 
 # App Component
 ac.robo_avatar_component()
 
+# Sidebar
+progress_bar = st.sidebar.header(":gear: Project Progress")
+progress_bar = st.sidebar.progress(0)
+
+# constant for number of steps in progress bar
+N_STEPS = 7
+
+# STEP 1 - load shape data for NYC taxi zones
+with loading_info, st.spinner("Downloading data... this may take a while! \n Don't worry, this is a one-time thing. :wink:"):
+    shape_data = ac.load_shape_data_file()
+    st.sidebar.write(":white_check_mark: Shape data download complete!")
+    progress_bar.progress(1/N_STEPS)
+
+# STEP 2 - Fetch batch of inference data
+with loading_info, st.spinner("Fetching data from Feature Store..."):
+    current_date = pd.to_datetime(datetime.utcnow()).floor('H')
+    features = load_batch_of_features_from_store(current_date)
+    st.sidebar.write(":white_check_mark: Inference data fetched!")
+    progress_bar.progress(2/N_STEPS)
+
+ac.render_contact()
+
+
 # Real-world Machine Learning Section
 st.markdown("\n")
 st.markdown("#### Real-World Machine Learning 🛠")
@@ -49,12 +79,3 @@
 
 # Repository Link Button
 st.link_button(":star: Star the Repository!", "https://github.com/carlosfab/taxi_demand_predictor", type='secondary', use_container_width=True)
-
-# Sidebar
-st.sidebar.image("header.gif", use_column_width=True)
-st.sidebar.title("Contact")
-st.sidebar.info(
-    """
-    Carlos at [sigmoidal.ai](https://sigmoidal.ai/en) | [GitHub](https://github.com/carlosfab) | [Twitter](https://twitter.com/carlos_melo_py) | [YouTube](https://www.youtube.com/@CarlosMeloSigmoidal) | [Instagram](http://instagram.com/carlos_melo.py) | [LinkedIn](http://linkedin.com/in/carlos-melo-data-science/)
-    """
-)
diff --git a/src/app_component.py b/src/app_component.py
@@ -1,9 +1,14 @@
 # Standard Libraries
-import random as r
+import random
+import zipfile
+import requests
 
 # External Libraries
 import streamlit as st
+import geopandas as gpd
 import streamlit.components.v1 as c
+import src.config as config
+from src.paths import DATA_DIR
 
 
 def robo_avatar_component():
@@ -51,3 +56,39 @@ def render_cta():
         st.write("Let's connect!")
         st_button(url="https://twitter.com/carlos_melo_py", label="Twitter", font_awesome_icon="fa-twitter")
         st_button(url="http://linkedin.com/in/carlos-melo-data-science/", label="LinkedIn", font_awesome_icon="fa-linkedin")
+
+
+def render_contact():
+    st.sidebar.title("Contact")
+    st.sidebar.info(
+        """
+    Carlos at [sigmoidal.ai](https://sigmoidal.ai/en) | [GitHub](https://github.com/carlosfab) | [Twitter](https://twitter.com/carlos_melo_py) | [YouTube](https://www.youtube.com/@CarlosMeloSigmoidal) | [Instagram](http://instagram.com/carlos_melo.py) | [LinkedIn](http://linkedin.com/in/carlos-melo-data-science/)
+    """
+    )
+
+
+def load_shape_data_file() -> gpd.geopandas.GeoDataFrame:
+    """
+    Load shape data for NYC taxi zones.
+
+    Returns:
+    - GeoDataFrame: A GeoDataFrame containing the shape data for NYC taxi zones.
+    """
+    # download zip file
+    url_path = "https://d37ci6vzurychx.cloudfront.net/misc/taxi_zones.zip"
+    path = DATA_DIR / 'taxi_zones.zip'
+    response = requests.get(url_path)
+
+    if response.status_code == 200:
+        with open(path, 'wb') as f:
+            f.write(response.content)
+    else:
+        raise Exception(f"Could not download data from {url_path}")
+
+    # unzip file
+    with zipfile.ZipFile(path, 'r') as zip_ref:
+        zip_ref.extractall(DATA_DIR / 'taxi_zones')
+
+    # load and return shape data
+    shape_data = gpd.read_file(DATA_DIR / 'taxi_zones/taxi_zones.shp')
+    return shape_data
diff --git a/src/config.py b/src/config.py
@@ -14,7 +14,18 @@
 
 # Constants
 HOPSWORKS_PROJECT_NAME = 'taxi_demand_api'
+
 FEATURE_GROUP_NAME = 'time_series_hourly_feature_group'
 FEATURE_GROUP_VERSION = 1
+
 FEATURE_VIEW_NAME = 'time_series_hourly_feature_view'
 FEATURE_VIEW_VERSION = 1
+
+N_FEATURES = 24 * 28
+
+MODEL_NAME = "taxi_demand_predictor_next_hour"
+MODEL_VERSION = 1
+
+FEATURE_GROUP_MODEL_PREDICTIONS = 'model_predictions_feature_group'
+FEATURE_VIEW_MODEL_PREDICTIONS = 'model_predictions_feature_view'
+FEATURE_VIEW_MONITORING = 'predictions_vs_actuals_for_monitoring_feature_view'
diff --git a/src/feature_store_api.py b/src/feature_store_api.py
@@ -1,2 +1,56 @@
-def get_feature_store():
-    pass
+# --- Import libraries ---
+from typing import Optional
+import hsfs
+import hopsworks
+
+# Import local configurations
+import src.config as config
+
+# --- Function Definitions ---
+
+
+def get_feature_store() -> hsfs.feature_store.FeatureStore:
+    """
+    Connects to Hopsworks and retrieves a pointer to the feature store.
+
+    This function uses the Hopsworks project name and API key from the 
+    configuration to connect to the Hopsworks instance.
+
+    Returns:
+        hsfs.feature_store.FeatureStore: Pointer to the feature store.
+    """
+
+    # Log in to Hopsworks using configuration details
+    project = hopsworks.login(
+        project=config.HOPSWORKS_PROJECT_NAME,
+        api_key_value=config.HOPSWORKS_API_KEY
+    )
+
+    # Return feature store associated with the project
+    return project.get_feature_store()
+
+
+def get_feature_group(
+    name: str,
+    version: Optional[int] = 1
+) -> hsfs.feature_group.FeatureGroup:
+    """
+    Connects to the feature store and retrieves a pointer to the specified 
+    feature group by its name and version.
+
+    Args:
+        name (str): Name of the feature group.
+        version (Optional[int], optional): Version of the feature group. Defaults to 1.
+
+    Returns:
+        hsfs.feature_group.FeatureGroup: Pointer to the feature group.
+    """
+
+    # Get feature store
+    feature_store = get_feature_store()
+
+    # Return the desired feature group from the feature store
+    return feature_store.get_feature_group(
+        name=name,
+        version=version,
+    )