datasets · judeleonard · Sep 15, 2023 · Sep 15, 2023 · Sep 15, 2023 · Sep 15, 2023
diff --git a/.github/workflows/update.yaml b/.github/workflows/update.yaml
@@ -0,0 +1,41 @@
+name: Population-city pipeline
+
+on:
+  push:
+    branches:
+      - master
+
+
+  schedule:
+    - cron: '0 0 1 * *'
+
+permissions:
+  contents: read
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v3
+      - name: Setup 
+        uses: actions/setup-python@v3
+        with:
+          python-version: 3.x
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          if [ -f requirements.txt ]; 
+          then 
+          pip install -r requirements.txt; fi
+      - name: Run pipe
+        run: |
+          make format
+          make run
+          make clean
+
+      - name: Commit and Push updated data in this repo
+        uses: mikeal/publish-to-github-action@master
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+.venv
+__pycache__/*
diff --git a/data/unsd-citypopulation-year-both.csv b/data/unsd-citypopulation-year-both.csv
diff --git a/data/unsd-citypopulation-year-fm.csv b/data/unsd-citypopulation-year-fm.csv
diff --git a/makefile b/makefile
@@ -0,0 +1,27 @@
+SHELL = /bin/bash
+
+# run style formatting
+.PHONY: format
+format:
+	black .
+
+.PHONY: run
+run: 
+	python3 scripts/process.py
+
+# Cleaning
+.PHONY: clean
+clean: 
+	find . -type f -name "*.DS_Store" -ls -delete
+	find . | grep -E "(__pycache__|\.pyc|\.pyo)" | xargs rm -rf
+	find . | grep -E ".pytest_cache" | xargs rm -rf
+	find . | grep -E ".ipynb_checkpoints" | xargs rm -rf
+	find . | grep -E ".trash" | xargs rm -rf
+	rm -f .coverage
+
+.PHONY: help
+help:
+	@echo "Commands:"
+	@echo "format   : executes style formatting."
+	@echo "clean    : deletes all unnecessary files "
+	@echo "run   	: starts running the pipeline."
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,5 @@
+selenium
+beautifulsoup4
+pandas
+black
+html5lib
diff --git a/scripts/__pycache__/scraper.cpython-310.pyc b/scripts/__pycache__/scraper.cpython-310.pyc
diff --git a/scripts/process.py b/scripts/process.py
@@ -0,0 +1,116 @@
+from scraper import download_data
+import pandas as pd
+import sqlite3
+import csv
+import os
+
+
+def load_to_DB():
+    """
+    Creates a table and loads the preprocessed data into the database table staged for transform.
+    Args:
+        None
+    """
+    df = pd.read_csv("./output.csv")
+
+    # define the sqlite database path
+    db_file = "population-cities.sqlite"
+
+    # Createdatabase connection
+    conn = sqlite3.connect(db_file)
+
+    table_name = "population"
+    df.to_sql(table_name, conn, if_exists="replace", index=False)
+    conn.close()
+
+    print("Data successfully loaded")
+
+
+def update_data():
+    """Fetches each of the transformed data from the DB and updates the files"""
+    # transform and update unsd-citypopulation-year-fm.csv
+    OUTFILE = "./data/unsd-citypopulation-year-fm.csv"
+    DBFILE = "population-cities.sqlite"
+    HEADERS = [
+        "Country or Area",
+        "Year",
+        "Area",
+        "Sex",
+        "City",
+        "City Type",
+        "Record Type",
+        "Reliability",
+        "Source Year",
+        "Value",
+        "Value Footnotes",
+    ]
+    conn = sqlite3.connect(DBFILE)
+    c = conn.cursor()
+    sql = """SELECT * FROM population
+             WHERE sex = 'Male' OR sex = 'Female'
+             """
+    with open(OUTFILE, "a", newline="") as outcsv:
+        writer = csv.writer(outcsv, lineterminator="\n")
+        # writer.writerow(HEADERS)
+        for row in c.execute(sql):
+            writer.writerow(row)
+
+    # transform and update unsd-citypopulation-year-both.csv
+    OUTFILE = "./data/unsd-citypopulation-year-both.csv"
+    c = conn.cursor()
+    sql = """SELECT * FROM population
+             WHERE sex = 'Both Sexes'
+             """
+    with open(OUTFILE, "a", newline="") as outcsv:
+        writer = csv.writer(outcsv, lineterminator="\n")
+        # writer.writerow(HEADERS)
+        for row in c.execute(sql):
+            writer.writerow(row)
+    print("files have been successfully updated")
+
+
+def remove_duplicates_in_csv(file_path: str):
+    """
+    Checks and remove duplicates from the last 100 row records as file keeps
+    populating.
+
+    Args:
+        file_path: directory path to the file to update
+    """
+    df = pd.read_csv(file_path)
+
+    # Get the last 100 rows
+    last_100_rows = df.iloc[-100:]
+
+    # Remove any duplicate rows from the last 100 rows
+    df = df.drop_duplicates(subset=last_100_rows.columns)
+
+    # update file
+    df.to_csv(file_path, index=False)
+
+    print(f"Duplicate records removed from the last 100 rows of {file_path}.")
+
+
+def run():
+    # Download data
+    download_data()
+
+    # load preprocessed data to DB
+    load_to_DB()
+
+    # update the UN data
+    update_data()
+
+    # check and remove any duplicated records from the last 100 records of unsd-citypopulation-year-fm.csv
+    remove_duplicates_in_csv("./data/unsd-citypopulation-year-fm.csv")
+
+    # check and remove any duplicated records from unsd-citypopulation-year-both.csv
+    remove_duplicates_in_csv("./data/unsd-citypopulation-year-both.csv")
+
+    # clear preprocessed csv file and db file for next run
+    os.remove("./output.csv")
+    os.remove("./population-cities.sqlite")
+
+
+if __name__ == "__main__":
+    run()
diff --git a/scripts/scraper.py b/scripts/scraper.py
@@ -0,0 +1,81 @@
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from bs4 import BeautifulSoup
+import pandas as pd
+
+
+# Set up the Selenium webdriver for Chrome
+chromeOptions = Options()
+chromeOptions.headless = True
+driver = webdriver.Chrome(options=chromeOptions)
+
+# website url
+url = "http://data.un.org/Data.aspx?d=POP&f=tableCode:240"
+
+
+def download_data():
+    print("Downloading Data...")
+    try:
+        # Open the website
+        driver.get(url)
+
+        # Wait for the page to load
+        WebDriverWait(driver, 10).until(
+            EC.presence_of_element_located((By.TAG_NAME, "table"))
+        )
+
+        # Get the page source after it's fully loaded
+        page_source = driver.page_source
+
+        # Find all table elements on the webpage
+        tables = driver.find_elements(By.TAG_NAME, "table")
+
+        dataframes = []
+        for table in tables:
+            table_html = table.get_attribute("outerHTML")
+            df = pd.read_html(table_html, flavor="bs4")[0]
+            print(df)
+            dataframes.append(df)
+
+        combined_df = pd.concat(dataframes, ignore_index=True)
+
+        # cleaning the data to remove rows with no information
+        rows_to_delete = [0, 1, 2]
+        # Delete selected rows
+        combined_df = combined_df.drop(rows_to_delete)
+        # rename the "Unnamed" field to "Value Footnotes"
+        combined_df.rename(columns={"Unnamed: 10": "Value Footnotes"}, inplace=True)
+        # # convert some fields to the appropriate datatype
+        combined_df[["Year", "Source Year", "Value Footnotes"]] = combined_df[
+            ["Year", "Source Year", "Value Footnotes"]
+        ].astype("Int64")
+        # select final preprocessed data
+        final_df = combined_df[
+            [
+                "Country or Area",
+                "Year",
+                "Area",
+                "Sex",
+                "City",
+                "City type",
+                "Record Type",
+                "Reliability",
+                "Source Year",
+                "Value",
+                "Value Footnotes",
+            ]
+        ]
+        # Save the DataFrame to a CSV file
+        csv_filename = "./output.csv"
+        final_df.to_csv(csv_filename, index=False)
+        print(f"DataFrame saved to {csv_filename}")
+
+    except Exception as e:
+        print(f"An error occurred: {str(e)}")
+
+    finally:
+        # Close the WebDriver
+        driver.quit()