Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pull requests to update population-city dataset #7

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions .github/workflows/update.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
name: Population-city pipeline

on:
push:
branches:
- master


schedule:
- cron: '0 0 1 * *'

permissions:
contents: read

jobs:
build:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3
- name: Setup
uses: actions/setup-python@v3
with:
python-version: 3.x
- name: Install dependencies
run: |
python -m pip install --upgrade pip
if [ -f requirements.txt ];
then
pip install -r requirements.txt; fi
- name: Run pipe
run: |
make format
make run
make clean

- name: Commit and Push updated data in this repo
uses: mikeal/publish-to-github-action@master
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
.venv
__pycache__/*
34,284 changes: 17,151 additions & 17,133 deletions data/unsd-citypopulation-year-both.csv

Large diffs are not rendered by default.

56,908 changes: 28,470 additions & 28,438 deletions data/unsd-citypopulation-year-fm.csv

Large diffs are not rendered by default.

27 changes: 27 additions & 0 deletions makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
SHELL = /bin/bash

# run style formatting
.PHONY: format
format:
black .

.PHONY: run
run:
python3 scripts/process.py

# Cleaning
.PHONY: clean
clean:
find . -type f -name "*.DS_Store" -ls -delete
find . | grep -E "(__pycache__|\.pyc|\.pyo)" | xargs rm -rf
find . | grep -E ".pytest_cache" | xargs rm -rf
find . | grep -E ".ipynb_checkpoints" | xargs rm -rf
find . | grep -E ".trash" | xargs rm -rf
rm -f .coverage

.PHONY: help
help:
@echo "Commands:"
@echo "format : executes style formatting."
@echo "clean : deletes all unnecessary files "
@echo "run : starts running the pipeline."
5 changes: 5 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
selenium
beautifulsoup4
pandas
black
html5lib
Binary file added scripts/__pycache__/scraper.cpython-310.pyc
Binary file not shown.
116 changes: 116 additions & 0 deletions scripts/process.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
from scraper import download_data
import pandas as pd
import sqlite3
import csv
import os


def load_to_DB():
"""
Creates a table and loads the preprocessed data into the database table staged for transform.
Args:
None
"""
df = pd.read_csv("./output.csv")

# define the sqlite database path
db_file = "population-cities.sqlite"

# Createdatabase connection
conn = sqlite3.connect(db_file)

table_name = "population"
df.to_sql(table_name, conn, if_exists="replace", index=False)
conn.close()

print("Data successfully loaded")


def update_data():
"""Fetches each of the transformed data from the DB and updates the files"""
# transform and update unsd-citypopulation-year-fm.csv
OUTFILE = "./data/unsd-citypopulation-year-fm.csv"
DBFILE = "population-cities.sqlite"
HEADERS = [
"Country or Area",
"Year",
"Area",
"Sex",
"City",
"City Type",
"Record Type",
"Reliability",
"Source Year",
"Value",
"Value Footnotes",
]
conn = sqlite3.connect(DBFILE)
c = conn.cursor()
sql = """SELECT * FROM population
WHERE sex = 'Male' OR sex = 'Female'
"""
with open(OUTFILE, "a", newline="") as outcsv:
writer = csv.writer(outcsv, lineterminator="\n")
# writer.writerow(HEADERS)
for row in c.execute(sql):
writer.writerow(row)

# transform and update unsd-citypopulation-year-both.csv
OUTFILE = "./data/unsd-citypopulation-year-both.csv"
c = conn.cursor()
sql = """SELECT * FROM population
WHERE sex = 'Both Sexes'
"""
with open(OUTFILE, "a", newline="") as outcsv:
writer = csv.writer(outcsv, lineterminator="\n")
# writer.writerow(HEADERS)
for row in c.execute(sql):
writer.writerow(row)
print("files have been successfully updated")


def remove_duplicates_in_csv(file_path: str):
"""
Checks and remove duplicates from the last 100 row records as file keeps
populating.

Args:
file_path: directory path to the file to update
"""
df = pd.read_csv(file_path)

# Get the last 100 rows
last_100_rows = df.iloc[-100:]

# Remove any duplicate rows from the last 100 rows
df = df.drop_duplicates(subset=last_100_rows.columns)

# update file
df.to_csv(file_path, index=False)

print(f"Duplicate records removed from the last 100 rows of {file_path}.")


def run():
# Download data
download_data()

# load preprocessed data to DB
load_to_DB()

# update the UN data
update_data()

# check and remove any duplicated records from the last 100 records of unsd-citypopulation-year-fm.csv
remove_duplicates_in_csv("./data/unsd-citypopulation-year-fm.csv")

# check and remove any duplicated records from unsd-citypopulation-year-both.csv
remove_duplicates_in_csv("./data/unsd-citypopulation-year-both.csv")

# clear preprocessed csv file and db file for next run
os.remove("./output.csv")
os.remove("./population-cities.sqlite")


if __name__ == "__main__":
run()
81 changes: 81 additions & 0 deletions scripts/scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@judeleonard where are you using it?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@judeleonard you imported BeautifulSoup but I couldn't find where are you using it.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

import pandas as pd


# Set up the Selenium webdriver for Chrome
chromeOptions = Options()
chromeOptions.headless = True
driver = webdriver.Chrome(options=chromeOptions)

# website url
url = "http://data.un.org/Data.aspx?d=POP&f=tableCode:240"


def download_data():
print("Downloading Data...")
try:
# Open the website
driver.get(url)

# Wait for the page to load
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "table"))
)

# Get the page source after it's fully loaded
page_source = driver.page_source

# Find all table elements on the webpage
tables = driver.find_elements(By.TAG_NAME, "table")

dataframes = []
for table in tables:
table_html = table.get_attribute("outerHTML")
df = pd.read_html(table_html, flavor="bs4")[0]
print(df)
dataframes.append(df)

combined_df = pd.concat(dataframes, ignore_index=True)

# cleaning the data to remove rows with no information
rows_to_delete = [0, 1, 2]
# Delete selected rows
combined_df = combined_df.drop(rows_to_delete)
# rename the "Unnamed" field to "Value Footnotes"
combined_df.rename(columns={"Unnamed: 10": "Value Footnotes"}, inplace=True)
# # convert some fields to the appropriate datatype
combined_df[["Year", "Source Year", "Value Footnotes"]] = combined_df[
["Year", "Source Year", "Value Footnotes"]
].astype("Int64")
# select final preprocessed data
final_df = combined_df[
[
"Country or Area",
"Year",
"Area",
"Sex",
"City",
"City type",
"Record Type",
"Reliability",
"Source Year",
"Value",
"Value Footnotes",
]
]
# Save the DataFrame to a CSV file
csv_filename = "./output.csv"
final_df.to_csv(csv_filename, index=False)
print(f"DataFrame saved to {csv_filename}")

except Exception as e:
print(f"An error occurred: {str(e)}")

finally:
# Close the WebDriver
driver.quit()