Skip to content

Commit

Permalink
deploy: Switched main DB initialization to Python script.
Browse files Browse the repository at this point in the history
- Replaced `postgis/postgis` Docker image with a two-stage Docker build: first
  stage installs GDAL on Ubuntu 22.04, second stage uses `postgres:16.0` and
  copies over GDAL binaries.
- Introduced `.dockerignore` to ignore Python virtual environment files.
- Removed old bash script for DB initialization.
- Added Python script `init-regions-table.py` for hierarchical DB
  initialization from GDAM GeoPackage.

How `init-regions-table.py` Works:

1. Reads GeoPackage file and connects to it as an SQLite database.
2. Fetches hierarchical region information (GID and NAME for different levels).
3. Reads PostgreSQL credentials from `.env` file.
4. Connects to the PostgreSQL DB and initializes `regions` table if it doesn't exist.
5. Inserts hierarchical regions into the `regions` table with proper parent-child relationships.
6. Creates indexes on the `regions` table for faster lookups.

- Requirements for the Python script are added in `requirements.txt`.

Signed-off-by: Nikolay Martyanov <[email protected]>
  • Loading branch information
OhmSpectator committed Oct 7, 2023
1 parent d10d902 commit eb36122
Show file tree
Hide file tree
Showing 6 changed files with 196 additions and 29 deletions.
1 change: 1 addition & 0 deletions deployment/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
init-db/venv
66 changes: 52 additions & 14 deletions deployment/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,29 +1,67 @@
FROM postgis/postgis:latest
# First stage: Use Ubuntu to install GDAL, it's used by Fiona
FROM ubuntu:22.04 AS gdal-stage
RUN apt-get update && \
apt-get install -y gdal-bin libgdal-dev libarmadillo10

# Read the .env file
ARG POSTGRES_DB
ARG POSTGRES_USER
ARG POSTGRES_PASSWORD
ARG GADM_FILE
# Second stage: Use PostgreSQL image
FROM postgres:16.0

# Copy GDAL files from first stage, and set GDAL environment variables
COPY --from=gdal-stage /usr/bin/gdal* /usr/bin/
COPY --from=gdal-stage /usr/lib/libgdal* /usr/lib/
COPY --from=gdal-stage /usr/include/gdal /usr/include/gdal
ENV GDAL_CONFIG=/usr/bin/gdal-config
ENV CPLUS_INCLUDE_PATH=/usr/include/gdal
ENV C_INCLUDE_PATH=/usr/include/gdal

# Copy other libraries from first stage, also needed by Fiona
COPY --from=gdal-stage /usr/lib/libarmadillo.so.10 /usr/lib/
COPY --from=gdal-stage /usr/lib/x86_64-linux-gnu/libpoppler.so.118 /usr/lib/x86_64-linux-gnu/

# Install python3 and pip3
RUN apt-get update && \
apt-get install -y python3 python3-pip python3-venv libpq-dev libheif1 && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

# Copy initialization scripts and GPKG file
COPY ./init-db.sh /docker-entrypoint-initdb.d/init-db.sh

# Copy the GADM file to the container
COPY $GADM_FILE /tmp/gadm.gpkg
# Copy the init script to the container
COPY ./init-db/* /tmp/


# Create a virtual environment and activate it
RUN python3 -m venv /venv
ENV PATH="/venv/bin:$PATH"

# Install python requirements
RUN pip3 install -r /tmp/requirements.txt
RUN pip install GDAL==`gdal-config --version`

# Make the init script executable
RUN chmod +x /docker-entrypoint-initdb.d/init-db.sh

# Install GDAL
RUN apt-get update && apt-get install -y gdal-bin
# Copy the GADM file to the container
ARG GADM_FILE
COPY $GADM_FILE /tmp/gadm.gpkg

# Run PostgreSQL on default port
EXPOSE 5432

RUN apt-get update && apt-get install -y locales && \
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
ENV LANG en_US.utf8


# Use the .env file to set the environment variables
ENV POSTGRES_DB=$POSTGRES_DB
ENV POSTGRES_USER=$POSTGRES_USER
ENV POSTGRES_PASSWORD=$POSTGRES_PASSWORD
ENV POSTGRES_DB=$DB_NAME
ENV POSTGRES_USER=$DB_USER
ENV POSTGRES_PASSWORD=$DB_PASSWORD

# Copy the .env file to the container
COPY .env /tmp/.env

USER postgres

CMD ["postgres"]
CMD ["postgres"]
13 changes: 5 additions & 8 deletions deployment/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,15 @@ services:
build:
context: .
args:
POSTGRES_DB: ${POSTGRES_DB}
POSTGRES_USER: ${POSTGRES_USER}
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
GADM_FILE: ${GADM_FILE}
environment:
POSTGRES_DB: ${POSTGRES_DB}
POSTGRES_USER: ${POSTGRES_USER}
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
POSTGRES_DB: ${DB_NAME}
POSTGRES_USER: ${DB_USER}
POSTGRES_PASSWORD: ${DB_PASSWORD}
ports:
- "5432:5432"
volumes:
- postgis_data:/var/lib/postgresql/data
- postgres_data:/var/lib/postgresql/data

volumes:
postgis_data:
postgres_data:
9 changes: 2 additions & 7 deletions deployment/init-db.sh
Original file line number Diff line number Diff line change
@@ -1,13 +1,8 @@
#!/bin/bash
set -e

# Connect to the PostgreSQL database and create the 'gadm' schema if it doesn't exist
echo "Creating gadm schema..."
psql -U "$POSTGRES_USER" -d "$POSTGRES_DB" -c "CREATE SCHEMA IF NOT EXISTS gadm;"

# Import GPKG file into PostGIS, specify the schema as 'gadm'
echo "Importing GPKG file into PostGIS..."
ogr2ogr -f "PostgreSQL" PG:"dbname=$POSTGRES_DB user=$POSTGRES_USER password=$POSTGRES_PASSWORD" "/tmp/gadm.gpkg" -lco SCHEMA=gadm
echo "Creating database..."
python3 /tmp/init-regions-table.py /tmp/gadm.gpkg

echo "Done."

133 changes: 133 additions & 0 deletions deployment/init-db/init-regions-table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
from datetime import datetime
import os
import sqlite3
import sys

from dotenv import load_dotenv
import fiona
import psycopg2

# Check that the GeoPackage file was provided
if len(sys.argv) < 2:
print("Usage: python init-regions-table.py <path-to-gadm.gpkg>")
sys.exit(1)

# Connect to the GeoPackage as an SQLite database
gadm_file = sys.argv[1]
conn_gpkg = sqlite3.connect(gadm_file)
cur_gpkg = conn_gpkg.cursor()

# Read the DB credentials from .env
load_dotenv()
db_name = os.getenv("DB_NAME")
db_user = os.getenv("DB_USER")
db_password = os.getenv("DB_PASSWORD")

# Check that the DB credentials were provided
if db_name is None or db_user is None or db_password is None:
print("Error: DB_NAME, DB_USER, and DB_PASSWORD must be provided in .env")
sys.exit(1)

print("Connecting to the database...")

# Connect to the PostgreSQL database
conn_pg = psycopg2.connect(f"dbname={db_name} user={db_user} password={db_password}")
cur_pg = conn_pg.cursor()

# Create the Region table, if it doesn't exist
cur_pg.execute("""
CREATE TABLE IF NOT EXISTS regions (
id SERIAL PRIMARY KEY,
name VARCHAR(255) NOT NULL,
parentRegionId INTEGER REFERENCES regions(id),
hasSubregions BOOLEAN NOT NULL
)
""")

# Get the layer name from the GeoPackage
layers = fiona.listlayers(gadm_file)

# We expect only one layer in the GeoPackage, print a warning if there are more
if len(layers) != 1:
print(f"Warning: Expected only one layer in GeoPackage, found {len(layers)}. Using first layer.")

layer_name = layers[0]

# Number of levels in the GADM hierarchy
num_levels = 6

# List of columns to fetch from the GeoPackage: GID_0, GID_1, ..., GID_5, NAME_0, NAME_1, ..., NAME_5
columns = [f'GID_{i}' for i in range(num_levels)] + [f'NAME_{i}' for i in range(num_levels)]

# Fetch the relevant columns from the GeoPackage
cur_gpkg.execute(f"SELECT {','.join(columns)} FROM {layer_name}")
rows = cur_gpkg.fetchall()

# Get the number of rows in the GeoPackage
num_rows = len(rows)
print(f"Processing {num_rows} rows...")
# Get the number of digits in the number of rows, to format the progress message
max_row_digits = len(str(num_rows))

# Print a progress message every 1% of rows
rows_in_one_percent = int(num_rows / 100)

timestamp = datetime.now()
timestamp_start = timestamp

existing_gids = {}


# Loop through the rows from the SQLite cursor
for i, row in enumerate(rows):
if i % rows_in_one_percent == 0 and i > 0:
# Print a progress message every 1% of rows and timestamp, how long it took
time_now = datetime.now()
time_diff = (time_now - timestamp).total_seconds()
total_time_diff = (time_now - timestamp_start).total_seconds()
estimated_time_left = (total_time_diff / (i / num_rows)) - total_time_diff
print(f"Handled {int(i / rows_in_one_percent):3d}% ({i:{max_row_digits}} rows) - last batch in {time_diff:.2f} seconds. Estimated time left: {estimated_time_left:.2f} seconds")
timestamp = datetime.now()

row_dict = {}
for column, value in zip(columns, row):
row_dict[column] = value

parent_region_id = None
# Recreate the regions, starting from the highest level
for level in range(num_levels):
gid = row_dict[f'GID_{level}']
if not gid:
# If the GID is empty, finish processing the row
break
name = row_dict[f'NAME_{level}']

if not existing_gids.get(gid):
# If the region doesn't exist, create it
# Check if the region has a subregion in this row
has_subregions = level < num_levels - 1 and bool(row_dict[f'GID_{level + 1}'])
# Use query parameters to give the database driver a chance to escape the values
query = "INSERT INTO regions (name, hasSubregions, parentRegionId) VALUES (%s, %s, %s) RETURNING id"
params = (name, has_subregions, parent_region_id)
cur_pg.execute(query, params)
region_id = cur_pg.fetchone()[0]
existing_gids[gid] = region_id
else:
# If the region already exists, get its ID
region_id = existing_gids[gid]

parent_region_id = region_id

print("Done, in total: ", datetime.now() - timestamp_start)

print("Creating indexes...")
# Create indexes on the Region table
cur_pg.execute("CREATE INDEX IF NOT EXISTS parent_region_idx ON regions (parentRegionId)")
print("Done")

# Commit the changes and close the connections
conn_pg.commit()
cur_gpkg.close()
conn_gpkg.close()
cur_pg.close()
conn_pg.close()
3 changes: 3 additions & 0 deletions deployment/init-db/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
psycopg2-binary
python-dotenv
fiona

0 comments on commit eb36122

Please sign in to comment.