-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
deploy: Switched main DB initialization to Python script.
- Replaced `postgis/postgis` Docker image with a two-stage Docker build: first stage installs GDAL on Ubuntu 22.04, second stage uses `postgres:16.0` and copies over GDAL binaries. - Introduced `.dockerignore` to ignore Python virtual environment files. - Removed old bash script for DB initialization. - Added Python script `init-regions-table.py` for hierarchical DB initialization from GDAM GeoPackage. How `init-regions-table.py` Works: 1. Reads GeoPackage file and connects to it as an SQLite database. 2. Fetches hierarchical region information (GID and NAME for different levels). 3. Reads PostgreSQL credentials from `.env` file. 4. Connects to the PostgreSQL DB and initializes `regions` table if it doesn't exist. 5. Inserts hierarchical regions into the `regions` table with proper parent-child relationships. 6. Creates indexes on the `regions` table for faster lookups. - Requirements for the Python script are added in `requirements.txt`. Signed-off-by: Nikolay Martyanov <[email protected]>
- Loading branch information
1 parent
d10d902
commit eb36122
Showing
6 changed files
with
196 additions
and
29 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
init-db/venv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,29 +1,67 @@ | ||
FROM postgis/postgis:latest | ||
# First stage: Use Ubuntu to install GDAL, it's used by Fiona | ||
FROM ubuntu:22.04 AS gdal-stage | ||
RUN apt-get update && \ | ||
apt-get install -y gdal-bin libgdal-dev libarmadillo10 | ||
|
||
# Read the .env file | ||
ARG POSTGRES_DB | ||
ARG POSTGRES_USER | ||
ARG POSTGRES_PASSWORD | ||
ARG GADM_FILE | ||
# Second stage: Use PostgreSQL image | ||
FROM postgres:16.0 | ||
|
||
# Copy GDAL files from first stage, and set GDAL environment variables | ||
COPY --from=gdal-stage /usr/bin/gdal* /usr/bin/ | ||
COPY --from=gdal-stage /usr/lib/libgdal* /usr/lib/ | ||
COPY --from=gdal-stage /usr/include/gdal /usr/include/gdal | ||
ENV GDAL_CONFIG=/usr/bin/gdal-config | ||
ENV CPLUS_INCLUDE_PATH=/usr/include/gdal | ||
ENV C_INCLUDE_PATH=/usr/include/gdal | ||
|
||
# Copy other libraries from first stage, also needed by Fiona | ||
COPY --from=gdal-stage /usr/lib/libarmadillo.so.10 /usr/lib/ | ||
COPY --from=gdal-stage /usr/lib/x86_64-linux-gnu/libpoppler.so.118 /usr/lib/x86_64-linux-gnu/ | ||
|
||
# Install python3 and pip3 | ||
RUN apt-get update && \ | ||
apt-get install -y python3 python3-pip python3-venv libpq-dev libheif1 && \ | ||
apt-get clean && \ | ||
rm -rf /var/lib/apt/lists/* | ||
|
||
# Copy initialization scripts and GPKG file | ||
COPY ./init-db.sh /docker-entrypoint-initdb.d/init-db.sh | ||
|
||
# Copy the GADM file to the container | ||
COPY $GADM_FILE /tmp/gadm.gpkg | ||
# Copy the init script to the container | ||
COPY ./init-db/* /tmp/ | ||
|
||
|
||
# Create a virtual environment and activate it | ||
RUN python3 -m venv /venv | ||
ENV PATH="/venv/bin:$PATH" | ||
|
||
# Install python requirements | ||
RUN pip3 install -r /tmp/requirements.txt | ||
RUN pip install GDAL==`gdal-config --version` | ||
|
||
# Make the init script executable | ||
RUN chmod +x /docker-entrypoint-initdb.d/init-db.sh | ||
|
||
# Install GDAL | ||
RUN apt-get update && apt-get install -y gdal-bin | ||
# Copy the GADM file to the container | ||
ARG GADM_FILE | ||
COPY $GADM_FILE /tmp/gadm.gpkg | ||
|
||
# Run PostgreSQL on default port | ||
EXPOSE 5432 | ||
|
||
RUN apt-get update && apt-get install -y locales && \ | ||
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 | ||
ENV LANG en_US.utf8 | ||
|
||
|
||
# Use the .env file to set the environment variables | ||
ENV POSTGRES_DB=$POSTGRES_DB | ||
ENV POSTGRES_USER=$POSTGRES_USER | ||
ENV POSTGRES_PASSWORD=$POSTGRES_PASSWORD | ||
ENV POSTGRES_DB=$DB_NAME | ||
ENV POSTGRES_USER=$DB_USER | ||
ENV POSTGRES_PASSWORD=$DB_PASSWORD | ||
|
||
# Copy the .env file to the container | ||
COPY .env /tmp/.env | ||
|
||
USER postgres | ||
|
||
CMD ["postgres"] | ||
CMD ["postgres"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,8 @@ | ||
#!/bin/bash | ||
set -e | ||
|
||
# Connect to the PostgreSQL database and create the 'gadm' schema if it doesn't exist | ||
echo "Creating gadm schema..." | ||
psql -U "$POSTGRES_USER" -d "$POSTGRES_DB" -c "CREATE SCHEMA IF NOT EXISTS gadm;" | ||
|
||
# Import GPKG file into PostGIS, specify the schema as 'gadm' | ||
echo "Importing GPKG file into PostGIS..." | ||
ogr2ogr -f "PostgreSQL" PG:"dbname=$POSTGRES_DB user=$POSTGRES_USER password=$POSTGRES_PASSWORD" "/tmp/gadm.gpkg" -lco SCHEMA=gadm | ||
echo "Creating database..." | ||
python3 /tmp/init-regions-table.py /tmp/gadm.gpkg | ||
|
||
echo "Done." | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
from datetime import datetime | ||
import os | ||
import sqlite3 | ||
import sys | ||
|
||
from dotenv import load_dotenv | ||
import fiona | ||
import psycopg2 | ||
|
||
# Check that the GeoPackage file was provided | ||
if len(sys.argv) < 2: | ||
print("Usage: python init-regions-table.py <path-to-gadm.gpkg>") | ||
sys.exit(1) | ||
|
||
# Connect to the GeoPackage as an SQLite database | ||
gadm_file = sys.argv[1] | ||
conn_gpkg = sqlite3.connect(gadm_file) | ||
cur_gpkg = conn_gpkg.cursor() | ||
|
||
# Read the DB credentials from .env | ||
load_dotenv() | ||
db_name = os.getenv("DB_NAME") | ||
db_user = os.getenv("DB_USER") | ||
db_password = os.getenv("DB_PASSWORD") | ||
|
||
# Check that the DB credentials were provided | ||
if db_name is None or db_user is None or db_password is None: | ||
print("Error: DB_NAME, DB_USER, and DB_PASSWORD must be provided in .env") | ||
sys.exit(1) | ||
|
||
print("Connecting to the database...") | ||
|
||
# Connect to the PostgreSQL database | ||
conn_pg = psycopg2.connect(f"dbname={db_name} user={db_user} password={db_password}") | ||
cur_pg = conn_pg.cursor() | ||
|
||
# Create the Region table, if it doesn't exist | ||
cur_pg.execute(""" | ||
CREATE TABLE IF NOT EXISTS regions ( | ||
id SERIAL PRIMARY KEY, | ||
name VARCHAR(255) NOT NULL, | ||
parentRegionId INTEGER REFERENCES regions(id), | ||
hasSubregions BOOLEAN NOT NULL | ||
) | ||
""") | ||
|
||
# Get the layer name from the GeoPackage | ||
layers = fiona.listlayers(gadm_file) | ||
|
||
# We expect only one layer in the GeoPackage, print a warning if there are more | ||
if len(layers) != 1: | ||
print(f"Warning: Expected only one layer in GeoPackage, found {len(layers)}. Using first layer.") | ||
|
||
layer_name = layers[0] | ||
|
||
# Number of levels in the GADM hierarchy | ||
num_levels = 6 | ||
|
||
# List of columns to fetch from the GeoPackage: GID_0, GID_1, ..., GID_5, NAME_0, NAME_1, ..., NAME_5 | ||
columns = [f'GID_{i}' for i in range(num_levels)] + [f'NAME_{i}' for i in range(num_levels)] | ||
|
||
# Fetch the relevant columns from the GeoPackage | ||
cur_gpkg.execute(f"SELECT {','.join(columns)} FROM {layer_name}") | ||
rows = cur_gpkg.fetchall() | ||
|
||
# Get the number of rows in the GeoPackage | ||
num_rows = len(rows) | ||
print(f"Processing {num_rows} rows...") | ||
# Get the number of digits in the number of rows, to format the progress message | ||
max_row_digits = len(str(num_rows)) | ||
|
||
# Print a progress message every 1% of rows | ||
rows_in_one_percent = int(num_rows / 100) | ||
|
||
timestamp = datetime.now() | ||
timestamp_start = timestamp | ||
|
||
existing_gids = {} | ||
|
||
|
||
# Loop through the rows from the SQLite cursor | ||
for i, row in enumerate(rows): | ||
if i % rows_in_one_percent == 0 and i > 0: | ||
# Print a progress message every 1% of rows and timestamp, how long it took | ||
time_now = datetime.now() | ||
time_diff = (time_now - timestamp).total_seconds() | ||
total_time_diff = (time_now - timestamp_start).total_seconds() | ||
estimated_time_left = (total_time_diff / (i / num_rows)) - total_time_diff | ||
print(f"Handled {int(i / rows_in_one_percent):3d}% ({i:{max_row_digits}} rows) - last batch in {time_diff:.2f} seconds. Estimated time left: {estimated_time_left:.2f} seconds") | ||
timestamp = datetime.now() | ||
|
||
row_dict = {} | ||
for column, value in zip(columns, row): | ||
row_dict[column] = value | ||
|
||
parent_region_id = None | ||
# Recreate the regions, starting from the highest level | ||
for level in range(num_levels): | ||
gid = row_dict[f'GID_{level}'] | ||
if not gid: | ||
# If the GID is empty, finish processing the row | ||
break | ||
name = row_dict[f'NAME_{level}'] | ||
|
||
if not existing_gids.get(gid): | ||
# If the region doesn't exist, create it | ||
# Check if the region has a subregion in this row | ||
has_subregions = level < num_levels - 1 and bool(row_dict[f'GID_{level + 1}']) | ||
# Use query parameters to give the database driver a chance to escape the values | ||
query = "INSERT INTO regions (name, hasSubregions, parentRegionId) VALUES (%s, %s, %s) RETURNING id" | ||
params = (name, has_subregions, parent_region_id) | ||
cur_pg.execute(query, params) | ||
region_id = cur_pg.fetchone()[0] | ||
existing_gids[gid] = region_id | ||
else: | ||
# If the region already exists, get its ID | ||
region_id = existing_gids[gid] | ||
|
||
parent_region_id = region_id | ||
|
||
print("Done, in total: ", datetime.now() - timestamp_start) | ||
|
||
print("Creating indexes...") | ||
# Create indexes on the Region table | ||
cur_pg.execute("CREATE INDEX IF NOT EXISTS parent_region_idx ON regions (parentRegionId)") | ||
print("Done") | ||
|
||
# Commit the changes and close the connections | ||
conn_pg.commit() | ||
cur_gpkg.close() | ||
conn_gpkg.close() | ||
cur_pg.close() | ||
conn_pg.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
psycopg2-binary | ||
python-dotenv | ||
fiona |