From 259891e60aa180ce04d4ba85917bf36beb815dd4 Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Wed, 27 Apr 2022 13:45:59 -0400 Subject: [PATCH] start of new scraper cookie cutter --- .../templates/new-scraper/cookiecutter.json | 3 + .../workflows/publish_docker_image.yml | 19 ++++++ .../{{cookiecutter.module_name}}/Dockerfile | 37 ++++++++++ .../{{cookiecutter.module_name}}/Makefile | 2 + .../docker-compose.yml | 67 +++++++++++++++++++ .../scraped/.gitkeep | 0 .../tests/docker-compose.yml | 7 ++ .../{{cookiecutter.module_name}}/__init__.py | 39 +++++++++++ 8 files changed, 174 insertions(+) create mode 100644 docker/templates/new-scraper/cookiecutter.json create mode 100644 docker/templates/new-scraper/{{cookiecutter.module_name}}/.github/workflows/publish_docker_image.yml create mode 100644 docker/templates/new-scraper/{{cookiecutter.module_name}}/Dockerfile create mode 100644 docker/templates/new-scraper/{{cookiecutter.module_name}}/Makefile create mode 100644 docker/templates/new-scraper/{{cookiecutter.module_name}}/docker-compose.yml create mode 100644 docker/templates/new-scraper/{{cookiecutter.module_name}}/scraped/.gitkeep create mode 100644 docker/templates/new-scraper/{{cookiecutter.module_name}}/tests/docker-compose.yml create mode 100644 docker/templates/new-scraper/{{cookiecutter.module_name}}/{{cookiecutter.module_name}}/__init__.py diff --git a/docker/templates/new-scraper/cookiecutter.json b/docker/templates/new-scraper/cookiecutter.json new file mode 100644 index 0000000..d21b8cd --- /dev/null +++ b/docker/templates/new-scraper/cookiecutter.json @@ -0,0 +1,3 @@ +{ + "module_name": "my_new_scraper", +} diff --git a/docker/templates/new-scraper/{{cookiecutter.module_name}}/.github/workflows/publish_docker_image.yml b/docker/templates/new-scraper/{{cookiecutter.module_name}}/.github/workflows/publish_docker_image.yml new file mode 100644 index 0000000..5fdb613 --- /dev/null +++ b/docker/templates/new-scraper/{{cookiecutter.module_name}}/.github/workflows/publish_docker_image.yml @@ -0,0 +1,19 @@ +name: Publish Docker image +on: + workflow_dispatch: +jobs: + push_to_registry: + name: Push Docker image to GitHub Packages + runs-on: ubuntu-latest + steps: + - name: Login to GitHub Container Registry + uses: docker/login-action@v1 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Push to GitHub Packages + uses: docker/build-push-action@v2 + with: + push: true + tags: ghcr.io/${{ github.repository }}:latest diff --git a/docker/templates/new-scraper/{{cookiecutter.module_name}}/Dockerfile b/docker/templates/new-scraper/{{cookiecutter.module_name}}/Dockerfile new file mode 100644 index 0000000..9717bb4 --- /dev/null +++ b/docker/templates/new-scraper/{{cookiecutter.module_name}}/Dockerfile @@ -0,0 +1,37 @@ +# Extend the base Python image +# See https://hub.docker.com/_/python for version options +# N.b., there are many options for Python images. We used the plain +# version number in the pilot. YMMV. See this post for a discussion of +# some options and their pros and cons: +# https://pythonspeed.com/articles/base-image-python-docker-images/ +FROM python:3.7 + +# Give ourselves some credit +LABEL maintainer "DataMade " + +# Install any additional OS-level packages you need via apt-get. RUN statements +# add additional layers to your image, increasing its final size. Keep your +# image small by combining related commands into one RUN statement, e.g., +# +# RUN apt-get update && \ +# apt-get install -y python-pip +# +# Read more on Dockerfile best practices at the source: +# https://docs.docker.com/develop/develop-images/dockerfile_best-practices + +# Inside the container, create an app directory and switch into it +RUN mkdir /app +WORKDIR /app + +# Copy the requirements file into the app directory, and install them. Copy +# only the requirements file, so Docker can cache this build step. Otherwise, +# the requirements must be reinstalled every time you build the image after +# the app code changes. See this post for further discussion of strategies +# for building lean and efficient containers: +# https://blog.realkinetic.com/building-minimal-docker-containers-for-python-applications-37d0272c52f3 +COPY ./requirements.txt /app/requirements.txt +RUN pip install --no-cache-dir -r requirements.txt + +# Copy the contents of the current host directory (i.e., our app code) into +# the container. +COPY . /app diff --git a/docker/templates/new-scraper/{{cookiecutter.module_name}}/Makefile b/docker/templates/new-scraper/{{cookiecutter.module_name}}/Makefile new file mode 100644 index 0000000..bface2b --- /dev/null +++ b/docker/templates/new-scraper/{{cookiecutter.module_name}}/Makefile @@ -0,0 +1,2 @@ +scrape : + python -m {{cookiecutter.module_name}} scraped diff --git a/docker/templates/new-scraper/{{cookiecutter.module_name}}/docker-compose.yml b/docker/templates/new-scraper/{{cookiecutter.module_name}}/docker-compose.yml new file mode 100644 index 0000000..a60f016 --- /dev/null +++ b/docker/templates/new-scraper/{{cookiecutter.module_name}}/docker-compose.yml @@ -0,0 +1,67 @@ +version: '2.4' + +services: + app: + image: {{cookiecutter.app_name}} + container_name: {{cookiecutter.app_name}} + build: . + # Allow container to be attached to, e.g., to access the pdb shell + stdin_open: true + tty: true + ports: + # Map ports on your computer to ports on your container. This allows you, + # e.g., to visit your containerized application in a browser on your + # computer. + - 8000:8000 + depends_on: + postgres: + condition: service_healthy + volumes: + # Mount the development directory as a volume into the container, so + # Docker automatically recognizes your changes. + - .:/app{% if cookiecutter.local_settings != 'None' %} + - ${PWD}/{{cookiecutter.local_settings}}:/app/{{cookiecutter.local_settings|replace(".example", "")}}{% endif %} + command: {{cookiecutter.run_command}}{% if cookiecutter.auto_migrate == 'True' %} + + migration: + container_name: {{cookiecutter.app_name}}-migration + image: {{cookiecutter.app_name}}:latest + depends_on: + # Declaring this dependency ensures that your application image is built + # before migrations are run, and that your application and migrations can + # be run from the same image, rather than creating purpose-specific + # copies. + - app + volumes: + # These should generally be the same as your application volumes. + - .:/app{% if cookiecutter.local_settings != 'None' %} + - ${PWD}/{{cookiecutter.local_settings}}:/app/{{cookiecutter.local_settings|replace(".example", "")}}{% endif %} + command: {{cookiecutter.migrate_command}} +{% endif %} + postgres: + container_name: {{cookiecutter.app_name}}-postgres + image: {% if cookiecutter.postgis == 'True' %}mdillon/postgis{% else %}postgres{% endif %}:{{cookiecutter.pg_version}} + healthcheck: + test: ["CMD-SHELL", "pg_isready -U postgres"] + interval: 10s + timeout: 5s + retries: 5 + environment: + # The default Postgres image exposes a number of environmental variables + # that allow you to configure the container's behavior, without writing + # any additional code. Specify the name of your database, and any other + # variables, here. https://hub.docker.com/_/postgres/#environment-variables + - POSTGRES_DB={{cookiecutter.pg_db}} + - POSTGRES_PASSWORD=postgres + volumes: + # By default, Postgres instantiates an anonymous volume. Use a named + # one, so your data persists beyond the life of the container. See this + # post for a discussion of the pitfalls of Postgres and anonymous + # volumes: https://linuxhint.com/run_postgresql_docker_compose/ + - {{cookiecutter.app_name}}-db-data:/var/lib/postgresql/data + ports: + - 32001:5432 + +volumes: + # Declare your named volume for Postgres. + {{cookiecutter.app_name}}-db-data: diff --git a/docker/templates/new-scraper/{{cookiecutter.module_name}}/scraped/.gitkeep b/docker/templates/new-scraper/{{cookiecutter.module_name}}/scraped/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/docker/templates/new-scraper/{{cookiecutter.module_name}}/tests/docker-compose.yml b/docker/templates/new-scraper/{{cookiecutter.module_name}}/tests/docker-compose.yml new file mode 100644 index 0000000..9874183 --- /dev/null +++ b/docker/templates/new-scraper/{{cookiecutter.module_name}}/tests/docker-compose.yml @@ -0,0 +1,7 @@ +version: '2.4' + +services: + app: + # Don't restart the service when the command exits + restart: "no" + command: pytest -sxv diff --git a/docker/templates/new-scraper/{{cookiecutter.module_name}}/{{cookiecutter.module_name}}/__init__.py b/docker/templates/new-scraper/{{cookiecutter.module_name}}/{{cookiecutter.module_name}}/__init__.py new file mode 100644 index 0000000..6080f19 --- /dev/null +++ b/docker/templates/new-scraper/{{cookiecutter.module_name}}/{{cookiecutter.module_name}}/__init__.py @@ -0,0 +1,39 @@ +import scrapelib +import lxml + +class Scraper(scrapelib.Scraper): + '''Rename me to something more descriptive''' + + def _spider(self): + '''yield lxml.html pages''' + ... + + + def scrape(self): + '''yield dictionaries of data''' + for page in self._spider(): + ... + + +if __name__ == '__main__': + import argparse + import pathlib + import json + + parser = argparse.ArgumentParser(description='Scrape your site') + parser.add_argument('output_dir', type=pathlib.Path) + + args = parser.parse_args() + + scraper = Scraper() + + for result in scraper: + result_id = result['id'] + file_name = f'{result_id}.json' + file_path = args.output_dir / file_name + with file_path.open() as f: + json.dump(f, result) + + + +