From a4f79211b1aea464e59397f126610af3e1eac9f0 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Sun, 9 Jun 2024 16:08:56 +0200 Subject: [PATCH 001/111] GHA Docker: build, tag and push cuda ml variants, too --- .github/workflows/docker-image.yml | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml index 2a1886959d..074e2d3f87 100644 --- a/.github/workflows/docker-image.yml +++ b/.github/workflows/docker-image.yml @@ -31,16 +31,24 @@ jobs: - name: Build the Docker image run: make docker - name: Build the Docker image with GPU support - run: make docker-cuda + run: make docker-cuda + - name: Build the Docker images with GPU support and ML frameworks + run: make docker-cuda-tf1 docker-cuda-tf2 docker-cuda-torch - name: Alias Docker images # default tag uses docker.io, so tag post-hoc run: | docker tag ocrd/core ${{ env.GHCRIO_DOCKER_TAG }} docker tag ocrd/core-cuda ${{ env.GHCRIO_DOCKER_TAG }}-cuda + docker tag ocrd/core-cuda-tf1 ${{ env.GHCRIO_DOCKER_TAG }}-cuda-tf1 + docker tag ocrd/core-cuda-tf2 ${{ env.GHCRIO_DOCKER_TAG }}-cuda-tf2 + docker tag ocrd/core-cuda-torch ${{ env.GHCRIO_DOCKER_TAG }}-cuda-torch - name: Smoke Test that ocrd --help works run: | docker run --rm ${{ env.GHCRIO_DOCKER_TAG }} ocrd --version docker run --rm ${{ env.GHCRIO_DOCKER_TAG }}-cuda ocrd --version + docker run --rm ${{ env.GHCRIO_DOCKER_TAG }}-cuda-tf1 ocrd --version + docker run --rm ${{ env.GHCRIO_DOCKER_TAG }}-cuda-tf2 ocrd --version + docker run --rm ${{ env.GHCRIO_DOCKER_TAG }}-cuda-torch ocrd --version - name: Login to GitHub Container Registry uses: docker/login-action@v3 with: @@ -56,16 +64,29 @@ jobs: run: | docker push ${{ env.GHCRIO_DOCKER_TAG }}:latest docker push ${{ env.GHCRIO_DOCKER_TAG }}-cuda:latest + docker push ${{ env.GHCRIO_DOCKER_TAG }}-cuda-tf1:latest + docker push ${{ env.GHCRIO_DOCKER_TAG }}-cuda-tf2:latest + docker push ${{ env.GHCRIO_DOCKER_TAG }}-cuda-torch:latest - name: Push images to Docker Hub run: | docker tag ${{ env.GHCRIO_DOCKER_TAG }} ${{ env.DOCKERIO_DOCKER_TAG }} docker tag ${{ env.GHCRIO_DOCKER_TAG }}-cuda ${{ env.DOCKERIO_DOCKER_TAG }}-cuda + docker tag ${{ env.GHCRIO_DOCKER_TAG }}-cuda-tf1 ${{ env.DOCKERIO_DOCKER_TAG }}-cuda-tf1 + docker tag ${{ env.GHCRIO_DOCKER_TAG }}-cuda-tf2 ${{ env.DOCKERIO_DOCKER_TAG }}-cuda-tf2 + docker tag ${{ env.GHCRIO_DOCKER_TAG }}-cuda-torch ${{ env.DOCKERIO_DOCKER_TAG }}-cuda-torch docker push ${{ env.DOCKERIO_DOCKER_TAG }}:latest docker push ${{ env.DOCKERIO_DOCKER_TAG }}-cuda:latest - docker push ${{ env.DOCKERIO_DOCKER_TAG }}:latest - docker push ${{ env.DOCKERIO_DOCKER_TAG }}-cuda:latest + docker push ${{ env.DOCKERIO_DOCKER_TAG }}-cuda-tf1:latest + docker push ${{ env.DOCKERIO_DOCKER_TAG }}-cuda-tf2:latest + docker push ${{ env.DOCKERIO_DOCKER_TAG }}-cuda-torch:latest docker tag ${{ env.DOCKERIO_DOCKER_TAG }} ${{ env.DOCKERIO_DOCKER_TAG }}:$(git describe --tags --abbrev=0) docker tag ${{ env.DOCKERIO_DOCKER_TAG }}-cuda ${{ env.DOCKERIO_DOCKER_TAG }}-cuda:$(git describe --tags --abbrev=0) + docker tag ${{ env.DOCKERIO_DOCKER_TAG }}-cuda-tf1 ${{ env.DOCKERIO_DOCKER_TAG }}-cuda-tf1:$(git describe --tags --abbrev=0) + docker tag ${{ env.DOCKERIO_DOCKER_TAG }}-cuda-tf2 ${{ env.DOCKERIO_DOCKER_TAG }}-cuda-tf2:$(git describe --tags --abbrev=0) + docker tag ${{ env.DOCKERIO_DOCKER_TAG }}-cuda-torch ${{ env.DOCKERIO_DOCKER_TAG }}-cuda-torch:$(git describe --tags --abbrev=0) docker push ${{ env.DOCKERIO_DOCKER_TAG }}:$(git describe --tags --abbrev=0) docker push ${{ env.DOCKERIO_DOCKER_TAG }}-cuda:$(git describe --tags --abbrev=0) + docker push ${{ env.DOCKERIO_DOCKER_TAG }}-cuda-tf1:$(git describe --tags --abbrev=0) + docker push ${{ env.DOCKERIO_DOCKER_TAG }}-cuda-tf2:$(git describe --tags --abbrev=0) + docker push ${{ env.DOCKERIO_DOCKER_TAG }}-cuda-torch:$(git describe --tags --abbrev=0) From 44a56bfab0cbd9d0a056132cc611ab4cbeba8682 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Mon, 10 Jun 2024 14:51:04 +0200 Subject: [PATCH 002/111] docker: try to build for non-x86 platforms, too include ARMv7 (Cortex-A...), ARMv8 (M1...), Power9 (HPC) --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 1b4ef47bd2..4cfe02cae7 100644 --- a/Makefile +++ b/Makefile @@ -394,7 +394,7 @@ docker-cuda-torch: DOCKER_FILE = Dockerfile.cuda-torch docker-cuda-torch: docker-cuda docker docker-cuda docker-cuda-tf1 docker-cuda-tf2 docker-cuda-torch: - docker build --progress=plain -f $(DOCKER_FILE) -t $(DOCKER_TAG) --target ocrd_core_base --build-arg BASE_IMAGE=$(DOCKER_BASE_IMAGE) $(DOCKER_ARGS) . + docker build --progress=plain --platform linux/amd64,linux/arm/v7,linux/arm64/v8,linux/powerpc64le -f $(DOCKER_FILE) -t $(DOCKER_TAG) --target ocrd_core_base --build-arg BASE_IMAGE=$(DOCKER_BASE_IMAGE) $(DOCKER_ARGS) . # Build wheels and source dist and twine upload them pypi: build From 56ce641926a210eb75444555a5e7ceb9f5aba02a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 10 Jun 2024 16:26:13 +0200 Subject: [PATCH 003/111] CD: fix typo in docker arch, use buildx directly, pass platforms via variable --- Makefile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 4cfe02cae7..11509e4600 100644 --- a/Makefile +++ b/Makefile @@ -48,6 +48,7 @@ help: @echo " DOCKER_TAG Docker target image tag. Default: '$(DOCKER_TAG)'." @echo " DOCKER_BASE_IMAGE Docker source image tag. Default: '$(DOCKER_BASE_IMAGE)'." @echo " DOCKER_ARGS Additional arguments to docker build. Default: '$(DOCKER_ARGS)'" + @echo " DOCKER_PLATFORMS OS/architecture combinations to (cross-)build for. Default: '$(DOCKER_PLATFORMS)'" @echo " PIP_INSTALL pip install command. Default: $(PIP_INSTALL)" @echo " PYTEST_ARGS arguments for pytest. Default: $(PYTEST_ARGS)" @@ -362,6 +363,7 @@ pyclean: # Additional arguments to docker build. Default: '$(DOCKER_ARGS)' DOCKER_ARGS = +DOCKER_PLATFORMS = linux/amd64,linux/arm/v7,linux/arm64/v8,linux/ppc64le # Build docker image docker: DOCKER_BASE_IMAGE = ubuntu:20.04 @@ -393,8 +395,8 @@ docker-cuda-torch: DOCKER_FILE = Dockerfile.cuda-torch docker-cuda-torch: docker-cuda -docker docker-cuda docker-cuda-tf1 docker-cuda-tf2 docker-cuda-torch: - docker build --progress=plain --platform linux/amd64,linux/arm/v7,linux/arm64/v8,linux/powerpc64le -f $(DOCKER_FILE) -t $(DOCKER_TAG) --target ocrd_core_base --build-arg BASE_IMAGE=$(DOCKER_BASE_IMAGE) $(DOCKER_ARGS) . +docker docker-cuda docker-cuda-tf1 docker-cuda-tf2 docker-cuda-torch: + docker buildx build --progress=plain --platform $(DOCKER_PLATFORMS) -f $(DOCKER_FILE) -t $(DOCKER_TAG) --target ocrd_core_base --build-arg BASE_IMAGE=$(DOCKER_BASE_IMAGE) $(DOCKER_ARGS) . # Build wheels and source dist and twine upload them pypi: build From b2ae951a628b2a1e24e7c257eba5591d6ef2ccc1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 12 Jun 2024 18:23:19 +0200 Subject: [PATCH 004/111] ensure deps (opencv-python-headless / lxml) can be compiled --- Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 11509e4600..e6c500b8cc 100644 --- a/Makefile +++ b/Makefile @@ -55,7 +55,7 @@ help: # END-EVAL # pip install command. Default: $(PIP_INSTALL) -PIP_INSTALL ?= $(PIP) install +PIP_INSTALL ?= $(PIP) install --no-build-isolation PIP_INSTALL_CONFIG_OPTION ?= .PHONY: deps-cuda deps-ubuntu deps-test @@ -163,7 +163,7 @@ deps-torch: # Dependencies for deployment in an ubuntu/debian linux deps-ubuntu: - apt-get install -y python3 imagemagick libgeos-dev + apt-get install -y python3 imagemagick libgeos-dev libxml2-dev libxslt-dev libssl-dev # Install test python deps via pip deps-test: @@ -186,7 +186,7 @@ install: #build $(PIP) config set global.no-binary shapely # Install with pip install -e -install-dev: PIP_INSTALL = $(PIP) install -e +install-dev: PIP_INSTALL = $(PIP) install --no-build-isolation -e install-dev: PIP_INSTALL_CONFIG_OPTION = --config-settings editable_mode=strict install-dev: uninstall $(MAKE) install From d978c93ea80fba9b35b6a32b16a44f279176ab72 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 14 Jun 2024 13:57:23 +0200 Subject: [PATCH 005/111] docker: move smoke test from CD to Dockerfile --- .github/workflows/docker-image.yml | 29 +++++++++++------------------ Dockerfile | 2 ++ 2 files changed, 13 insertions(+), 18 deletions(-) diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml index 074e2d3f87..73a782c646 100644 --- a/.github/workflows/docker-image.yml +++ b/.github/workflows/docker-image.yml @@ -28,6 +28,17 @@ jobs: - # Activate cache export feature to reduce build time of images name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Log in to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERIO_USERNAME }} + password: ${{ secrets.DOCKERIO_PASSWORD }} - name: Build the Docker image run: make docker - name: Build the Docker image with GPU support @@ -42,24 +53,6 @@ jobs: docker tag ocrd/core-cuda-tf1 ${{ env.GHCRIO_DOCKER_TAG }}-cuda-tf1 docker tag ocrd/core-cuda-tf2 ${{ env.GHCRIO_DOCKER_TAG }}-cuda-tf2 docker tag ocrd/core-cuda-torch ${{ env.GHCRIO_DOCKER_TAG }}-cuda-torch - - name: Smoke Test that ocrd --help works - run: | - docker run --rm ${{ env.GHCRIO_DOCKER_TAG }} ocrd --version - docker run --rm ${{ env.GHCRIO_DOCKER_TAG }}-cuda ocrd --version - docker run --rm ${{ env.GHCRIO_DOCKER_TAG }}-cuda-tf1 ocrd --version - docker run --rm ${{ env.GHCRIO_DOCKER_TAG }}-cuda-tf2 ocrd --version - docker run --rm ${{ env.GHCRIO_DOCKER_TAG }}-cuda-torch ocrd --version - - name: Login to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - name: Log in to Docker Hub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERIO_USERNAME }} - password: ${{ secrets.DOCKERIO_PASSWORD }} - name: Push images to Github Container Registry run: | docker push ${{ env.GHCRIO_DOCKER_TAG }}:latest diff --git a/Dockerfile b/Dockerfile index fd57e5014d..144ae774dc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -38,6 +38,8 @@ RUN python3 -m venv /usr/local \ && hash -r \ && make install-dev \ && eval $FIXUP +# Smoke Test +RUN ocrd --version WORKDIR /data From be89806d27fcef0116726bd4f3e59d157215b5e2 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 14 Jun 2024 14:13:22 +0200 Subject: [PATCH 006/111] =?UTF-8?q?docker:=20refactor=20for=20multi-platfo?= =?UTF-8?q?rm=20multi-tag=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - make build backend configurable (DOCKER_BUILD), default to docker build again - on CD, user docker buildx build --push --platform ..., so we can build and publish multi-platform images - make tags configurable and multi-valued (DOCKER_TAG), so multiple registries can be pushed to without extra aliasing/pushing --- .github/workflows/docker-image.yml | 49 +++++------------------------- Makefile | 25 +++++++-------- 2 files changed, 21 insertions(+), 53 deletions(-) diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml index 73a782c646..a1173f3ae6 100644 --- a/.github/workflows/docker-image.yml +++ b/.github/workflows/docker-image.yml @@ -6,8 +6,7 @@ on: workflow_dispatch: # run manually env: - GHCRIO_DOCKER_TAG: ghcr.io/ocr-d/core - DOCKERIO_DOCKER_TAG: docker.io/ocrd/core + PLATFORMS: linux/amd64,linux/arm/v7,linux/arm64/v8,linux/ppc64le jobs: @@ -17,8 +16,14 @@ jobs: permissions: packages: write contents: read - + env: + DOCKER_BASE_TAG: ghcr.io/ocr-d docker.io/ocrd + DOCKER_BUILD: docker buildx build --progress=plain --platform ${{ env.PLATFORMS }} --push steps: + - name: Export variables + run: | + echo "DOCKER_BASE_TAG='${{ env.DOCKER_BASE_TAG }}'" >> $GITHUB_ENV + echo "DOCKER_BUILD='${{ env.DOCKER_BUILD }}'" >> $GITHUB_ENV - name: Checkout uses: actions/checkout@v4 with: @@ -45,41 +50,3 @@ jobs: run: make docker-cuda - name: Build the Docker images with GPU support and ML frameworks run: make docker-cuda-tf1 docker-cuda-tf2 docker-cuda-torch - - name: Alias Docker images - # default tag uses docker.io, so tag post-hoc - run: | - docker tag ocrd/core ${{ env.GHCRIO_DOCKER_TAG }} - docker tag ocrd/core-cuda ${{ env.GHCRIO_DOCKER_TAG }}-cuda - docker tag ocrd/core-cuda-tf1 ${{ env.GHCRIO_DOCKER_TAG }}-cuda-tf1 - docker tag ocrd/core-cuda-tf2 ${{ env.GHCRIO_DOCKER_TAG }}-cuda-tf2 - docker tag ocrd/core-cuda-torch ${{ env.GHCRIO_DOCKER_TAG }}-cuda-torch - - name: Push images to Github Container Registry - run: | - docker push ${{ env.GHCRIO_DOCKER_TAG }}:latest - docker push ${{ env.GHCRIO_DOCKER_TAG }}-cuda:latest - docker push ${{ env.GHCRIO_DOCKER_TAG }}-cuda-tf1:latest - docker push ${{ env.GHCRIO_DOCKER_TAG }}-cuda-tf2:latest - docker push ${{ env.GHCRIO_DOCKER_TAG }}-cuda-torch:latest - - name: Push images to Docker Hub - run: | - docker tag ${{ env.GHCRIO_DOCKER_TAG }} ${{ env.DOCKERIO_DOCKER_TAG }} - docker tag ${{ env.GHCRIO_DOCKER_TAG }}-cuda ${{ env.DOCKERIO_DOCKER_TAG }}-cuda - docker tag ${{ env.GHCRIO_DOCKER_TAG }}-cuda-tf1 ${{ env.DOCKERIO_DOCKER_TAG }}-cuda-tf1 - docker tag ${{ env.GHCRIO_DOCKER_TAG }}-cuda-tf2 ${{ env.DOCKERIO_DOCKER_TAG }}-cuda-tf2 - docker tag ${{ env.GHCRIO_DOCKER_TAG }}-cuda-torch ${{ env.DOCKERIO_DOCKER_TAG }}-cuda-torch - docker push ${{ env.DOCKERIO_DOCKER_TAG }}:latest - docker push ${{ env.DOCKERIO_DOCKER_TAG }}-cuda:latest - docker push ${{ env.DOCKERIO_DOCKER_TAG }}-cuda-tf1:latest - docker push ${{ env.DOCKERIO_DOCKER_TAG }}-cuda-tf2:latest - docker push ${{ env.DOCKERIO_DOCKER_TAG }}-cuda-torch:latest - docker tag ${{ env.DOCKERIO_DOCKER_TAG }} ${{ env.DOCKERIO_DOCKER_TAG }}:$(git describe --tags --abbrev=0) - docker tag ${{ env.DOCKERIO_DOCKER_TAG }}-cuda ${{ env.DOCKERIO_DOCKER_TAG }}-cuda:$(git describe --tags --abbrev=0) - docker tag ${{ env.DOCKERIO_DOCKER_TAG }}-cuda-tf1 ${{ env.DOCKERIO_DOCKER_TAG }}-cuda-tf1:$(git describe --tags --abbrev=0) - docker tag ${{ env.DOCKERIO_DOCKER_TAG }}-cuda-tf2 ${{ env.DOCKERIO_DOCKER_TAG }}-cuda-tf2:$(git describe --tags --abbrev=0) - docker tag ${{ env.DOCKERIO_DOCKER_TAG }}-cuda-torch ${{ env.DOCKERIO_DOCKER_TAG }}-cuda-torch:$(git describe --tags --abbrev=0) - docker push ${{ env.DOCKERIO_DOCKER_TAG }}:$(git describe --tags --abbrev=0) - docker push ${{ env.DOCKERIO_DOCKER_TAG }}-cuda:$(git describe --tags --abbrev=0) - docker push ${{ env.DOCKERIO_DOCKER_TAG }}-cuda-tf1:$(git describe --tags --abbrev=0) - docker push ${{ env.DOCKERIO_DOCKER_TAG }}-cuda-tf2:$(git describe --tags --abbrev=0) - docker push ${{ env.DOCKERIO_DOCKER_TAG }}-cuda-torch:$(git describe --tags --abbrev=0) - diff --git a/Makefile b/Makefile index e6c500b8cc..18dc8f8e2d 100644 --- a/Makefile +++ b/Makefile @@ -362,41 +362,42 @@ pyclean: .PHONY: docker docker-cuda # Additional arguments to docker build. Default: '$(DOCKER_ARGS)' -DOCKER_ARGS = -DOCKER_PLATFORMS = linux/amd64,linux/arm/v7,linux/arm64/v8,linux/ppc64le +DOCKER_ARGS ?= +DOCKER_BASE_TAG ?= ocrd +DOCKER_BUILD ?= docker build --progress=plain # Build docker image docker: DOCKER_BASE_IMAGE = ubuntu:20.04 -docker: DOCKER_TAG = ocrd/core +docker: DOCKER_TAG = $(DOCKER_BASE_TAG)/core docker: DOCKER_FILE = Dockerfile # Build extended sets for maximal layer sharing -docker-cuda: DOCKER_BASE_IMAGE = ocrd/core -docker-cuda: DOCKER_TAG = ocrd/core-cuda +docker-cuda: DOCKER_BASE_IMAGE = $(DOCKER_BASE_TAG)/core +docker-cuda: DOCKER_TAG = $(DOCKER_BASE_TAG)/core-cuda docker-cuda: DOCKER_FILE = Dockerfile.cuda docker-cuda: docker -docker-cuda-tf1: DOCKER_BASE_IMAGE = ocrd/core-cuda -docker-cuda-tf1: DOCKER_TAG = ocrd/core-cuda-tf1 +docker-cuda-tf1: DOCKER_BASE_IMAGE = $(DOCKER_BASE_TAG)/core-cuda +docker-cuda-tf1: DOCKER_TAG = $(DOCKER_BASE_TAG)/core-cuda-tf1 docker-cuda-tf1: DOCKER_FILE = Dockerfile.cuda-tf1 docker-cuda-tf1: docker-cuda -docker-cuda-tf2: DOCKER_BASE_IMAGE = ocrd/core-cuda -docker-cuda-tf2: DOCKER_TAG = ocrd/core-cuda-tf2 +docker-cuda-tf2: DOCKER_BASE_IMAGE = $(DOCKER_BASE_TAG)/core-cuda +docker-cuda-tf2: DOCKER_TAG = $(DOCKER_BASE_TAG)/core-cuda-tf2 docker-cuda-tf2: DOCKER_FILE = Dockerfile.cuda-tf2 docker-cuda-tf2: docker-cuda -docker-cuda-torch: DOCKER_BASE_IMAGE = ocrd/core-cuda -docker-cuda-torch: DOCKER_TAG = ocrd/core-cuda-torch +docker-cuda-torch: DOCKER_BASE_IMAGE = $(DOCKER_BASE_TAG)/core-cuda +docker-cuda-torch: DOCKER_TAG = $(DOCKER_BASE_TAG)/core-cuda-torch docker-cuda-torch: DOCKER_FILE = Dockerfile.cuda-torch docker-cuda-torch: docker-cuda docker docker-cuda docker-cuda-tf1 docker-cuda-tf2 docker-cuda-torch: - docker buildx build --progress=plain --platform $(DOCKER_PLATFORMS) -f $(DOCKER_FILE) -t $(DOCKER_TAG) --target ocrd_core_base --build-arg BASE_IMAGE=$(DOCKER_BASE_IMAGE) $(DOCKER_ARGS) . + $(DOCKER_BUILD) -f $(DOCKER_FILE) $(DOCKER_TAG:%=-t %) --target ocrd_core_base --build-arg BASE_IMAGE=$(DOCKER_BASE_IMAGE) $(DOCKER_ARGS) . # Build wheels and source dist and twine upload them pypi: build From bc9d73352804757214f856d880508b043cc30e32 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 19 Jun 2024 13:09:50 +0200 Subject: [PATCH 007/111] deps: revert b2ae951a (no-build-isolation) --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 18dc8f8e2d..224aba220f 100644 --- a/Makefile +++ b/Makefile @@ -55,7 +55,7 @@ help: # END-EVAL # pip install command. Default: $(PIP_INSTALL) -PIP_INSTALL ?= $(PIP) install --no-build-isolation +PIP_INSTALL ?= $(PIP) install PIP_INSTALL_CONFIG_OPTION ?= .PHONY: deps-cuda deps-ubuntu deps-test @@ -186,7 +186,7 @@ install: #build $(PIP) config set global.no-binary shapely # Install with pip install -e -install-dev: PIP_INSTALL = $(PIP) install --no-build-isolation -e +install-dev: PIP_INSTALL = $(PIP) install -e install-dev: PIP_INSTALL_CONFIG_OPTION = --config-settings editable_mode=strict install-dev: uninstall $(MAKE) install From 67f77ae4ca8da49ba7b55384aa1ad5c74b2abf1a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 19 Jun 2024 13:13:13 +0200 Subject: [PATCH 008/111] CD: disable target arm/v7 due to build problems --- .github/workflows/docker-image.yml | 4 +++- Makefile | 1 - 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml index a1173f3ae6..0c6501020f 100644 --- a/.github/workflows/docker-image.yml +++ b/.github/workflows/docker-image.yml @@ -6,7 +6,9 @@ on: workflow_dispatch: # run manually env: - PLATFORMS: linux/amd64,linux/arm/v7,linux/arm64/v8,linux/ppc64le + # FIXME: linux/arm/v7 disabled as long as scikit-build/cmake-python-distributions#503 is unresolved + # PLATFORMS: linux/amd64,linux/arm/v7,linux/arm64/v8,linux/ppc64le + PLATFORMS: linux/amd64,linux/arm64/v8,linux/ppc64le jobs: diff --git a/Makefile b/Makefile index 224aba220f..52bb649e88 100644 --- a/Makefile +++ b/Makefile @@ -48,7 +48,6 @@ help: @echo " DOCKER_TAG Docker target image tag. Default: '$(DOCKER_TAG)'." @echo " DOCKER_BASE_IMAGE Docker source image tag. Default: '$(DOCKER_BASE_IMAGE)'." @echo " DOCKER_ARGS Additional arguments to docker build. Default: '$(DOCKER_ARGS)'" - @echo " DOCKER_PLATFORMS OS/architecture combinations to (cross-)build for. Default: '$(DOCKER_PLATFORMS)'" @echo " PIP_INSTALL pip install command. Default: $(PIP_INSTALL)" @echo " PYTEST_ARGS arguments for pytest. Default: $(PYTEST_ARGS)" From 721fdbc8a0ed1c10fa4aef66ae15cd157e24ee6f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 25 Jun 2024 10:42:51 +0200 Subject: [PATCH 009/111] docker: fix for multi-valued DOCKER_BASE_TAG --- Makefile | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 52bb649e88..c80dffffe3 100644 --- a/Makefile +++ b/Makefile @@ -367,30 +367,30 @@ DOCKER_BUILD ?= docker build --progress=plain # Build docker image docker: DOCKER_BASE_IMAGE = ubuntu:20.04 -docker: DOCKER_TAG = $(DOCKER_BASE_TAG)/core +docker: DOCKER_TAG = $(DOCKER_BASE_TAG:%=%/core) docker: DOCKER_FILE = Dockerfile # Build extended sets for maximal layer sharing docker-cuda: DOCKER_BASE_IMAGE = $(DOCKER_BASE_TAG)/core -docker-cuda: DOCKER_TAG = $(DOCKER_BASE_TAG)/core-cuda +docker-cuda: DOCKER_TAG = $(DOCKER_BASE_TAG:%=%/core-cuda) docker-cuda: DOCKER_FILE = Dockerfile.cuda docker-cuda: docker docker-cuda-tf1: DOCKER_BASE_IMAGE = $(DOCKER_BASE_TAG)/core-cuda -docker-cuda-tf1: DOCKER_TAG = $(DOCKER_BASE_TAG)/core-cuda-tf1 +docker-cuda-tf1: DOCKER_TAG = $(DOCKER_BASE_TAG:%=%/core-cuda-tf1) docker-cuda-tf1: DOCKER_FILE = Dockerfile.cuda-tf1 docker-cuda-tf1: docker-cuda docker-cuda-tf2: DOCKER_BASE_IMAGE = $(DOCKER_BASE_TAG)/core-cuda -docker-cuda-tf2: DOCKER_TAG = $(DOCKER_BASE_TAG)/core-cuda-tf2 +docker-cuda-tf2: DOCKER_TAG = $(DOCKER_BASE_TAG:%=%/core-cuda-tf2) docker-cuda-tf2: DOCKER_FILE = Dockerfile.cuda-tf2 docker-cuda-tf2: docker-cuda docker-cuda-torch: DOCKER_BASE_IMAGE = $(DOCKER_BASE_TAG)/core-cuda -docker-cuda-torch: DOCKER_TAG = $(DOCKER_BASE_TAG)/core-cuda-torch +docker-cuda-torch: DOCKER_TAG = $(DOCKER_BASE_TAG:%=%/core-cuda-torch) docker-cuda-torch: DOCKER_FILE = Dockerfile.cuda-torch docker-cuda-torch: docker-cuda From 8c8f0422ae8fb2c0c572ba1c536521424127065b Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 25 Jun 2024 15:54:57 +0200 Subject: [PATCH 010/111] :memo: changelog --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 049f804b3e..73251f9e11 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,10 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +Fixed: + + * GHA Docker: build docker.io first, then tag ghcr.io + ## [2.66.0] - 2024-06-07 Fixed: From 79c61e303c87f229d5c96aedc0da31ef82b0f5d3 Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 26 Jun 2024 14:03:54 +0200 Subject: [PATCH 011/111] :package: v2.66.1 --- CHANGELOG.md | 6 +++++- VERSION | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 73251f9e11..46a9b83d7a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,9 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [2.66.1] - 2024-06-26 + + Fixed: * GHA Docker: build docker.io first, then tag ghcr.io @@ -2096,8 +2099,9 @@ Fixed ## [0.0.1] - 2018-04-17 Initial Release -] + +[2.66.1]: ../../compare/v2.66.1..v2.66.0 [2.66.0]: ../../compare/v2.66.0..v2.65.0 [2.65.0]: ../../compare/v2.65.0..v2.64.1 [2.64.1]: ../../compare/v2.64.1..v2.64.0 diff --git a/VERSION b/VERSION index 3d6ac35b13..64fb9e5e48 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.66.0 +2.66.1 From cd2e0b01051d295d5bd9565b3d947a063ce7c7db Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Thu, 27 Jun 2024 00:16:40 +0200 Subject: [PATCH 012/111] workspace: document mets_server_url kwarg --- src/ocrd/resolver.py | 3 +++ src/ocrd/workspace.py | 16 ++++++++++------ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/ocrd/resolver.py b/src/ocrd/resolver.py index 4b8fe6b217..fa98c82d0e 100644 --- a/src/ocrd/resolver.py +++ b/src/ocrd/resolver.py @@ -174,6 +174,9 @@ def workspace_from_url( By default existing ``mets.xml`` will raise an exception. download (boolean, False): Whether to also download all the files referenced by the METS src_baseurl (string, None): Base URL for resolving relative file locations + mets_server_url (string, None): URI of TCP or local path of UDS for METS server handling + the `OcrdMets` of the workspace. By default the METS will be read from and written to + the filesystem directly. **kwargs (): Passed on to ``OcrdMets.find_files`` if download == True Download (clone) :py:attr:`mets_url` to ``mets.xml`` in :py:attr:`dst_dir`, unless diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index 41ea8e9005..2fb3dd283f 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -64,13 +64,17 @@ class Workspace(): :py:class:`ocrd.resolver.Resolver`. Args: - - directory (string) : Filesystem folder to work in + resolver (:py:class:`ocrd.Resolver`) : `Resolver` instance + directory (string) : Filesystem path to work in mets (:py:class:`ocrd_models.ocrd_mets.OcrdMets`) : `OcrdMets` representing this workspace. - Loaded from `'mets.xml'` if `None`. - mets_basename (string) : Basename of the METS XML file. Default: Last URL segment of the mets_url. - overwrite_mode (boolean) : Whether to force add operations on this workspace globally - baseurl (string) : Base URL to prefix to relative URL. + If `None`, then loaded from ``directory``/``mets_basename`` + or delegated to ``mets_server_url``. + mets_basename (string, mets.xml) : Basename of the METS XML file in the workspace directory. + mets_server_url (string, None) : URI of TCP or local path of UDS for METS server handling the + `OcrdMets` of this workspace. If `None`, then the METS will be read from and written to + the filesystem directly. + baseurl (string, None) : Base URL to prefix to relative URL. + overwrite_mode (boolean, False) : Whether to force add operations on this workspace globally """ def __init__( From 9428c68468769af36c36cba6ab327188fb239c49 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Thu, 27 Jun 2024 00:24:40 +0200 Subject: [PATCH 013/111] task_sequence: add mets_server_url kwarg --- src/ocrd/task_sequence.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/ocrd/task_sequence.py b/src/ocrd/task_sequence.py index da691fbc1d..ea8f21042b 100644 --- a/src/ocrd/task_sequence.py +++ b/src/ocrd/task_sequence.py @@ -115,9 +115,9 @@ def validate_tasks(tasks, workspace, page_id=None, overwrite=False): return report -def run_tasks(mets, log_level, page_id, task_strs, overwrite=False): +def run_tasks(mets, log_level, page_id, task_strs, overwrite=False, mets_server_url=None): resolver = Resolver() - workspace = resolver.workspace_from_url(mets) + workspace = resolver.workspace_from_url(mets, mets_server_url=mets_server_url) log = getLogger('ocrd.task_sequence.run_tasks') tasks = [ProcessorTask.parse(task_str) for task_str in task_strs] @@ -139,7 +139,8 @@ def run_tasks(mets, log_level, page_id, task_strs, overwrite=False): overwrite=overwrite, input_file_grp=','.join(task.input_file_grps), output_file_grp=','.join(task.output_file_grps), - parameter=json.dumps(task.parameters) + parameter=json.dumps(task.parameters), + mets_server_url=mets_server_url ) # check return code @@ -149,7 +150,8 @@ def run_tasks(mets, log_level, page_id, task_strs, overwrite=False): log.info("Finished processing task '%s'", task) # reload mets - workspace.reload_mets() + if mets_server_url is None: + workspace.reload_mets() # check output file groups are in mets for output_file_grp in task.output_file_grps: From 8c61d3435519f2a28081b77ba958e06b0e0e55a1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Thu, 27 Jun 2024 00:24:56 +0200 Subject: [PATCH 014/111] ocrd process: add --mets-server-url option --- src/ocrd/cli/process.py | 5 +++-- src/ocrd/cli/workspace.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/ocrd/cli/process.py b/src/ocrd/cli/process.py index b71b74d096..9dcd562644 100644 --- a/src/ocrd/cli/process.py +++ b/src/ocrd/cli/process.py @@ -19,14 +19,15 @@ @click.command('process') @ocrd_loglevel @click.option('-m', '--mets', help="METS to process", default=DEFAULT_METS_BASENAME) +@click.option('-U', '--mets-server-url', help="TCP host URI or UDS path of METS server") @click.option('-g', '--page-id', help="ID(s) of the pages to process") @click.option('--overwrite', is_flag=True, default=False, help="Remove output pages/images if they already exist") @click.argument('tasks', nargs=-1, required=True) -def process_cli(log_level, mets, page_id, tasks, overwrite): +def process_cli(log_level, mets, mets_server_url, page_id, tasks, overwrite): """ Process a series of tasks """ initLogging() log = getLogger('ocrd.cli.process') - run_tasks(mets, log_level, page_id, tasks, overwrite) + run_tasks(mets, log_level, page_id, tasks, overwrite=overwrite, mets_server_url=mets_server_url) log.info("Finished") diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index 52f48b7c09..4c262ec48d 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -48,7 +48,7 @@ def __init__(self, directory, mets_url, mets_basename=DEFAULT_METS_BASENAME, met @click.option('-d', '--directory', envvar='WORKSPACE_DIR', type=click.Path(file_okay=False), metavar='WORKSPACE_DIR', help='Changes the workspace folder location [default: METS_URL directory or .]"') @click.option('-M', '--mets-basename', default=None, help='METS file basename. Deprecated, use --mets/--directory') @click.option('-m', '--mets', default=None, help='The path/URL of the METS file [default: WORKSPACE_DIR/mets.xml]', metavar="METS_URL") -@click.option('-U', '--mets-server-url', 'mets_server_url', help="TCP host of METS server") +@click.option('-U', '--mets-server-url', 'mets_server_url', help="TCP host URI or UDS path of METS server") @click.option('--backup', default=False, help="Backup mets.xml whenever it is saved.", is_flag=True) @click.pass_context def workspace_cli(ctx, directory, mets, mets_basename, mets_server_url, backup): From 4944ffea75f97ce43e5f62f344a9fef45bf85a95 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 27 Jun 2024 14:20:59 +0200 Subject: [PATCH 015/111] config: fix confused descriptions of XDG_{DATA,CONFIG}_HOME --- src/ocrd_utils/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index 5c99800cc8..b3a3e9537d 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -178,12 +178,12 @@ def _ocrd_download_timeout_parser(val): default=(True, lambda: Path.home())) config.add("XDG_DATA_HOME", - description="Directory to look for `./ocrd/resources.yml` (i.e. `ocrd resmgr` user database)", + description="Directory to look for `./ocrd-resources/*` (i.e. `ocrd resmgr` data location)", parser=lambda val: Path(val), default=(True, lambda: Path(config.HOME, '.local/share'))) config.add("XDG_CONFIG_HOME", - description="Directory to look for `./ocrd-resources/*` (i.e. `ocrd resmgr` data location)", + description="Directory to look for `./ocrd/resources.yml` (i.e. `ocrd resmgr` user database)", parser=lambda val: Path(val), default=(True, lambda: Path(config.HOME, '.config'))) From 3a6ff97263cff850f72a3d494f046922117f4ffd Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 27 Jun 2024 14:22:17 +0200 Subject: [PATCH 016/111] test resolve resource candidates: fix (configured XDG_DATA_HOME instead of fixed path) --- tests/utils/test_os.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/utils/test_os.py b/tests/utils/test_os.py index a651ef608f..99c50ba0d3 100644 --- a/tests/utils/test_os.py +++ b/tests/utils/test_os.py @@ -11,6 +11,7 @@ redirect_stderr_and_stdout_to_file, guess_media_type, ) +from ocrd_utils import config class TestOsUtils(TestCase): @@ -34,7 +35,7 @@ def dehomify(s): self.assertEqual(cands, [join(x, fname) for x in [ dehomify(join(getcwd())), dehomify(self.tempdir_path), - '$HOME/.local/share/ocrd-resources/ocrd-dummy', + join(config.XDG_DATA_HOME, 'ocrd-resources', 'ocrd-dummy'), '/usr/local/share/ocrd-resources/ocrd-dummy', ]]) From ff1d71bf70fa781c8054451180798cba64556a28 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 27 Jun 2024 14:52:43 +0200 Subject: [PATCH 017/111] test ocrd process: simplify, add actual dummy test --- tests/cli/test_process.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/tests/cli/test_process.py b/tests/cli/test_process.py index d0e5dc2129..dd131f8f8e 100644 --- a/tests/cli/test_process.py +++ b/tests/cli/test_process.py @@ -1,3 +1,5 @@ +from contextlib import ExitStack + from ocrd.cli import process_cli from ocrd_utils import pushd_popd, disableLogging @@ -5,12 +7,24 @@ class TestLogCli(TestCase): + def setUp(self): + super().setUp() + # make sure we get an isolated temporary copy of the testdata each time + # as long as we are not using pytest but unittest, we need to manage contexts + # (enterContext is only supported starting with py311) + with ExitStack() as stack: + self.workdir = stack.enter_context(copy_of_directory(assets.path_to('kant_aufklaerung_1784/data'))) + stack.enter_context(pushd_popd(self.workdir)) + self.addCleanup(stack.pop_all().close) + def test_cli_process_smoke(self): - disableLogging() - with copy_of_directory(assets.path_to('kant_aufklaerung_1784/data')) as wsdir: - with pushd_popd(wsdir): - with self.assertRaisesRegex(Exception, "Executable not found in PATH: ocrd-foo"): - self.invoke_cli(process_cli, ['foo']) + with self.assertRaisesRegex(Exception, "Executable not found in PATH: ocrd-foo"): + self.invoke_cli(process_cli, ['foo']) + + def test_cli_process_dummy(self): + code, out, err = self.invoke_cli(process_cli, ['dummy -I OCR-D-GT-PAGE -O OCR-D-DUMMY']) + print(code, out, err) + self.assertFalse(code) if __name__ == '__main__': main(__file__) From da7e96076f192ba506a70208501ddeeaa167bf63 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 27 Jun 2024 16:45:31 +0200 Subject: [PATCH 018/111] ocrd process: avoid temp copy of workspace --- src/ocrd/task_sequence.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/ocrd/task_sequence.py b/src/ocrd/task_sequence.py index ea8f21042b..85e30b177c 100644 --- a/src/ocrd/task_sequence.py +++ b/src/ocrd/task_sequence.py @@ -117,7 +117,9 @@ def validate_tasks(tasks, workspace, page_id=None, overwrite=False): def run_tasks(mets, log_level, page_id, task_strs, overwrite=False, mets_server_url=None): resolver = Resolver() - workspace = resolver.workspace_from_url(mets, mets_server_url=mets_server_url) + workdir, mets, basename, _ = resolver.resolve_mets_arguments(None, mets, None) + workspace = resolver.workspace_from_url(mets, workdir, mets_basename=basename, + mets_server_url=mets_server_url) log = getLogger('ocrd.task_sequence.run_tasks') tasks = [ProcessorTask.parse(task_str) for task_str in task_strs] From 2888033ea2079bce752e1e1855c6a155bf144c5b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 27 Jun 2024 16:46:00 +0200 Subject: [PATCH 019/111] test ocrd process: test for METS server option, too --- tests/cli/test_process.py | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/tests/cli/test_process.py b/tests/cli/test_process.py index dd131f8f8e..32fc02fed0 100644 --- a/tests/cli/test_process.py +++ b/tests/cli/test_process.py @@ -1,11 +1,18 @@ +from os.path import exists +from os import remove, getcwd +from time import sleep from contextlib import ExitStack +from multiprocessing import Process, set_start_method +# necessary for macos +set_start_method("fork") +from ocrd import Resolver, Workspace, OcrdMetsServer from ocrd.cli import process_cli -from ocrd_utils import pushd_popd, disableLogging +from ocrd_utils import pushd_popd from tests.base import CapturingTestCase as TestCase, main, assets, copy_of_directory -class TestLogCli(TestCase): +class TestCli(TestCase): def setUp(self): super().setUp() @@ -25,6 +32,28 @@ def test_cli_process_dummy(self): code, out, err = self.invoke_cli(process_cli, ['dummy -I OCR-D-GT-PAGE -O OCR-D-DUMMY']) print(code, out, err) self.assertFalse(code) + self.assertTrue(exists('OCR-D-DUMMY')) + + def test_cli_process_mets_server(self): + # stolen from test_mets_server.fixture_start_mets_server ... + def _start_mets_server(*args, **kwargs): + mets_server = OcrdMetsServer(*args, **kwargs) + mets_server.startup() + if exists('mets.sock'): + remove('mets.sock') + ws = Workspace(Resolver(), getcwd()) + p = Process(target=_start_mets_server, kwargs={'workspace': ws, 'url': 'mets.sock'}) + p.daemon = True + p.start() + sleep(1) # sleep to start up server + self.assertTrue(exists('mets.sock')) + code, out, err = self.invoke_cli(process_cli, ['-U', 'mets.sock', 'dummy -I OCR-D-GT-PAGE -O OCR-D-DUMMY']) + print(code, out, err) + self.assertFalse(code) + self.assertTrue(exists('OCR-D-DUMMY')) + p.terminate() + ws.reload_mets() + self.assertIn('OCR-D-DUMMY', ws.mets.file_groups) if __name__ == '__main__': main(__file__) From ed5201424b430adc4c8e4b5a1d49382a509d7213 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 27 Jun 2024 17:42:52 +0200 Subject: [PATCH 020/111] follow-up 3a6ff972: avoid comparing 'HOME' --- tests/utils/test_os.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/utils/test_os.py b/tests/utils/test_os.py index 99c50ba0d3..2bdf33d2ae 100644 --- a/tests/utils/test_os.py +++ b/tests/utils/test_os.py @@ -35,7 +35,7 @@ def dehomify(s): self.assertEqual(cands, [join(x, fname) for x in [ dehomify(join(getcwd())), dehomify(self.tempdir_path), - join(config.XDG_DATA_HOME, 'ocrd-resources', 'ocrd-dummy'), + dehomify(join(config.XDG_DATA_HOME, 'ocrd-resources', 'ocrd-dummy')), '/usr/local/share/ocrd-resources/ocrd-dummy', ]]) From e4c04aba99212689ec03c1e3558d564e9b0fd21e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 27 Jun 2024 17:44:16 +0200 Subject: [PATCH 021/111] suppress interfering env XDG_DATA_HOME when downloading model resource --- tests/test_resource_manager.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_resource_manager.py b/tests/test_resource_manager.py index 221b0a3af1..0c24a44833 100644 --- a/tests/test_resource_manager.py +++ b/tests/test_resource_manager.py @@ -23,6 +23,8 @@ def test_resources_manager_config_default(monkeypatch, tmp_path): monkeypatch.setenv('HOME', str(tmp_path)) if 'XDG_CONFIG_HOME' in os.environ: monkeypatch.delenv('XDG_CONFIG_HOME', raising=False) + if 'XDG_DATA_HOME' in os.environ: + monkeypatch.delenv('XDG_DATA_HOME', raising=False) # act mgr = OcrdResourceManager() From e979511a61c359750ddca62973480d9fb4af2501 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 27 Jun 2024 17:45:11 +0200 Subject: [PATCH 022/111] ensure does not ocrd_utils.config get initialized before mocking env --- tests/test_resource_manager.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/tests/test_resource_manager.py b/tests/test_resource_manager.py index 0c24a44833..e76073e347 100644 --- a/tests/test_resource_manager.py +++ b/tests/test_resource_manager.py @@ -2,8 +2,11 @@ from pathlib import Path # import pdb -from ocrd.resource_manager import OcrdResourceManager -from ocrd_utils import config +# avoid importing early (before mocking environ) +# because ocrd_utils.config module-level init is crucial: +#from ocrd.resource_manager import OcrdResourceManager +#from ocrd_utils import config + from ocrd_utils.os import get_ocrd_tool_json from pytest import raises, fixture @@ -27,12 +30,14 @@ def test_resources_manager_config_default(monkeypatch, tmp_path): monkeypatch.delenv('XDG_DATA_HOME', raising=False) # act + from ocrd.resource_manager import OcrdResourceManager mgr = OcrdResourceManager() # assert default_config_dir = os.path.join(os.environ['HOME'], '.config', 'ocrd') f = Path(default_config_dir) / CONST_RESOURCE_YML assert os.environ['HOME'] == str(tmp_path) + from ocrd_utils import config assert config.HOME == tmp_path assert Path.home() == tmp_path assert f == mgr.user_list @@ -56,6 +61,7 @@ def test_resources_manager_from_environment(tmp_path, monkeypatch): monkeypatch.setenv('HOME', str(tmp_path)) # act + from ocrd.resource_manager import OcrdResourceManager mgr = OcrdResourceManager() # assert @@ -74,6 +80,7 @@ def test_resources_manager_from_environment(tmp_path, monkeypatch): def test_resources_manager_config_explicite(tmp_path): # act + from ocrd.resource_manager import OcrdResourceManager mgr = OcrdResourceManager(xdg_config_home=str(tmp_path / 'config'), xdg_data_home=str(tmp_path / 'data')) # assert @@ -90,6 +97,7 @@ def test_resources_manager_config_explicite(tmp_path): def test_resources_manager_config_explicit_invalid(tmp_path): # act + from ocrd.resource_manager import OcrdResourceManager (tmp_path / 'ocrd').mkdir() (tmp_path / 'ocrd' / CONST_RESOURCE_YML).write_text('::INVALID::') @@ -100,6 +108,7 @@ def test_resources_manager_config_explicit_invalid(tmp_path): def test_find_resources(tmp_path): # act + from ocrd.resource_manager import OcrdResourceManager f = tmp_path / 'ocrd-foo' / 'foo.bar' f.parent.mkdir() f.write_text('foobar') @@ -112,6 +121,7 @@ def test_find_resources(tmp_path): assert 'ocrd-foo' in [x for x, _ in mgr.list_available(url='http://foo/bar')] def test_parameter_usage(tmp_path): + from ocrd.resource_manager import OcrdResourceManager mgr = OcrdResourceManager(xdg_config_home=tmp_path) assert mgr.parameter_usage('foo.bar') == 'foo.bar' assert mgr.parameter_usage('foo.bar', 'without-extension') == 'foo' @@ -119,21 +129,25 @@ def test_parameter_usage(tmp_path): mgr.parameter_usage('foo.bar', 'baz') def test_default_resource_dir(tmp_path): + from ocrd.resource_manager import OcrdResourceManager mgr = OcrdResourceManager(xdg_data_home=tmp_path) assert mgr.xdg_config_home != mgr.xdg_data_home assert mgr.default_resource_dir == str(mgr.xdg_data_home / 'ocrd-resources') def test_list_available0(tmp_path): + from ocrd.resource_manager import OcrdResourceManager mgr = OcrdResourceManager(xdg_data_home=tmp_path) res = mgr.list_available() assert len(res) > 0 def test_list_available_with_unknown_executable(tmp_path): + from ocrd.resource_manager import OcrdResourceManager mgr = OcrdResourceManager(xdg_data_home=tmp_path) res = mgr.list_available(executable="ocrd-non-existing-processor") assert len(res[0][1]) == 0 def test_date_as_string(tmp_path): + from ocrd.resource_manager import OcrdResourceManager mgr = OcrdResourceManager(xdg_data_home=tmp_path) test_list = tmp_path / 'test-list.yml' with open(test_list, 'w', encoding='utf-8') as fout: @@ -150,6 +164,7 @@ def test_date_as_string(tmp_path): mgr.list_available(executable='ocrd-eynollah-segment') def test_download_archive(tmp_path): + from ocrd.resource_manager import OcrdResourceManager mgr = OcrdResourceManager(xdg_data_home=tmp_path) for archive_type in ('.zip', '.tar.gz', '.tar.xz'): mgr.download( From 6558019337838b0bae3da950d48a421d424e59a3 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 27 Jun 2024 17:48:43 +0200 Subject: [PATCH 023/111] test_mets_server: ignore if forking as already been configured --- tests/test_mets_server.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/test_mets_server.py b/tests/test_mets_server.py index da0b958946..58ff6e2a9b 100644 --- a/tests/test_mets_server.py +++ b/tests/test_mets_server.py @@ -6,8 +6,11 @@ from itertools import repeat from multiprocessing import Process, Pool, Pipe, set_start_method -# necessary for macos -set_start_method("fork") +try: + # necessary for macos + set_start_method("fork") +except RuntimeError: + pass from shutil import rmtree, copytree from os import remove, stat as os_stat from os.path import exists From 494d52b3fe9b24ec08c1f119510c27b1d5f190c7 Mon Sep 17 00:00:00 2001 From: joschrew <91774427+joschrew@users.noreply.github.com> Date: Wed, 3 Jul 2024 11:52:40 +0200 Subject: [PATCH 024/111] Remove unused socket file on metsserver startup --- src/ocrd/mets_server.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index d7edec5ec1..7e06c58f92 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -528,6 +528,9 @@ async def add_file( # Create socket and change to world-readable and -writable to avoid permission errors self.log.debug(f"chmod 0o677 {self.url}") server = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + if Path(self.url).exists() and not is_socket_in_use(self.url): + # remove leftover unused socket which blocks startup + Path(self.url).unlink() server.bind(self.url) # creates the socket file atexit.register(self.shutdown) server.close() @@ -541,3 +544,16 @@ async def add_file( self.log.debug("Starting uvicorn") uvicorn.run(app, **uvicorn_kwargs) + + +def is_socket_in_use(socket_path): + if Path(socket_path).exists(): + client = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + try: + client.connect(socket_path) + except OSError: + return False + client.close() + return True + else: + return False From fdc50f9803d44440731868a41db30d9d4ee5b693 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 3 Jul 2024 17:10:12 +0200 Subject: [PATCH 025/111] remove temp dir --- src/ocrd/resource_manager.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/ocrd/resource_manager.py b/src/ocrd/resource_manager.py index c668028e9c..34d990afaf 100644 --- a/src/ocrd/resource_manager.py +++ b/src/ocrd/resource_manager.py @@ -336,6 +336,9 @@ def download( copytree(path_in_archive, str(fpath)) else: copy(path_in_archive, str(fpath)) + if Path(tempdir).exists(): + log.info("Removing temp dir: %s" % tempdir) + rmtree(tempdir) return fpath def _dedup_database(self, database=None, dedup_key='name'): From fd21e972136b2f956530e6c90acad0f01deb148a Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 3 Jul 2024 17:26:20 +0200 Subject: [PATCH 026/111] refactor prints to p3 --- src/ocrd/resource_manager.py | 43 ++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/src/ocrd/resource_manager.py b/src/ocrd/resource_manager.py index 34d990afaf..a4b834ef80 100644 --- a/src/ocrd/resource_manager.py +++ b/src/ocrd/resource_manager.py @@ -24,7 +24,8 @@ from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd, get_ocrd_tool_json from .constants import RESOURCE_LIST_FILENAME, RESOURCE_USER_LIST_COMMENT -class OcrdResourceManager(): + +class OcrdResourceManager: """ Managing processor resources @@ -81,7 +82,7 @@ def load_resource_list(self, list_filename, database=None): report = OcrdResourceListValidator.validate(list_loaded) if not report.is_valid: self.log.error('\n'.join(report.errors)) - raise ValueError("Resource list %s is invalid!" % (list_filename)) + raise ValueError(f"Resource list {list_filename} is invalid!") for executable, resource_list in list_loaded.items(): if executable not in database: database[executable] = [] @@ -176,7 +177,8 @@ def add_to_user_database(self, executable, res_filename, url=None, resource_type Add a stub entry to the user resource.yml """ res_name = Path(res_filename).name - self.log.info("%s resource '%s' (%s) not a known resource, creating stub in %s'", executable, res_name, str(res_filename), self.user_list) + self.log.info(f"{executable} resource '{res_name}' ({str(res_filename)}) not a known resource, " + f"creating stub in {self.user_list}'") if Path(res_filename).is_dir(): res_size = directory_size(res_filename) else: @@ -190,7 +192,7 @@ def add_to_user_database(self, executable, res_filename, url=None, resource_type resdict = { 'name': res_name, 'url': url if url else '???', - 'description': 'Found at %s on %s' % (self.resource_dir_to_location(res_filename), datetime.now()), + 'description': f'Found at {self.resource_dir_to_location(res_filename)} on {datetime.now()}', 'version_range': '???', 'type': resource_type, 'size': res_size @@ -223,11 +225,11 @@ def parameter_usage(self, name, usage='as-is'): return name elif usage == 'without-extension': return Path(name).stem - raise ValueError("No such usage '%s'" % usage) + raise ValueError(f"No such usage '{usage}'") def _download_impl(self, url, filename, progress_cb=None, size=None): log = getLogger('ocrd.resource_manager._download_impl') - log.info("Downloading %s to %s" % (url, filename)) + log.info(f"Downloading {url} to {filename}") with open(filename, 'wb') as f: gdrive_file_id, is_gdrive_download_link = gparse_url(url, warning=False) if gdrive_file_id: @@ -248,7 +250,7 @@ def _download_impl(self, url, filename, progress_cb=None, size=None): def _copy_impl(self, src_filename, filename, progress_cb=None): log = getLogger('ocrd.resource_manager._copy_impl') - log.info("Copying %s to %s", src_filename, filename) + log.info(f"Copying {src_filename} to {filename}") if Path(src_filename).is_dir(): log.info(f"Copying recursively from {src_filename} to {filename}") for child in Path(src_filename).rglob('*'): @@ -276,16 +278,8 @@ def _copy_impl(self, src_filename, filename, progress_cb=None): # TODO Proper caching (make head request for size, If-Modified etc) def download( - self, - executable, - url, - basedir, - overwrite=False, - no_subdir=False, - name=None, - resource_type='file', - path_in_archive='.', - progress_cb=None, + self, executable, url, basedir, overwrite=False, no_subdir=False, name=None, resource_type='file', + path_in_archive='.', progress_cb=None, ): """ Download a resource by URL @@ -299,12 +293,13 @@ def download( is_url = url.startswith('https://') or url.startswith('http://') if fpath.exists(): if not overwrite: - raise FileExistsError("%s %s already exists but --overwrite is not set" % ('Directory' if fpath.is_dir() else 'File', fpath)) + fpath_type = 'Directory' if fpath.is_dir() else 'File' + raise FileExistsError(f"{fpath_type} {fpath} already exists but --overwrite is not set") if fpath.is_dir(): - log.info("Removing existing target directory {fpath}") + log.info(f"Removing existing target directory {fpath}") rmtree(str(fpath)) else: - log.info("Removing existing target file {fpath}") + log.info(f"Removing existing target file {fpath}") unlink(str(fpath)) destdir.mkdir(parents=True, exist_ok=True) if resource_type in ('file', 'directory'): @@ -322,7 +317,7 @@ def download( Path('out').mkdir() with pushd_popd('out'): mimetype = guess_media_type(f'../{archive_fname}', fallback='application/octet-stream') - log.info("Extracting %s archive to %s/out" % (mimetype, tempdir)) + log.info(f"Extracting {mimetype} archive to {tempdir}/out") if mimetype == 'application/zip': with ZipFile(f'../{archive_fname}', 'r') as zipf: zipf.extractall() @@ -330,14 +325,14 @@ def download( with open_tarfile(f'../{archive_fname}', 'r:*') as tar: tar.extractall() else: - raise RuntimeError("Unable to handle extraction of %s archive %s" % (mimetype, url)) - log.info("Copying '%s' from archive to %s" % (path_in_archive, fpath)) + raise RuntimeError(f"Unable to handle extraction of {mimetype} archive {url}") + log.info(f"Copying '{path_in_archive}' from archive to {fpath}") if Path(path_in_archive).is_dir(): copytree(path_in_archive, str(fpath)) else: copy(path_in_archive, str(fpath)) if Path(tempdir).exists(): - log.info("Removing temp dir: %s" % tempdir) + log.info(f"Removing temp dir {tempdir}") rmtree(tempdir) return fpath From 3ef31c477f3120eca9c4a554750b4904dfce51a5 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 3 Jul 2024 17:27:38 +0200 Subject: [PATCH 027/111] make methods static --- src/ocrd/resource_manager.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/ocrd/resource_manager.py b/src/ocrd/resource_manager.py index a4b834ef80..da20aa8b70 100644 --- a/src/ocrd/resource_manager.py +++ b/src/ocrd/resource_manager.py @@ -220,14 +220,16 @@ def resource_dir_to_location(self, resource_path): 'cwd' if resource_path.startswith(getcwd()) else \ resource_path - def parameter_usage(self, name, usage='as-is'): + @staticmethod + def parameter_usage(name, usage='as-is'): if usage == 'as-is': return name elif usage == 'without-extension': return Path(name).stem raise ValueError(f"No such usage '{usage}'") - def _download_impl(self, url, filename, progress_cb=None, size=None): + @staticmethod + def _download_impl(url, filename, progress_cb=None, size=None): log = getLogger('ocrd.resource_manager._download_impl') log.info(f"Downloading {url} to {filename}") with open(filename, 'wb') as f: @@ -248,7 +250,8 @@ def _download_impl(self, url, filename, progress_cb=None, size=None): progress_cb(len(data)) f.write(data) - def _copy_impl(self, src_filename, filename, progress_cb=None): + @staticmethod + def _copy_impl(src_filename, filename, progress_cb=None): log = getLogger('ocrd.resource_manager._copy_impl') log.info(f"Copying {src_filename} to {filename}") if Path(src_filename).is_dir(): From a97615bf14794b8822850888a674e3eedd6c9d2a Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 3 Jul 2024 17:31:10 +0200 Subject: [PATCH 028/111] refactor: remove format from str --- src/ocrd/resource_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/resource_manager.py b/src/ocrd/resource_manager.py index da20aa8b70..7d5d8fc302 100644 --- a/src/ocrd/resource_manager.py +++ b/src/ocrd/resource_manager.py @@ -236,7 +236,7 @@ def _download_impl(url, filename, progress_cb=None, size=None): gdrive_file_id, is_gdrive_download_link = gparse_url(url, warning=False) if gdrive_file_id: if not is_gdrive_download_link: - url = "https://drive.google.com/uc?id={id}".format(id=gdrive_file_id) + url = f"https://drive.google.com/uc?id={gdrive_file_id}" try: with requests.get(url, stream=True) as r: if "Content-Disposition" not in r.headers: From b0a7e9211f7a3a7883efb6aeffafc41e8e65eafa Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 3 Jul 2024 22:25:16 +0200 Subject: [PATCH 029/111] CI: store pytest results, use flake8 github formatting --- .github/workflows/unit-test.yml | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml index 2b8e3d5b82..2ab5ee46c4 100644 --- a/.github/workflows/unit-test.yml +++ b/.github/workflows/unit-test.yml @@ -50,14 +50,18 @@ jobs: make install deps-test - name: Test with pytest run: | - make test benchmark + mkdir -p test-results + make test benchmark PYTEST_ARGS=--junitxml=test-results/test.xml + - uses: test-summary/action@v2 + with: + paths: "test-results/test.xml" - name: test to ensure that --editable install works run: | make install-dev; ocrd --version - name: Lint with flake8 run: | - python -m pip install flake8 + python -m pip install flake8 flake8-github # stop the build if there are Python syntax errors or undefined names - flake8 src --count --exit-zero --select=E9,F63,F7,F82 --show-source --statistics + flake8 src --count --exit-zero --select=E9,F63,F7,F82 --show-source --statistics --format=github # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 src --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + flake8 src --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --format=github From 9b1455e3589fc0585f7dcc8864ab2d2d53714c5c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 4 Jul 2024 02:27:50 +0200 Subject: [PATCH 030/111] ocrd_utils.get_ocrd_tool/moduledir: try loading from ocrd-all-*.json in dist --- src/ocrd_utils/os.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/src/ocrd_utils/os.py b/src/ocrd_utils/os.py index 1b3ab4e73d..18463de0c0 100644 --- a/src/ocrd_utils/os.py +++ b/src/ocrd_utils/os.py @@ -34,6 +34,7 @@ from .constants import EXT_TO_MIME from .config import config from .logging import getLogger +from .introspect import resource_string def abspath(url): """ @@ -79,12 +80,16 @@ def get_ocrd_tool_json(executable): """ Get the ``ocrd-tool`` description of ``executable``. """ + ocrd_tool = {} executable_name = Path(executable).name try: - ocrd_tool = loads(run([executable, '--dump-json'], stdout=PIPE).stdout) - except (JSONDecodeError, OSError) as e: - getLogger('ocrd.utils.get_ocrd_tool_json').error(f'{executable} --dump-json produced invalid JSON: {e}') - ocrd_tool = {} + ocrd_all_tool = loads(resource_string('ocrd', 'ocrd-all-tool.json')) + ocrd_tool = ocrd_all_tool[executable] + except (JSONDecodeError, OSError, KeyError): + try: + ocrd_tool = loads(run([executable, '--dump-json'], stdout=PIPE).stdout) + except (JSONDecodeError, OSError) as e: + getLogger('ocrd.utils.get_ocrd_tool_json').error(f'{executable} --dump-json produced invalid JSON: {e}') if 'resource_locations' not in ocrd_tool: ocrd_tool['resource_locations'] = ['data', 'cwd', 'system', 'module'] return ocrd_tool @@ -93,9 +98,13 @@ def get_ocrd_tool_json(executable): def get_moduledir(executable): moduledir = None try: - moduledir = run([executable, '--dump-module-dir'], encoding='utf-8', stdout=PIPE).stdout.rstrip('\n') - except (JSONDecodeError, OSError) as e: - getLogger('ocrd.utils.get_moduledir').error(f'{executable} --dump-module-dir failed: {e}') + ocrd_all_moduledir = loads(resource_string('ocrd', 'ocrd-all-module-dir.json')) + moduledir = ocrd_all_moduledir[executable] + except (JSONDecodeError, OSError, KeyError): + try: + moduledir = run([executable, '--dump-module-dir'], encoding='utf-8', stdout=PIPE).stdout.rstrip('\n') + except (JSONDecodeError, OSError) as e: + getLogger('ocrd.utils.get_moduledir').error(f'{executable} --dump-module-dir failed: {e}') return moduledir def list_resource_candidates(executable, fname, cwd=getcwd(), moduled=None, xdg_data_home=None): From 92613dbde0492f8fe97a68ed1fc6530faeaf98a4 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Thu, 4 Jul 2024 10:47:00 +0200 Subject: [PATCH 031/111] fix: recursive copy of dirs --- src/ocrd/resource_manager.py | 55 +++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/src/ocrd/resource_manager.py b/src/ocrd/resource_manager.py index 7d5d8fc302..4cb2bab4c9 100644 --- a/src/ocrd/resource_manager.py +++ b/src/ocrd/resource_manager.py @@ -1,6 +1,6 @@ from pathlib import Path from os.path import join -from os import environ, listdir, getcwd, path, unlink +from os import environ, listdir, makedirs, getcwd, path, unlink from shutil import copytree, rmtree, copy from fnmatch import filter as apply_glob from datetime import datetime @@ -250,34 +250,43 @@ def _download_impl(url, filename, progress_cb=None, size=None): progress_cb(len(data)) f.write(data) + @staticmethod + def _copy_file(src, dst, progress_cb=None): + log = getLogger('ocrd.resource_manager._copy_file') + log.info(f"Copying file {src} to {dst}") + with open(dst, 'wb') as f_out, open(src, 'rb') as f_in: + while True: + chunk = f_in.read(4096) + if chunk: + f_out.write(chunk) + if progress_cb: + progress_cb(len(chunk)) + else: + break + + @staticmethod + def _copy_dir(src, dst, progress_cb=None): + log = getLogger('ocrd.resource_manager._copy_dir') + log.info(f"Copying dir recursively from {src} to {dst}") + if not Path(src).is_dir(): + raise ValueError(f"The source is not a directory: {src}") + makedirs(name=dst, exist_ok=True) + for child in Path(src).rglob('*'): + child_dst = Path(dst) / child.relative_to(src) + child_dst.parent.mkdir(parents=True, exist_ok=True) + if Path(child).is_dir(): + OcrdResourceManager._copy_dir(child, child_dst, progress_cb) + else: + OcrdResourceManager._copy_file(child, child_dst, progress_cb) + @staticmethod def _copy_impl(src_filename, filename, progress_cb=None): log = getLogger('ocrd.resource_manager._copy_impl') log.info(f"Copying {src_filename} to {filename}") if Path(src_filename).is_dir(): - log.info(f"Copying recursively from {src_filename} to {filename}") - for child in Path(src_filename).rglob('*'): - child_dst = Path(filename) / child.relative_to(src_filename) - child_dst.parent.mkdir(parents=True, exist_ok=True) - with open(child_dst, 'wb') as f_out, open(child, 'rb') as f_in: - while True: - chunk = f_in.read(4096) - if chunk: - f_out.write(chunk) - if progress_cb: - progress_cb(len(chunk)) - else: - break + OcrdResourceManager._copy_dir(src_filename, filename, progress_cb) else: - with open(filename, 'wb') as f_out, open(src_filename, 'rb') as f_in: - while True: - chunk = f_in.read(4096) - if chunk: - f_out.write(chunk) - if progress_cb: - progress_cb(len(chunk)) - else: - break + OcrdResourceManager._copy_file(src_filename, filename, progress_cb) # TODO Proper caching (make head request for size, If-Modified etc) def download( From 73702557de1cd77735a34164e22e93384c0fc3bf Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Thu, 4 Jul 2024 10:47:23 +0200 Subject: [PATCH 032/111] add test for _copy_impl --- tests/test_resource_manager.py | 38 ++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/tests/test_resource_manager.py b/tests/test_resource_manager.py index 221b0a3af1..a41c79c504 100644 --- a/tests/test_resource_manager.py +++ b/tests/test_resource_manager.py @@ -1,5 +1,6 @@ import os from pathlib import Path +from shutil import rmtree # import pdb from ocrd.resource_manager import OcrdResourceManager @@ -12,11 +13,13 @@ CONST_RESOURCE_YML = 'resources.yml' CONST_RESOURCE_URL_LAYOUT = 'https://github.com/tesseract-ocr/tessdata_best/raw/main/bos.traineddata' + @fixture(autouse=True) def drop_get_ocrd_tool_json_cache(): get_ocrd_tool_json.cache_clear() yield + def test_resources_manager_config_default(monkeypatch, tmp_path): # arrange @@ -85,6 +88,7 @@ def test_resources_manager_config_explicite(tmp_path): assert fpath.exists() assert mgr.add_to_user_database(proc, fpath) + def test_resources_manager_config_explicit_invalid(tmp_path): # act @@ -95,6 +99,7 @@ def test_resources_manager_config_explicit_invalid(tmp_path): with raises(ValueError, match='is invalid'): OcrdResourceManager(xdg_config_home=tmp_path) + def test_find_resources(tmp_path): # act @@ -109,6 +114,7 @@ def test_find_resources(tmp_path): assert 'ocrd-foo' in [x for x, _ in mgr.list_available()] assert 'ocrd-foo' in [x for x, _ in mgr.list_available(url='http://foo/bar')] + def test_parameter_usage(tmp_path): mgr = OcrdResourceManager(xdg_config_home=tmp_path) assert mgr.parameter_usage('foo.bar') == 'foo.bar' @@ -116,21 +122,25 @@ def test_parameter_usage(tmp_path): with raises(ValueError, match='No such usage'): mgr.parameter_usage('foo.bar', 'baz') + def test_default_resource_dir(tmp_path): mgr = OcrdResourceManager(xdg_data_home=tmp_path) assert mgr.xdg_config_home != mgr.xdg_data_home assert mgr.default_resource_dir == str(mgr.xdg_data_home / 'ocrd-resources') + def test_list_available0(tmp_path): mgr = OcrdResourceManager(xdg_data_home=tmp_path) res = mgr.list_available() assert len(res) > 0 + def test_list_available_with_unknown_executable(tmp_path): mgr = OcrdResourceManager(xdg_data_home=tmp_path) res = mgr.list_available(executable="ocrd-non-existing-processor") assert len(res[0][1]) == 0 + def test_date_as_string(tmp_path): mgr = OcrdResourceManager(xdg_data_home=tmp_path) test_list = tmp_path / 'test-list.yml' @@ -147,6 +157,7 @@ def test_date_as_string(tmp_path): mgr.load_resource_list(test_list) mgr.list_available(executable='ocrd-eynollah-segment') + def test_download_archive(tmp_path): mgr = OcrdResourceManager(xdg_data_home=tmp_path) for archive_type in ('.zip', '.tar.gz', '.tar.xz'): @@ -163,5 +174,32 @@ def test_download_archive(tmp_path): assert filecontent_path.read_text() == '1\n' +def test_copy_impl(tmp_path): + root_dir = "./mgr_copy_impl_test" + root_dir_copied = "./mgr_copy_impl_test_copied" + rmtree(path=root_dir, ignore_errors=True) + rmtree(path=root_dir_copied, ignore_errors=True) + + def _create_test_folder(test_dir: str, letter: str) -> str: + os.makedirs(name=f"./{test_dir}/{letter}", exist_ok=True) + file_path = f"./{test_dir}/{letter}/{letter}.txt" + with open(f"{file_path}", "w") as file: + file.write(f"{letter}") + return file_path + + file1_path = _create_test_folder(test_dir=root_dir, letter="a") + file2_path = _create_test_folder(test_dir=root_dir, letter="b") + file3_path = _create_test_folder(test_dir=root_dir, letter="c") + + mgr = OcrdResourceManager(xdg_data_home=tmp_path) + mgr._copy_impl(src_filename=root_dir, filename=root_dir_copied) + + assert Path(file1_path).exists() + assert Path(file2_path).exists() + assert Path(file3_path).exists() + rmtree(path='./mgr_copy_impl_test', ignore_errors=True) + rmtree(path='./mgr_copy_impl_test_copied', ignore_errors=True) + + if __name__ == "__main__": main(__file__) From 3b0e7d6364d03b7bfd6990483b034b9e0edf7bde Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Thu, 4 Jul 2024 11:19:22 +0200 Subject: [PATCH 033/111] fix: optimize folder creation --- src/ocrd/resource_manager.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/ocrd/resource_manager.py b/src/ocrd/resource_manager.py index 4cb2bab4c9..e079295c60 100644 --- a/src/ocrd/resource_manager.py +++ b/src/ocrd/resource_manager.py @@ -270,10 +270,9 @@ def _copy_dir(src, dst, progress_cb=None): log.info(f"Copying dir recursively from {src} to {dst}") if not Path(src).is_dir(): raise ValueError(f"The source is not a directory: {src}") - makedirs(name=dst, exist_ok=True) + Path(dst).mkdir(parents=True, exist_ok=True) for child in Path(src).rglob('*'): child_dst = Path(dst) / child.relative_to(src) - child_dst.parent.mkdir(parents=True, exist_ok=True) if Path(child).is_dir(): OcrdResourceManager._copy_dir(child, child_dst, progress_cb) else: From ec4cb209ea5a525bd23aa9c42dd505b6e80d3a33 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Thu, 4 Jul 2024 11:19:44 +0200 Subject: [PATCH 034/111] fix: test copied, not created --- tests/test_resource_manager.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/test_resource_manager.py b/tests/test_resource_manager.py index a41c79c504..8272896df8 100644 --- a/tests/test_resource_manager.py +++ b/tests/test_resource_manager.py @@ -175,30 +175,30 @@ def test_download_archive(tmp_path): def test_copy_impl(tmp_path): - root_dir = "./mgr_copy_impl_test" - root_dir_copied = "./mgr_copy_impl_test_copied" + root_dir = f"{tmp_path}/mgr_copy_impl_test" + root_dir_copied = f"{tmp_path}/mgr_copy_impl_test_copied" rmtree(path=root_dir, ignore_errors=True) rmtree(path=root_dir_copied, ignore_errors=True) def _create_test_folder(test_dir: str, letter: str) -> str: - os.makedirs(name=f"./{test_dir}/{letter}", exist_ok=True) - file_path = f"./{test_dir}/{letter}/{letter}.txt" + Path(f"{test_dir}/{letter}").mkdir(parents=True, exist_ok=True) + file_path = f"{test_dir}/{letter}/{letter}.txt" with open(f"{file_path}", "w") as file: file.write(f"{letter}") return file_path - file1_path = _create_test_folder(test_dir=root_dir, letter="a") - file2_path = _create_test_folder(test_dir=root_dir, letter="b") - file3_path = _create_test_folder(test_dir=root_dir, letter="c") + _create_test_folder(test_dir=root_dir, letter="a") + _create_test_folder(test_dir=root_dir, letter="b") + _create_test_folder(test_dir=root_dir, letter="c") mgr = OcrdResourceManager(xdg_data_home=tmp_path) mgr._copy_impl(src_filename=root_dir, filename=root_dir_copied) - assert Path(file1_path).exists() - assert Path(file2_path).exists() - assert Path(file3_path).exists() - rmtree(path='./mgr_copy_impl_test', ignore_errors=True) - rmtree(path='./mgr_copy_impl_test_copied', ignore_errors=True) + assert Path(f"{root_dir_copied}/a/a.txt").exists() + assert Path(f"{root_dir_copied}/b/b.txt").exists() + assert Path(f"{root_dir_copied}/c/c.txt").exists() + rmtree(path=root_dir, ignore_errors=True) + rmtree(path=root_dir_copied, ignore_errors=True) if __name__ == "__main__": From d1400f4defad378b529e9c35fd54b804403263b0 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Thu, 4 Jul 2024 12:08:41 +0200 Subject: [PATCH 035/111] skip some executables and error prints --- src/ocrd_utils/os.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/src/ocrd_utils/os.py b/src/ocrd_utils/os.py index 1b3ab4e73d..0fb8ed52b0 100644 --- a/src/ocrd_utils/os.py +++ b/src/ocrd_utils/os.py @@ -75,27 +75,38 @@ def unzip_file_to_dir(path_to_zip, output_directory): z.close() @lru_cache() -def get_ocrd_tool_json(executable): +def get_ocrd_tool_json(executable, skip_error_print: bool = False): """ Get the ``ocrd-tool`` description of ``executable``. """ + ocrd_tool = {} executable_name = Path(executable).name - try: - ocrd_tool = loads(run([executable, '--dump-json'], stdout=PIPE).stdout) - except (JSONDecodeError, OSError) as e: - getLogger('ocrd.utils.get_ocrd_tool_json').error(f'{executable} --dump-json produced invalid JSON: {e}') - ocrd_tool = {} + # TODO: Potentially these executables should be moved as a constant somewhere else + skip_executables = ["ocrd-cis-data", "ocrd-import", "ocrd-make"] + if executable_name not in skip_executables: + try: + ocrd_tool = loads(run([executable, '--dump-json'], stdout=PIPE).stdout) + except (JSONDecodeError, OSError) as e: + if not skip_error_print: + getLogger('ocrd.utils.get_ocrd_tool_json').error(f'{executable} --dump-json produced invalid JSON: {e}') + except Exception as e: + if not skip_error_print: + getLogger('ocrd.utils.get_ocrd_tool_json').error(f'{executable} --dump-json failed: {e}') if 'resource_locations' not in ocrd_tool: ocrd_tool['resource_locations'] = ['data', 'cwd', 'system', 'module'] return ocrd_tool @lru_cache() -def get_moduledir(executable): +def get_moduledir(executable, skip_error_print: bool = False): moduledir = None try: moduledir = run([executable, '--dump-module-dir'], encoding='utf-8', stdout=PIPE).stdout.rstrip('\n') except (JSONDecodeError, OSError) as e: - getLogger('ocrd.utils.get_moduledir').error(f'{executable} --dump-module-dir failed: {e}') + if not skip_error_print: + getLogger('ocrd.utils.get_moduledir').error(f'{executable} --dump-module-dir failed: {e}') + except Exception as e: + if not skip_error_print: + getLogger('ocrd.utils.get_moduledir').error(f'{executable} --dump-module-dir failed: {e}') return moduledir def list_resource_candidates(executable, fname, cwd=getcwd(), moduled=None, xdg_data_home=None): From cbd108b0dcc6562eaa465eddb4c18f3b955a296e Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Thu, 4 Jul 2024 12:09:06 +0200 Subject: [PATCH 036/111] adapt the call in the resource manager --- src/ocrd/resource_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/resource_manager.py b/src/ocrd/resource_manager.py index e079295c60..5529568ce1 100644 --- a/src/ocrd/resource_manager.py +++ b/src/ocrd/resource_manager.py @@ -102,7 +102,7 @@ def list_available(self, executable=None, dynamic=True, name=None, database=None for exec_dir in environ['PATH'].split(':'): for exec_path in Path(exec_dir).glob(f'{executable}'): self.log.debug(f"Inspecting '{exec_path} --dump-json' for resources") - ocrd_tool = get_ocrd_tool_json(exec_path) + ocrd_tool = get_ocrd_tool_json(exec_path, True) for resdict in ocrd_tool.get('resources', ()): if exec_path.name not in database: database[exec_path.name] = [] From 8c48f11832af2cd1d3f344a6cc0bf3bbb88abea8 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Thu, 4 Jul 2024 13:56:24 +0200 Subject: [PATCH 037/111] remove: skip_error_print --- src/ocrd_utils/os.py | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/src/ocrd_utils/os.py b/src/ocrd_utils/os.py index 0fb8ed52b0..8b3c73fc63 100644 --- a/src/ocrd_utils/os.py +++ b/src/ocrd_utils/os.py @@ -75,38 +75,27 @@ def unzip_file_to_dir(path_to_zip, output_directory): z.close() @lru_cache() -def get_ocrd_tool_json(executable, skip_error_print: bool = False): +def get_ocrd_tool_json(executable): """ Get the ``ocrd-tool`` description of ``executable``. """ ocrd_tool = {} executable_name = Path(executable).name - # TODO: Potentially these executables should be moved as a constant somewhere else - skip_executables = ["ocrd-cis-data", "ocrd-import", "ocrd-make"] - if executable_name not in skip_executables: - try: - ocrd_tool = loads(run([executable, '--dump-json'], stdout=PIPE).stdout) - except (JSONDecodeError, OSError) as e: - if not skip_error_print: - getLogger('ocrd.utils.get_ocrd_tool_json').error(f'{executable} --dump-json produced invalid JSON: {e}') - except Exception as e: - if not skip_error_print: - getLogger('ocrd.utils.get_ocrd_tool_json').error(f'{executable} --dump-json failed: {e}') + try: + ocrd_tool = loads(run([executable, '--dump-json'], stdout=PIPE).stdout) + except (JSONDecodeError, OSError) as e: + getLogger('ocrd.utils.get_ocrd_tool_json').error(f'{executable} --dump-json produced invalid JSON: {e}') if 'resource_locations' not in ocrd_tool: ocrd_tool['resource_locations'] = ['data', 'cwd', 'system', 'module'] return ocrd_tool @lru_cache() -def get_moduledir(executable, skip_error_print: bool = False): +def get_moduledir(executable): moduledir = None try: moduledir = run([executable, '--dump-module-dir'], encoding='utf-8', stdout=PIPE).stdout.rstrip('\n') except (JSONDecodeError, OSError) as e: - if not skip_error_print: - getLogger('ocrd.utils.get_moduledir').error(f'{executable} --dump-module-dir failed: {e}') - except Exception as e: - if not skip_error_print: - getLogger('ocrd.utils.get_moduledir').error(f'{executable} --dump-module-dir failed: {e}') + getLogger('ocrd.utils.get_moduledir').error(f'{executable} --dump-module-dir failed: {e}') return moduledir def list_resource_candidates(executable, fname, cwd=getcwd(), moduled=None, xdg_data_home=None): From 72cc19483df95b06a96213519c19a61ef109faf0 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Thu, 4 Jul 2024 13:56:50 +0200 Subject: [PATCH 038/111] remove: flag for printing --- src/ocrd/resource_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/resource_manager.py b/src/ocrd/resource_manager.py index 5529568ce1..e079295c60 100644 --- a/src/ocrd/resource_manager.py +++ b/src/ocrd/resource_manager.py @@ -102,7 +102,7 @@ def list_available(self, executable=None, dynamic=True, name=None, database=None for exec_dir in environ['PATH'].split(':'): for exec_path in Path(exec_dir).glob(f'{executable}'): self.log.debug(f"Inspecting '{exec_path} --dump-json' for resources") - ocrd_tool = get_ocrd_tool_json(exec_path, True) + ocrd_tool = get_ocrd_tool_json(exec_path) for resdict in ocrd_tool.get('resources', ()): if exec_path.name not in database: database[exec_path.name] = [] From a2b89417cb7c282b2e4a7c3975312142ae8d900e Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Thu, 4 Jul 2024 15:02:43 +0200 Subject: [PATCH 039/111] add skipping in resmgr --- src/ocrd/resource_manager.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/ocrd/resource_manager.py b/src/ocrd/resource_manager.py index e079295c60..ef274b339a 100644 --- a/src/ocrd/resource_manager.py +++ b/src/ocrd/resource_manager.py @@ -99,8 +99,15 @@ def list_available(self, executable=None, dynamic=True, name=None, database=None if not executable: return database.items() if dynamic: + skip_executables = ["ocrd-cis-data", "ocrd-import", "ocrd-make"] for exec_dir in environ['PATH'].split(':'): for exec_path in Path(exec_dir).glob(f'{executable}'): + if not exec_path.name.startswith('ocrd-'): + self.log.debug(f"Not in 'ocrd-*' format, skipping '{exec_path}'") + continue + if exec_path.name in skip_executables: + self.log.debug(f"No dump json available, skipping '{exec_path}'") + continue self.log.debug(f"Inspecting '{exec_path} --dump-json' for resources") ocrd_tool = get_ocrd_tool_json(exec_path) for resdict in ocrd_tool.get('resources', ()): From dbd97532f21b8a33e1b5b3d41de35497ae70ded8 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Thu, 4 Jul 2024 15:17:10 +0200 Subject: [PATCH 040/111] revert previous change --- src/ocrd_utils/os.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd_utils/os.py b/src/ocrd_utils/os.py index 8b3c73fc63..1b3ab4e73d 100644 --- a/src/ocrd_utils/os.py +++ b/src/ocrd_utils/os.py @@ -79,12 +79,12 @@ def get_ocrd_tool_json(executable): """ Get the ``ocrd-tool`` description of ``executable``. """ - ocrd_tool = {} executable_name = Path(executable).name try: ocrd_tool = loads(run([executable, '--dump-json'], stdout=PIPE).stdout) except (JSONDecodeError, OSError) as e: getLogger('ocrd.utils.get_ocrd_tool_json').error(f'{executable} --dump-json produced invalid JSON: {e}') + ocrd_tool = {} if 'resource_locations' not in ocrd_tool: ocrd_tool['resource_locations'] = ['data', 'cwd', 'system', 'module'] return ocrd_tool From 1dc0ff703226c9a3c0f1901b88cd8cbecb95403a Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Thu, 4 Jul 2024 15:20:57 +0200 Subject: [PATCH 041/111] Update src/ocrd/resource_manager.py Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- src/ocrd/resource_manager.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/ocrd/resource_manager.py b/src/ocrd/resource_manager.py index ef274b339a..d2b57bc361 100644 --- a/src/ocrd/resource_manager.py +++ b/src/ocrd/resource_manager.py @@ -103,8 +103,7 @@ def list_available(self, executable=None, dynamic=True, name=None, database=None for exec_dir in environ['PATH'].split(':'): for exec_path in Path(exec_dir).glob(f'{executable}'): if not exec_path.name.startswith('ocrd-'): - self.log.debug(f"Not in 'ocrd-*' format, skipping '{exec_path}'") - continue + self.log.warning(f"OCR-D processor executable '{exec_path}' has no 'ocrd-' prefix") if exec_path.name in skip_executables: self.log.debug(f"No dump json available, skipping '{exec_path}'") continue From d7dc87cc6a1926d796f6449c099ae51ae09edb33 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Thu, 4 Jul 2024 15:21:17 +0200 Subject: [PATCH 042/111] Update src/ocrd/resource_manager.py Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- src/ocrd/resource_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/resource_manager.py b/src/ocrd/resource_manager.py index d2b57bc361..0f16ba8ef6 100644 --- a/src/ocrd/resource_manager.py +++ b/src/ocrd/resource_manager.py @@ -105,7 +105,7 @@ def list_available(self, executable=None, dynamic=True, name=None, database=None if not exec_path.name.startswith('ocrd-'): self.log.warning(f"OCR-D processor executable '{exec_path}' has no 'ocrd-' prefix") if exec_path.name in skip_executables: - self.log.debug(f"No dump json available, skipping '{exec_path}'") + self.log.debug(f"Not an OCR-D processor CLI, skipping '{exec_path}'") continue self.log.debug(f"Inspecting '{exec_path} --dump-json' for resources") ocrd_tool = get_ocrd_tool_json(exec_path) From 7e5946e1cbf65baefcd3601fc4a67dbc35e7a327 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Thu, 4 Jul 2024 16:09:00 +0200 Subject: [PATCH 043/111] remove: unnecessary rmtree --- src/ocrd/resource_manager.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/ocrd/resource_manager.py b/src/ocrd/resource_manager.py index 7d5d8fc302..b927ae9be7 100644 --- a/src/ocrd/resource_manager.py +++ b/src/ocrd/resource_manager.py @@ -334,9 +334,6 @@ def download( copytree(path_in_archive, str(fpath)) else: copy(path_in_archive, str(fpath)) - if Path(tempdir).exists(): - log.info(f"Removing temp dir {tempdir}") - rmtree(tempdir) return fpath def _dedup_database(self, database=None, dedup_key='name'): From d5173ada7d052c107c04da8732ccd30f61c4d9a1 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 5 Jul 2024 10:59:32 +0200 Subject: [PATCH 044/111] skip downloading of existing models --- src/ocrd/resource_manager.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ocrd/resource_manager.py b/src/ocrd/resource_manager.py index b927ae9be7..5ac876ccc2 100644 --- a/src/ocrd/resource_manager.py +++ b/src/ocrd/resource_manager.py @@ -297,7 +297,8 @@ def download( if fpath.exists(): if not overwrite: fpath_type = 'Directory' if fpath.is_dir() else 'File' - raise FileExistsError(f"{fpath_type} {fpath} already exists but --overwrite is not set") + log.warning(f"{fpath_type} {fpath} already exists but --overwrite is not set, skipping the download") + # raise FileExistsError(f"{fpath_type} {fpath} already exists but --overwrite is not set") if fpath.is_dir(): log.info(f"Removing existing target directory {fpath}") rmtree(str(fpath)) From a0206a6a2c7e504b6fd732bb06c81c6dd2498797 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 5 Jul 2024 11:13:36 +0200 Subject: [PATCH 045/111] reduce nests inside _download_impl --- src/ocrd/resource_manager.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/ocrd/resource_manager.py b/src/ocrd/resource_manager.py index 5ac876ccc2..e05471af88 100644 --- a/src/ocrd/resource_manager.py +++ b/src/ocrd/resource_manager.py @@ -232,17 +232,17 @@ def parameter_usage(name, usage='as-is'): def _download_impl(url, filename, progress_cb=None, size=None): log = getLogger('ocrd.resource_manager._download_impl') log.info(f"Downloading {url} to {filename}") + gdrive_file_id, is_gdrive_download_link = gparse_url(url, warning=False) + if gdrive_file_id: + if not is_gdrive_download_link: + url = f"https://drive.google.com/uc?id={gdrive_file_id}" + try: + with requests.get(url, stream=True) as r: + if "Content-Disposition" not in r.headers: + url = get_url_from_gdrive_confirmation(r.text) + except RuntimeError as e: + log.warning("Cannot unwrap Google Drive URL: ", e) with open(filename, 'wb') as f: - gdrive_file_id, is_gdrive_download_link = gparse_url(url, warning=False) - if gdrive_file_id: - if not is_gdrive_download_link: - url = f"https://drive.google.com/uc?id={gdrive_file_id}" - try: - with requests.get(url, stream=True) as r: - if "Content-Disposition" not in r.headers: - url = get_url_from_gdrive_confirmation(r.text) - except RuntimeError as e: - log.warning("Cannot unwrap Google Drive URL: ", e) with requests.get(url, stream=True) as r: r.raise_for_status() for data in r.iter_content(chunk_size=4096): From 27503d7b78274ac19b573391706cdc4da5f51673 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 5 Jul 2024 11:25:11 +0200 Subject: [PATCH 046/111] fix: remove unfinished download files --- src/ocrd/resource_manager.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/ocrd/resource_manager.py b/src/ocrd/resource_manager.py index e05471af88..f8979d15e7 100644 --- a/src/ocrd/resource_manager.py +++ b/src/ocrd/resource_manager.py @@ -242,13 +242,18 @@ def _download_impl(url, filename, progress_cb=None, size=None): url = get_url_from_gdrive_confirmation(r.text) except RuntimeError as e: log.warning("Cannot unwrap Google Drive URL: ", e) - with open(filename, 'wb') as f: - with requests.get(url, stream=True) as r: - r.raise_for_status() - for data in r.iter_content(chunk_size=4096): - if progress_cb: - progress_cb(len(data)) - f.write(data) + try: + with open(filename, 'wb') as f: + with requests.get(url, stream=True) as r: + r.raise_for_status() + for data in r.iter_content(chunk_size=4096): + if progress_cb: + progress_cb(len(data)) + f.write(data) + except Exception as e: + rmtree(filename, ignore_errors=True) + Path(filename).unlink(missing_ok=True) + raise e @staticmethod def _copy_impl(src_filename, filename, progress_cb=None): From ceafe49f9813faa6778affc46d3bfb00457f3ec1 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 5 Jul 2024 11:29:05 +0200 Subject: [PATCH 047/111] fix: extend try block scope --- src/ocrd/resource_manager.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/ocrd/resource_manager.py b/src/ocrd/resource_manager.py index f8979d15e7..dd41cdcd84 100644 --- a/src/ocrd/resource_manager.py +++ b/src/ocrd/resource_manager.py @@ -232,17 +232,17 @@ def parameter_usage(name, usage='as-is'): def _download_impl(url, filename, progress_cb=None, size=None): log = getLogger('ocrd.resource_manager._download_impl') log.info(f"Downloading {url} to {filename}") - gdrive_file_id, is_gdrive_download_link = gparse_url(url, warning=False) - if gdrive_file_id: - if not is_gdrive_download_link: - url = f"https://drive.google.com/uc?id={gdrive_file_id}" - try: - with requests.get(url, stream=True) as r: - if "Content-Disposition" not in r.headers: - url = get_url_from_gdrive_confirmation(r.text) - except RuntimeError as e: - log.warning("Cannot unwrap Google Drive URL: ", e) try: + gdrive_file_id, is_gdrive_download_link = gparse_url(url, warning=False) + if gdrive_file_id: + if not is_gdrive_download_link: + url = f"https://drive.google.com/uc?id={gdrive_file_id}" + try: + with requests.get(url, stream=True) as r: + if "Content-Disposition" not in r.headers: + url = get_url_from_gdrive_confirmation(r.text) + except RuntimeError as e: + log.warning("Cannot unwrap Google Drive URL: ", e) with open(filename, 'wb') as f: with requests.get(url, stream=True) as r: r.raise_for_status() From 5f68c84800ae6cdda73498cff7cf9c0ddfd5ad74 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 5 Jul 2024 12:45:40 +0200 Subject: [PATCH 048/111] fix: return after the warning --- src/ocrd/resource_manager.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ocrd/resource_manager.py b/src/ocrd/resource_manager.py index dd41cdcd84..eaca9de94a 100644 --- a/src/ocrd/resource_manager.py +++ b/src/ocrd/resource_manager.py @@ -304,6 +304,7 @@ def download( fpath_type = 'Directory' if fpath.is_dir() else 'File' log.warning(f"{fpath_type} {fpath} already exists but --overwrite is not set, skipping the download") # raise FileExistsError(f"{fpath_type} {fpath} already exists but --overwrite is not set") + return if fpath.is_dir(): log.info(f"Removing existing target directory {fpath}") rmtree(str(fpath)) From 2d71442f39a8d85689db5392e447e540f0ea09e5 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 5 Jul 2024 12:47:37 +0200 Subject: [PATCH 049/111] return fpath not empty --- src/ocrd/resource_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/resource_manager.py b/src/ocrd/resource_manager.py index eaca9de94a..c74940c161 100644 --- a/src/ocrd/resource_manager.py +++ b/src/ocrd/resource_manager.py @@ -304,7 +304,7 @@ def download( fpath_type = 'Directory' if fpath.is_dir() else 'File' log.warning(f"{fpath_type} {fpath} already exists but --overwrite is not set, skipping the download") # raise FileExistsError(f"{fpath_type} {fpath} already exists but --overwrite is not set") - return + return fpath if fpath.is_dir(): log.info(f"Removing existing target directory {fpath}") rmtree(str(fpath)) From b0eb13e5418c5f7ea2c065feace33b6f3a5017b3 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 5 Jul 2024 12:55:26 +0200 Subject: [PATCH 050/111] refactor: spacing --- src/ocrd/cli/resmgr.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/ocrd/cli/resmgr.py b/src/ocrd/cli/resmgr.py index 1289e498e1..6df09fbde6 100644 --- a/src/ocrd/cli/resmgr.py +++ b/src/ocrd/cli/resmgr.py @@ -13,18 +13,12 @@ import requests import click -from ocrd_utils import ( - initLogging, - directory_size, - getLogger, - get_ocrd_tool_json, - get_moduledir, - RESOURCE_LOCATIONS, -) +from ocrd_utils import directory_size, getLogger, get_moduledir, get_ocrd_tool_json, initLogging, RESOURCE_LOCATIONS from ocrd.constants import RESOURCE_USER_LIST_COMMENT from ..resource_manager import OcrdResourceManager + def print_resources(executable, reslist, resmgr): print('%s' % executable) for resdict in reslist: @@ -36,6 +30,7 @@ def print_resources(executable, reslist, resmgr): )) print() + @click.group("resmgr") def resmgr_cli(): """ @@ -43,6 +38,7 @@ def resmgr_cli(): """ initLogging() + @resmgr_cli.command('list-available') @click.option('-D', '--no-dynamic', is_flag=True, default=False, help="Whether to skip looking into each processor's --dump-{json,module-dir} for module-level resources") @click.option('-e', '--executable', help='Show only resources for executable beginning with EXEC', metavar='EXEC', default='ocrd-*') @@ -54,6 +50,7 @@ def list_available(executable, no_dynamic): for executable, reslist in resmgr.list_available(executable=executable, dynamic=not no_dynamic): print_resources(executable, reslist, resmgr) + @resmgr_cli.command('list-installed') @click.option('-e', '--executable', help='Show only resources for executable EXEC', metavar='EXEC') def list_installed(executable=None): @@ -64,6 +61,7 @@ def list_installed(executable=None): for executable, reslist in resmgr.list_installed(executable): print_resources(executable, reslist, resmgr) + @resmgr_cli.command('download') @click.option('-n', '--any-url', help='URL of unregistered resource to download/copy from', default='') @click.option('-D', '--no-dynamic', is_flag=True, default=False, help="Whether to skip looking into each processor's --dump-{json,module-dir} for module-level resources") @@ -172,6 +170,7 @@ def download(any_url, no_dynamic, resource_type, path_in_archive, allow_uninstal log.info(str(exc)) log.info("Use in parameters as '%s'", resmgr.parameter_usage(resdict['name'], usage=resdict.get('parameter_usage', 'as-is'))) + @resmgr_cli.command('migrate') @click.argument('migration', type=click.Choice(['2.37.0'])) def migrate(migration): From a9f96c1159b546ef713641e2b415148a5d0a2f3d Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 5 Jul 2024 13:17:01 +0200 Subject: [PATCH 051/111] refactor: p2 to p3 strings, line limit 120 --- src/ocrd/cli/resmgr.py | 76 +++++++++++++++++++++++------------------- 1 file changed, 42 insertions(+), 34 deletions(-) diff --git a/src/ocrd/cli/resmgr.py b/src/ocrd/cli/resmgr.py index 6df09fbde6..1493415601 100644 --- a/src/ocrd/cli/resmgr.py +++ b/src/ocrd/cli/resmgr.py @@ -20,14 +20,10 @@ def print_resources(executable, reslist, resmgr): - print('%s' % executable) + print(f"{executable}") for resdict in reslist: - print('- %s %s (%s)\n %s' % ( - resdict['name'], - '@ %s' % resmgr.resource_dir_to_location(resdict['path']) if 'path' in resdict else '', - resdict['url'], - resdict['description'] - )) + res_loc = resmgr.resource_dir_to_location(resdict['path']) if 'path' in resdict else '' + print(f"- {resdict['name']} @ {res_loc} ({resdict['url']})\n {resdict['description']}") print() @@ -40,8 +36,10 @@ def resmgr_cli(): @resmgr_cli.command('list-available') -@click.option('-D', '--no-dynamic', is_flag=True, default=False, help="Whether to skip looking into each processor's --dump-{json,module-dir} for module-level resources") -@click.option('-e', '--executable', help='Show only resources for executable beginning with EXEC', metavar='EXEC', default='ocrd-*') +@click.option('-D', '--no-dynamic', is_flag=True, default=False, + help="Whether to skip looking into each processor's --dump-{json,module-dir} for module-level resources") +@click.option('-e', '--executable', metavar='EXEC', default='ocrd-*', + help='Show only resources for executable beginning with EXEC', ) def list_available(executable, no_dynamic): """ List available resources @@ -63,16 +61,22 @@ def list_installed(executable=None): @resmgr_cli.command('download') -@click.option('-n', '--any-url', help='URL of unregistered resource to download/copy from', default='') -@click.option('-D', '--no-dynamic', is_flag=True, default=False, help="Whether to skip looking into each processor's --dump-{json,module-dir} for module-level resources") -@click.option('-t', '--resource-type', help='Type of resource', type=click.Choice(['file', 'directory', 'archive']), default='file') -@click.option('-P', '--path-in-archive', help='Path to extract in case of archive type', default='.') -@click.option('-a', '--allow-uninstalled', help="Allow installing resources for uninstalled processors", is_flag=True) +@click.option('-n', '--any-url', default='', help='URL of unregistered resource to download/copy from') +@click.option('-D', '--no-dynamic', default=False, is_flag=True, + help="Whether to skip looking into each processor's --dump-{json,module-dir} for module-level resources") +@click.option('-t', '--resource-type', type=click.Choice(['file', 'directory', 'archive']), default='file', + help='Type of resource',) +@click.option('-P', '--path-in-archive', default='.', help='Path to extract in case of archive type') +@click.option('-a', '--allow-uninstalled', is_flag=True, + help="Allow installing resources for uninstalled processors",) @click.option('-o', '--overwrite', help='Overwrite existing resources', is_flag=True) -@click.option('-l', '--location', help="Where to store resources - defaults to first location in processor's 'resource_locations' list or finally 'data'", type=click.Choice(RESOURCE_LOCATIONS)) +@click.option('-l', '--location', type=click.Choice(RESOURCE_LOCATIONS), + help="Where to store resources - defaults to first location in processor's 'resource_locations' " + "list or finally 'data'") @click.argument('executable', required=True) @click.argument('name', required=False) -def download(any_url, no_dynamic, resource_type, path_in_archive, allow_uninstalled, overwrite, location, executable, name): +def download(any_url, no_dynamic, resource_type, path_in_archive, allow_uninstalled, overwrite, location, executable, + name): """ Download resource NAME for processor EXECUTABLE. @@ -89,7 +93,7 @@ def download(any_url, no_dynamic, resource_type, path_in_archive, allow_uninstal log = getLogger('ocrd.cli.resmgr') resmgr = OcrdResourceManager() if executable != '*' and not name: - log.error("Unless EXECUTABLE ('%s') is the '*' wildcard, NAME is required" % executable) + log.error(f"Unless EXECUTABLE ('{executable}') is the '*' wildcard, NAME is required") sys.exit(1) elif executable == '*': executable = None @@ -99,19 +103,21 @@ def download(any_url, no_dynamic, resource_type, path_in_archive, allow_uninstal is_filename = Path(any_url).exists() if any_url else False if executable and not which(executable): if not allow_uninstalled: - log.error("Executable '%s' is not installed. " \ - "To download resources anyway, use the -a/--allow-uninstalled flag", executable) + log.error(f"Executable '{executable}' is not installed. " + f"To download resources anyway, use the -a/--allow-uninstalled flag") sys.exit(1) else: - log.info("Executable %s is not installed, but " \ - "downloading resources anyway", executable) + log.info(f"Executable '{executable}' is not installed, but downloading resources anyway", executable) reslist = resmgr.list_available(executable=executable, dynamic=not no_dynamic, name=name) if not any(r[1] for r in reslist): log.info(f"No resources {name} found in registry for executable {executable}") if executable and name: - reslist = [(executable, [{'url': any_url or '???', 'name': name, - 'type': resource_type, - 'path_in_archive': path_in_archive}])] + reslist = [(executable, [{ + 'url': any_url or '???', + 'name': name, + 'type': resource_type, + 'path_in_archive': path_in_archive}] + )] for this_executable, this_reslist in reslist: for resdict in this_reslist: if 'size' in resdict: @@ -121,15 +127,15 @@ def download(any_url, no_dynamic, resource_type, path_in_archive, allow_uninstal if any_url: resdict['url'] = any_url if resdict['url'] == '???': - log.warning("Cannot download user resource %s", resdict['name']) + log.warning(f"Cannot download user resource {resdict['name']}") continue if resdict['url'].startswith('https://') or resdict['url'].startswith('http://'): - log.info("Downloading %s resource '%s' (%s)", registered, resdict['name'], resdict['url']) + log.info(f"Downloading {registered} resource '{resdict['name']}' ({resdict['url']})") if 'size' not in resdict: with requests.head(resdict['url']) as r: resdict['size'] = int(r.headers.get('content-length', 0)) else: - log.info("Copying %s resource '%s' (%s)", registered, resdict['name'], resdict['url']) + log.info(f"Copying {registered} resource '{resdict['name']}' ({resdict['url']})") urlpath = Path(resdict['url']) resdict['url'] = str(urlpath.resolve()) if Path(urlpath).is_dir(): @@ -139,7 +145,8 @@ def download(any_url, no_dynamic, resource_type, path_in_archive, allow_uninstal if not location: location = get_ocrd_tool_json(this_executable)['resource_locations'][0] elif location not in get_ocrd_tool_json(this_executable)['resource_locations']: - log.error("The selected --location {location} is not in the {this_executable}'s resource search path, refusing to install to invalid location") + log.error(f"The selected --location {location} is not in the {this_executable}'s resource search path, " + f"refusing to install to invalid location") sys.exit(1) if location != 'module': basedir = resmgr.location_to_resource_dir(location) @@ -162,13 +169,15 @@ def download(any_url, no_dynamic, resource_type, path_in_archive, allow_uninstal progress_cb=lambda delta: bar.update(delta) ) if registered == 'unregistered': - log.info("%s resource '%s' (%s) not a known resource, creating stub in %s'", this_executable, name, any_url, resmgr.user_list) + log.info(f"{this_executable} resource '{name}' ({any_url}) not a known resource, creating stub " + f"in {resmgr.user_list}'") resmgr.add_to_user_database(this_executable, fpath, url=any_url) resmgr.save_user_list() - log.info("Installed resource %s under %s", resdict['url'], fpath) + log.info(f"Installed resource {resdict['url']} under {fpath}") except FileExistsError as exc: log.info(str(exc)) - log.info("Use in parameters as '%s'", resmgr.parameter_usage(resdict['name'], usage=resdict.get('parameter_usage', 'as-is'))) + log.info(f"Use in parameters as " + f"'{resmgr.parameter_usage(resdict['name'], usage=resdict.get('parameter_usage', 'as-is'))}'") @resmgr_cli.command('migrate') @@ -202,7 +211,6 @@ def migrate(migration): v_out = 'directory' resdict_out[k_out] = v_out yaml_out[executable].append(resdict_out) - resmgr.user_list.write_text(RESOURCE_USER_LIST_COMMENT + - '\n# migrated with ocrd resmgr migrate {migration}\n' + - safe_dump(yaml_out)) + resmgr.user_list.write_text( + RESOURCE_USER_LIST_COMMENT + '\n# migrated with ocrd resmgr migrate {migration}\n' + safe_dump(yaml_out)) log.info(f'Applied migration {migration} to {resmgr.user_list}') From 7ef7d94fd94d0d0aa6cff8ec5589afc49a4eba61 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 5 Jul 2024 13:50:04 +0200 Subject: [PATCH 052/111] fix: the additional param --- src/ocrd/cli/resmgr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/cli/resmgr.py b/src/ocrd/cli/resmgr.py index 1493415601..6ddc9a91bf 100644 --- a/src/ocrd/cli/resmgr.py +++ b/src/ocrd/cli/resmgr.py @@ -107,7 +107,7 @@ def download(any_url, no_dynamic, resource_type, path_in_archive, allow_uninstal f"To download resources anyway, use the -a/--allow-uninstalled flag") sys.exit(1) else: - log.info(f"Executable '{executable}' is not installed, but downloading resources anyway", executable) + log.info(f"Executable '{executable}' is not installed, but downloading resources anyway") reslist = resmgr.list_available(executable=executable, dynamic=not no_dynamic, name=name) if not any(r[1] for r in reslist): log.info(f"No resources {name} found in registry for executable {executable}") From d309a8102c3a55609d53e6612fd66150dc5315ba Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 5 Jul 2024 14:41:14 +0200 Subject: [PATCH 053/111] fix: set exists_ok for makedirs --- src/ocrd/workspace.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index 41ea8e9005..db090e2933 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -422,7 +422,7 @@ def add_file(self, file_grp, content=None, **kwargs) -> Union[OcrdFile, ClientSi # If the local filename has folder components, create those folders local_filename_dir = str(kwargs['local_filename']).rsplit('/', 1)[0] if local_filename_dir != str(kwargs['local_filename']) and not Path(local_filename_dir).is_dir(): - makedirs(local_filename_dir) + makedirs(local_filename_dir, exist_ok=True) # print(kwargs) kwargs["pageId"] = kwargs.pop("page_id") From 441cca90c61c982ebfb304ba2970dd5fa7e33a68 Mon Sep 17 00:00:00 2001 From: kba Date: Fri, 5 Jul 2024 14:59:59 +0200 Subject: [PATCH 054/111] :memo: changelog --- CHANGELOG.md | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 46a9b83d7a..a3357c6dbf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,8 +5,19 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased -## [2.66.1] - 2024-06-26 +Changed: + + - Additional docker base images with preinstalled tensorflow 1 (`core-cuda-tf1`), tensorflow 2 (`core-cuda-tf2`) and torch (`core-cuda-torch`), #1239 + +Added: + - `ocrd process` does support `-U/--mets-server`, #1243 + +Fixed: + + - `ocrd process`-derived tasks are not run in a temporary directory when not called from within workspace, #1243 + +## [2.66.1] - 2024-06-26 Fixed: From 70e87571d41a9af27d7fbf424d3dc61972e62646 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 5 Jul 2024 15:05:32 +0200 Subject: [PATCH 055/111] fix: increase health check retries --- tests/network/docker-compose.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/network/docker-compose.yml b/tests/network/docker-compose.yml index a5cef49e23..ec45bc7e8a 100644 --- a/tests/network/docker-compose.yml +++ b/tests/network/docker-compose.yml @@ -22,7 +22,7 @@ services: test: echo 'db.runCommand("ping").ok' | mongosh localhost:27017/test --quiet interval: 1s timeout: 3s - retries: 30 + retries: 90 ocrd_network_rabbit_mq: image: "rabbitmq:3.12-management" @@ -42,7 +42,7 @@ services: test: rabbitmq-diagnostics check_port_connectivity interval: 1s timeout: 3s - retries: 30 + retries: 90 ocrd_network_processing_server: image: "ocrd_core_test" From ae93e6375d0e2c97c89fd87fa49a79394f2e973e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 24 Jun 2024 15:00:33 +0200 Subject: [PATCH 056/111] Processor: factor-out show_resource(), delegate to resolve_resource() --- src/ocrd/processor/base.py | 41 +++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 931d945d45..6dad78a3ff 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -131,27 +131,22 @@ def __init__( for res in self.list_all_resources(): print(res) return - if resolve_resource or show_resource: - initLogging() + if resolve_resource: try: - res_fname = self.resolve_resource(resolve_resource or show_resource) + res = self.resolve_resource(resolve_resource) + print(res) + except ResourceNotFoundError as e: + log = getLogger('ocrd.processor.base') + log.critical(e.message) + sys.exit(1) + return + if show_resource: + try: + self.show_resource(show_resource) except ResourceNotFoundError as e: log = getLogger('ocrd.processor.base') log.critical(e.message) sys.exit(1) - if resolve_resource: - print(res_fname) - return - fpath = Path(res_fname) - if fpath.is_dir(): - with pushd_popd(fpath): - fileobj = io.BytesIO() - with tarfile.open(fileobj=fileobj, mode='w:gz') as tarball: - tarball.add('.') - fileobj.seek(0) - copyfileobj(fileobj, sys.stdout.buffer) - else: - sys.stdout.buffer.write(fpath.read_bytes()) return if show_help: self.show_help(subcommand=subcommand) @@ -233,6 +228,7 @@ def resolve_resource(self, val): Args: val (string): resource value to resolve """ + initLogging() executable = self.ocrd_tool['executable'] log = getLogger('ocrd.processor.base') if exists(val): @@ -250,6 +246,19 @@ def resolve_resource(self, val): return ret[0] raise ResourceNotFoundError(val, executable) + def show_resource(self, val): + res_fname = self.resolve_resource(val) + fpath = Path(res_fname) + if fpath.is_dir(): + with pushd_popd(fpath): + fileobj = io.BytesIO() + with tarfile.open(fileobj=fileobj, mode='w:gz') as tarball: + tarball.add('.') + fileobj.seek(0) + copyfileobj(fileobj, sys.stdout.buffer) + else: + sys.stdout.buffer.write(fpath.read_bytes()) + def list_all_resources(self): """ List all resources found in the filesystem and matching content-type by filename suffix From 6bdbac5fdf5e207b5873e808854b47057237654d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 11 Jun 2024 00:36:29 +0200 Subject: [PATCH 057/111] fix #274: no default -I / -O --- src/ocrd/decorators/__init__.py | 2 ++ src/ocrd/decorators/ocrd_cli_options.py | 7 ++----- src/ocrd/processor/base.py | 7 ++----- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/src/ocrd/decorators/__init__.py b/src/ocrd/decorators/__init__.py index 811587a10d..cbeadc8d7b 100644 --- a/src/ocrd/decorators/__init__.py +++ b/src/ocrd/decorators/__init__.py @@ -71,6 +71,8 @@ def ocrd_cli_wrap_processor( initLogging() LOG = getLogger('ocrd.cli_wrap_processor') + assert kwargs['input_file_grp'] is not None + assert kwargs['output_file_grp'] is not None # LOG.info('kwargs=%s' % kwargs) if 'parameter' in kwargs: # Disambiguate parameter file/literal, and resolve file diff --git a/src/ocrd/decorators/ocrd_cli_options.py b/src/ocrd/decorators/ocrd_cli_options.py index f329558388..e640a20032 100644 --- a/src/ocrd/decorators/ocrd_cli_options.py +++ b/src/ocrd/decorators/ocrd_cli_options.py @@ -29,11 +29,8 @@ def cli(mets_url): option('-m', '--mets', help="METS to process", default=DEFAULT_METS_BASENAME), option('-w', '--working-dir', help="Working Directory"), option('-U', '--mets-server-url', help="METS server URL. Starts with http:// then TCP, otherwise unix socket path"), - # TODO OCR-D/core#274 - # option('-I', '--input-file-grp', required=True), - # option('-O', '--output-file-grp', required=True), - option('-I', '--input-file-grp', default='INPUT'), - option('-O', '--output-file-grp', default='OUTPUT'), + option('-I', '--input-file-grp', default=None), + option('-O', '--output-file-grp', default=None), option('-g', '--page-id'), option('--overwrite', is_flag=True, default=False), option('--profile', is_flag=True, default=False), diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 6dad78a3ff..c31d992245 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -66,11 +66,8 @@ def __init__( workspace : Workspace, ocrd_tool=None, parameter=None, - # TODO OCR-D/core#274 - # input_file_grp=None, - # output_file_grp=None, - input_file_grp="INPUT", - output_file_grp="OUTPUT", + input_file_grp=None, + output_file_grp=None, page_id=None, resolve_resource=None, show_resource=None, From e2d14c420a239dfd09f7e82af189098069a7896d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 18 Jun 2024 11:19:20 +0200 Subject: [PATCH 058/111] workspace.download: fix typo in exception --- src/ocrd/workspace.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index 41ea8e9005..8ce42a070d 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -208,7 +208,7 @@ def download_file(self, f, _recursion_count=0): self.baseurl, f.local_filename) url = '%s/%s' % (self.baseurl, f.local_filename) else: - raise FileNotFoundError(f"'local_filename' {f.local_filename} points to non-existing file," + raise FileNotFoundError(f"'local_filename' {f.local_filename} points to non-existing file, " "and no 'url' to download and no 'baseurl' set on workspace - nothing we can do.") file_path = Path(f.local_filename) self.resolver.download_to_directory(self.directory, url, subdir=file_path.parent, basename=file_path.name) @@ -219,7 +219,7 @@ def download_file(self, f, _recursion_count=0): f.local_filename = self.resolver.download_to_directory(self.directory, f.url, subdir=f.fileGrp, basename=basename) return f # If neither f.local_filename nor f.url is set, fail - raise ValueError("OcrdFile {f} has neither 'url' nor 'local_filename', so cannot be downloaded") + raise ValueError(f"OcrdFile {f} has neither 'url' nor 'local_filename', so cannot be downloaded") def remove_file(self, file_id, force=False, keep_file=False, page_recursive=False, page_same_group=False): """ From 0979abd0fa1be11e10c57c4721608c444ce000c6 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 24 Jun 2024 13:50:36 +0200 Subject: [PATCH 059/111] ocrd_cli_wrap_processor: fix workspace arg (not a kwarg) --- src/ocrd/decorators/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ocrd/decorators/__init__.py b/src/ocrd/decorators/__init__.py index cbeadc8d7b..3d07957021 100644 --- a/src/ocrd/decorators/__init__.py +++ b/src/ocrd/decorators/__init__.py @@ -48,11 +48,11 @@ def ocrd_cli_wrap_processor( **kwargs ): if not sys.argv[1:]: - processorClass(workspace=None, show_help=True) + processorClass(None, show_help=True) sys.exit(1) if dump_json or dump_module_dir or help or version or show_resource or list_resources: processorClass( - workspace=None, + None, dump_json=dump_json, dump_module_dir=dump_module_dir, show_help=help, From 0e694e23d5f546352c96fc64e45825e0cb74e7b9 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 5 Jul 2024 17:29:55 +0200 Subject: [PATCH 060/111] processor parameter decorator for presets: avoid entering processing context --- src/ocrd/decorators/__init__.py | 6 +++++- src/ocrd/processor/base.py | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/ocrd/decorators/__init__.py b/src/ocrd/decorators/__init__.py index 3d07957021..580a75b0c0 100644 --- a/src/ocrd/decorators/__init__.py +++ b/src/ocrd/decorators/__init__.py @@ -76,7 +76,11 @@ def ocrd_cli_wrap_processor( # LOG.info('kwargs=%s' % kwargs) if 'parameter' in kwargs: # Disambiguate parameter file/literal, and resolve file - disposable = processorClass(workspace=None) + # (but avoid entering processing context of constructor) + class DisposableSubclass(processorClass): + def show_version(self): + pass + disposable = DisposableSubclass(None, show_version=True) def resolve(name): try: return disposable.resolve_resource(name) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index c31d992245..8303413933 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -116,8 +116,6 @@ def __init__( on stdout. """ self.ocrd_tool = ocrd_tool - if parameter is None: - parameter = {} if dump_json: print(json.dumps(ocrd_tool, indent=True)) return @@ -162,6 +160,8 @@ def __init__( self.input_file_grp = input_file_grp self.output_file_grp = output_file_grp self.page_id = None if page_id == [] or page_id is None else page_id + if parameter is None: + parameter = {} parameterValidator = ParameterValidator(ocrd_tool) report = parameterValidator.validate(parameter) if not report.is_valid: From d5cc111e918d6c1af859f4b648b753a644403f29 Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 9 Jul 2024 13:31:42 +0200 Subject: [PATCH 061/111] :memo: changelog --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a3357c6dbf..917bd64194 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,11 @@ Added: Fixed: - `ocrd process`-derived tasks are not run in a temporary directory when not called from within workspace, #1243 + - regression from #1238 where processors failed that had required parameters, #1255, #1256 + +Removed: + + - Defaults for `-I/--input-file-grp`/`-O/--output-file-grp`, #1256, #274 ## [2.66.1] - 2024-06-26 From 0500504f6c1b2307522295b0f95c998aa88112f6 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 9 Jul 2024 13:35:56 +0200 Subject: [PATCH 062/111] Update src/ocrd/mets_server.py --- src/ocrd/mets_server.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index 7e06c58f92..0d4c0a0785 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -555,5 +555,3 @@ def is_socket_in_use(socket_path): return False client.close() return True - else: - return False From be840f3b7a2bebeb133aadbaa16f482a9ed26310 Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 9 Jul 2024 13:37:29 +0200 Subject: [PATCH 063/111] :memo: changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 917bd64194..d2d1e7d994 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ Fixed: - `ocrd process`-derived tasks are not run in a temporary directory when not called from within workspace, #1243 - regression from #1238 where processors failed that had required parameters, #1255, #1256 + - METS Server: Unlink UDS sockert file if it exists before startup, #1244 Removed: From 0f7addf993547da0075ddf2998a0fa0a6aef8d89 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 11 Jul 2024 11:52:07 +0200 Subject: [PATCH 064/111] :memo: changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d2d1e7d994..e2e63b4c78 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ Versioned according to [Semantic Versioning](http://semver.org/). Changed: - Additional docker base images with preinstalled tensorflow 1 (`core-cuda-tf1`), tensorflow 2 (`core-cuda-tf2`) and torch (`core-cuda-torch`), #1239 + - Resource Manager: Skip instead of raise an exception download if target file already exists (unless `--overwrite`), #1246 Added: @@ -18,6 +19,7 @@ Fixed: - `ocrd process`-derived tasks are not run in a temporary directory when not called from within workspace, #1243 - regression from #1238 where processors failed that had required parameters, #1255, #1256 - METS Server: Unlink UDS sockert file if it exists before startup, #1244 + - Resource Manager: Do not create zero-size files for failing downloads, #1201, #1246 Removed: From f2c51a987c90962f4c37990cb4cef24aef8b60fa Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 11 Jul 2024 12:03:32 +0200 Subject: [PATCH 065/111] :memo: changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e2e63b4c78..9153712c14 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ Fixed: - regression from #1238 where processors failed that had required parameters, #1255, #1256 - METS Server: Unlink UDS sockert file if it exists before startup, #1244 - Resource Manager: Do not create zero-size files for failing downloads, #1201, #1246 + - Workspace.add_file: Allow multiple processors to create file group folders simultaneously, #1203, #1253 Removed: From d575540045cca98201eabc216cbd4a038030f8c3 Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 15 Jul 2024 13:58:05 +0200 Subject: [PATCH 066/111] :memo: changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9153712c14..e2e1b04b67 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ Fixed: - METS Server: Unlink UDS sockert file if it exists before startup, #1244 - Resource Manager: Do not create zero-size files for failing downloads, #1201, #1246 - Workspace.add_file: Allow multiple processors to create file group folders simultaneously, #1203, #1253 + - Resource Manager: Do not try to run `--dump-json` for known non-processors `ocrd-{cis-data,import,make}`, #1218, #1249 Removed: From 6874eec90372b05de83ed1be1d837706c218c547 Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 15 Jul 2024 13:59:14 +0200 Subject: [PATCH 067/111] :memo: changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e2e1b04b67..911003255b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,7 @@ Fixed: - Resource Manager: Do not create zero-size files for failing downloads, #1201, #1246 - Workspace.add_file: Allow multiple processors to create file group folders simultaneously, #1203, #1253 - Resource Manager: Do not try to run `--dump-json` for known non-processors `ocrd-{cis-data,import,make}`, #1218, #1249 + - Resource Manager: Properly handle copying of directories, #1237, #1248 Removed: From 87b3529ccae4180328ac75cdd49ab2d17ccc6da5 Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 15 Jul 2024 14:07:30 +0200 Subject: [PATCH 068/111] :memo: changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 911003255b..78d787e16d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ Changed: - Additional docker base images with preinstalled tensorflow 1 (`core-cuda-tf1`), tensorflow 2 (`core-cuda-tf2`) and torch (`core-cuda-torch`), #1239 - Resource Manager: Skip instead of raise an exception download if target file already exists (unless `--overwrite`), #1246 + - Resource Manager: Try to use bundled `ocrd-all-tool.json` if available, #1250, OCR-D/all#444 Added: From 648be59749ab5b7df228f204f196885027d6c322 Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 15 Jul 2024 18:19:10 +0200 Subject: [PATCH 069/111] tests: missing import stmt --- tests/test_resource_manager.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_resource_manager.py b/tests/test_resource_manager.py index dadf43c446..653167e10a 100644 --- a/tests/test_resource_manager.py +++ b/tests/test_resource_manager.py @@ -192,6 +192,7 @@ def test_download_archive(tmp_path): def test_copy_impl(tmp_path): + from ocrd.resource_manager import OcrdResourceManager root_dir = f"{tmp_path}/mgr_copy_impl_test" root_dir_copied = f"{tmp_path}/mgr_copy_impl_test_copied" rmtree(path=root_dir, ignore_errors=True) From f0516380db7dbc4c3263fcf28e9dacd6c7dbda20 Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 16 Jul 2024 13:58:34 +0200 Subject: [PATCH 070/111] tests: disable resmgr --overwrite tests, #1246 --- tests/cli/test_resmgr.py | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/tests/cli/test_resmgr.py b/tests/cli/test_resmgr.py index 6cec6225b8..9c52100cba 100644 --- a/tests/cli/test_resmgr.py +++ b/tests/cli/test_resmgr.py @@ -21,7 +21,7 @@ def test_url_tool_name_unregistered(mgr_with_tmp_path): """ We should add a test for the -n URL TOOL NAME use-case as well (both as an unregistered resource and as URL-override). """ - tmp_path, mgr, env = mgr_with_tmp_path + _, mgr, env = mgr_with_tmp_path print(mgr.list_installed(executable)[0][1]) rsrcs_before = len(mgr.list_installed(executable)[0][1]) @@ -33,23 +33,28 @@ def test_url_tool_name_unregistered(mgr_with_tmp_path): rsrcs = mgr.list_installed(executable)[0][1] assert len(rsrcs) == rsrcs_before + 1 - assert rsrcs[0]['name'] == name - assert rsrcs[0]['url'] == url + assert rsrcs[-1]['name'] == name + assert rsrcs[-1]['url'] == url # add resource with different URL but same name url2 = url.replace('dzo', 'bos') - r = runner.invoke(resmgr_cli, ['download', '--allow-uninstalled', '--any-url', url2, executable, name], env=env) - assert 'already exists but --overwrite is not set' in r.output + # + # TODO(kba): Silently skipped since https://github.com/OCR-D/core/commit/d5173ada7d052c107c04da8732ccd30f61c4d9a1 + # so we'd need to check the log output which is not captured by + # CliRunner, even though `mix_stderr == True` + # + # r = runner.invoke(resmgr_cli, ['download', '--allow-uninstalled', '--any-url', url2, executable, name], env=env) + # assert 'already exists but --overwrite is not set' in r.output r = runner.invoke(resmgr_cli, ['download', '--overwrite', '--allow-uninstalled', '--any-url', url2, executable, name], env=env) - assert 'already exists but --overwrite is not set' not in r.output + # assert 'already exists but --overwrite is not set' not in r.output mgr.load_resource_list(mgr.user_list) rsrcs = mgr.list_installed(executable)[0][1] print(rsrcs) assert len(rsrcs) == rsrcs_before + 1 - assert rsrcs[0]['name'] == name - assert rsrcs[0]['url'] == url2 + assert rsrcs[-1]['name'] == name + assert rsrcs[-1]['url'] == url2 def test_directory_copy(mgr_with_tmp_path): """ @@ -76,13 +81,18 @@ def test_directory_copy(mgr_with_tmp_path): assert Path(mgr_path / 'ocrd-resources' / proc).exists() assert directory_size(mgr_path / 'ocrd-resources' / proc / res_name) == 30 - r = runner.invoke( - resmgr_cli, - ['download', '--allow-uninstalled', '--any-url', tmp_path, proc, res_name], - env=env, - catch_exceptions=False - ) - assert 'already exists but --overwrite is not set' in r.output + # + # TODO(kba): Silently skipped since https://github.com/OCR-D/core/commit/d5173ada7d052c107c04da8732ccd30f61c4d9a1 + # so we'd need to check the log output which is not captured by + # CliRunner, even though `mix_stderr == True` + # + # r = runner.invoke( + # resmgr_cli, + # ['download', '--allow-uninstalled', '--any-url', tmp_path, proc, res_name], + # env=env, + # catch_exceptions=False + # ) + # assert 'already exists but --overwrite is not set' in r.output r = runner.invoke( resmgr_cli, ['download', '--overwrite', '--allow-uninstalled', '--any-url', tmp_path, proc, res_name], From 2e60ecd97a69309828be62aad3b868cfce0be9c4 Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 16 Jul 2024 14:12:42 +0200 Subject: [PATCH 071/111] switch to active fork requests_unixsocket2, msabramo/requests-unixsocket#72 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 672f9bc66c..9ee13f4179 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,7 +28,7 @@ python-magic python-multipart pyyaml requests < 2.30 -requests_unixsocket +requests_unixsocket2 shapely uvicorn uvicorn>=0.17.6 From 1cb607e354db6610251650962ebace3d0e309921 Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 16 Jul 2024 14:18:20 +0200 Subject: [PATCH 072/111] circleci: use schema v2.1 --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 9c5ff83227..24c742aa68 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,4 +1,4 @@ -version: 2 +version: 2.1 orbs: python: circleci/python@2.0.3 From 68ec83e248b2126e38ac882f1da9f167ef0413a8 Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 16 Jul 2024 14:21:05 +0200 Subject: [PATCH 073/111] unpin requests version (< 2.30) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 9ee13f4179..feb18104ac 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,7 +27,7 @@ pydantic==1.* python-magic python-multipart pyyaml -requests < 2.30 +requests requests_unixsocket2 shapely uvicorn From 8b22cc92b61b6108d3e3e8f9fc01c130be35aa3f Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 16 Jul 2024 15:13:14 +0200 Subject: [PATCH 074/111] ci: try fixing scrutinizer --- .scrutinizer.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.scrutinizer.yml b/.scrutinizer.yml index 4848dca46a..4e510c5d74 100644 --- a/.scrutinizer.yml +++ b/.scrutinizer.yml @@ -11,7 +11,8 @@ build: analysis: dependencies: override: - - sudo make deps-ubuntu + - command: sudo make deps-ubuntu + idle_timeout: 600 - make install tests: override: From 5f2cd6a0542ee71fd37da75c2c28eefa48b21f25 Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 16 Jul 2024 15:24:17 +0200 Subject: [PATCH 075/111] ci: try fixing scrutinizer --- .scrutinizer.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.scrutinizer.yml b/.scrutinizer.yml index 4e510c5d74..c45bc82bc7 100644 --- a/.scrutinizer.yml +++ b/.scrutinizer.yml @@ -7,6 +7,8 @@ build: python: version: 3.8.2 virtualenv: true + variables: + DEBIAN_FRONTEND: noninteractive nodes: analysis: dependencies: From 6136d9c439fd695f8ebe0ad5d8adcca4db9d280e Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 16 Jul 2024 15:52:06 +0200 Subject: [PATCH 076/111] test_os: don't dehomify root --- tests/utils/test_os.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/utils/test_os.py b/tests/utils/test_os.py index 2bdf33d2ae..f6dbde3549 100644 --- a/tests/utils/test_os.py +++ b/tests/utils/test_os.py @@ -27,6 +27,8 @@ def tearDown(self): def test_resolve_basic(self): def dehomify(s): + if ENV['HOME'] == '/' or expanduser('~') == '/': + return s return s.replace(ENV['HOME'], '$HOME').replace(expanduser('~'), '$HOME') fname = 'foo.bar' cands = list_resource_candidates('ocrd-dummy', fname) From 1634d3fe8ff80b1490e685fac5643404cab21825 Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 16 Jul 2024 16:51:55 +0200 Subject: [PATCH 077/111] test_ocrd_mets: tests in docker have OCRD_METS_CACHING=1 always, unset --- tests/model/test_ocrd_mets.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/tests/model/test_ocrd_mets.py b/tests/model/test_ocrd_mets.py index 3bf342b8ef..5062fe2270 100644 --- a/tests/model/test_ocrd_mets.py +++ b/tests/model/test_ocrd_mets.py @@ -61,10 +61,11 @@ def test_unique_identifier_from_nothing(): def test_str(): - mets = OcrdMets(content='', cache_flag=False) - assert str(mets) == 'OcrdMets[cached=False,fileGrps=[],files=[]]' - mets_cached = OcrdMets(content='', cache_flag=True) - assert str(mets_cached) == 'OcrdMets[cached=True,fileGrps=[],files=[]]' + with temp_env_var('OCRD_METS_CACHING', None): + mets = OcrdMets(content='', cache_flag=False) + assert str(mets) == 'OcrdMets[cached=False,fileGrps=[],files=[]]' + mets_cached = OcrdMets(content='', cache_flag=True) + assert str(mets_cached) == 'OcrdMets[cached=True,fileGrps=[],files=[]]' def test_file_groups(sbb_sample_01): @@ -383,12 +384,15 @@ def test_invalid_filegrp(): @contextmanager def temp_env_var(k, v): v_before = environ.get(k, None) - environ[k] = v + if v == None: + environ.pop(k, None) + else: + environ[k] = v yield if v_before is not None: environ[k] = v_before else: - del environ[k] + environ.pop(k, None) def test_envvar(): assert OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=True)._cache_flag From 9bf0e7da84e368b01347230bcad834faacb4fbc3 Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 16 Jul 2024 17:24:56 +0200 Subject: [PATCH 078/111] test_ocrd_mets: tests in docker have OCRD_METS_CACHING=1 always, unset --- tests/model/test_ocrd_mets.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/model/test_ocrd_mets.py b/tests/model/test_ocrd_mets.py index 5062fe2270..739db7625a 100644 --- a/tests/model/test_ocrd_mets.py +++ b/tests/model/test_ocrd_mets.py @@ -395,8 +395,9 @@ def temp_env_var(k, v): environ.pop(k, None) def test_envvar(): - assert OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=True)._cache_flag - assert not OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=False)._cache_flag + with temp_env_var('OCRD_METS_CACHING', None): + assert OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=True)._cache_flag + assert not OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=False)._cache_flag with temp_env_var('OCRD_METS_CACHING', 'true'): assert OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=True)._cache_flag assert OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=False)._cache_flag From 11b76c9265e52af5347991e6825e43da0260bb4d Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 16 Jul 2024 17:36:27 +0200 Subject: [PATCH 079/111] logging: extract get_logging_config_files function, use to skip tests if logging.conf present --- src/ocrd_utils/__init__.py | 1 + src/ocrd_utils/logging.py | 22 ++++++++++++++-------- tests/test_decorators.py | 4 +++- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/src/ocrd_utils/__init__.py b/src/ocrd_utils/__init__.py index d03c2a920c..2055758a89 100644 --- a/src/ocrd_utils/__init__.py +++ b/src/ocrd_utils/__init__.py @@ -169,6 +169,7 @@ disableLogging, getLevelName, getLogger, + get_logging_config_files, initLogging, setOverrideLogLevel, ) diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index 6245f99b76..bb771fc0ce 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -128,6 +128,19 @@ def setOverrideLogLevel(lvl, silent=not config.OCRD_LOGGING_DEBUG): print(f'[LOGGING] Overriding ocrd log level to {lvl}', file=sys.stderr) ocrd_logger.setLevel(lvl) +def get_logging_config_files(): + """ + Return a list of all ``ocrd_logging.conf`` files found in CWD, HOME or /etc. + """ + CONFIG_PATHS = [ + Path.cwd(), + Path.home(), + Path('/etc'), + ] + return [f for f \ + in [p / 'ocrd_logging.conf' for p in CONFIG_PATHS] \ + if f.exists()] + def initLogging(builtin_only=False, force_reinit=False, silent=not config.OCRD_LOGGING_DEBUG): """ Reset ``ocrd`` logger, read logging configuration if exists, otherwise use basicConfig @@ -164,14 +177,7 @@ def initLogging(builtin_only=False, force_reinit=False, silent=not config.OCRD_L config_file = None if not builtin_only: - CONFIG_PATHS = [ - Path.cwd(), - Path.home(), - Path('/etc'), - ] - config_file = [f for f \ - in [p / 'ocrd_logging.conf' for p in CONFIG_PATHS] \ - if f.exists()] + config_file = get_logging_config_files() if config_file: if len(config_file) > 1 and not silent: print(f"[LOGGING] Multiple logging configuration files found at {config_file}, using first one", file=sys.stderr) diff --git a/tests/test_decorators.py b/tests/test_decorators.py index 3c38f20789..5ab2880053 100644 --- a/tests/test_decorators.py +++ b/tests/test_decorators.py @@ -15,7 +15,7 @@ ocrd_loglevel, ocrd_cli_wrap_processor, ) # pylint: disable=protected-access -from ocrd_utils import pushd_popd, VERSION as OCRD_VERSION, disableLogging, initLogging +from ocrd_utils import pushd_popd, VERSION as OCRD_VERSION, disableLogging, initLogging, get_logging_config_files @click.command() @ocrd_cli_options @@ -60,6 +60,8 @@ def test_loglevel_invalid(self): assert "'foo' is not one of" in err def test_loglevel_override(self): + if get_logging_config_files(): + pytest.skip(f"ocrd_logging.conf found at {get_logging_config_files()}, skipping logging test") import logging disableLogging() assert logging.getLogger('ocrd').getEffectiveLevel() == logging.WARNING From 9ea8e953f5e98844cb2e34aeea813cb92bdb08d5 Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 16 Jul 2024 19:07:37 +0200 Subject: [PATCH 080/111] :fire: bashlib: quote output of parameter preset resolve result --- src/ocrd/lib.bash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/lib.bash b/src/ocrd/lib.bash index a65ca35cd6..1e3ecfc6eb 100644 --- a/src/ocrd/lib.bash +++ b/src/ocrd/lib.bash @@ -146,7 +146,7 @@ ocrd__parse_argv () { -D|--dump-module-dir) echo $(dirname "$OCRD_TOOL_JSON"); exit ;; -C|--show-resource) ocrd__show_resource "$2"; exit ;; -L|--list-resources) ocrd__list_resources; exit ;; - -p|--parameter) __parameters+=(-p $(ocrd__resolve_resource "$2" 2>/dev/null || echo "$2")) ; shift ;; + -p|--parameter) __parameters+=(-p "$(ocrd__resolve_resource "$2" 2>/dev/null || echo "$2")") ; shift ;; -P|--parameter-override) __parameter_overrides+=(-P "$2" "$3") ; shift ; shift ;; -g|--page-id) ocrd__argv[page_id]=$2 ; shift ;; -O|--output-file-grp) ocrd__argv[output_file_grp]=$2 ; shift ;; From c6e918b8d52f773c4767c4bd95c8cd93cf158990 Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 16 Jul 2024 19:13:18 +0200 Subject: [PATCH 081/111] disable scrutinizer --- .scrutinizer.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.scrutinizer.yml b/.scrutinizer.yml index c45bc82bc7..83cb1c8521 100644 --- a/.scrutinizer.yml +++ b/.scrutinizer.yml @@ -13,9 +13,10 @@ build: analysis: dependencies: override: - - command: sudo make deps-ubuntu - idle_timeout: 600 - - make install + - echo "Skipped" + # - command: sudo make deps-ubuntu + # idle_timeout: 600 + # - make install tests: override: - py-scrutinizer-run From 204b588deb8fcdc34eced8ab5da0440b20293ab4 Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 16 Jul 2024 19:36:08 +0200 Subject: [PATCH 082/111] :memo: changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 78d787e16d..ddc92fed8e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ Fixed: - Workspace.add_file: Allow multiple processors to create file group folders simultaneously, #1203, #1253 - Resource Manager: Do not try to run `--dump-json` for known non-processors `ocrd-{cis-data,import,make}`, #1218, #1249 - Resource Manager: Properly handle copying of directories, #1237, #1248 + - bashlib: regression in parsing JSON from introducing parameter preset files, #1258 Removed: From 719bbc7d526a9876a816d4bc522e9255bb654152 Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 16 Jul 2024 19:37:08 +0200 Subject: [PATCH 083/111] :package: v2.67.0 --- CHANGELOG.md | 3 +++ VERSION | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ddc92fed8e..3a373e748b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [2.67.0] - 2024-07-16 + Changed: - Additional docker base images with preinstalled tensorflow 1 (`core-cuda-tf1`), tensorflow 2 (`core-cuda-tf2`) and torch (`core-cuda-torch`), #1239 @@ -2125,6 +2127,7 @@ Fixed Initial Release +[2.67.0]: ../../compare/v2.67.0..v2.66.1 [2.66.1]: ../../compare/v2.66.1..v2.66.0 [2.66.0]: ../../compare/v2.66.0..v2.65.0 [2.65.0]: ../../compare/v2.65.0..v2.64.1 diff --git a/VERSION b/VERSION index 64fb9e5e48..ed283c8621 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.66.1 +2.67.0 From 0f37b763fc91453b53c1dc1745e1be5a4a465ab6 Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 16 Jul 2024 19:47:46 +0200 Subject: [PATCH 084/111] makefile: missing ;then --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index c80dffffe3..3daffddfc7 100644 --- a/Makefile +++ b/Makefile @@ -151,7 +151,7 @@ deps-tf1: fi deps-tf2: - if $(PYTHON) -c 'import sys; print("%u.%u" % (sys.version_info.major, sys.version_info.minor))' | fgrep 3.8 && \ + if $(PYTHON) -c 'import sys; print("%u.%u" % (sys.version_info.major, sys.version_info.minor))' | fgrep 3.8; then \ $(PIP) install tensorflow; \ else \ $(PIP) install "tensorflow[and-cuda]"; \ From fb29d8608b67635e5916b055a513976826c9b7a7 Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 16 Jul 2024 19:57:15 +0200 Subject: [PATCH 085/111] ci: workaround env interpolation issue --- .github/workflows/docker-image.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml index 0c6501020f..d304336761 100644 --- a/.github/workflows/docker-image.yml +++ b/.github/workflows/docker-image.yml @@ -20,7 +20,9 @@ jobs: contents: read env: DOCKER_BASE_TAG: ghcr.io/ocr-d docker.io/ocrd - DOCKER_BUILD: docker buildx build --progress=plain --platform ${{ env.PLATFORMS }} --push + # TODO(kba): make the interpolation work correctly + # DOCKER_BUILD: docker buildx build --progress=plain --platform ${{ env.PLATFORMS }} --push + DOCKER_BUILD: docker buildx build --progress=plain --platform linux/amd64,linux/arm64/v8,linux/ppc64le --push steps: - name: Export variables run: | From 061228c08e6d10a6aee71fd0e7c6f34b98fdef8f Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 16 Jul 2024 20:05:43 +0200 Subject: [PATCH 086/111] ci: gha env var quote fix --- .github/workflows/docker-image.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml index d304336761..dc532a4a95 100644 --- a/.github/workflows/docker-image.yml +++ b/.github/workflows/docker-image.yml @@ -26,8 +26,8 @@ jobs: steps: - name: Export variables run: | - echo "DOCKER_BASE_TAG='${{ env.DOCKER_BASE_TAG }}'" >> $GITHUB_ENV - echo "DOCKER_BUILD='${{ env.DOCKER_BUILD }}'" >> $GITHUB_ENV + echo "DOCKER_BASE_TAG=${{ env.DOCKER_BASE_TAG }}" >> $GITHUB_ENV + echo "DOCKER_BUILD=${{ env.DOCKER_BUILD }}" >> $GITHUB_ENV - name: Checkout uses: actions/checkout@v4 with: From a7962143131bf07caa151a47750df01fcedef67a Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 16 Jul 2024 20:41:58 +0200 Subject: [PATCH 087/111] ci: gha: disable ppc build b/c it hangs on dependencies --- .github/workflows/docker-image.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml index dc532a4a95..09428e5836 100644 --- a/.github/workflows/docker-image.yml +++ b/.github/workflows/docker-image.yml @@ -22,7 +22,8 @@ jobs: DOCKER_BASE_TAG: ghcr.io/ocr-d docker.io/ocrd # TODO(kba): make the interpolation work correctly # DOCKER_BUILD: docker buildx build --progress=plain --platform ${{ env.PLATFORMS }} --push - DOCKER_BUILD: docker buildx build --progress=plain --platform linux/amd64,linux/arm64/v8,linux/ppc64le --push + # TODO(kba): Investigate why ppc64le build hangs on "Installing build dependencies" + DOCKER_BUILD: docker buildx build --progress=plain --platform linux/amd64,linux/arm64/v8 --push steps: - name: Export variables run: | From 76323db7947213d3be199b814da79e9705abc705 Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 16 Jul 2024 21:14:14 +0200 Subject: [PATCH 088/111] :fire: make docker: use only one base image --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 3daffddfc7..3d3ee36fcc 100644 --- a/Makefile +++ b/Makefile @@ -396,7 +396,7 @@ docker-cuda-torch: DOCKER_FILE = Dockerfile.cuda-torch docker-cuda-torch: docker-cuda docker docker-cuda docker-cuda-tf1 docker-cuda-tf2 docker-cuda-torch: - $(DOCKER_BUILD) -f $(DOCKER_FILE) $(DOCKER_TAG:%=-t %) --target ocrd_core_base --build-arg BASE_IMAGE=$(DOCKER_BASE_IMAGE) $(DOCKER_ARGS) . + @echo $(DOCKER_BUILD) -f $(DOCKER_FILE) $(DOCKER_TAG:%=-t %) --target ocrd_core_base --build-arg BASE_IMAGE=$(lastword $(DOCKER_BASE_IMAGE)) $(DOCKER_ARGS) . # Build wheels and source dist and twine upload them pypi: build From 6a1058b1bb255dbb524a1fb8cb1502cc6497fdd5 Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 16 Jul 2024 21:16:19 +0200 Subject: [PATCH 089/111] :bug: missed dbg stmt --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 3d3ee36fcc..e9cb9ca169 100644 --- a/Makefile +++ b/Makefile @@ -396,7 +396,7 @@ docker-cuda-torch: DOCKER_FILE = Dockerfile.cuda-torch docker-cuda-torch: docker-cuda docker docker-cuda docker-cuda-tf1 docker-cuda-tf2 docker-cuda-torch: - @echo $(DOCKER_BUILD) -f $(DOCKER_FILE) $(DOCKER_TAG:%=-t %) --target ocrd_core_base --build-arg BASE_IMAGE=$(lastword $(DOCKER_BASE_IMAGE)) $(DOCKER_ARGS) . + $(DOCKER_BUILD) -f $(DOCKER_FILE) $(DOCKER_TAG:%=-t %) --target ocrd_core_base --build-arg BASE_IMAGE=$(lastword $(DOCKER_BASE_IMAGE)) $(DOCKER_ARGS) . # Build wheels and source dist and twine upload them pypi: build From c2d6e477c75c9278f0ba5d245ff31d73ce2d3b56 Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 17 Jul 2024 11:46:00 +0200 Subject: [PATCH 090/111] disable arm64 build for now --- .github/workflows/docker-image.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml index 09428e5836..da01c8cb20 100644 --- a/.github/workflows/docker-image.yml +++ b/.github/workflows/docker-image.yml @@ -23,7 +23,8 @@ jobs: # TODO(kba): make the interpolation work correctly # DOCKER_BUILD: docker buildx build --progress=plain --platform ${{ env.PLATFORMS }} --push # TODO(kba): Investigate why ppc64le build hangs on "Installing build dependencies" - DOCKER_BUILD: docker buildx build --progress=plain --platform linux/amd64,linux/arm64/v8 --push + # TODO(kba): Investigate why arm64 fails with .buildkit_qemu_emulator: /usr/local/bin/conda: Invalid ELF image for this architecture + DOCKER_BUILD: docker buildx build --progress=plain --platform linux/amd64 --push steps: - name: Export variables run: | From 92b217e36b68891724efe030542378ca995e4178 Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 17 Jul 2024 12:18:18 +0200 Subject: [PATCH 091/111] :package: v2.67.1 --- CHANGELOG.md | 7 +++++++ VERSION | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3a373e748b..ad18302ad2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,12 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [2.67.1] - 2024-07-17 + +Fixed: + + - Build and tests fixed, no functional changes from #1258 + ## [2.67.0] - 2024-07-16 Changed: @@ -2127,6 +2133,7 @@ Fixed Initial Release +[2.67.1]: ../../compare/v2.67.1..v2.67.0 [2.67.0]: ../../compare/v2.67.0..v2.66.1 [2.66.1]: ../../compare/v2.66.1..v2.66.0 [2.66.0]: ../../compare/v2.66.0..v2.65.0 diff --git a/VERSION b/VERSION index ed283c8621..9db2e998b3 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.67.0 +2.67.1 From 7dabe0046038f387254d193e81efc1cbc1122788 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 17 Jul 2024 12:55:09 +0200 Subject: [PATCH 092/111] load tool json locally --- src/ocrd_network/constants.py | 2 +- src/ocrd_network/processing_server.py | 8 ++++---- src/ocrd_network/utils.py | 14 +++++--------- 3 files changed, 10 insertions(+), 14 deletions(-) diff --git a/src/ocrd_network/constants.py b/src/ocrd_network/constants.py index 53dbd9b11b..bfa137d9e0 100644 --- a/src/ocrd_network/constants.py +++ b/src/ocrd_network/constants.py @@ -6,7 +6,7 @@ DOCKER_RABBIT_MQ_FEATURES = "quorum_queue,implicit_default_bindings,classic_mirrored_queue_version" NETWORK_PROTOCOLS = ["http://", "https://"] -OCRD_ALL_JSON_TOOLS_URL = "https://ocr-d.de/js/ocrd-all-tool.json" +OCRD_ALL_TOOL_JSON = "ocrd-all-tool.json" # Used as a placeholder to lock all pages when no page_id is specified SERVER_ALL_PAGES_PLACEHOLDER = "all_pages" diff --git a/src/ocrd_network/processing_server.py b/src/ocrd_network/processing_server.py index dbbdea6475..e142802268 100644 --- a/src/ocrd_network/processing_server.py +++ b/src/ocrd_network/processing_server.py @@ -10,7 +10,7 @@ from ocrd.task_sequence import ProcessorTask from ocrd_utils import initLogging, getLogger -from .constants import AgentType, JobState, OCRD_ALL_JSON_TOOLS_URL, ServerApiTags +from .constants import AgentType, JobState, ServerApiTags from .database import ( initiate_database, db_get_processing_job, @@ -58,7 +58,7 @@ ) from .tcp_to_uds_mets_proxy import MetsServerProxy from .utils import ( - download_ocrd_all_tool_json, + load_ocrd_all_tool_json, expand_page_ids, generate_id, generate_workflow_content, @@ -90,8 +90,8 @@ def __init__(self, config_path: str, host: str, port: int) -> None: log_file = get_processing_server_logging_file_path(pid=getpid()) configure_file_handler_with_formatter(self.log, log_file=log_file, mode="a") - self.log.info(f"Downloading ocrd all tool json") - self.ocrd_all_tool_json = download_ocrd_all_tool_json(ocrd_all_url=OCRD_ALL_JSON_TOOLS_URL) + self.log.info(f"Loading ocrd all tool json") + self.ocrd_all_tool_json = load_ocrd_all_tool_json() self.hostname = host self.port = port diff --git a/src/ocrd_network/utils.py b/src/ocrd_network/utils.py index e7a07fa9d9..e0f3570a47 100644 --- a/src/ocrd_network/utils.py +++ b/src/ocrd_network/utils.py @@ -3,6 +3,7 @@ from fastapi import UploadFile from functools import wraps from hashlib import md5 +from json import loads from pathlib import Path from re import compile as re_compile, split as re_split from requests import get as requests_get, Session as Session_TCP @@ -14,7 +15,8 @@ from ocrd.resolver import Resolver from ocrd.workspace import Workspace from ocrd.mets_server import MpxReq -from ocrd_utils import config, generate_range, REGEX_PREFIX, safe_filename, getLogger +from ocrd_utils import config, generate_range, REGEX_PREFIX, safe_filename, getLogger, resource_string +from .constants import OCRD_ALL_TOOL_JSON from .rabbitmq_utils import OcrdResultMessage @@ -92,14 +94,8 @@ def is_url_responsive(url: str, tries: int = 1, wait_time: int = 3) -> bool: return False -def download_ocrd_all_tool_json(ocrd_all_url: str): - if not ocrd_all_url: - raise ValueError(f"The URL of ocrd all tool json is empty") - headers = {"Accept": "application/json"} - response = Session_TCP().get(ocrd_all_url, headers=headers) - if not response.status_code == 200: - raise ValueError(f"Failed to download ocrd all tool json from: '{ocrd_all_url}'") - return response.json() +def load_ocrd_all_tool_json(): + return loads(resource_string('ocrd', OCRD_ALL_TOOL_JSON)) def post_to_callback_url(logger, callback_url: str, result_message: OcrdResultMessage): From 083a2cf741fb200e598575b92d33090af67c0b58 Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 17 Jul 2024 13:48:19 +0200 Subject: [PATCH 093/111] restrict multiprocessing.set_start_method call to OSX --- tests/cli/test_process.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/cli/test_process.py b/tests/cli/test_process.py index 32fc02fed0..d123e857bd 100644 --- a/tests/cli/test_process.py +++ b/tests/cli/test_process.py @@ -4,7 +4,9 @@ from contextlib import ExitStack from multiprocessing import Process, set_start_method # necessary for macos -set_start_method("fork") +from sys import platform +if platform == "darwin": + set_start_method("fork") from ocrd import Resolver, Workspace, OcrdMetsServer from ocrd.cli import process_cli From 66e1e8bddf266b45a287de968beee5f7a2cc054e Mon Sep 17 00:00:00 2001 From: kba Date: Fri, 19 Jul 2024 14:57:02 +0200 Subject: [PATCH 094/111] :memo: changelog --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ad18302ad2..8ba32e427f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,10 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +Fixed: + + * Run `multiprocessing.set_start_method('fork')` only for OSX, #1261 + ## [2.67.1] - 2024-07-17 Fixed: From 81c0b4c01eb1db54999c5db8feae120df2cc1a7b Mon Sep 17 00:00:00 2001 From: kba Date: Fri, 19 Jul 2024 14:58:28 +0200 Subject: [PATCH 095/111] :package: v2.67.2 --- CHANGELOG.md | 4 ++++ Makefile | 2 +- VERSION | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8ba32e427f..fcd01d11b3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,9 +5,12 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [2.67.2] - 2024-07-19 + Fixed: * Run `multiprocessing.set_start_method('fork')` only for OSX, #1261 + * Broken PyPI release, #1262 ## [2.67.1] - 2024-07-17 @@ -2137,6 +2140,7 @@ Fixed Initial Release +[2.67.2]: ../../compare/v2.67.2..v2.67.1 [2.67.1]: ../../compare/v2.67.1..v2.67.0 [2.67.0]: ../../compare/v2.67.0..v2.66.1 [2.66.1]: ../../compare/v2.66.1..v2.66.0 diff --git a/Makefile b/Makefile index e9cb9ca169..0608b0b738 100644 --- a/Makefile +++ b/Makefile @@ -400,7 +400,7 @@ docker docker-cuda docker-cuda-tf1 docker-cuda-tf2 docker-cuda-torch: # Build wheels and source dist and twine upload them pypi: build - twine upload dist/ocrd-$(VERSION)*{tar.gz,whl} + twine upload --verbose dist/ocrd-$(VERSION)*{tar.gz,whl} pypi-workaround: build-workaround for dist in $(BUILD_ORDER);do twine upload dist/$$dist-$(VERSION)*{tar.gz,whl};done diff --git a/VERSION b/VERSION index 9db2e998b3..456312b41f 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.67.1 +2.67.2 From 7382c46ff1eb6825c4e8ea20f1ae5638fece8a92 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Mon, 29 Jul 2024 13:05:24 +0200 Subject: [PATCH 096/111] download tool json if missing --- src/ocrd_network/constants.py | 1 + src/ocrd_network/utils.py | 15 ++++++++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/src/ocrd_network/constants.py b/src/ocrd_network/constants.py index bfa137d9e0..f3d2de1247 100644 --- a/src/ocrd_network/constants.py +++ b/src/ocrd_network/constants.py @@ -7,6 +7,7 @@ NETWORK_PROTOCOLS = ["http://", "https://"] OCRD_ALL_TOOL_JSON = "ocrd-all-tool.json" +OCRD_ALL_TOOL_JSON_URL = "https://ocr-d.de/js/ocrd-all-tool.json" # Used as a placeholder to lock all pages when no page_id is specified SERVER_ALL_PAGES_PLACEHOLDER = "all_pages" diff --git a/src/ocrd_network/utils.py b/src/ocrd_network/utils.py index e0f3570a47..56d35558ef 100644 --- a/src/ocrd_network/utils.py +++ b/src/ocrd_network/utils.py @@ -16,7 +16,7 @@ from ocrd.workspace import Workspace from ocrd.mets_server import MpxReq from ocrd_utils import config, generate_range, REGEX_PREFIX, safe_filename, getLogger, resource_string -from .constants import OCRD_ALL_TOOL_JSON +from .constants import OCRD_ALL_TOOL_JSON, OCRD_ALL_TOOL_JSON_URL from .rabbitmq_utils import OcrdResultMessage @@ -94,8 +94,17 @@ def is_url_responsive(url: str, tries: int = 1, wait_time: int = 3) -> bool: return False -def load_ocrd_all_tool_json(): - return loads(resource_string('ocrd', OCRD_ALL_TOOL_JSON)) +def load_ocrd_all_tool_json(download_if_missing: bool = True): + try: + ocrd_all_tool_json = loads(resource_string('ocrd', OCRD_ALL_TOOL_JSON)) + except Exception as error: + if not download_if_missing: + raise Exception(error) + response = Session_TCP().get(OCRD_ALL_TOOL_JSON_URL, headers={"Accept": "application/json"}) + if not response.status_code == 200: + raise ValueError(f"Failed to download ocrd all tool json from: '{OCRD_ALL_TOOL_JSON_URL}'") + ocrd_all_tool_json = response.json() + return ocrd_all_tool_json def post_to_callback_url(logger, callback_url: str, result_message: OcrdResultMessage): From 035b4ea526a1a88a0f75d7ef54f37f2f47c36a56 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Mon, 29 Jul 2024 13:16:22 +0200 Subject: [PATCH 097/111] add: default ocrd-all-tool.json --- src/ocrd/ocrd-all-tool.json | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 src/ocrd/ocrd-all-tool.json diff --git a/src/ocrd/ocrd-all-tool.json b/src/ocrd/ocrd-all-tool.json new file mode 100644 index 0000000000..fee8e7ef62 --- /dev/null +++ b/src/ocrd/ocrd-all-tool.json @@ -0,0 +1,21 @@ +{ + "ocrd-dummy": { + "executable": "ocrd-dummy", + "description": "Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group", + "steps": [ + "preprocessing/optimization" + ], + "categories": [ + "Image preprocessing" + ], + "input_file_grp": "DUMMY_INPUT", + "output_file_grp": "DUMMY_OUTPUT", + "parameters": { + "copy_files": { + "type": "boolean", + "default": false, + "description": "Whether to actually copy files (true) or just create PAGE-XML as a side effect (false)" + } + } + } +} \ No newline at end of file From ccfaf100b1ceb11d96a74b2f23bf45f2e0a9f446 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Mon, 29 Jul 2024 13:42:39 +0200 Subject: [PATCH 098/111] remove downloading tool json --- src/ocrd_network/utils.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/ocrd_network/utils.py b/src/ocrd_network/utils.py index 56d35558ef..babd576956 100644 --- a/src/ocrd_network/utils.py +++ b/src/ocrd_network/utils.py @@ -94,16 +94,11 @@ def is_url_responsive(url: str, tries: int = 1, wait_time: int = 3) -> bool: return False -def load_ocrd_all_tool_json(download_if_missing: bool = True): +def load_ocrd_all_tool_json(): try: ocrd_all_tool_json = loads(resource_string('ocrd', OCRD_ALL_TOOL_JSON)) except Exception as error: - if not download_if_missing: - raise Exception(error) - response = Session_TCP().get(OCRD_ALL_TOOL_JSON_URL, headers={"Accept": "application/json"}) - if not response.status_code == 200: - raise ValueError(f"Failed to download ocrd all tool json from: '{OCRD_ALL_TOOL_JSON_URL}'") - ocrd_all_tool_json = response.json() + raise ValueError(f"Failed to load ocrd all tool json from: '{OCRD_ALL_TOOL_JSON}', {error}") return ocrd_all_tool_json From 1af0cc186ae373beccd320355789064978d22a78 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Mon, 29 Jul 2024 13:47:38 +0200 Subject: [PATCH 099/111] set: paramiko logging to ERROR --- src/ocrd_utils/ocrd_logging.conf | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/ocrd_utils/ocrd_logging.conf b/src/ocrd_utils/ocrd_logging.conf index 93e311a882..bc477d9a06 100644 --- a/src/ocrd_utils/ocrd_logging.conf +++ b/src/ocrd_utils/ocrd_logging.conf @@ -11,7 +11,7 @@ # each logger requires a corresponding configuration section below # [loggers] -keys=root,ocrd,ocrd_network,ocrd_tensorflow,ocrd_shapely_geos,ocrd_PIL,uvicorn,uvicorn_access,uvicorn_error,multipart +keys=root,ocrd,ocrd_network,ocrd_tensorflow,ocrd_shapely_geos,ocrd_PIL,uvicorn,uvicorn_access,uvicorn_error,multipart,paramiko,paramiko_transport # # mandatory handlers section @@ -91,6 +91,19 @@ level=INFO handlers=consoleHandler qualname=PIL +# +# paramiko loggers +# +[logger_paramiko] +level=ERROR +handlers=consoleHandler +qualname=paramiko + +[logger_paramiko_transport] +level=ERROR +handlers=consoleHandler +qualname=paramiko.transport + # # uvicorn loggers # From be133ea0c0f6b17d5079681f5665038f10dc4c1b Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Mon, 29 Jul 2024 13:51:45 +0200 Subject: [PATCH 100/111] set: propagate 0, logging config --- src/ocrd_utils/ocrd_logging.conf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/ocrd_utils/ocrd_logging.conf b/src/ocrd_utils/ocrd_logging.conf index bc477d9a06..60925acae8 100644 --- a/src/ocrd_utils/ocrd_logging.conf +++ b/src/ocrd_utils/ocrd_logging.conf @@ -98,11 +98,13 @@ qualname=PIL level=ERROR handlers=consoleHandler qualname=paramiko +propagate=0 [logger_paramiko_transport] level=ERROR handlers=consoleHandler qualname=paramiko.transport +propagate=0 # # uvicorn loggers From 379f3a47313c57f0c3a9730b94ee23fb207f5f9f Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Mon, 29 Jul 2024 15:35:39 +0200 Subject: [PATCH 101/111] fix: supress paramiko warnings --- requirements.txt | 2 +- requirements_test.txt | 1 + src/ocrd_network/runtime_data/connection_clients.py | 8 +++++++- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index feb18104ac..f748a06057 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ atomicwrites >= 1.3.0 beanie~=1.7 click >=7 +cryptography >= 43.0.0 Deprecated == 1.2.0 docker fastapi>=0.78.0 @@ -32,4 +33,3 @@ requests_unixsocket2 shapely uvicorn uvicorn>=0.17.6 - diff --git a/requirements_test.txt b/requirements_test.txt index 0f0e5b97d4..be2ba65bca 100644 --- a/requirements_test.txt +++ b/requirements_test.txt @@ -1,4 +1,5 @@ autopep8 +cryptography >= 43.0.0 pytest >= 4.0.0 generateDS == 2.35.20 pytest-benchmark >= 3.2.3 diff --git a/src/ocrd_network/runtime_data/connection_clients.py b/src/ocrd_network/runtime_data/connection_clients.py index 67002a498f..ab2e48b144 100644 --- a/src/ocrd_network/runtime_data/connection_clients.py +++ b/src/ocrd_network/runtime_data/connection_clients.py @@ -1,7 +1,13 @@ from __future__ import annotations from docker import APIClient, DockerClient from docker.transport import SSHHTTPAdapter -from paramiko import AutoAddPolicy, SSHClient + +# TODO: A workaround to supress the annoying paramiko +# warnings which fail bash lib tests - core #1260 +from warnings import catch_warnings +from cryptography.utils import CryptographyDeprecationWarning +with catch_warnings(action="ignore", category=CryptographyDeprecationWarning): + from paramiko import AutoAddPolicy, SSHClient class CustomDockerClient(DockerClient): From 64d5abb195f6d95743c71059b17f116d2db989e9 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Mon, 29 Jul 2024 15:36:23 +0200 Subject: [PATCH 102/111] set paramiko logging to INFO --- src/ocrd_utils/ocrd_logging.conf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ocrd_utils/ocrd_logging.conf b/src/ocrd_utils/ocrd_logging.conf index 60925acae8..5cf161398e 100644 --- a/src/ocrd_utils/ocrd_logging.conf +++ b/src/ocrd_utils/ocrd_logging.conf @@ -95,13 +95,13 @@ qualname=PIL # paramiko loggers # [logger_paramiko] -level=ERROR +level=INFO handlers=consoleHandler qualname=paramiko propagate=0 [logger_paramiko_transport] -level=ERROR +level=INFO handlers=consoleHandler qualname=paramiko.transport propagate=0 From 52a099e90c1ba759dc88fe96ed41b09e4a9e3f6a Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Mon, 29 Jul 2024 15:49:32 +0200 Subject: [PATCH 103/111] revert, and just use < v43.0.0 --- requirements.txt | 2 +- requirements_test.txt | 2 +- src/ocrd_network/runtime_data/connection_clients.py | 8 +------- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/requirements.txt b/requirements.txt index f748a06057..ed5fd56d59 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ atomicwrites >= 1.3.0 beanie~=1.7 click >=7 -cryptography >= 43.0.0 +cryptography < 43.0.0 Deprecated == 1.2.0 docker fastapi>=0.78.0 diff --git a/requirements_test.txt b/requirements_test.txt index be2ba65bca..d8cef1dae7 100644 --- a/requirements_test.txt +++ b/requirements_test.txt @@ -1,5 +1,5 @@ autopep8 -cryptography >= 43.0.0 +cryptography < 43.0.0 pytest >= 4.0.0 generateDS == 2.35.20 pytest-benchmark >= 3.2.3 diff --git a/src/ocrd_network/runtime_data/connection_clients.py b/src/ocrd_network/runtime_data/connection_clients.py index ab2e48b144..67002a498f 100644 --- a/src/ocrd_network/runtime_data/connection_clients.py +++ b/src/ocrd_network/runtime_data/connection_clients.py @@ -1,13 +1,7 @@ from __future__ import annotations from docker import APIClient, DockerClient from docker.transport import SSHHTTPAdapter - -# TODO: A workaround to supress the annoying paramiko -# warnings which fail bash lib tests - core #1260 -from warnings import catch_warnings -from cryptography.utils import CryptographyDeprecationWarning -with catch_warnings(action="ignore", category=CryptographyDeprecationWarning): - from paramiko import AutoAddPolicy, SSHClient +from paramiko import AutoAddPolicy, SSHClient class CustomDockerClient(DockerClient): From c7e380014d12c36b39b7043246005dcc6d86009c Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 30 Jul 2024 15:02:46 +0200 Subject: [PATCH 104/111] remove OCRD_ALL_TOOL_JSON_URL Co-authored-by: Konstantin Baierer --- src/ocrd_network/constants.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/ocrd_network/constants.py b/src/ocrd_network/constants.py index f3d2de1247..bfa137d9e0 100644 --- a/src/ocrd_network/constants.py +++ b/src/ocrd_network/constants.py @@ -7,7 +7,6 @@ NETWORK_PROTOCOLS = ["http://", "https://"] OCRD_ALL_TOOL_JSON = "ocrd-all-tool.json" -OCRD_ALL_TOOL_JSON_URL = "https://ocr-d.de/js/ocrd-all-tool.json" # Used as a placeholder to lock all pages when no page_id is specified SERVER_ALL_PAGES_PLACEHOLDER = "all_pages" From 3d47640fb7e5a078e1a042640c9966bb437404f3 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 30 Jul 2024 15:08:58 +0200 Subject: [PATCH 105/111] remove: OCRD_ALL_TOOL_JSON_URL import --- src/ocrd_network/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd_network/utils.py b/src/ocrd_network/utils.py index babd576956..a2f563de43 100644 --- a/src/ocrd_network/utils.py +++ b/src/ocrd_network/utils.py @@ -16,7 +16,7 @@ from ocrd.workspace import Workspace from ocrd.mets_server import MpxReq from ocrd_utils import config, generate_range, REGEX_PREFIX, safe_filename, getLogger, resource_string -from .constants import OCRD_ALL_TOOL_JSON, OCRD_ALL_TOOL_JSON_URL +from .constants import OCRD_ALL_TOOL_JSON from .rabbitmq_utils import OcrdResultMessage From 507c1ea858d72dd6626b933528d4de5d8e05d99f Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 1 Aug 2024 13:27:03 +0200 Subject: [PATCH 106/111] :memo: changelog --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index fcd01d11b3..14783f8376 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,10 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +Changed: + + * ocrd_network: Use `ocrd-all-tool.json` bundled by core instead of download from website, #1257, #1260 + ## [2.67.2] - 2024-07-19 Fixed: From 54913fbc1999c02f126a606f265d93e0f6200779 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Thu, 1 Aug 2024 21:16:48 +0200 Subject: [PATCH 107/111] workspace find: print correct output fields fixes #1202 and a regression that printed a removal notice instead of the requested output fields when undoing downloads --- src/ocrd/cli/workspace.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index 4c262ec48d..0c70fd3a36 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -467,19 +467,18 @@ def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, incl include_fileGrp=include_fileGrp, exclude_fileGrp=exclude_fileGrp, ): - ret_entry = [f.ID if field == 'pageId' else str(getattr(f, field)) or '' for field in output_field] if download and not f.local_filename: workspace.download_file(f) modified_mets = True if wait: time.sleep(wait) if undo_download and f.url and f.local_filename: - ret_entry = [f'Removed local_filename {f.local_filename}'] f.local_filename = None modified_mets = True if not keep_files: ctx.log.debug("rm %s [cwd=%s]", f.local_filename, workspace.directory) unlink(f.local_filename) + ret_entry = [f.ID if field == 'pageId' else str(getattr(f, field)) or '' for field in output_field] ret.append(ret_entry) if modified_mets: workspace.save_mets() From c2e563074868c78d0390c71ccfc697f403b99bcb Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 6 Aug 2024 13:18:15 +0200 Subject: [PATCH 108/111] resolver.download_to_directory: log all if_exists cases on file existing --- src/ocrd/resolver.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/ocrd/resolver.py b/src/ocrd/resolver.py index fa98c82d0e..132aa73df2 100644 --- a/src/ocrd/resolver.py +++ b/src/ocrd/resolver.py @@ -95,12 +95,15 @@ def download_to_directory(self, directory, url, basename=None, if_exists='skip', log.debug("Stop early, src_path and dst_path are the same: '%s' (url: '%s')" % (src_path, url)) return str(ret) - # Respect 'if_exists' arg + # Respect 'if_exists' kwarg if dst_path.exists(): if if_exists == 'skip': + log.debug(f"File already exists but if_exists == {if_exists}, skipping.") return str(ret) - if if_exists == 'raise': - raise FileExistsError(f"File already exists and if_exists == 'raise': {dst_path}") + elif if_exists == 'raise': + raise FileExistsError(f"File already exists and if_exists == '{if_exists}': {dst_path}") + else: + log.debug(f"File already exists but if_exists == {if_exists}, overwriting.") # Create dst_path parent dir dst_path.parent.mkdir(parents=True, exist_ok=True) From 1c8b9db05431d9a52e4763b9efcf4f123e3d65c8 Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 6 Aug 2024 13:51:13 +0200 Subject: [PATCH 109/111] resolver.workspace_from_url: raise FileExistsError if mets.xml exists and not clobber_mets, fix #563 --- src/ocrd/resolver.py | 2 +- tests/test_resolver.py | 17 +++++++++++------ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/src/ocrd/resolver.py b/src/ocrd/resolver.py index 132aa73df2..124d006927 100644 --- a/src/ocrd/resolver.py +++ b/src/ocrd/resolver.py @@ -221,7 +221,7 @@ def workspace_from_url( log.debug("workspace_from_url\nmets_basename='%s'\nmets_url='%s'\nsrc_baseurl='%s'\ndst_dir='%s'", mets_basename, mets_url, src_baseurl, dst_dir) - self.download_to_directory(dst_dir, mets_url, basename=mets_basename, if_exists='overwrite' if clobber_mets else 'skip') + self.download_to_directory(dst_dir, mets_url, basename=mets_basename, if_exists='overwrite' if clobber_mets else 'raise') workspace = Workspace(self, dst_dir, mets_basename=mets_basename, baseurl=src_baseurl, mets_server_url=mets_server_url) diff --git a/tests/test_resolver.py b/tests/test_resolver.py index abcf69257b..16dfd03d56 100644 --- a/tests/test_resolver.py +++ b/tests/test_resolver.py @@ -117,6 +117,9 @@ def test_workspace_from_url_kant_with_resources(mock_request, tmp_path): @patch.object(Session, "get") def test_workspace_from_url_kant_with_resources_existing_local(mock_request, tmp_path): + """ + Fail with clobber_mets=False, succeeed with clobber_mets=True + """ # arrange url_src = 'https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xml' @@ -127,12 +130,14 @@ def test_workspace_from_url_kant_with_resources_existing_local(mock_request, tmp dst_mets = Path(dst_dir, 'mets.xml') shutil.copyfile(src_mets, dst_mets) - # act - Resolver().workspace_from_url(url_src, clobber_mets=False, dst_dir=dst_dir) + # fail + with pytest.raises(FileExistsError) as exc: + Resolver().workspace_from_url(url_src, clobber_mets=False, dst_dir=dst_dir) + assert mock_request.call_count == 0 - # assert - # no real request was made, since mets already present - assert mock_request.call_count == 0 + # succeed + Resolver().workspace_from_url(url_src, clobber_mets=True, dst_dir=dst_dir) + assert mock_request.call_count == 1 @patch.object(Session, "get") @@ -229,7 +234,7 @@ def test_workspace_from_nothing_noclobber(tmp_path): ws2 = Resolver().workspace_from_nothing(tmp_path) assert ws2.directory == tmp_path - with pytest.raises(Exception) as exc: + with pytest.raises(FileExistsError) as exc: Resolver().workspace_from_nothing(tmp_path) # assert From ea2e06d5ded2ad5f99f1823ed1dd6776605c657e Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 7 Aug 2024 16:56:42 +0200 Subject: [PATCH 110/111] :memo: changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 14783f8376..d001568686 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ Versioned according to [Semantic Versioning](http://semver.org/). Changed: * ocrd_network: Use `ocrd-all-tool.json` bundled by core instead of download from website, #1257, #1260 + * `ocrd workspace clone`/`Resolver.workspace_from_url`: with `clobber_mets=False`, raise a FileExistsError for existing mets.xml on disk, #563, #1268 ## [2.67.2] - 2024-07-19 From 6041785b36ea603da6da205b28a50e4009011b58 Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 13 Aug 2024 10:25:55 +0200 Subject: [PATCH 111/111] :memo: changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d001568686..dd816a3545 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ Changed: * ocrd_network: Use `ocrd-all-tool.json` bundled by core instead of download from website, #1257, #1260 * `ocrd workspace clone`/`Resolver.workspace_from_url`: with `clobber_mets=False`, raise a FileExistsError for existing mets.xml on disk, #563, #1268 + * `ocrd workspace find --download`: print the the correct, up-to-date field, not `None`, #1202, #1266 ## [2.67.2] - 2024-07-19