From 5908f4786c2d67086b318de320d7531bdbc191f0 Mon Sep 17 00:00:00 2001 From: git-hyagi <45576767+git-hyagi@users.noreply.github.com> Date: Wed, 19 Jun 2024 16:44:30 -0300 Subject: [PATCH] Add includes/excludes fields to filter remotes This commit adds a feature to refine (include only and/or remove unwanted) remote repositories pulled during pull-through caching. closes: #459 --- CHANGES/459.feature | 2 + .../migrations/0040_add_remote_repo_filter.py | 24 ++++++ pulp_container/app/models.py | 5 ++ pulp_container/app/registry_api.py | 36 +++++---- pulp_container/app/serializers.py | 26 +++++- pulp_container/app/tasks/sync_stages.py | 28 +------ pulp_container/app/utils.py | 35 +++++++- .../functional/api/test_pull_through_cache.py | 20 +---- .../api/test_remote_filter_pull_through.py | 80 +++++++++++++++++++ pulp_container/tests/functional/conftest.py | 25 ++++++ .../admin/guides/pull-through-caching.md | 33 ++++++++ 11 files changed, 255 insertions(+), 59 deletions(-) create mode 100644 CHANGES/459.feature create mode 100644 pulp_container/app/migrations/0040_add_remote_repo_filter.py create mode 100644 pulp_container/tests/functional/api/test_remote_filter_pull_through.py diff --git a/CHANGES/459.feature b/CHANGES/459.feature new file mode 100644 index 000000000..3c6260473 --- /dev/null +++ b/CHANGES/459.feature @@ -0,0 +1,2 @@ +Added support for filtering remote repositories in pull-through caching using `includes` and +`excludes` fields. These fields can be set on pull-through caching remote objects. diff --git a/pulp_container/app/migrations/0040_add_remote_repo_filter.py b/pulp_container/app/migrations/0040_add_remote_repo_filter.py new file mode 100644 index 000000000..7e5dea836 --- /dev/null +++ b/pulp_container/app/migrations/0040_add_remote_repo_filter.py @@ -0,0 +1,24 @@ +# Generated by Django 4.2.13 on 2024-06-28 10:34 + +import django.contrib.postgres.fields +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('container', '0039_manifest_data'), + ] + + operations = [ + migrations.AddField( + model_name='containerpullthroughremote', + name='excludes', + field=django.contrib.postgres.fields.ArrayField(base_field=models.TextField(null=True), null=True, size=None), + ), + migrations.AddField( + model_name='containerpullthroughremote', + name='includes', + field=django.contrib.postgres.fields.ArrayField(base_field=models.TextField(null=True), null=True, size=None), + ), + ] diff --git a/pulp_container/app/models.py b/pulp_container/app/models.py index 8e7695c65..76efabb15 100644 --- a/pulp_container/app/models.py +++ b/pulp_container/app/models.py @@ -503,6 +503,11 @@ class ContainerPullThroughRemote(Remote, AutoAddObjPermsMixin): from within a single instance of this remote. """ + TYPE = "pull-through" + + includes = fields.ArrayField(models.TextField(null=True), null=True) + excludes = fields.ArrayField(models.TextField(null=True), null=True) + class Meta: default_related_name = "%(app_label)s_%(model_name)s" permissions = [ diff --git a/pulp_container/app/registry_api.py b/pulp_container/app/registry_api.py index e32338b13..682c7cb2e 100644 --- a/pulp_container/app/registry_api.py +++ b/pulp_container/app/registry_api.py @@ -21,6 +21,7 @@ from django.core.files.base import ContentFile, File from django.db import IntegrityError, transaction from django.db.models import F, Value +from django.forms.models import model_to_dict from django.shortcuts import get_object_or_404 from django.conf import settings @@ -82,6 +83,7 @@ from pulp_container.app.utils import ( determine_media_type, extract_data_from_signature, + filter_resource, has_task_completed, validate_manifest, ) @@ -97,13 +99,15 @@ log = logging.getLogger(__name__) IGNORED_PULL_THROUGH_REMOTE_ATTRIBUTES = [ - "remote_ptr_id", + "remote_ptr", "pulp_type", "pulp_last_updated", "pulp_created", "pulp_id", - "url", "name", + "includes", + "excludes", + "pulp_domain", ] @@ -309,18 +313,29 @@ def get_pull_through_drv(self, path): if not pull_through_cache_distribution: raise RepositoryNotFound(name=path) + upstream_name = path.split(pull_through_cache_distribution.base_path, maxsplit=1)[1].strip( + "/" + ) + pull_through_remote = models.ContainerPullThroughRemote.objects.get( + pk=pull_through_cache_distribution.remote_id + ) + if not filter_resource( + upstream_name, pull_through_remote.includes, pull_through_remote.excludes + ): + raise RepositoryNotFound(name=path) + try: with transaction.atomic(): repository, _ = models.ContainerRepository.objects.get_or_create( name=path, retain_repo_versions=1 ) - remote_data = _get_pull_through_remote_data(pull_through_cache_distribution) - upstream_name = path.split(pull_through_cache_distribution.base_path, maxsplit=1)[1] + remote_data = model_to_dict( + pull_through_remote, exclude=IGNORED_PULL_THROUGH_REMOTE_ATTRIBUTES + ) remote, _ = models.ContainerRemote.objects.get_or_create( name=path, - upstream_name=upstream_name.strip("/"), - url=pull_through_cache_distribution.remote.url, + upstream_name=upstream_name, **remote_data, ) @@ -389,15 +404,6 @@ def create_dr(self, path, request): return distribution, repository -def _get_pull_through_remote_data(root_cache_distribution): - remote_data = models.ContainerPullThroughRemote.objects.filter( - pk=root_cache_distribution.remote_id - ).values()[0] - for attr in IGNORED_PULL_THROUGH_REMOTE_ATTRIBUTES: - remote_data.pop(attr, None) - return remote_data - - class BearerTokenView(APIView): """ Hand out anonymous or authenticated bearer tokens. diff --git a/pulp_container/app/serializers.py b/pulp_container/app/serializers.py index 3cac522bd..d2922bf18 100644 --- a/pulp_container/app/serializers.py +++ b/pulp_container/app/serializers.py @@ -307,9 +307,33 @@ class ContainerPullThroughRemoteSerializer(RemoteSerializer): """ policy = serializers.ChoiceField(choices=[Remote.ON_DEMAND], default=Remote.ON_DEMAND) + includes = serializers.ListField( + child=serializers.CharField(max_length=255), + allow_null=True, + required=False, + help_text=_( + """ + A list of remotes to include during pull-through caching. + Wildcards *, ? are recognized. + 'includes' is evaluated before 'excludes'. + """ + ), + ) + excludes = serializers.ListField( + child=serializers.CharField(max_length=255), + allow_null=True, + required=False, + help_text=_( + """ + A list of remotes to exclude during pull-through caching. + Wildcards *, ? are recognized. + 'excludes' is evaluated after 'includes'. + """ + ), + ) class Meta: - fields = RemoteSerializer.Meta.fields + fields = RemoteSerializer.Meta.fields + ("includes", "excludes") model = models.ContainerPullThroughRemote diff --git a/pulp_container/app/tasks/sync_stages.py b/pulp_container/app/tasks/sync_stages.py index 05530bc87..ef7ec4c9f 100644 --- a/pulp_container/app/tasks/sync_stages.py +++ b/pulp_container/app/tasks/sync_stages.py @@ -1,7 +1,6 @@ import aiohttp import asyncio import base64 -import fnmatch import hashlib import json import logging @@ -34,6 +33,7 @@ determine_media_type, validate_manifest, calculate_digest, + filter_resources, get_content_data, ) @@ -118,7 +118,9 @@ async def run(self): repo_name = self.remote.namespaced_upstream_name tag_list_url = "/v2/{name}/tags/list".format(name=repo_name) tag_list = await self.get_paginated_tag_list(tag_list_url, repo_name) - tag_list = self.filter_tags(tag_list) + tag_list = filter_resources( + tag_list, self.remote.include_tags, self.remote.exclude_tags + ) await pb.aincrement() for tag_name in tag_list: @@ -303,28 +305,6 @@ async def resolve_flush(self): await self.put(signature_dc) self.signature_dcs.clear() - def filter_tags(self, tag_list): - """ - Filter tags by a list of included and excluded tags. - """ - include_tags = self.remote.include_tags - if include_tags: - tag_list = [ - tag - for tag in tag_list - if any(fnmatch.fnmatch(tag, pattern) for pattern in include_tags) - ] - - exclude_tags = self.remote.exclude_tags - if exclude_tags: - tag_list = [ - tag - for tag in tag_list - if not any(fnmatch.fnmatch(tag, pattern) for pattern in exclude_tags) - ] - - return tag_list - async def get_paginated_tag_list(self, rel_link, repo_name): """ Handle registries that have pagination enabled. diff --git a/pulp_container/app/utils.py b/pulp_container/app/utils.py index 1554a2c27..ed038fa0e 100644 --- a/pulp_container/app/utils.py +++ b/pulp_container/app/utils.py @@ -1,5 +1,6 @@ import base64 import hashlib +import fnmatch import re import subprocess import gnupg @@ -11,11 +12,15 @@ from jsonschema import Draft7Validator, validate, ValidationError from django.core.files.storage import default_storage as storage from django.db import IntegrityError +from functools import partial from rest_framework.exceptions import Throttled from pulpcore.plugin.models import Artifact, Task -from pulp_container.constants import MANIFEST_MEDIA_TYPES, MEDIA_TYPE +from pulp_container.constants import ( + MANIFEST_MEDIA_TYPES, + MEDIA_TYPE, +) from pulp_container.app.exceptions import ManifestInvalid from pulp_container.app.json_schemas import ( OCI_INDEX_SCHEMA, @@ -309,3 +314,31 @@ def get_content_data(saved_artifact): raw_data = file.read() content_data = json.loads(raw_data) return content_data, raw_data + + +def include(x, patterns): + return any(fnmatch.fnmatch(x, pattern) for pattern in patterns) + + +def exclude(x, patterns): + return not include(x, patterns) + + +def filter_resource(element, include_patterns, exclude_patterns): + """ + Returns true if element matches {include,exclude}_patterns filters. + """ + if not (include_patterns or exclude_patterns): + return True + return include(element, include_patterns or []) and exclude(element, exclude_patterns or []) + + +def filter_resources(element_list, include_patterns, exclude_patterns): + """ + Returns a list of elements based on filter parameters ({include,exclude}_patterns). + """ + if include_patterns: + element_list = filter(partial(include, patterns=include_patterns), element_list) + if exclude_patterns: + element_list = filter(partial(exclude, patterns=exclude_patterns), element_list) + return list(element_list) diff --git a/pulp_container/tests/functional/api/test_pull_through_cache.py b/pulp_container/tests/functional/api/test_pull_through_cache.py index 6a623d72d..b3bcd2a19 100644 --- a/pulp_container/tests/functional/api/test_pull_through_cache.py +++ b/pulp_container/tests/functional/api/test_pull_through_cache.py @@ -6,29 +6,11 @@ from pulp_container.tests.functional.constants import ( REGISTRY_V2, - REGISTRY_V2_FEED_URL, PULP_HELLO_WORLD_REPO, PULP_FIXTURE_1, ) -@pytest.fixture -def pull_through_distribution( - gen_object_with_cleanup, - container_pull_through_remote_api, - container_pull_through_distribution_api, -): - remote = gen_object_with_cleanup( - container_pull_through_remote_api, - {"name": str(uuid4()), "url": REGISTRY_V2_FEED_URL}, - ) - distribution = gen_object_with_cleanup( - container_pull_through_distribution_api, - {"name": str(uuid4()), "base_path": str(uuid4()), "remote": remote.pulp_href}, - ) - return distribution - - @pytest.fixture def pull_and_verify( add_to_cleanup, @@ -42,6 +24,7 @@ def pull_and_verify( ): def _pull_and_verify(images, pull_through_distribution): tags_to_verify = [] + pull_through_distribution = pull_through_distribution() for version, image_path in enumerate(images, start=1): remote_image_path = f"{REGISTRY_V2}/{image_path}" local_image_path = f"{pull_through_distribution.base_path}/{image_path}" @@ -113,6 +96,7 @@ def test_conflicting_names_and_paths( local_registry, monitor_task, ): + pull_through_distribution = pull_through_distribution() local_image_path = f"{pull_through_distribution.base_path}/{str(uuid4())}" remote = container_remote_factory(name=local_image_path) diff --git a/pulp_container/tests/functional/api/test_remote_filter_pull_through.py b/pulp_container/tests/functional/api/test_remote_filter_pull_through.py new file mode 100644 index 000000000..7c09f8797 --- /dev/null +++ b/pulp_container/tests/functional/api/test_remote_filter_pull_through.py @@ -0,0 +1,80 @@ +import pytest +import subprocess + +from pulp_container.tests.functional.constants import ( + REGISTRY_V2, + PULP_HELLO_WORLD_REPO, + PULP_FIXTURE_1, +) + + +@pytest.fixture +def pull_and_verify( + capfd, + local_registry, + registry_client, +): + def _pull_and_verify(images, pull_through_distribution, includes, excludes, expected): + distr = pull_through_distribution(includes, excludes) + for image_path in images: + remote_image_path = f"{REGISTRY_V2}/{image_path}" + local_image_path = f"{distr.base_path}/{image_path}" + + if image_path not in expected: + with pytest.raises(subprocess.CalledProcessError): + local_registry.pull(local_image_path) + assert "Repository not found" in capfd.readouterr().err + else: + local_registry.pull(local_image_path) + local_image = local_registry.inspect(local_image_path) + registry_client.pull(remote_image_path) + remote_image = registry_client.inspect(remote_image_path) + assert local_image[0]["Id"] == remote_image[0]["Id"] + + return _pull_and_verify + + +@pytest.mark.parametrize( + "images, includes, excludes, expected", + [ + ( + [f"{PULP_FIXTURE_1}:manifest_a", f"{PULP_FIXTURE_1}:manifest_b"], + None, + [], + [f"{PULP_FIXTURE_1}:manifest_a", f"{PULP_FIXTURE_1}:manifest_b"], + ), + ([f"{PULP_FIXTURE_1}:manifest_a", f"{PULP_FIXTURE_1}:manifest_b"], [], ["pulp*"], []), + ( + [f"{PULP_FIXTURE_1}:manifest_a", f"{PULP_FIXTURE_1}:manifest_b"], + [], + ["pulp/test-fixture-1"], + [], + ), + ( + [ + f"{PULP_FIXTURE_1}:manifest_a", + f"{PULP_FIXTURE_1}:manifest_b", + f"{PULP_HELLO_WORLD_REPO}:linux", + ], + ["*hello*"], + ["*fixture*"], + [f"{PULP_HELLO_WORLD_REPO}:linux"], + ), + ( + ["custom_namespace/custom_repo:latest"], + ["*pulp*"], + None, + [], + ), + ], +) +def test_includes_excludes_filter( + images, + includes, + excludes, + expected, + pull_through_distribution, + pull_and_verify, + delete_orphans_pre, +): + pull_and_verify(images, pull_through_distribution, includes, excludes, expected) diff --git a/pulp_container/tests/functional/conftest.py b/pulp_container/tests/functional/conftest.py index 9a56335f7..16f07d1f5 100644 --- a/pulp_container/tests/functional/conftest.py +++ b/pulp_container/tests/functional/conftest.py @@ -428,3 +428,28 @@ def _sync(repo, remote=None): return monitor_task(sync_response.task) return _sync + + +@pytest.fixture +def pull_through_distribution( + gen_object_with_cleanup, + container_pull_through_remote_api, + container_pull_through_distribution_api, +): + def _pull_through_distribution(includes=None, excludes=None): + remote = gen_object_with_cleanup( + container_pull_through_remote_api, + { + "name": str(uuid4()), + "url": REGISTRY_V2_FEED_URL, + "includes": includes, + "excludes": excludes, + }, + ) + distribution = gen_object_with_cleanup( + container_pull_through_distribution_api, + {"name": str(uuid4()), "base_path": str(uuid4()), "remote": remote.pulp_href}, + ) + return distribution + + return _pull_through_distribution diff --git a/staging_docs/admin/guides/pull-through-caching.md b/staging_docs/admin/guides/pull-through-caching.md index 91efde935..c2772d20c 100644 --- a/staging_docs/admin/guides/pull-through-caching.md +++ b/staging_docs/admin/guides/pull-through-caching.md @@ -43,3 +43,36 @@ ensures a more reliable container deployment system in production environments. generate a new repository version that incorporates both the "10" and "11" tags, automatically removing the previous version. Repositories and their content remain manageable through standard Pulp API endpoints. The repositories are read-only and public by default. + + +### Filtering the repositories + +It is possible to use the includes/excludes fields to set a list of upstream repositories that Pulp +will be able to pull from. + +``` +# define a pull-through remote with the includes/excludes fields +REMOTE_HREF=$(http ${BASE_ADDR}/pulp/api/v3/remotes/container/pull-through/ name=docker-cache url=https://registry-1.docker.io includes=["*pulp*"] excludes=["*molecule_debian*"] | jq -r ".pulp_href") + +# create a pull-through distribution linked to the initialized remote +http ${BASE_ADDR}/pulp/api/v3/distributions/container/pull-through/ remote=${REMOTE_HREF} name=docker-cache base_path=docker-cache +``` + +Pulling allowed content: + +``` +podman pull localhost:24817/docker-cache/pulp/test-fixture-1:manifest_a +``` + +Pulling from a repository that includes *molecule_debian* in its name will fail because it is filtered by the *excludes* definition: +``` +podman pull localhost:24817/docker-cache/pulp/molecule_debian11 +Error response from daemon: repository localhost:24817/docker-cache/pulp/molecule_debian11 not found: name unknown: Repository not found. +``` + +Since only repositories with *pulp* in their names are included (`includes=["*pulp*"]`), the following image pull will also fail: + +``` +podman pull localhost:24817/docker-cache/library/hello-world +Error response from daemon: repository localhost:24817/docker-cache/library/hello-world not found: name unknown: Repository not found. +```