From e0baf65fd2dbd50243aec16e4fcc82e14957aa07 Mon Sep 17 00:00:00 2001 From: David Ormsbee Date: Thu, 26 Sep 2024 22:26:44 -0400 Subject: [PATCH 1/2] perf: reduce db calls for ComponentVersion after lookup by UUID When you fetch a single ComponentVersion, you are often going to want to pull data related to its Component, and possibly the LearningPackage it belongs to. This commit adds a select_related call to eliminate the extra roundtrips for this information. --- .../apps/authoring/components/api.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/openedx_learning/apps/authoring/components/api.py b/openedx_learning/apps/authoring/components/api.py index a65eee80..1355a2ff 100644 --- a/openedx_learning/apps/authoring/components/api.py +++ b/openedx_learning/apps/authoring/components/api.py @@ -269,7 +269,15 @@ def get_component_by_uuid(uuid: UUID) -> Component: def get_component_version_by_uuid(uuid: UUID) -> ComponentVersion: - return ComponentVersion.objects.get(publishable_entity_version__uuid=uuid) + return ( + ComponentVersion + .objects + .select_related( + "component", + "component__learning_package", + ) + .get(publishable_entity_version__uuid=uuid) + ) def component_exists_by_key( @@ -510,7 +518,12 @@ def _error_header(error: AssetError) -> dict[str, str]: # Check: Does the ComponentVersion exist? try: - component_version = get_component_version_by_uuid(component_version_uuid) + component_version = ( + ComponentVersion + .objects + .select_related("component", "component__learning_package") + .get(publishable_entity_version__uuid=component_version_uuid) + ) except ComponentVersion.DoesNotExist: # No need to add headers here, because no ComponentVersion was found. logger.error(f"Asset Not Found: No ComponentVersion with UUID {component_version_uuid}") From fd83506fe9c49a728c1d14aa07670b1627a8f852 Mon Sep 17 00:00:00 2001 From: David Ormsbee Date: Thu, 26 Sep 2024 23:06:34 -0400 Subject: [PATCH 2/2] feat!: require OPENEDX_LEARNING setting for Content storage WARNING: This commit will break any file-backed Content entries you currently have. Though you will likely only have them if you've used the very recently created add_assets_to_component command. This commit switches us away from using the default_storage and requires the project to set an OPENEDX_LEARNING settings dictionary with a MEDIA key like: OPENEDX_LEARNING = { 'MEDIA': { "BACKEND": "django.core.files.storage.FileSystemStorage", "OPTIONS": { "location": "/openedx/media-private/openedx-learning", } } } If no such setting is present, Content operations that require file access will raise a django.core.exceptions.ImproperlyConfigured error. In addition to that, Content files will be stored in a "content" subdirectory of the specified media location. This is to allow us more flexibility as we start to use file storage for other things like the import/export process. We need to have a separate space for Learning Core media files because these files should NOT be directly accessible via the browser (as the MEDIA_ROOT is). This is because of access policies and the fact that the filenames will not be meaningful by themselves and must be translated by app logic. For details, please see: * docs/decisions/0015-serving-static-assets.rst * openedx_learning/apps/authoring/contents/models.py * openedx_learning/apps/authoring/components/models.py Hiding these files in a new location also required changes to the Django admin, which are included here. This commit also adds a little extra to the admin to make it easier to map Component assets to actual files on disk. This commit also adds the following helpers to the Content model: * read_file(): return opened Django File object for the Content. * os_path(): get the full OS path to the stored file, if available. * path(): get the logical path for this asset relative to our configured OPENEDX_LEARNING['MEDIA'] storage root. This commit also auto-strips file paths starting with '/'. Allowing absolute paths tripped me up multiple times as I was setting up my test data, and I don't think there's any advantage to allowing inconsistencies (e.g. "/static/foo.png" and "static/foo.png"). We now force the relative form ("static/foo.png"). --- openedx_learning/__init__.py | 2 +- .../apps/authoring/components/admin.py | 42 ++++------ .../apps/authoring/components/api.py | 17 +++- .../apps/authoring/contents/admin.py | 39 ++++++--- .../apps/authoring/contents/models.py | 81 ++++++++++++++----- test_settings.py | 11 +++ .../apps/authoring/components/test_api.py | 20 ++++- .../apps/authoring/components/test_assets.py | 2 +- .../authoring/contents/test_file_storage.py | 79 ++++++++++++++++++ 9 files changed, 232 insertions(+), 61 deletions(-) create mode 100644 tests/openedx_learning/apps/authoring/contents/test_file_storage.py diff --git a/openedx_learning/__init__.py b/openedx_learning/__init__.py index 5bf29b11..7fde125a 100644 --- a/openedx_learning/__init__.py +++ b/openedx_learning/__init__.py @@ -1,4 +1,4 @@ """ Open edX Learning ("Learning Core"). """ -__version__ = "0.13.1" +__version__ = "0.14.0" diff --git a/openedx_learning/apps/authoring/components/admin.py b/openedx_learning/apps/authoring/components/admin.py index f09483e9..edb83921 100644 --- a/openedx_learning/apps/authoring/components/admin.py +++ b/openedx_learning/apps/authoring/components/admin.py @@ -1,6 +1,8 @@ """ Django admin for components models """ +import base64 + from django.contrib import admin from django.template.defaultfilters import filesizeformat from django.urls import reverse @@ -67,19 +69,22 @@ def get_queryset(self, request): ) fields = [ - "format_key", + "key", "format_size", "learner_downloadable", "rendered_data", ] readonly_fields = [ "content", - "format_key", + "key", "format_size", "rendered_data", ] extra = 0 + def has_file(self, cvc_obj): + return cvc_obj.content.has_file + def rendered_data(self, cvc_obj): return content_preview(cvc_obj) @@ -87,15 +92,6 @@ def rendered_data(self, cvc_obj): def format_size(self, cvc_obj): return filesizeformat(cvc_obj.content.size) - @admin.display(description="Key") - def format_key(self, cvc_obj): - return format_html( - '{}', - link_for_cvc(cvc_obj), - # reverse("admin:components_content_change", args=(cvc_obj.content_id,)), - cvc_obj.key, - ) - @admin.register(ComponentVersion) class ComponentVersionAdmin(ReadOnlyModelAdmin): @@ -129,18 +125,6 @@ def get_queryset(self, request): ) -def link_for_cvc(cvc_obj: ComponentVersionContent) -> str: - """ - Get the download URL for the given ComponentVersionContent instance - """ - return "/media_server/component_asset/{}/{}/{}/{}".format( - cvc_obj.content.learning_package.key, - cvc_obj.component_version.component.key, - cvc_obj.component_version.version_num, - cvc_obj.key, - ) - - def format_text_for_admin_display(text: str) -> SafeText: """ Get the HTML to display the given plain text (preserving formatting) @@ -158,9 +142,17 @@ def content_preview(cvc_obj: ComponentVersionContent) -> SafeText: content_obj = cvc_obj.content if content_obj.media_type.type == "image": + # This base64 encoding looks really goofy and is bad for performance, + # but image previews in the admin are extremely useful, and this lets us + # have them without creating a separate view in Learning Core. (Keep in + # mind that these assets are private, so they cannot be accessed via the + # MEDIA_URL like most Django uploaded assets.) + data = content_obj.read_file().read() return format_html( - '', - content_obj.file_url(), + '
{}
', + content_obj.mime_type, + base64.encodebytes(data).decode('utf8'), + content_obj.os_path(), ) return format_text_for_admin_display( diff --git a/openedx_learning/apps/authoring/components/api.py b/openedx_learning/apps/authoring/components/api.py index 1355a2ff..3be562a8 100644 --- a/openedx_learning/apps/authoring/components/api.py +++ b/openedx_learning/apps/authoring/components/api.py @@ -403,7 +403,21 @@ def create_component_version_content( ) -> ComponentVersionContent: """ Add a Content to the given ComponentVersion + + We don't allow keys that would be absolute paths, e.g. ones that start with + '/'. Storing these causes headaches with building relative paths and because + of mismatches with things that expect a leading slash and those that don't. + So for safety and consistency, we strip off leading slashes and emit a + warning when we do. """ + if key.startswith('/'): + logger.warning( + "Absolute paths are not supported: " + f"removed leading '/' from ComponentVersion {component_version_id} " + f"content key: {repr(key)} (content_id: {content_id})" + ) + key = key.lstrip('/') + cvrc, _created = ComponentVersionContent.objects.get_or_create( component_version_id=component_version_id, content_id=content_id, @@ -580,10 +594,9 @@ def _error_header(error: AssetError) -> dict[str, str]: # At this point, we know that there is valid Content that we want to send. # This adds Content-level headers, like the hash/etag and content type. info_headers.update(contents_api.get_content_info_headers(content)) - stored_file_path = content.file_path() # Recompute redirect headers (reminder: this should never be cached). - redirect_headers = contents_api.get_redirect_headers(stored_file_path, public) + redirect_headers = contents_api.get_redirect_headers(content.path, public) logger.info( "Asset redirect (uncached metadata): " f"{component_version_uuid}/{asset_path} -> {redirect_headers}" diff --git a/openedx_learning/apps/authoring/contents/admin.py b/openedx_learning/apps/authoring/contents/admin.py index 036d171b..029ebfd8 100644 --- a/openedx_learning/apps/authoring/contents/admin.py +++ b/openedx_learning/apps/authoring/contents/admin.py @@ -1,6 +1,8 @@ """ Django admin for contents models """ +import base64 + from django.contrib import admin from django.utils.html import format_html @@ -16,7 +18,6 @@ class ContentAdmin(ReadOnlyModelAdmin): """ list_display = [ "hash_digest", - "file_link", "learning_package", "media_type", "size", @@ -29,24 +30,42 @@ class ContentAdmin(ReadOnlyModelAdmin): "media_type", "size", "created", - "file_link", - "text_preview", "has_file", + "path", + "os_path", + "text_preview", + "image_preview", ] list_filter = ("media_type", "learning_package") search_fields = ("hash_digest",) - def file_link(self, content: Content): - if not content.has_file: - return "" + @admin.display(description="OS Path") + def os_path(self, content: Content): + return content.os_path() or "" - return format_html( - 'Download', - content.file_url(), - ) + def path(self, content: Content): + return content.path if content.has_file else "" def text_preview(self, content: Content): + if not content.text: + return "" return format_html( '
\n{}\n
', content.text, ) + + def image_preview(self, content: Content): + """ + Return HTML for an image, if that is the underlying Content. + + Otherwise, just return a blank string. + """ + if content.media_type.type != "image": + return "" + + data = content.read_file().read() + return format_html( + '', + content.mime_type, + base64.encodebytes(data).decode('utf8'), + ) diff --git a/openedx_learning/apps/authoring/contents/models.py b/openedx_learning/apps/authoring/contents/models.py index 7e4ff8cc..214c2d27 100644 --- a/openedx_learning/apps/authoring/contents/models.py +++ b/openedx_learning/apps/authoring/contents/models.py @@ -7,11 +7,13 @@ from functools import cache, cached_property -from django.core.exceptions import ValidationError +from django.conf import settings +from django.core.exceptions import ImproperlyConfigured, ValidationError from django.core.files.base import File -from django.core.files.storage import Storage, default_storage +from django.core.files.storage import Storage from django.core.validators import MaxValueValidator from django.db import models +from django.utils.module_loading import import_string from ....lib.fields import MultiCollationTextField, case_insensitive_char_field, hash_field, manual_date_time_field from ....lib.managers import WithRelationsManager @@ -28,13 +30,25 @@ def get_storage() -> Storage: """ Return the Storage instance for our Content file persistence. - For right now, we're still only storing inline text and not static assets in - production, so just return the default_storage. We're also going through a - transition between Django 3.2 -> 4.2, where storage configuration has moved. + This will first search for an OPENEDX_LEARNING config dictionary and return + a Storage subclass based on that configuration. - Make this work properly as part of adding support for static assets. + If there is no value for the OPENEDX_LEARNING setting, we return the default + MEDIA storage class. TODO: Should we make it just error instead? """ - return default_storage + config_dict = getattr(settings, 'OPENEDX_LEARNING', {}) + + if 'MEDIA' in config_dict: + storage_cls = import_string(config_dict['MEDIA']['BACKEND']) + options = config_dict['MEDIA'].get('OPTIONS', {}) + return storage_cls(**options) + + raise ImproperlyConfigured( + "Cannot access file storage: Missing the OPENEDX_LEARNING['MEDIA'] " + "setting, which should have a storage BACKEND and OPTIONS values for " + "a Storage subclass. These files should be stored in a location that " + "is NOT publicly accessible to browsers (so not in the MEDIA_ROOT)." + ) class MediaType(models.Model): @@ -282,22 +296,53 @@ def mime_type(self) -> str: """ return str(self.media_type) - def file_path(self): + @cached_property + def path(self): + """ + Logical path at which this content is stored (or would be stored). + + This path is relative to OPENEDX_LEARNING['MEDIA'] configured storage + root. This file may not exist because has_file=False, or because we + haven't written the file yet (this is the method we call when trying to + figure out where the file *should* go). + """ + return f"content/{self.learning_package.uuid}/{self.hash_digest}" + + def os_path(self): + """ + The full OS path for the underlying file for this Content. + + This will not be supported by all Storage class types. + + This will return ``None`` if there is no backing file (has_file=False). """ - Path at which this content is stored (or would be stored). + if self.has_file: + return get_storage().path(self.path) + return None - This path is relative to configured storage root. + def read_file(self) -> File: """ - return f"{self.learning_package.uuid}/{self.hash_digest}" + Get a File object that has been open for reading. + + We intentionally don't expose an `open()` call where callers can open + this file in write mode. Writing a Content file should happen at most + once, and the logic is not obvious (see ``write_file``). + + At the end of the day, the caller can close the returned File and reopen + it in whatever mode they want, but we're trying to gently discourage + that kind of usage. + """ + return get_storage().open(self.path, 'rb') def write_file(self, file: File) -> None: """ Write file contents to the file storage backend. - This function does nothing if the file already exists. + This function does nothing if the file already exists. Note that Content + is supposed to be immutable, so this should normally only be called once + for a given Content row. """ storage = get_storage() - file_path = self.file_path() # There are two reasons why a file might already exist even if the the # Content row is new: @@ -314,15 +359,15 @@ def write_file(self, file: File) -> None: # 3. Similar to (2), but only part of the file was written before an # error occurred. This seems unlikely, but possible if the underlying # storage engine writes in chunks. - if storage.exists(file_path) and storage.size(file_path) == file.size: + if storage.exists(self.path) and storage.size(self.path) == file.size: return - storage.save(file_path, file) + storage.save(self.path, file) def file_url(self) -> str: """ This will sometimes be a time-limited signed URL. """ - return content_file_url(self.file_path()) + return get_storage().url(self.path) def clean(self): """ @@ -361,7 +406,3 @@ class Meta: ] verbose_name = "Content" verbose_name_plural = "Contents" - - -def content_file_url(file_path): - return get_storage().url(file_path) diff --git a/test_settings.py b/test_settings.py index db27f354..f5b154f5 100644 --- a/test_settings.py +++ b/test_settings.py @@ -69,3 +69,14 @@ def root(*args): 'DEFAULT_PAGINATION_CLASS': 'edx_rest_framework_extensions.paginators.DefaultPagination', 'PAGE_SIZE': 10, } + +######################## LEARNING CORE SETTINGS ######################## + +OPENEDX_LEARNING = { + 'MEDIA': { + 'BACKEND': 'django.core.files.storage.InMemoryStorage', + 'OPTIONS': { + 'location': MEDIA_ROOT + "_private" + } + } +} diff --git a/tests/openedx_learning/apps/authoring/components/test_api.py b/tests/openedx_learning/apps/authoring/components/test_api.py index 18e5d04d..1fd8f494 100644 --- a/tests/openedx_learning/apps/authoring/components/test_api.py +++ b/tests/openedx_learning/apps/authoring/components/test_api.py @@ -381,7 +381,7 @@ def test_add(self): components_api.create_component_version_content( new_version.pk, new_content.pk, - key="hello.txt", + key="my/path/to/hello.txt", learner_downloadable=False, ) # re-fetch from the database to check to see if we wrote it correctly @@ -390,7 +390,23 @@ def test_add(self): .get(publishable_entity_version__version_num=1) assert ( new_content == - new_version.contents.get(componentversioncontent__key="hello.txt") + new_version.contents.get(componentversioncontent__key="my/path/to/hello.txt") + ) + + # Write the same content again, but to an absolute path (should auto- + # strip) the leading '/'s. + components_api.create_component_version_content( + new_version.pk, + new_content.pk, + key="//nested/path/hello.txt", + learner_downloadable=False, + ) + new_version = components_api.get_component(self.problem.pk) \ + .versions \ + .get(publishable_entity_version__version_num=1) + assert ( + new_content == + new_version.contents.get(componentversioncontent__key="nested/path/hello.txt") ) def test_multiple_versions(self): diff --git a/tests/openedx_learning/apps/authoring/components/test_assets.py b/tests/openedx_learning/apps/authoring/components/test_assets.py index 6d631738..56bfda41 100644 --- a/tests/openedx_learning/apps/authoring/components/test_assets.py +++ b/tests/openedx_learning/apps/authoring/components/test_assets.py @@ -167,7 +167,7 @@ def _assert_html_content_headers(self, response): assert response.status_code == 200 assert response.headers["Etag"] == self.html_asset_content.hash_digest assert response.headers["Content-Type"] == "text/html" - assert response.headers["X-Accel-Redirect"] == self.html_asset_content.file_path() + assert response.headers["X-Accel-Redirect"] == self.html_asset_content.path assert "X-Open-edX-Error" not in response.headers def test_public_asset_response(self): diff --git a/tests/openedx_learning/apps/authoring/contents/test_file_storage.py b/tests/openedx_learning/apps/authoring/contents/test_file_storage.py new file mode 100644 index 00000000..a94b3157 --- /dev/null +++ b/tests/openedx_learning/apps/authoring/contents/test_file_storage.py @@ -0,0 +1,79 @@ +""" +Tests for file-backed Content +""" +from datetime import datetime, timezone + +from django.conf import settings +from django.core.exceptions import ImproperlyConfigured +from django.test import override_settings + +from openedx_learning.apps.authoring.contents import api as contents_api +from openedx_learning.apps.authoring.contents.models import get_storage +from openedx_learning.apps.authoring.publishing import api as publishing_api +from openedx_learning.lib.test_utils import TestCase + + +class ContentFileStorageTestCase(TestCase): + """ + Test the storage of files backing Content. + """ + def setUp(self) -> None: + """ + It's actually important that we use setUp and not setUpTestData here, + because at least one of our tests will clear the get_storage cache, + meaning that subsequent tests will get a new instance of the + InMemoryStorage backend–and consequently wouldn't see any data loaded + by setUpTestData. + + Recreating the test data for every test lets individual tests change the + storage configuration without creating side-effects for other tests. + """ + super().setUp() + learning_package = publishing_api.create_learning_package( + key="ContentFileStorageTestCase-test-key", + title="Content File Storage Test Case Learning Package", + ) + self.html_media_type = contents_api.get_or_create_media_type("text/html") + self.html_content = contents_api.get_or_create_file_content( + learning_package.id, + self.html_media_type.id, + data=b"hello world!", + created=datetime.now(tz=timezone.utc), + ) + + def test_file_path(self): + """ + Test that the file path doesn't change. + + If this test breaks, it means that we've changed the convention for + where we're storing the backing files for Content, which means we'll be + breaking backwards compatibility for everyone. Please be very careful if + you're updating this test. + """ + content = self.html_content + assert content.path == f"content/{content.learning_package.uuid}/{content.hash_digest}" + + storage_root = settings.OPENEDX_LEARNING['MEDIA']['OPTIONS']['location'] + assert content.os_path() == f"{storage_root}/{content.path}" + + def test_read(self): + """Make sure we can read the file data back.""" + assert b"hello world!" == self.html_content.read_file().read() + + @override_settings() + def test_misconfiguration(self): + """ + Require the OPENEDX_LEARNING setting for file operations. + + The main goal of this is that we don't want to store our files in the + default MEDIA_ROOT, because that would make them publicly accessible. + + We set our OPENEDX_LEARNING value in test_settings.py. We're going to + delete this setting in our test and make sure we raise the correct + exception (The @override_settings decorator will set everything back to + normal after the test completes.) + """ + get_storage.cache_clear() + del settings.OPENEDX_LEARNING + with self.assertRaises(ImproperlyConfigured): + self.html_content.read_file()