Merge pull request #58 from openzim/archives-api

Add Archives APIs
openzim · Sep 18, 2023 · 5a2b2c5 · 5a2b2c5
2 parents 73357a1 + 4779e76
commit 5a2b2c5
Show file tree

Hide file tree

Showing 11 changed files with 622 additions and 56 deletions.
diff --git a/.github/workflows/Tests.yaml b/.github/workflows/Tests.yaml
@@ -5,11 +5,11 @@ on:
     paths:
       - "backend/**"
   push:
-    branches: [main]
+    branches: [ main ]
 env:
   POSTGRES_URI: "postgresql+psycopg://nautilus:nautilus@localhost:5432/nautilus"
   S3_URL_WITH_CREDENTIALS: "PLACEHOLDER"
-  PRIVATE_KEY: "PRIVATE_KEY"
+  PRIVATE_SALT: "PRIVATE_SALT"
 
 jobs:
   run-tests:

diff --git a/backend/api/constants.py b/backend/api/constants.py
@@ -43,6 +43,9 @@ class BackendConf:
     cookie_expiration_days = int(os.getenv("COOKIE_EXPIRATION_DAYS", "30"))
     project_quota = humanfriendly.parse_size(os.getenv("PROJECT_QUOTA", "100MB"))
     chunk_size = humanfriendly.parse_size(os.getenv("CHUNK_SIZE", "2MiB"))
+    illustration_quota = humanfriendly.parse_size(
+        os.getenv("ILLUSTRATION_QUOTA", "2MiB")
+    )
     allowed_origins = os.getenv(
         "ALLOWED_ORIGINS",
         "http://localhost",

diff --git a/backend/api/database/models.py b/backend/api/database/models.py
@@ -134,13 +134,12 @@ class Archive(Base):
     )
     project_id: Mapped[UUID] = mapped_column(ForeignKey("project.id"), init=False)
 
-    filename: Mapped[str]
-    filesize: Mapped[int]
+    filesize: Mapped[int | None]
     created_on: Mapped[datetime]
-    requested_on: Mapped[datetime]
-    download_url: Mapped[str]
-    collection_json_path: Mapped[str]
+    requested_on: Mapped[datetime | None]
+    download_url: Mapped[str | None]
+    collection_json_path: Mapped[str | None]
     status: Mapped[str]
-    zimfarm_task_id: Mapped[UUID]
+    zimfarm_task_id: Mapped[UUID | None]
     email: Mapped[str | None]
     config: Mapped[dict[str, Any]]
diff --git a/backend/api/main.py b/backend/api/main.py
@@ -7,7 +7,7 @@
 
 from api import __description__, __titile__, __version__
 from api.constants import constants, determine_mandatory_environment_variables
-from api.routes import files, projects, users, utils
+from api.routes import archives, files, projects, users, utils
 
 
 @asynccontextmanager
@@ -63,6 +63,7 @@ async def landing() -> RedirectResponse:
     api.include_router(utils.router)
     api.include_router(users.router)
     projects.router.include_router(files.router)
+    projects.router.include_router(archives.router)
     api.include_router(projects.router)
     app.mount(constants.api_version_prefix, api)
     return app
diff --git a/backend/api/routes/__init__.py b/backend/api/routes/__init__.py
@@ -1,13 +1,16 @@
+import hashlib
+from collections.abc import Iterator
 from http import HTTPStatus
-from typing import Annotated
+from pathlib import Path
+from typing import Annotated, BinaryIO
 from uuid import UUID
 
 from fastapi import Cookie, Depends, HTTPException, Response
 from sqlalchemy import select
 from sqlalchemy.orm import Session
 
 from api.constants import constants
-from api.database import gen_session
+from api.database import gen_session, get_local_fpath_for
 from api.database.models import Project, User
 
 
@@ -53,3 +56,62 @@ async def validated_project(
     if not project:
         raise HTTPException(HTTPStatus.NOT_FOUND, f"Project not found: {project_id}")
     return project
+
+
+def calculate_file_size(file: BinaryIO) -> int:
+    """Calculate the size of a file chunk by chunk"""
+    size = 0
+    for chunk in read_file_in_chunks(file):
+        size += len(chunk)
+    return size
+
+
+def read_file_in_chunks(
+    reader: BinaryIO, chunk_size=constants.chunk_size
+) -> Iterator[bytes]:
+    """Read Big file chunk by chunk. Default chunk size is 2k"""
+    while True:
+        chunk = reader.read(chunk_size)
+        if not chunk:
+            break
+        yield chunk
+    reader.seek(0)
+
+
+def save_file(file: BinaryIO, file_name: str, project_id: UUID) -> Path:
+    """Saves a binary file to a specific location and returns its path."""
+    fpath = get_local_fpath_for(file_name, project_id)
+    if not fpath.is_file():
+        with open(fpath, "wb") as file_object:
+            for chunk in read_file_in_chunks(file):
+                file_object.write(chunk)
+    return fpath
+
+
+def generate_file_hash(file: BinaryIO) -> str:
+    """Generate sha256 hash of a file, optimized for large files"""
+    hasher = hashlib.sha256()
+    for chunk in read_file_in_chunks(file):
+        hasher.update(chunk)
+    return hasher.hexdigest()
+
+
+def normalize_filename(filename: str) -> str:
+    """filesystem (ext4,apfs,hfs+,ntfs,exfat) and S3 compliant filename"""
+
+    normalized = str(filename)
+
+    # we replace / with __ as it would have a meaning
+    replacements = (("/", "__"),)
+    for pattern, repl in replacements:
+        normalized = filename.replace(pattern, repl)
+
+    # other prohibited chars are removed (mostly for Windows context)
+    removals = ["\\", ":", "*", "?", '"', "<", ">", "|"] + [
+        chr(idx) for idx in range(1, 32)
+    ]
+    for char in removals:
+        normalized.replace(char, "")
+
+    # ext4/exfat has a 255B filename limit (s3 is 1KiB)
+    return normalized.encode("utf-8")[:255].decode("utf-8")
diff --git a/backend/api/routes/archives.py b/backend/api/routes/archives.py
@@ -0,0 +1,202 @@
+import base64
+import datetime
+import io
+from enum import Enum
+from http import HTTPStatus
+from typing import Any
+from uuid import UUID
+
+import zimscraperlib.image
+from fastapi import APIRouter, Depends, HTTPException, UploadFile
+from pydantic import BaseModel, ConfigDict, TypeAdapter
+from sqlalchemy import select, update
+from sqlalchemy.orm import Session
+from zimscraperlib import filesystem
+
+from api.constants import constants
+from api.database import gen_session
+from api.database.models import Archive, Project
+from api.routes import (
+    calculate_file_size,
+    normalize_filename,
+    read_file_in_chunks,
+    validated_project,
+)
+
+router = APIRouter()
+
+
+class ArchiveStatus(str, Enum):
+    # It's in database but not requested and can be modified
+    PENDING = "PENDING"
+    # it has been ZF-requested; can not be modified by user,
+    # awaiting callback from ZimFarm
+    REQUESTED = "REQUESTED"
+    # ZimFarm task succeeded, it now has a download_url and filesize
+    READY = "READY"
+    # ZimFarm task failed, cant be downloaded
+    FAILED = "FAILED"
+
+
+class ArchiveConfig(BaseModel):
+    title: str | None
+    description: str | None
+    name: str | None
+    publisher: str | None
+    creator: str | None
+    languages: list[str] | None
+    tags: list[str] | None
+    filename: str
+
+
+class ArchiveRequest(BaseModel):
+    email: str | None
+    config: ArchiveConfig
+
+    model_config = ConfigDict(from_attributes=True)
+
+
+class ArchiveModel(BaseModel):
+    id: UUID
+
+    project_id: UUID
+
+    filesize: int | None
+    created_on: datetime.datetime
+    download_url: str | None
+    status: str
+    email: str | None
+    config: dict[str, Any]
+
+    model_config = ConfigDict(from_attributes=True)
+
+
+def validated_archive(
+    archive_id: UUID,
+    project: Project = Depends(validated_project),
+    session: Session = Depends(gen_session),
+) -> Archive:
+    """Depends()-able archive from request, ensuring it exists"""
+    stmt = select(Archive).filter_by(id=archive_id).filter_by(project_id=project.id)
+    archive = session.execute(stmt).scalar()
+    if not archive:
+        raise HTTPException(HTTPStatus.NOT_FOUND, f"Archive not found: {archive_id}")
+    return archive
+
+
+@router.get("/{project_id}/archives", response_model=list[ArchiveModel])
+async def get_all_archives(
+    project: Project = Depends(validated_project),
+) -> list[ArchiveModel]:
+    """Get all archives of a project"""
+    return TypeAdapter(list[ArchiveModel]).validate_python(project.archives)
+
+
+@router.get("/{project_id}/archives/{archive_id}", response_model=ArchiveModel)
+async def get_archive(archive: Archive = Depends(validated_archive)) -> ArchiveModel:
+    """Get a specific archives of a project"""
+    return ArchiveModel.model_validate(archive)
+
+
+@router.patch(
+    "/{project_id}/archives/{archive_id}",
+    status_code=HTTPStatus.NO_CONTENT,
+)
+async def update_archive(
+    archive_request: ArchiveRequest,
+    archive: Archive = Depends(validated_archive),
+    session: Session = Depends(gen_session),
+):
+    """Update a metadata of a archive"""
+    config = archive_request.config.model_dump()
+    config["filename"] = normalize_filename(config["filename"])
+    stmt = (
+        update(Archive)
+        .filter_by(id=archive.id)
+        .values(
+            email=archive_request.email,
+            config=archive_request.config.model_dump(),
+        )
+    )
+    session.execute(stmt)
+
+
+def validate_illustration_image(upload_file: UploadFile):
+    """
+    Validates the illustration image to ensure it meets the requirements.
+
+    Args:
+        upload_file (UploadFile): The uploaded illustration image.
+
+    Raises:
+        HTTPException: If the illustration is invalid,
+                       the illustration is empty,
+                       illustration is not a png image.
+    """
+    filename = upload_file.filename
+
+    if not filename:
+        raise HTTPException(
+            status_code=HTTPStatus.BAD_REQUEST, detail="Filename is invalid."
+        )  # pragma: no cover
+
+    size = calculate_file_size(upload_file.file)
+
+    if size == 0:
+        raise HTTPException(status_code=HTTPStatus.BAD_REQUEST, detail="Empty file.")
+
+    if size > constants.illustration_quota:
+        raise HTTPException(
+            status_code=HTTPStatus.REQUEST_ENTITY_TOO_LARGE,
+            detail="Illustration is too large.",
+        )
+
+    mimetype = filesystem.get_content_mimetype(upload_file.file.read(2048))
+
+    if "image/" not in mimetype:
+        raise HTTPException(
+            status_code=HTTPStatus.BAD_REQUEST,
+            detail="Illustration is not a valid image.",
+        )
+
+    upload_file.file.seek(0)
+
+
+@router.post(
+    "/{project_id}/archives/{archive_id}/illustration",
+    status_code=HTTPStatus.CREATED,
+)
+async def upload_illustration(
+    uploaded_illustration: UploadFile,
+    archive: Archive = Depends(validated_archive),
+    session: Session = Depends(gen_session),
+):
+    """Upload an illustration of a archive."""
+    validate_illustration_image(uploaded_illustration)
+
+    src = io.BytesIO()
+    for chunk in read_file_in_chunks(uploaded_illustration.file):
+        src.write(chunk)
+    dst = io.BytesIO()
+    try:
+        zimscraperlib.image.convert_image(
+            src, dst, fmt="PNG"  # pyright: ignore [reportGeneralTypeIssues]
+        )
+    except Exception as exc:
+        raise HTTPException(
+            status_code=HTTPStatus.BAD_REQUEST,
+            detail="Illustration cannot be converted to PNG",
+        ) from exc
+
+    try:
+        zimscraperlib.image.resize_image(dst, width=48, height=48, method="cover")
+    except Exception as exc:
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
+            detail="Illustration cannot be resized",
+        ) from exc
+    else:
+        new_config = archive.config
+        new_config["illustration"] = base64.b64encode(dst.getvalue()).decode("utf-8")
+        stmt = update(Archive).filter_by(id=archive.id).values(config=new_config)
+        session.execute(stmt)