Skip to content

Commit

Permalink
Merge pull request #58 from openzim/archives-api
Browse files Browse the repository at this point in the history
Add Archives APIs
  • Loading branch information
rgaudin authored Sep 18, 2023
2 parents 73357a1 + 4779e76 commit 5a2b2c5
Show file tree
Hide file tree
Showing 11 changed files with 622 additions and 56 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/Tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@ on:
paths:
- "backend/**"
push:
branches: [main]
branches: [ main ]
env:
POSTGRES_URI: "postgresql+psycopg://nautilus:nautilus@localhost:5432/nautilus"
S3_URL_WITH_CREDENTIALS: "PLACEHOLDER"
PRIVATE_KEY: "PRIVATE_KEY"
PRIVATE_SALT: "PRIVATE_SALT"

jobs:
run-tests:
Expand Down
3 changes: 3 additions & 0 deletions backend/api/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ class BackendConf:
cookie_expiration_days = int(os.getenv("COOKIE_EXPIRATION_DAYS", "30"))
project_quota = humanfriendly.parse_size(os.getenv("PROJECT_QUOTA", "100MB"))
chunk_size = humanfriendly.parse_size(os.getenv("CHUNK_SIZE", "2MiB"))
illustration_quota = humanfriendly.parse_size(
os.getenv("ILLUSTRATION_QUOTA", "2MiB")
)
allowed_origins = os.getenv(
"ALLOWED_ORIGINS",
"http://localhost",
Expand Down
11 changes: 5 additions & 6 deletions backend/api/database/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,13 +134,12 @@ class Archive(Base):
)
project_id: Mapped[UUID] = mapped_column(ForeignKey("project.id"), init=False)

filename: Mapped[str]
filesize: Mapped[int]
filesize: Mapped[int | None]
created_on: Mapped[datetime]
requested_on: Mapped[datetime]
download_url: Mapped[str]
collection_json_path: Mapped[str]
requested_on: Mapped[datetime | None]
download_url: Mapped[str | None]
collection_json_path: Mapped[str | None]
status: Mapped[str]
zimfarm_task_id: Mapped[UUID]
zimfarm_task_id: Mapped[UUID | None]
email: Mapped[str | None]
config: Mapped[dict[str, Any]]
3 changes: 2 additions & 1 deletion backend/api/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from api import __description__, __titile__, __version__
from api.constants import constants, determine_mandatory_environment_variables
from api.routes import files, projects, users, utils
from api.routes import archives, files, projects, users, utils


@asynccontextmanager
Expand Down Expand Up @@ -63,6 +63,7 @@ async def landing() -> RedirectResponse:
api.include_router(utils.router)
api.include_router(users.router)
projects.router.include_router(files.router)
projects.router.include_router(archives.router)
api.include_router(projects.router)
app.mount(constants.api_version_prefix, api)
return app
66 changes: 64 additions & 2 deletions backend/api/routes/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import hashlib
from collections.abc import Iterator
from http import HTTPStatus
from typing import Annotated
from pathlib import Path
from typing import Annotated, BinaryIO
from uuid import UUID

from fastapi import Cookie, Depends, HTTPException, Response
from sqlalchemy import select
from sqlalchemy.orm import Session

from api.constants import constants
from api.database import gen_session
from api.database import gen_session, get_local_fpath_for
from api.database.models import Project, User


Expand Down Expand Up @@ -53,3 +56,62 @@ async def validated_project(
if not project:
raise HTTPException(HTTPStatus.NOT_FOUND, f"Project not found: {project_id}")
return project


def calculate_file_size(file: BinaryIO) -> int:
"""Calculate the size of a file chunk by chunk"""
size = 0
for chunk in read_file_in_chunks(file):
size += len(chunk)
return size


def read_file_in_chunks(
reader: BinaryIO, chunk_size=constants.chunk_size
) -> Iterator[bytes]:
"""Read Big file chunk by chunk. Default chunk size is 2k"""
while True:
chunk = reader.read(chunk_size)
if not chunk:
break
yield chunk
reader.seek(0)


def save_file(file: BinaryIO, file_name: str, project_id: UUID) -> Path:
"""Saves a binary file to a specific location and returns its path."""
fpath = get_local_fpath_for(file_name, project_id)
if not fpath.is_file():
with open(fpath, "wb") as file_object:
for chunk in read_file_in_chunks(file):
file_object.write(chunk)
return fpath


def generate_file_hash(file: BinaryIO) -> str:
"""Generate sha256 hash of a file, optimized for large files"""
hasher = hashlib.sha256()
for chunk in read_file_in_chunks(file):
hasher.update(chunk)
return hasher.hexdigest()


def normalize_filename(filename: str) -> str:
"""filesystem (ext4,apfs,hfs+,ntfs,exfat) and S3 compliant filename"""

normalized = str(filename)

# we replace / with __ as it would have a meaning
replacements = (("/", "__"),)
for pattern, repl in replacements:
normalized = filename.replace(pattern, repl)

# other prohibited chars are removed (mostly for Windows context)
removals = ["\\", ":", "*", "?", '"', "<", ">", "|"] + [
chr(idx) for idx in range(1, 32)
]
for char in removals:
normalized.replace(char, "")

# ext4/exfat has a 255B filename limit (s3 is 1KiB)
return normalized.encode("utf-8")[:255].decode("utf-8")
202 changes: 202 additions & 0 deletions backend/api/routes/archives.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
import base64
import datetime
import io
from enum import Enum
from http import HTTPStatus
from typing import Any
from uuid import UUID

import zimscraperlib.image
from fastapi import APIRouter, Depends, HTTPException, UploadFile
from pydantic import BaseModel, ConfigDict, TypeAdapter
from sqlalchemy import select, update
from sqlalchemy.orm import Session
from zimscraperlib import filesystem

from api.constants import constants
from api.database import gen_session
from api.database.models import Archive, Project
from api.routes import (
calculate_file_size,
normalize_filename,
read_file_in_chunks,
validated_project,
)

router = APIRouter()


class ArchiveStatus(str, Enum):
# It's in database but not requested and can be modified
PENDING = "PENDING"
# it has been ZF-requested; can not be modified by user,
# awaiting callback from ZimFarm
REQUESTED = "REQUESTED"
# ZimFarm task succeeded, it now has a download_url and filesize
READY = "READY"
# ZimFarm task failed, cant be downloaded
FAILED = "FAILED"


class ArchiveConfig(BaseModel):
title: str | None
description: str | None
name: str | None
publisher: str | None
creator: str | None
languages: list[str] | None
tags: list[str] | None
filename: str


class ArchiveRequest(BaseModel):
email: str | None
config: ArchiveConfig

model_config = ConfigDict(from_attributes=True)


class ArchiveModel(BaseModel):
id: UUID

project_id: UUID

filesize: int | None
created_on: datetime.datetime
download_url: str | None
status: str
email: str | None
config: dict[str, Any]

model_config = ConfigDict(from_attributes=True)


def validated_archive(
archive_id: UUID,
project: Project = Depends(validated_project),
session: Session = Depends(gen_session),
) -> Archive:
"""Depends()-able archive from request, ensuring it exists"""
stmt = select(Archive).filter_by(id=archive_id).filter_by(project_id=project.id)
archive = session.execute(stmt).scalar()
if not archive:
raise HTTPException(HTTPStatus.NOT_FOUND, f"Archive not found: {archive_id}")
return archive


@router.get("/{project_id}/archives", response_model=list[ArchiveModel])
async def get_all_archives(
project: Project = Depends(validated_project),
) -> list[ArchiveModel]:
"""Get all archives of a project"""
return TypeAdapter(list[ArchiveModel]).validate_python(project.archives)


@router.get("/{project_id}/archives/{archive_id}", response_model=ArchiveModel)
async def get_archive(archive: Archive = Depends(validated_archive)) -> ArchiveModel:
"""Get a specific archives of a project"""
return ArchiveModel.model_validate(archive)


@router.patch(
"/{project_id}/archives/{archive_id}",
status_code=HTTPStatus.NO_CONTENT,
)
async def update_archive(
archive_request: ArchiveRequest,
archive: Archive = Depends(validated_archive),
session: Session = Depends(gen_session),
):
"""Update a metadata of a archive"""
config = archive_request.config.model_dump()
config["filename"] = normalize_filename(config["filename"])
stmt = (
update(Archive)
.filter_by(id=archive.id)
.values(
email=archive_request.email,
config=archive_request.config.model_dump(),
)
)
session.execute(stmt)


def validate_illustration_image(upload_file: UploadFile):
"""
Validates the illustration image to ensure it meets the requirements.
Args:
upload_file (UploadFile): The uploaded illustration image.
Raises:
HTTPException: If the illustration is invalid,
the illustration is empty,
illustration is not a png image.
"""
filename = upload_file.filename

if not filename:
raise HTTPException(
status_code=HTTPStatus.BAD_REQUEST, detail="Filename is invalid."
) # pragma: no cover

size = calculate_file_size(upload_file.file)

if size == 0:
raise HTTPException(status_code=HTTPStatus.BAD_REQUEST, detail="Empty file.")

if size > constants.illustration_quota:
raise HTTPException(
status_code=HTTPStatus.REQUEST_ENTITY_TOO_LARGE,
detail="Illustration is too large.",
)

mimetype = filesystem.get_content_mimetype(upload_file.file.read(2048))

if "image/" not in mimetype:
raise HTTPException(
status_code=HTTPStatus.BAD_REQUEST,
detail="Illustration is not a valid image.",
)

upload_file.file.seek(0)


@router.post(
"/{project_id}/archives/{archive_id}/illustration",
status_code=HTTPStatus.CREATED,
)
async def upload_illustration(
uploaded_illustration: UploadFile,
archive: Archive = Depends(validated_archive),
session: Session = Depends(gen_session),
):
"""Upload an illustration of a archive."""
validate_illustration_image(uploaded_illustration)

src = io.BytesIO()
for chunk in read_file_in_chunks(uploaded_illustration.file):
src.write(chunk)
dst = io.BytesIO()
try:
zimscraperlib.image.convert_image(
src, dst, fmt="PNG" # pyright: ignore [reportGeneralTypeIssues]
)
except Exception as exc:
raise HTTPException(
status_code=HTTPStatus.BAD_REQUEST,
detail="Illustration cannot be converted to PNG",
) from exc

try:
zimscraperlib.image.resize_image(dst, width=48, height=48, method="cover")
except Exception as exc:
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
detail="Illustration cannot be resized",
) from exc
else:
new_config = archive.config
new_config["illustration"] = base64.b64encode(dst.getvalue()).decode("utf-8")
stmt = update(Archive).filter_by(id=archive.id).values(config=new_config)
session.execute(stmt)
Loading

0 comments on commit 5a2b2c5

Please sign in to comment.