Skip to content

Commit

Permalink
Huge Refactor
Browse files Browse the repository at this point in the history
- Move DocumentListAPI parsing code to crud.py
- Introduce NormalizedDocument as a TypedDict
- Remove a bunch of old code
- Remove Submodules
- Reformat and clean up Imports
- Clean up the codebase:
  - Move executeable files into `bin/` and make those scripts executeable from anywhere
  - Create directory `frontend/` for frontends of isisdl (excluding __main__.py)
  • Loading branch information
Emily3403 committed Mar 3, 2024
1 parent d9ea526 commit 9fc3569
Show file tree
Hide file tree
Showing 36 changed files with 288 additions and 346 deletions.
6 changes: 0 additions & 6 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,9 +1,3 @@
[submodule "isisdl.wiki"]
path = isisdl.wiki
url = [email protected]:Emily3403/isisdl.wiki.git
[submodule "aur"]
path = aur
url = https://aur.archlinux.org/isisdl.git
[submodule "tldr"]
path = tldr
url = [email protected]:Emily3403/tldr.git
1 change: 0 additions & 1 deletion aur
Submodule aur deleted from aa3cc7
9 changes: 9 additions & 0 deletions bin/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# compile-isisdl
static-compile/*.build
static-compile/*.dist
static-compile/*.onefile-build
static-compile/isisdl-linux.bin
static-compile/isisdl-windows.exe

# upload-PyPI
build/
File renamed without changes.
File renamed without changes.
26 changes: 26 additions & 0 deletions bin/compile-isisdl.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash

set -e
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"

rm -rf "$SCRIPT_DIR"/static-compile/isisdl.*
rm -rf "$SCRIPT_DIR"/static-compile/venv

python3.11 -m venv "$SCRIPT_DIR"/static-compile/venv
source "$SCRIPT_DIR"/static-compile/venv/bin/activate
pip install "$SCRIPT_DIR"/..

python3 -c 'from isisdl.settings import is_static
assert is_static, "Error: For the static build, is_static must be True"
' || exit 1


pip install zstandard ordered-set nuitka
nuitka3 --standalone --onefile \
--linux-onefile-icon="$SCRIPT_DIR"/static-compile/isisdl_icon.png \
--output-dir="$SCRIPT_DIR"/static-compile \
--output-filename=isisdl-linux.bin \
"$SCRIPT_DIR"/../src/isisdl/__main__.py

echo "new sha256sum is"
sha256sum "$SCRIPT_DIR"/static-compile/isisdl-linux.bin
File renamed without changes
Binary file added bin/static-compile/isisdl_icon.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
12 changes: 12 additions & 0 deletions bin/upload-PyPI.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash

set -e
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"

cd "$SCRIPT_DIR"/.. || exit 1

mkdir -p "$SCRIPT_DIR"/dist/
rm "$SCRIPT_DIR"/dist/* 2> /dev/null
python3 -m build --outdir "$SCRIPT_DIR/dist/"

twine upload "$SCRIPT_DIR"/dist/*
7 changes: 0 additions & 7 deletions compile/.gitignore

This file was deleted.

24 changes: 0 additions & 24 deletions compile/compile_isisdl.sh

This file was deleted.

Binary file removed compile/isisdl.png
Binary file not shown.
1 change: 0 additions & 1 deletion isisdl.wiki
Submodule isisdl.wiki deleted from f29ec5
10 changes: 5 additions & 5 deletions src/isisdl/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
import asyncio
import sys

import isisdl.compress as compress
import isisdl.frontend.compress as compress
from isisdl.api.crud import authenticate_new_session
from isisdl.api.downloading import download_media_urls, gather_media_urls
from isisdl.api.download import download_media_urls, gather_media_urls
from isisdl.api.endpoints import UserCourseListAPI
from isisdl.backend import sync_database
from isisdl.backend.config import init_wizard, config_wizard
from isisdl.frontend import sync_database
from isisdl.frontend.config import init_wizard, config_wizard
from isisdl.backend.crud import read_config, read_user
from isisdl.backend.request_helper import CourseDownloader
from isisdl.db_conf import init_database, DatabaseSessionMaker
Expand Down Expand Up @@ -53,7 +53,7 @@ async def _new_main() -> None:
return None

downloaded_content = await download_media_urls(db, urls)
# - After downloading everything, run the hardlink resolution, this time based on checksums.
# After downloading everything, run the hardlink resolution, this time based on checksums.

_ = downloaded_content

Expand Down
164 changes: 151 additions & 13 deletions src/isisdl/api/crud.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,19 @@

import re
from base64 import standard_b64decode
from collections import defaultdict
from html import unescape
from typing import Any
from itertools import chain
from typing import Any, Literal, cast, DefaultDict

from aiohttp import ClientSession as InternetSession
from sqlalchemy import select
from sqlalchemy.orm import Session as DatabaseSession

from isisdl.api.models import AuthenticatedSession, Course, MediaURL, MediaType
from isisdl.api.models import AuthenticatedSession, Course, MediaURL, MediaType, NormalizedDocument
from isisdl.backend.models import User, Config
from isisdl.db_conf import add_or_update_objects_to_database
from isisdl.settings import url_finder, isis_ignore, extern_ignore, regex_is_isis_document, regex_is_isis_video
from isisdl.utils import datetime_fromtimestamp_with_None, flat_map
from isisdl.version import __version__

Expand Down Expand Up @@ -73,6 +76,12 @@ async def authenticate_new_session(user: User, config: Config) -> AuthenticatedS
return AuthenticatedSession(session, session_key=session_key, api_token=api_token)


# --- Courses ---

def read_courses(db: DatabaseSession) -> list[Course]:
return list(db.execute(select(Course)).scalars().all())


def parse_courses_from_API(db: DatabaseSession, courses: list[dict[str, Any]], config: Config) -> list[Course] | None:
existing_courses = {it.id: it for it in read_courses(db)}

Expand All @@ -85,11 +94,145 @@ def parse_courses_from_API(db: DatabaseSession, courses: list[dict[str, Any]], c
)


def create_videos_from_API(db: DatabaseSession, videos: list[dict[str, Any]], course_id: int) -> list[MediaURL] | None:
# --- Documents ---

def read_media_urls(db: DatabaseSession) -> dict[int, dict[str, MediaURL]]:
final: DefaultDict[int, dict[str, MediaURL]] = defaultdict(dict)
for it in db.execute(select(MediaURL)).scalars().all():
final[it.course_id][it.url] = it

return dict(final)


def create_documents_from_API(db: DatabaseSession, data: list[NormalizedDocument], existing_documents: dict[str, MediaURL]) -> list[MediaURL] | None:
_data = cast(list[dict[str, Any]], data) # Erase the `NormalizedDocument` signature to make mypy happy

return add_or_update_objects_to_database(
db, existing_documents, _data, MediaURL, lambda doc: doc["url"],
{it: it for it in NormalizedDocument.__annotations__.keys()},
{"time_created": datetime_fromtimestamp_with_None, "time_modified": datetime_fromtimestamp_with_None},
)


def parse_documents_from_API(db: DatabaseSession, course_id: int, documents: list[dict[str, Any]], existing_documents: dict[str, MediaURL]) -> list[MediaURL]:
"""
TODO: Revise this docstring as it is not accurate anymore. Maybe a way for a transaction is possible, but I don't see it.
Note that this function should be called using a `db.begin()` (transaction) for the db parameter as this function will create #Courses commits to the database.
To save trips to the database, one has to pass existing_documents parameter to this function.
"""

api_data = list(
filter(
lambda it: it != {},

flat_map(
lambda it: it.get("contents", [{}]),
flat_map(
lambda it: it.get("modules", [{}]),
documents
)
)
)
)

regex_data = parse_course_page_with_regex(documents, course_id)
data = filter_duplicates_and_normalize_documents(api_data, regex_data, course_id)

return create_documents_from_API(db, data, existing_documents) or []


def parse_course_page_with_regex(documents: list[dict[str, Any]], course_id: int) -> list[dict[str, Any]]:
files = []

for url in url_finder.findall(str(documents)):
if isis_ignore.match(url) is not None or extern_ignore.match(url) is not None:
continue

files.append({"fileurl": url, "course_id": course_id, "relative_path": "", "filename": None, "filesize": None, "timecreated": None, "timemodified": None, "type": "url"})

return files


def filter_duplicates_and_normalize_documents(documents_data: list[dict[str, Any]], regex_data: list[dict[str, Any]], course_id: int) -> list[NormalizedDocument]:
duplicates = defaultdict(list)

for it in chain(documents_data, regex_data):
file = normalize_file(it, course_id)
if file is None:
continue

duplicates[it["fileurl"]].append(file)

return [resolve_duplicates(files) for files in duplicates.values()]


def normalize_file(file: dict[str, Any], course_id: int) -> NormalizedDocument | None:
url = file.get("fileurl")
if url is None:
return None

if url.endswith("?forcedownload=1"):
url = url[:-len("?forcedownload=1")]

if isis_ignore.match(url) is not None or extern_ignore.match(url) is not None:
return None

if regex_is_isis_video.match(url) is not None:
media_type = MediaType.video
elif regex_is_isis_document.match(url) is not None or file.get("type") != "url":
media_type = MediaType.document
else:
media_type = MediaType.extern

return {
"url": url,
"course_id": course_id,
"media_type": media_type,
"relative_path": (file.get("filepath") or "").lstrip("/"),
"name": file.get("filename"),
"size": file.get("filesize"),
"time_created": file.get("timecreated") or file.get("timemodified"),
"time_modified": file.get("timemodified") or file.get("timecreated"),
}


def resolve_duplicates(files: list[NormalizedDocument]) -> NormalizedDocument:
"""
Determinism:
Files are sorted deterministicly by partitioning each attribute into the "Some" and "None" category.
Then, each attribute is sorted based on the "Some" category.
If there are multiple files with different attribute, the first one according to the sort order is chosen.
"""
if len(files) == 1:
return files[0]

def resolve_conflict(attr: Literal["url"] | Literal["course_id"] | Literal["media_type"] | Literal["relative_path"] | Literal["name"] | Literal["size"] | Literal["time_created"] | Literal["time_modified"]) -> Any:
conflicting_attrs = sorted({it for file in files if (it := file[attr]) is not None})
if len(conflicting_attrs) == 0:
return None

return conflicting_attrs[0]

return {
"url": resolve_conflict("url"),
"course_id": resolve_conflict("course_id"),
"media_type": resolve_conflict("media_type"),
"relative_path": resolve_conflict("relative_path"),
"name": resolve_conflict("name"),
"size": resolve_conflict("size"),
"time_created": resolve_conflict("time_created"),
"time_modified": resolve_conflict("time_modified"),
}


# --- Videos ---


def create_videos_from_API(db: DatabaseSession, videos: list[dict[str, Any]], course_id: int, existing_videos: dict[str, MediaURL]) -> list[MediaURL] | None:
# Filter out duplicate videos
videos = list({video["url"]: video for video in videos}.values())

existing_videos = {it.url: it for it in read_media_urls(db) if it.media_type == MediaType.video}
videos = list(map(lambda it: it | {"course_id": course_id, "media_type": MediaType.video, "relative_path": "Videos", "size": None, "time_modified": None}, videos))

return add_or_update_objects_to_database(
Expand All @@ -103,20 +246,15 @@ def parse_videos_from_API(db: DatabaseSession, videos: list[dict[str, Any]], con
if config.dl_download_videos is False:
return []

existing_videos = read_media_urls(db)

# TODO: Make this a single transaction instead of one for each course
return list(
filter(
lambda it: it is not None,
flat_map(
lambda data: create_videos_from_API(db, data.get("videos"), data.get("courseid")) or [],
lambda data: create_videos_from_API(db, data.get("videos"), data.get("courseid"), existing_videos[data["courseid"]]) or [],
map(lambda it: it.get("data", {}), videos)
)
)
)


def read_courses(db: DatabaseSession) -> list[Course]:
return list(db.execute(select(Course)).scalars().all())


def read_media_urls(db: DatabaseSession) -> list[MediaURL]:
return list(db.execute(select(MediaURL)).scalars().all())
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from sqlalchemy.orm import Session as DatabaseSession

from isisdl.api.endpoints import VideoListAPI, DocumentListAPI
from isisdl.api.endpoints import DocumentListAPI, VideoListAPI
from isisdl.api.models import MediaContainer, MediaURL, AuthenticatedSession, Course

__all__ = ["download_media_urls"]
Expand Down
Loading

0 comments on commit 9fc3569

Please sign in to comment.