Merge pull request #373 from openzim/stream_dl

Stream files downloads to not exhaust memory
openzim · Aug 12, 2024 · d814c23 · d814c23
2 parents d0d0c6e + efdf780
commit d814c23
Show file tree

Hide file tree

Showing 4 changed files with 37 additions and 16 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Add support for uncompressed tar archive in --warcs (#369)
 
+### Fixed
+
+- Stream files downloads to not exhaust memory (#373)
+
 ## [2.1.0] - 2024-08-09
 
 ### Added

diff --git a/src/zimit/constants.py b/src/zimit/constants.py
@@ -0,0 +1,10 @@
+import logging
+
+from zimscraperlib.logging import getLogger
+
+EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2
+EXIT_CODE_CRAWLER_LIMIT_HIT = 11
+NORMAL_WARC2ZIM_EXIT_CODE = 100
+REQUESTS_TIMEOUT = 10
+
+logger = getLogger(name="zimit", level=logging.INFO)
diff --git a/src/zimit/utils.py b/src/zimit/utils.py
@@ -0,0 +1,14 @@
+from pathlib import Path
+
+import requests
+
+from zimit.constants import REQUESTS_TIMEOUT
+
+
+def download_file(url: str, fpath: Path):
+    """Download file from url to fpath with streaming"""
+    with requests.get(url, timeout=REQUESTS_TIMEOUT, stream=True) as resp:
+        resp.raise_for_status()
+        with open(fpath, "wb") as f:
+            for chunk in resp.iter_content(chunk_size=8192):
+                f.write(chunk)
diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py
@@ -6,7 +6,6 @@
 
 import atexit
 import json
-import logging
 import re
 import shutil
 import signal
@@ -21,19 +20,17 @@
 
 import inotify
 import inotify.adapters
-import requests
 from warc2zim.main import main as warc2zim
-from zimscraperlib.logging import getLogger
 from zimscraperlib.uri import rebuild_uri
 
 from zimit.__about__ import __version__
-
-EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2
-EXIT_CODE_CRAWLER_LIMIT_HIT = 11
-NORMAL_WARC2ZIM_EXIT_CODE = 100
-REQUESTS_TIMEOUT = 10
-
-logger = getLogger(name="zimit", level=logging.INFO)
+from zimit.constants import (
+    EXIT_CODE_CRAWLER_LIMIT_HIT,
+    EXIT_CODE_WARC2ZIM_CHECK_FAILED,
+    NORMAL_WARC2ZIM_EXIT_CODE,
+    logger,
+)
+from zimit.utils import download_file
 
 
 class ProgressFileWatcher:
@@ -457,9 +454,7 @@ def cleanup():
                     f"Downloading browser profile from {custom_behavior} "
                     f"to {behaviors_file.name}"
                 )
-                resp = requests.get(custom_behavior, timeout=REQUESTS_TIMEOUT)
-                resp.raise_for_status()
-                Path(behaviors_file.name).write_bytes(resp.content)
+                download_file(custom_behavior, Path(behaviors_file.name))
             else:
                 logger.info(
                     f"Copying browser profile from {custom_behavior} "
@@ -552,9 +547,7 @@ def cleanup():
             # collisions
             warc_file = Path(filename.name)
             logger.info(f"Downloading WARC(s) from {warc_location} to {warc_file}")
-            resp = requests.get(warc_location, timeout=REQUESTS_TIMEOUT)
-            resp.raise_for_status()
-            warc_file.write_bytes(resp.content)
+            download_file(warc_location, warc_file)
 
             # if it is a plain warc or warc.gz, simply add it to the list
             if suffix in {".warc", ".warc.gz"}: