implamented check_url_archive caching for archive status

internetarchive · Nov 29, 2023 · 491117e · 491117e
1 parent 7dc79f3
commit 491117e
Show file tree

Hide file tree

Showing 10 changed files with 78 additions and 16 deletions.
diff --git a/src/models/api/schema/check_url_schema.py b/src/models/api/schema/check_url_schema.py
@@ -14,7 +14,7 @@ class UrlSchema(BaseSchema):
 
     url = String(required=True)
     timeout = Int(required=False)
-    method = String(required=False)  # dead: disable
+    method = String(required=False)
     debug = Bool(required=False)
     blocks = Bool(required=False)
     xml = Bool(required=False)

diff --git a/src/models/file_io/__init__.py b/src/models/file_io/__init__.py
@@ -34,7 +34,7 @@ def path_filename(self) -> str:
             path_filename = (
                 f"{config.subdirectory_for_json}{self.subfolder}{self.filename}"
             )
-        app.logger.debug(f"using path: {path_filename}")
+        app.logger.debug(f"using path: {path_filename} (subfolder: {self.subfolder})")
         return path_filename
 
     def write_to_disk(
@@ -77,4 +77,3 @@ def read_from_disk(self) -> None:
                 # app.logger.debug(f"loaded: {self.statistics_dictionary}")
         else:
             logger.debug("no json on disk")
-            app.logger.debug("no json on disk")
diff --git a/src/models/file_io/url_archive_file_io.py b/src/models/file_io/url_archive_file_io.py
@@ -0,0 +1,11 @@
+import logging
+from typing import Any, Dict, Optional
+
+from src.models.file_io.hash_based import HashBasedFileIo
+
+logger = logging.getLogger(__name__)
+
+
+class UrlArchiveFileIo(HashBasedFileIo):
+    data: Optional[Dict[str, Any]] = None
+    subfolder = "urls/archives/"
diff --git a/src/models/file_io/url_file_io.py b/src/models/file_io/url_file_io.py
@@ -8,4 +8,5 @@
 
 class UrlFileIo(HashBasedFileIo):
     data: Optional[Dict[str, Any]] = None
+    flavor: str = ""
     subfolder = "urls/"
diff --git a/src/models/identifiers_checking/url.py b/src/models/identifiers_checking/url.py
@@ -46,6 +46,7 @@ class Url(WikipediaUrl):
 
     # iari test - deprecated, for now (2023.11.08)
     status_code: int = 0
+    status_code_method: str = ""
 
     # iabot status
     testdeadlink_status_code: int = 0
@@ -76,10 +77,17 @@ class Url(WikipediaUrl):
     # def __check_soft404__(self):
     #     raise NotImplementedError()
 
-    def check(self):
+    def check(self, method):
+        from src import app
+
         if self.url:
             self.extract()
-            # self.__check_url__()  # omit native IARI checking for now - just ise IABot's
+            # self.__check_url__()  # deprecated - omit native IARI checking - just using IABot's testdeadlink for now
+
+            self.status_code_method = method
+            app.logger.debug(f"checking url with method {method}")
+
+            # TODO me must respect "method" parameter here to check URL status
             self.__check_url_with_testdeadlink_api__()
             # self.__check_url_archive_with_iabot_api__()
             self.__detect_language__()

diff --git a/src/models/wikimedia/wikipedia/reference/extractor.py b/src/models/wikimedia/wikipedia/reference/extractor.py
@@ -229,6 +229,9 @@ def __extract_sections__(self) -> None:
             levels=[2],
             include_headings=True,
         )
+
+        # TODO: make this code better by special casing no section and making faux section, and putting through same loop
+
         if not sections:
             app.logger.debug("No level 2 sections detected, creating root section")
             # console.print(self.wikicode)

diff --git a/src/models/wikimedia/wikipedia/reference/generic.py b/src/models/wikimedia/wikipedia/reference/generic.py
@@ -221,22 +221,27 @@ def __extract_external_wikicoded_links_from_the_reference__(self) -> None:
     def __extract_reference_urls__(self) -> None:
         """We support both URLs in templates and outside aka bare URLs"""
         urls_list = []
+
         if not self.template_urls:
             self.__extract_template_urls__()
         if self.template_urls:
             urls_list.extend(self.template_urls)
+
         if not self.bare_urls:
             self.__extract_bare_urls_outside_templates__()
         if self.bare_urls:
             urls_list.extend(self.bare_urls)
+
         if not self.wikicoded_links:
             self.__extract_external_wikicoded_links_from_the_reference__()
         if self.wikicoded_links:
             urls_list.extend(self.wikicoded_links)
+
         # if not self.comment_urls:
         #     self.__extract_urls_from_comments__()
         # urls_list.extend(self.comment_urls)
         # We set it to avoid duplicates
+
         self.reference_urls = list(set(urls_list))
 
     def __extract_unique_first_level_domains__(self) -> None:

diff --git a/src/views/check_url.py b/src/views/check_url.py
@@ -38,6 +38,11 @@ def __url_hash_id__(self) -> str:
         the raw upper cased URL supplied by the user"""
         if not self.job:
             raise MissingInformationError()
+
+        from src import app
+
+        # app.logger.debug(f"check_url::__url_hash_id: method variable is: {method}")
+
         return hashlib.md5(f"{self.job.unquoted_url.upper()}".encode()).hexdigest()[:8]
 
     def get(self):
@@ -51,12 +56,14 @@ def get(self):
             return self.__return_from_cache_or_analyze_and_return__()
 
     def __setup_io__(self):
-        self.io = UrlFileIo(hash_based_id=self.__url_hash_id__)
+        self.io = UrlFileIo(hash_based_id=self.__url_hash_id__, flavor=self.job.method)
 
     def __return_from_cache_or_analyze_and_return__(self):
         from src import app
 
-        app.logger.debug("__handle_valid_job__; running")
+        app.logger.debug(
+            f"check_url::__return_from_cache_or_analyze_and_return__: method is {self.job.method}"
+        )
 
         if not self.job.refresh:
             self.__setup_and_read_from_cache__()
@@ -71,21 +78,24 @@ def __return_fresh_data__(self):
         from src import app
 
         url_string = self.job.unquoted_url
-        app.logger.info(f"Got {url_string}")
+        app.logger.info(f"__return_fresh_data__: Got {url_string}")
         url = Url(url=url_string, timeout=self.job.timeout)
 
-        url.check()
+        url.check(self.job.method)
 
         data = url.get_dict
 
         timestamp = datetime.timestamp(datetime.utcnow())
-        data["timestamp"] = int(timestamp)
         isodate = datetime.isoformat(datetime.utcnow())
-        data["isodate"] = str(isodate)
         url_hash_id = self.__url_hash_id__
+
+        data["timestamp"] = int(timestamp)
+        data["isodate"] = str(isodate)
         data["id"] = url_hash_id
+
         data_without_text = deepcopy(data)
         del data_without_text["text"]
+
         self.__write_to_cache__(data_without_text=data_without_text)
         if self.job.refresh:
             self.__print_log_message_about_refresh__()
@@ -101,6 +111,8 @@ def __write_to_cache__(self, data_without_text):
         # We skip writes during testing
         if not self.job.testing:
             write = UrlFileIo(
-                data=data_without_text, hash_based_id=data_without_text["id"]
+                data=data_without_text,
+                hash_based_id=data_without_text["id"],
+                flavor=self.job.method,
             )
             write.write_to_disk()
diff --git a/src/views/check_url_archive.py b/src/views/check_url_archive.py
@@ -8,7 +8,7 @@
 from src.models.api.job.check_url_archive_job import UrlArchiveJob
 from src.models.api.schema.check_url_archive_schema import UrlArchiveSchema
 from src.models.exceptions import MissingInformationError
-from src.models.file_io.url_file_io import UrlFileIo
+from src.models.file_io.url_archive_file_io import UrlArchiveFileIo
 from src.models.identifiers_checking.url_archive import UrlArchive
 from src.views.statistics.write_view import StatisticsWriteView
 
@@ -51,7 +51,7 @@ def get(self):
             return self.__return_from_cache_or_analyze_and_return__()
 
     def __setup_io__(self):
-        self.io = UrlFileIo(hash_based_id=self.__url_hash_id__)
+        self.io = UrlArchiveFileIo(hash_based_id=self.__url_hash_id__)
 
     def __return_from_cache_or_analyze_and_return__(self):
         from src import app
@@ -60,7 +60,16 @@ def __return_from_cache_or_analyze_and_return__(self):
 
         # always return fresh data for url_archive, for now...
         # in essence, the iabot database really IS the cache...
-        return self.__return_fresh_data__()
+        ### return self.__return_fresh_data__()
+
+        if not self.job.refresh:
+            self.__setup_and_read_from_cache__()
+            if self.io.data:
+                return self.io.data, 200
+            else:  # no cached data found - pull from live data (and save)
+                return self.__return_fresh_data__()
+        else:
+            return self.__return_fresh_data__()
 
     def __return_fresh_data__(self):
         from src import app
@@ -80,4 +89,18 @@ def __return_fresh_data__(self):
         data["isodate"] = str(isodate)
         data["id"] = url_hash_id
 
+        self.__write_to_cache__(data=data)
+
+        if self.job.refresh:
+            self.__print_log_message_about_refresh__()
+            data["refreshed_now"] = True
+        else:
+            data["refreshed_now"] = False
+
         return data, 200
+
+    def __write_to_cache__(self, data):
+        # We skip writes during testing
+        if not self.job.testing:
+            write = UrlArchiveFileIo(data=data, hash_based_id=data["id"])
+            write.write_to_disk()
diff --git a/src/views/statistics/__init__.py b/src/views/statistics/__init__.py
@@ -62,7 +62,7 @@ def __print_log_message_about_refresh__(self):
         from src import app
 
         if self.job.refresh:
-            app.logger.info("got force refresh from patron")
+            app.logger.info("force refresh from patron request")
 
     # def __write_to_disk__(self):
     #     raise NotImplementedError()