From 491117e1ee2facb3afda535dc5300598889d5bcf Mon Sep 17 00:00:00 2001 From: mojomonger Date: Wed, 29 Nov 2023 02:29:09 -0800 Subject: [PATCH] implamented check_url_archive caching for archive status --- src/models/api/schema/check_url_schema.py | 2 +- src/models/file_io/__init__.py | 3 +- src/models/file_io/url_archive_file_io.py | 11 +++++++ src/models/file_io/url_file_io.py | 1 + src/models/identifiers_checking/url.py | 12 ++++++-- .../wikipedia/reference/extractor.py | 3 ++ .../wikimedia/wikipedia/reference/generic.py | 5 ++++ src/views/check_url.py | 26 ++++++++++++----- src/views/check_url_archive.py | 29 +++++++++++++++++-- src/views/statistics/__init__.py | 2 +- 10 files changed, 78 insertions(+), 16 deletions(-) create mode 100644 src/models/file_io/url_archive_file_io.py diff --git a/src/models/api/schema/check_url_schema.py b/src/models/api/schema/check_url_schema.py index 9382c1c6..c5cc81e9 100644 --- a/src/models/api/schema/check_url_schema.py +++ b/src/models/api/schema/check_url_schema.py @@ -14,7 +14,7 @@ class UrlSchema(BaseSchema): url = String(required=True) timeout = Int(required=False) - method = String(required=False) # dead: disable + method = String(required=False) debug = Bool(required=False) blocks = Bool(required=False) xml = Bool(required=False) diff --git a/src/models/file_io/__init__.py b/src/models/file_io/__init__.py index cbf56058..1a13b35a 100644 --- a/src/models/file_io/__init__.py +++ b/src/models/file_io/__init__.py @@ -34,7 +34,7 @@ def path_filename(self) -> str: path_filename = ( f"{config.subdirectory_for_json}{self.subfolder}{self.filename}" ) - app.logger.debug(f"using path: {path_filename}") + app.logger.debug(f"using path: {path_filename} (subfolder: {self.subfolder})") return path_filename def write_to_disk( @@ -77,4 +77,3 @@ def read_from_disk(self) -> None: # app.logger.debug(f"loaded: {self.statistics_dictionary}") else: logger.debug("no json on disk") - app.logger.debug("no json on disk") diff --git a/src/models/file_io/url_archive_file_io.py b/src/models/file_io/url_archive_file_io.py new file mode 100644 index 00000000..bff5260a --- /dev/null +++ b/src/models/file_io/url_archive_file_io.py @@ -0,0 +1,11 @@ +import logging +from typing import Any, Dict, Optional + +from src.models.file_io.hash_based import HashBasedFileIo + +logger = logging.getLogger(__name__) + + +class UrlArchiveFileIo(HashBasedFileIo): + data: Optional[Dict[str, Any]] = None + subfolder = "urls/archives/" diff --git a/src/models/file_io/url_file_io.py b/src/models/file_io/url_file_io.py index b2e02263..af424c77 100644 --- a/src/models/file_io/url_file_io.py +++ b/src/models/file_io/url_file_io.py @@ -8,4 +8,5 @@ class UrlFileIo(HashBasedFileIo): data: Optional[Dict[str, Any]] = None + flavor: str = "" subfolder = "urls/" diff --git a/src/models/identifiers_checking/url.py b/src/models/identifiers_checking/url.py index dd2de9d8..30b90464 100644 --- a/src/models/identifiers_checking/url.py +++ b/src/models/identifiers_checking/url.py @@ -46,6 +46,7 @@ class Url(WikipediaUrl): # iari test - deprecated, for now (2023.11.08) status_code: int = 0 + status_code_method: str = "" # iabot status testdeadlink_status_code: int = 0 @@ -76,10 +77,17 @@ class Url(WikipediaUrl): # def __check_soft404__(self): # raise NotImplementedError() - def check(self): + def check(self, method): + from src import app + if self.url: self.extract() - # self.__check_url__() # omit native IARI checking for now - just ise IABot's + # self.__check_url__() # deprecated - omit native IARI checking - just using IABot's testdeadlink for now + + self.status_code_method = method + app.logger.debug(f"checking url with method {method}") + + # TODO me must respect "method" parameter here to check URL status self.__check_url_with_testdeadlink_api__() # self.__check_url_archive_with_iabot_api__() self.__detect_language__() diff --git a/src/models/wikimedia/wikipedia/reference/extractor.py b/src/models/wikimedia/wikipedia/reference/extractor.py index 6209df44..9a066302 100644 --- a/src/models/wikimedia/wikipedia/reference/extractor.py +++ b/src/models/wikimedia/wikipedia/reference/extractor.py @@ -229,6 +229,9 @@ def __extract_sections__(self) -> None: levels=[2], include_headings=True, ) + + # TODO: make this code better by special casing no section and making faux section, and putting through same loop + if not sections: app.logger.debug("No level 2 sections detected, creating root section") # console.print(self.wikicode) diff --git a/src/models/wikimedia/wikipedia/reference/generic.py b/src/models/wikimedia/wikipedia/reference/generic.py index 151dc75f..1a231b27 100644 --- a/src/models/wikimedia/wikipedia/reference/generic.py +++ b/src/models/wikimedia/wikipedia/reference/generic.py @@ -221,22 +221,27 @@ def __extract_external_wikicoded_links_from_the_reference__(self) -> None: def __extract_reference_urls__(self) -> None: """We support both URLs in templates and outside aka bare URLs""" urls_list = [] + if not self.template_urls: self.__extract_template_urls__() if self.template_urls: urls_list.extend(self.template_urls) + if not self.bare_urls: self.__extract_bare_urls_outside_templates__() if self.bare_urls: urls_list.extend(self.bare_urls) + if not self.wikicoded_links: self.__extract_external_wikicoded_links_from_the_reference__() if self.wikicoded_links: urls_list.extend(self.wikicoded_links) + # if not self.comment_urls: # self.__extract_urls_from_comments__() # urls_list.extend(self.comment_urls) # We set it to avoid duplicates + self.reference_urls = list(set(urls_list)) def __extract_unique_first_level_domains__(self) -> None: diff --git a/src/views/check_url.py b/src/views/check_url.py index ab6e78e4..c1ee4df5 100644 --- a/src/views/check_url.py +++ b/src/views/check_url.py @@ -38,6 +38,11 @@ def __url_hash_id__(self) -> str: the raw upper cased URL supplied by the user""" if not self.job: raise MissingInformationError() + + from src import app + + # app.logger.debug(f"check_url::__url_hash_id: method variable is: {method}") + return hashlib.md5(f"{self.job.unquoted_url.upper()}".encode()).hexdigest()[:8] def get(self): @@ -51,12 +56,14 @@ def get(self): return self.__return_from_cache_or_analyze_and_return__() def __setup_io__(self): - self.io = UrlFileIo(hash_based_id=self.__url_hash_id__) + self.io = UrlFileIo(hash_based_id=self.__url_hash_id__, flavor=self.job.method) def __return_from_cache_or_analyze_and_return__(self): from src import app - app.logger.debug("__handle_valid_job__; running") + app.logger.debug( + f"check_url::__return_from_cache_or_analyze_and_return__: method is {self.job.method}" + ) if not self.job.refresh: self.__setup_and_read_from_cache__() @@ -71,21 +78,24 @@ def __return_fresh_data__(self): from src import app url_string = self.job.unquoted_url - app.logger.info(f"Got {url_string}") + app.logger.info(f"__return_fresh_data__: Got {url_string}") url = Url(url=url_string, timeout=self.job.timeout) - url.check() + url.check(self.job.method) data = url.get_dict timestamp = datetime.timestamp(datetime.utcnow()) - data["timestamp"] = int(timestamp) isodate = datetime.isoformat(datetime.utcnow()) - data["isodate"] = str(isodate) url_hash_id = self.__url_hash_id__ + + data["timestamp"] = int(timestamp) + data["isodate"] = str(isodate) data["id"] = url_hash_id + data_without_text = deepcopy(data) del data_without_text["text"] + self.__write_to_cache__(data_without_text=data_without_text) if self.job.refresh: self.__print_log_message_about_refresh__() @@ -101,6 +111,8 @@ def __write_to_cache__(self, data_without_text): # We skip writes during testing if not self.job.testing: write = UrlFileIo( - data=data_without_text, hash_based_id=data_without_text["id"] + data=data_without_text, + hash_based_id=data_without_text["id"], + flavor=self.job.method, ) write.write_to_disk() diff --git a/src/views/check_url_archive.py b/src/views/check_url_archive.py index 2ff53936..b7f30a51 100644 --- a/src/views/check_url_archive.py +++ b/src/views/check_url_archive.py @@ -8,7 +8,7 @@ from src.models.api.job.check_url_archive_job import UrlArchiveJob from src.models.api.schema.check_url_archive_schema import UrlArchiveSchema from src.models.exceptions import MissingInformationError -from src.models.file_io.url_file_io import UrlFileIo +from src.models.file_io.url_archive_file_io import UrlArchiveFileIo from src.models.identifiers_checking.url_archive import UrlArchive from src.views.statistics.write_view import StatisticsWriteView @@ -51,7 +51,7 @@ def get(self): return self.__return_from_cache_or_analyze_and_return__() def __setup_io__(self): - self.io = UrlFileIo(hash_based_id=self.__url_hash_id__) + self.io = UrlArchiveFileIo(hash_based_id=self.__url_hash_id__) def __return_from_cache_or_analyze_and_return__(self): from src import app @@ -60,7 +60,16 @@ def __return_from_cache_or_analyze_and_return__(self): # always return fresh data for url_archive, for now... # in essence, the iabot database really IS the cache... - return self.__return_fresh_data__() + ### return self.__return_fresh_data__() + + if not self.job.refresh: + self.__setup_and_read_from_cache__() + if self.io.data: + return self.io.data, 200 + else: # no cached data found - pull from live data (and save) + return self.__return_fresh_data__() + else: + return self.__return_fresh_data__() def __return_fresh_data__(self): from src import app @@ -80,4 +89,18 @@ def __return_fresh_data__(self): data["isodate"] = str(isodate) data["id"] = url_hash_id + self.__write_to_cache__(data=data) + + if self.job.refresh: + self.__print_log_message_about_refresh__() + data["refreshed_now"] = True + else: + data["refreshed_now"] = False + return data, 200 + + def __write_to_cache__(self, data): + # We skip writes during testing + if not self.job.testing: + write = UrlArchiveFileIo(data=data, hash_based_id=data["id"]) + write.write_to_disk() diff --git a/src/views/statistics/__init__.py b/src/views/statistics/__init__.py index 7bf57144..a6cfe43e 100644 --- a/src/views/statistics/__init__.py +++ b/src/views/statistics/__init__.py @@ -62,7 +62,7 @@ def __print_log_message_about_refresh__(self): from src import app if self.job.refresh: - app.logger.info("got force refresh from patron") + app.logger.info("force refresh from patron request") # def __write_to_disk__(self): # raise NotImplementedError()