Skip to content

Commit

Permalink
implamented check_url_archive caching for archive status
Browse files Browse the repository at this point in the history
  • Loading branch information
mojomonger committed Nov 29, 2023
1 parent 7dc79f3 commit 491117e
Show file tree
Hide file tree
Showing 10 changed files with 78 additions and 16 deletions.
2 changes: 1 addition & 1 deletion src/models/api/schema/check_url_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ class UrlSchema(BaseSchema):

url = String(required=True)
timeout = Int(required=False)
method = String(required=False) # dead: disable
method = String(required=False)
debug = Bool(required=False)
blocks = Bool(required=False)
xml = Bool(required=False)
Expand Down
3 changes: 1 addition & 2 deletions src/models/file_io/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def path_filename(self) -> str:
path_filename = (
f"{config.subdirectory_for_json}{self.subfolder}{self.filename}"
)
app.logger.debug(f"using path: {path_filename}")
app.logger.debug(f"using path: {path_filename} (subfolder: {self.subfolder})")
return path_filename

def write_to_disk(
Expand Down Expand Up @@ -77,4 +77,3 @@ def read_from_disk(self) -> None:
# app.logger.debug(f"loaded: {self.statistics_dictionary}")
else:
logger.debug("no json on disk")
app.logger.debug("no json on disk")
11 changes: 11 additions & 0 deletions src/models/file_io/url_archive_file_io.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import logging
from typing import Any, Dict, Optional

from src.models.file_io.hash_based import HashBasedFileIo

logger = logging.getLogger(__name__)


class UrlArchiveFileIo(HashBasedFileIo):
data: Optional[Dict[str, Any]] = None
subfolder = "urls/archives/"
1 change: 1 addition & 0 deletions src/models/file_io/url_file_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@

class UrlFileIo(HashBasedFileIo):
data: Optional[Dict[str, Any]] = None
flavor: str = ""
subfolder = "urls/"
12 changes: 10 additions & 2 deletions src/models/identifiers_checking/url.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ class Url(WikipediaUrl):

# iari test - deprecated, for now (2023.11.08)
status_code: int = 0
status_code_method: str = ""

# iabot status
testdeadlink_status_code: int = 0
Expand Down Expand Up @@ -76,10 +77,17 @@ class Url(WikipediaUrl):
# def __check_soft404__(self):
# raise NotImplementedError()

def check(self):
def check(self, method):
from src import app

if self.url:
self.extract()
# self.__check_url__() # omit native IARI checking for now - just ise IABot's
# self.__check_url__() # deprecated - omit native IARI checking - just using IABot's testdeadlink for now

self.status_code_method = method
app.logger.debug(f"checking url with method {method}")

# TODO me must respect "method" parameter here to check URL status
self.__check_url_with_testdeadlink_api__()
# self.__check_url_archive_with_iabot_api__()
self.__detect_language__()
Expand Down
3 changes: 3 additions & 0 deletions src/models/wikimedia/wikipedia/reference/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,9 @@ def __extract_sections__(self) -> None:
levels=[2],
include_headings=True,
)

# TODO: make this code better by special casing no section and making faux section, and putting through same loop

if not sections:
app.logger.debug("No level 2 sections detected, creating root section")
# console.print(self.wikicode)
Expand Down
5 changes: 5 additions & 0 deletions src/models/wikimedia/wikipedia/reference/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,22 +221,27 @@ def __extract_external_wikicoded_links_from_the_reference__(self) -> None:
def __extract_reference_urls__(self) -> None:
"""We support both URLs in templates and outside aka bare URLs"""
urls_list = []

if not self.template_urls:
self.__extract_template_urls__()
if self.template_urls:
urls_list.extend(self.template_urls)

if not self.bare_urls:
self.__extract_bare_urls_outside_templates__()
if self.bare_urls:
urls_list.extend(self.bare_urls)

if not self.wikicoded_links:
self.__extract_external_wikicoded_links_from_the_reference__()
if self.wikicoded_links:
urls_list.extend(self.wikicoded_links)

# if not self.comment_urls:
# self.__extract_urls_from_comments__()
# urls_list.extend(self.comment_urls)
# We set it to avoid duplicates

self.reference_urls = list(set(urls_list))

def __extract_unique_first_level_domains__(self) -> None:
Expand Down
26 changes: 19 additions & 7 deletions src/views/check_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,11 @@ def __url_hash_id__(self) -> str:
the raw upper cased URL supplied by the user"""
if not self.job:
raise MissingInformationError()

from src import app

# app.logger.debug(f"check_url::__url_hash_id: method variable is: {method}")

return hashlib.md5(f"{self.job.unquoted_url.upper()}".encode()).hexdigest()[:8]

def get(self):
Expand All @@ -51,12 +56,14 @@ def get(self):
return self.__return_from_cache_or_analyze_and_return__()

def __setup_io__(self):
self.io = UrlFileIo(hash_based_id=self.__url_hash_id__)
self.io = UrlFileIo(hash_based_id=self.__url_hash_id__, flavor=self.job.method)

def __return_from_cache_or_analyze_and_return__(self):
from src import app

app.logger.debug("__handle_valid_job__; running")
app.logger.debug(
f"check_url::__return_from_cache_or_analyze_and_return__: method is {self.job.method}"
)

if not self.job.refresh:
self.__setup_and_read_from_cache__()
Expand All @@ -71,21 +78,24 @@ def __return_fresh_data__(self):
from src import app

url_string = self.job.unquoted_url
app.logger.info(f"Got {url_string}")
app.logger.info(f"__return_fresh_data__: Got {url_string}")
url = Url(url=url_string, timeout=self.job.timeout)

url.check()
url.check(self.job.method)

data = url.get_dict

timestamp = datetime.timestamp(datetime.utcnow())
data["timestamp"] = int(timestamp)
isodate = datetime.isoformat(datetime.utcnow())
data["isodate"] = str(isodate)
url_hash_id = self.__url_hash_id__

data["timestamp"] = int(timestamp)
data["isodate"] = str(isodate)
data["id"] = url_hash_id

data_without_text = deepcopy(data)
del data_without_text["text"]

self.__write_to_cache__(data_without_text=data_without_text)
if self.job.refresh:
self.__print_log_message_about_refresh__()
Expand All @@ -101,6 +111,8 @@ def __write_to_cache__(self, data_without_text):
# We skip writes during testing
if not self.job.testing:
write = UrlFileIo(
data=data_without_text, hash_based_id=data_without_text["id"]
data=data_without_text,
hash_based_id=data_without_text["id"],
flavor=self.job.method,
)
write.write_to_disk()
29 changes: 26 additions & 3 deletions src/views/check_url_archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from src.models.api.job.check_url_archive_job import UrlArchiveJob
from src.models.api.schema.check_url_archive_schema import UrlArchiveSchema
from src.models.exceptions import MissingInformationError
from src.models.file_io.url_file_io import UrlFileIo
from src.models.file_io.url_archive_file_io import UrlArchiveFileIo
from src.models.identifiers_checking.url_archive import UrlArchive
from src.views.statistics.write_view import StatisticsWriteView

Expand Down Expand Up @@ -51,7 +51,7 @@ def get(self):
return self.__return_from_cache_or_analyze_and_return__()

def __setup_io__(self):
self.io = UrlFileIo(hash_based_id=self.__url_hash_id__)
self.io = UrlArchiveFileIo(hash_based_id=self.__url_hash_id__)

def __return_from_cache_or_analyze_and_return__(self):
from src import app
Expand All @@ -60,7 +60,16 @@ def __return_from_cache_or_analyze_and_return__(self):

# always return fresh data for url_archive, for now...
# in essence, the iabot database really IS the cache...
return self.__return_fresh_data__()
### return self.__return_fresh_data__()

if not self.job.refresh:
self.__setup_and_read_from_cache__()
if self.io.data:
return self.io.data, 200
else: # no cached data found - pull from live data (and save)
return self.__return_fresh_data__()
else:
return self.__return_fresh_data__()

def __return_fresh_data__(self):
from src import app
Expand All @@ -80,4 +89,18 @@ def __return_fresh_data__(self):
data["isodate"] = str(isodate)
data["id"] = url_hash_id

self.__write_to_cache__(data=data)

if self.job.refresh:
self.__print_log_message_about_refresh__()
data["refreshed_now"] = True
else:
data["refreshed_now"] = False

return data, 200

def __write_to_cache__(self, data):
# We skip writes during testing
if not self.job.testing:
write = UrlArchiveFileIo(data=data, hash_based_id=data["id"])
write.write_to_disk()
2 changes: 1 addition & 1 deletion src/views/statistics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def __print_log_message_about_refresh__(self):
from src import app

if self.job.refresh:
app.logger.info("got force refresh from patron")
app.logger.info("force refresh from patron request")

# def __write_to_disk__(self):
# raise NotImplementedError()

0 comments on commit 491117e

Please sign in to comment.