Merge pull request #917 from internetarchive/check_url_archive

functionality is sufficient to merge with main branch. check-url-archive endpoint was added
internetarchive · Dec 3, 2023 · abafbad · abafbad
2 parents 1bcd7ba + 10d9335
commit abafbad
Show file tree

Hide file tree

Showing 30 changed files with 686 additions and 430 deletions.
diff --git a/.gitignore b/.gitignore
@@ -12,4 +12,5 @@ __pycache__/
 .coverage
 .idea/
 venv/
-_notes/
+_notes/
+.env
diff --git a/Dockerfile b/Dockerfile
@@ -15,7 +15,7 @@ RUN poetry install -v --no-interaction --no-ansi
 COPY . ./
 
 # Setup all the needed directories
-RUN mkdir -p /tmp/wikicitations-api json/articles json/references json/dois json/urls json/xhtmls json/pdfs
+RUN mkdir -p /tmp/wikicitations-api json/articles json/references json/dois json/urls json/urls/archives json/xhtmls json/pdfs
 
 #CMD ["./debug_app.py"]
-CMD ["gunicorn","-w", "4", "--bind", ":5000", "--timeout", "1500", "wsgi:app"]
+CMD ["gunicorn","-w", "9", "--bind", ":5000", "--timeout", "1500", "wsgi:app"]
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,6 @@
 [tool.poetry]
 name = "Internet Archive Reference Inventory (IARI)"
-#name = "IARI"
-version = "4.1.4"
+version = "4.2.0"
 description = "API capable of fetching, extracting, transforming and storing reference information from Wikipedia articles, websites and PDFs as structured data."
 authors = ["Dennis Priskorn <[email protected]>"]
 license = "GPLv3+"

diff --git a/src/__init__.py b/src/__init__.py
@@ -15,6 +15,7 @@
 import config
 from src.views.check_doi import CheckDoi
 from src.views.check_url import CheckUrl
+from src.views.check_url_archive import CheckUrlArchive
 from src.views.check_urls import CheckUrls
 from src.views.statistics.all import All
 from src.views.statistics.article import Article
@@ -37,6 +38,7 @@
 api.add_resource(Version, "/version")
 api.add_resource(CheckUrls, "/check-urls")
 api.add_resource(CheckUrl, "/check-url")
+api.add_resource(CheckUrlArchive, "/check-url-archive")
 api.add_resource(CheckDoi, "/check-doi")
 api.add_resource(Article, "/statistics/article")
 api.add_resource(All, "/statistics/all")

diff --git a/src/helpers/get_version.py b/src/helpers/get_version.py
@@ -0,0 +1,22 @@
+# get_version.py
+
+
+def get_poetry_version(file_path):
+    with open(file_path) as toml_file:
+        content = toml_file.read()
+
+        poetry_start = content.find("[tool.poetry]")
+        if poetry_start == -1:
+            return None  # The [tool.poetry] section is not found
+
+        version_start = content.find("version", poetry_start)
+        if version_start == -1:
+            return None  # The 'version' property is not found in [tool.poetry]
+
+        version_end = content.find("\n", version_start)
+        version_line = content[version_start:version_end].strip()
+
+        # Assuming version is in the format 'version = "x.y.z"'
+        version = version_line.split("=")[1].strip().strip('"')
+
+        return version
diff --git a/src/models/api/job/article_job.py b/src/models/api/job/article_job.py
@@ -74,17 +74,17 @@ def __urldecode_url__(self):
 
     def __extract_url__(self):
         """This was generated with help of chatgpt using this prompt:
-        I want a python re regex that extracts "en" "wikipedia.or"
+        I want a python re regex that extracts "en" "wikipedia.org"
         and "Test" from http://en.wikipedia.org/wiki/Test
         """
         from src import app
 
         app.logger.debug("extract_url: running")
         if self.url:
             self.__urldecode_url__()
-            pattern = r"https?://(\w+)\.(\w+\.\w+)/wiki/(.+)"
+            wiki_url_pattern = r"https?://(\w+)\.(\w+\.\w+)/wiki/(.+)"
 
-            matches = re.match(pattern, self.url)
+            matches = re.match(wiki_url_pattern, self.url)
             if matches:
                 groups = matches.groups()
                 self.lang = groups[0]
@@ -104,14 +104,14 @@ def __valid_regex__(self) -> bool:
         Words separated by spaces are allowed.
         _ is not allowed anywhere"""
         underscore_pattern = re.compile(r"^[^_]*$")
-        horizontal_line_regex = r"^(\s*[^\s]+\s*)+(\s*\|\s*[^\s]+\s*)*$"
+        pipe_delimiter_pattern = r"^(\s*[^\s]+\s*)+(\s*\|\s*[^\s]+\s*)*$"
         if " | " in self.regex:
             return False
         if "||" in self.regex:
             return False
         if not re.fullmatch(underscore_pattern, self.regex):
             return False
-        if re.fullmatch(horizontal_line_regex, self.regex):
+        if re.fullmatch(pipe_delimiter_pattern, self.regex):
             # print('The string is formatted correctly.')
             return True
         else:

diff --git a/src/models/api/job/check_url_archive_job.py b/src/models/api/job/check_url_archive_job.py
@@ -0,0 +1,12 @@
+from urllib.parse import unquote
+
+from src.models.api.job import Job
+
+
+class UrlArchiveJob(Job):
+    url: str
+
+    @property
+    def unquoted_url(self):
+        """Decoded url"""
+        return unquote(self.url)
diff --git a/src/models/api/job/check_url_job.py b/src/models/api/job/check_url_job.py
@@ -6,6 +6,8 @@
 class UrlJob(Job):
     url: str
     timeout: int = 2  # We default to 2 seconds
+    method: str = "iabot"  # default to iabot check method # TODO get methods from global structure
+
     debug: bool = False
     blocks: bool = False
     html: bool = False

diff --git a/src/models/api/schema/check_url_archive_schema.py b/src/models/api/schema/check_url_archive_schema.py
@@ -0,0 +1,26 @@
+import logging
+
+from marshmallow import post_load
+from marshmallow.fields import Bool, Int, String
+
+from src.models.api.job.check_url_archive_job import UrlArchiveJob
+from src.models.api.schema.refresh import BaseSchema
+
+logger = logging.getLogger(__name__)
+
+
+class UrlArchiveSchema(BaseSchema):
+    """This validates the patron input in the get request"""
+
+    url = String(required=True)
+
+    # noinspection PyUnusedLocal
+    @post_load
+    # **kwargs is needed here despite what the validator claims
+    def return_object(self, data, **kwargs) -> UrlArchiveJob:  # type: ignore # dead: disable
+        """Return job object"""
+        from src import app
+
+        app.logger.debug("return_object: running")
+        job = UrlArchiveJob(**data)
+        return job
diff --git a/src/models/api/schema/check_url_schema.py b/src/models/api/schema/check_url_schema.py
@@ -14,6 +14,7 @@ class UrlSchema(BaseSchema):
 
     url = String(required=True)
     timeout = Int(required=False)
+    method = String(required=False)
     debug = Bool(required=False)
     blocks = Bool(required=False)
     xml = Bool(required=False)

diff --git a/src/models/api/statistic/article.py b/src/models/api/statistic/article.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
 
 from pydantic import BaseModel, Extra
 
@@ -14,21 +14,27 @@ class ArticleStatistics(BaseModel):
     wari_id: str = ""
     lang: str = "en"  # language code according to Wikimedia
     page_id: int = 0  # page id of the Wikipedia in question
-    dehydrated_references: List[str] = []
-    references: List[str] = []
-    reference_statistics: Dict[str, int] = {}
-    served_from_cache: bool = False
+    revision_id: int = 0
+    revision_isodate: str = ""
+    revision_timestamp: int = 0
     site: str = WikimediaDomain.wikipedia.value  # wikimedia site in question
+    title: str = ""
+    ores_score: Any = {}
+
+    served_from_cache: bool = False
     timestamp: int = 0  # timestamp at beginning of analysis
     isodate: str = ""  # isodate (human readable) at beginning of analysis
     timing: int = 0  # time to analyze in seconds
-    title: str = ""
-    fld_counts: Dict[str, int] = {}
+
+    references: List[str] = []
+    reference_statistics: Dict[str, int] = {}
+    dehydrated_references: List[str] = []
+
+    cite_refs_count: int = 0
+    cite_refs: Optional[List] = []
+
     urls: List[str] = []
-    ores_score: Any = {}
-    revision_id: int = 0
-    revision_isodate: str = ""
-    revision_timestamp: int = 0
+    fld_counts: Dict[str, int] = {}
 
     class Config:  # dead: disable
         extra = Extra.forbid  # dead: disable
diff --git a/src/models/api/statistic/reference.py b/src/models/api/statistic/reference.py
@@ -8,18 +8,21 @@ class ReferenceStatistic(BaseModel):
     the patron wants from the reference endpoint"""
 
     id: str = ""
-    template_names: List[str]
-    wikitext: str
-    type: str  # # [general|footnote]
+    ref_index: int = 0
+    name: str = ""
+    type: str  # [general|footnote]
     footnote_subtype: str  # [named|content]
-    # identifiers: Dict[str, Any]  # {dois: [1234,12345], isbns: [1234]}
-    flds: List[str] = []  # non-unique first level domain strings
-    urls: List[str] = []  # non-unique url strings
-    templates: List[Dict[str, Any]]
-    titles: List[str] = []
     section: str = ""
+
+    titles: List[str] = []
+    template_names: List[str]
+    templates: List[Dict[str, Any]]
+    urls: List[str] = []  # non-unique url strings
     url_objects: List[Dict[str, Any]]
-    name: str = ""
+    flds: List[str] = []  # non-unique first level domain strings
+
+    wikitext: str
+    # identifiers: Dict[str, Any]  # {dois: [1234,12345], isbns: [1234]}
 
     class Config:  # dead: disable
         extra = "forbid"  # dead: disable
diff --git a/src/models/file_io/__init__.py b/src/models/file_io/__init__.py
@@ -34,7 +34,7 @@ def path_filename(self) -> str:
             path_filename = (
                 f"{config.subdirectory_for_json}{self.subfolder}{self.filename}"
             )
-        app.logger.debug(f"using path: {path_filename}")
+        app.logger.debug(f"using path: {path_filename} (subfolder: {self.subfolder})")
         return path_filename
 
     def write_to_disk(
@@ -77,4 +77,3 @@ def read_from_disk(self) -> None:
                 # app.logger.debug(f"loaded: {self.statistics_dictionary}")
         else:
             logger.debug("no json on disk")
-            app.logger.debug("no json on disk")
diff --git a/src/models/file_io/url_archive_file_io.py b/src/models/file_io/url_archive_file_io.py
@@ -0,0 +1,11 @@
+import logging
+from typing import Any, Dict, Optional
+
+from src.models.file_io.hash_based import HashBasedFileIo
+
+logger = logging.getLogger(__name__)
+
+
+class UrlArchiveFileIo(HashBasedFileIo):
+    data: Optional[Dict[str, Any]] = None
+    subfolder = "urls/archives/"
diff --git a/src/models/file_io/url_file_io.py b/src/models/file_io/url_file_io.py
@@ -8,4 +8,5 @@
 
 class UrlFileIo(HashBasedFileIo):
     data: Optional[Dict[str, Any]] = None
+    flavor: str = ""
     subfolder = "urls/"
diff --git a/src/models/identifiers_checking/url.py b/src/models/identifiers_checking/url.py
@@ -1,5 +1,6 @@
 import logging
 import os
+import urllib.parse
 from typing import Any, Dict, Optional
 
 import requests
@@ -34,9 +35,7 @@
 
 class Url(WikipediaUrl):
     """
-    This handles checking a URL
-
-    Our patrons want to know if this URL is likely to lead to the content that is referenced.
+    This handles checking a URL for it's http status
 
     We define a malformed URL as any URL that the reader cannot easily
     click and successfully get the contents of in a normal web browser session
@@ -45,32 +44,52 @@ class Url(WikipediaUrl):
     and do not offer turning them off for now.
     """
 
-    request_error: bool = False
-    request_error_details: str = ""
-    dns_record_found: bool = False
-    dns_no_answer: bool = False
-    dns_error: bool = False
-    # soft404_probability: float = 0.0  # not implemented yet
+    # iari test - deprecated, for now (2023.11.08)
     status_code: int = 0
+    status_code_method: str = ""
+
+    # iabot status
     testdeadlink_status_code: int = 0
     testdeadlink_error_details: str = ""
-    timeout: int = 2
-    dns_error_details: str = ""
-    response_headers: Optional[Dict] = None
+
+    # IABot Archive information (from internal iabot database)
+    # iabot_results: Optional[Dict] = None
+
     text: str = ""
+    response_headers: Optional[Dict] = None
+
     detected_language: str = ""
     detected_language_error: bool = False
     detected_language_error_details: str = ""
 
+    request_error: bool = False
+    request_error_details: str = ""
+    timeout: int = 2
+
+    dns_record_found: bool = False
+    dns_no_answer: bool = False
+    dns_error: bool = False
+    dns_error_details: str = ""
+
+    # soft404_probability: float = 0.0  # not implemented yet
+
     # @property
     # def __check_soft404__(self):
     #     raise NotImplementedError()
 
-    def check(self):
+    def check(self, method):
+        from src import app
+
         if self.url:
             self.extract()
-            self.__check_url__()
+            # self.__check_url__()  # deprecated - omit native IARI checking - just using IABot's testdeadlink for now
+
+            self.status_code_method = method
+            app.logger.debug(f"checking url with method {method}")
+
+            # TODO me must respect "method" parameter here to check URL status
             self.__check_url_with_testdeadlink_api__()
+            # self.__check_url_archive_with_iabot_api__()
             self.__detect_language__()
 
     def __get_dns_record__(self) -> None:
@@ -113,6 +132,7 @@ def __check_with_https_verify__(self):
                 allow_redirects=True,
             )
             self.status_code = r.status_code
+
             logger.debug(self.url + "\tStatus: " + str(r.status_code))
             self.response_headers = dict(r.headers)
             if r.status_code == 200:
@@ -292,3 +312,30 @@ def __check_url_with_testdeadlink_api__(self):
                                     result
                                 ]
                             break
+
+    # def __check_url_archive_with_iabot_api__(self):
+    #     """
+    #     This fetches the status code, archive, and other information from the
+    #     searchurldata API of IABot
+    #     """
+    #
+    #     modified_url = urllib.parse.quote(self.url)  # url encode the url
+    #
+    #     headers = {
+    #         "Content-Type": "application/x-www-form-urlencoded",
+    #         "User-Agent": "http://en.wikipedia.org/wiki/User:GreenC via iabget.awk",
+    #     }
+    #     data = f"&action=searchurldata&urls={modified_url}"
+    #
+    #     response = requests.post(
+    #         "https://iabot.wmcloud.org/api.php?wiki=enwiki",
+    #         headers=headers,
+    #         data=data,
+    #     )
+    #
+    #     # if status code is 200, the request was successful
+    #     if response.status_code == 200:
+    #         data = response.json()
+    #         print(data)
+    #         # TODO handle return data or errors
+    #         self.iabot_results = data
-Original file line number
+Diff line change
@@ Expand Up / @@ -12,4 +12,5 @@ __pycache__/ @@
     .coverage
     .idea/
     venv/
-    _notes/
+    _notes/
+    .env