Merge branch 'multifetch' into main

internetarchive · Aug 26, 2024 · b2ea1a9 · b2ea1a9
2 parents 936bde9 + 393b39f
commit b2ea1a9
Show file tree

Hide file tree

Showing 21 changed files with 1,013 additions and 115 deletions.
diff --git a/src/__init__.py b/src/__init__.py
@@ -17,7 +17,7 @@
 import config
 from src.models.exceptions import MissingInformationError, WikipediaApiFetchError
 
-# old stuff...
+# legacy endpoints stuff...
 from src.views.check_doi import CheckDoi
 from src.views.check_url import CheckUrl
 from src.views.check_url_archive import CheckUrlArchive
@@ -36,6 +36,8 @@
 from src.views.v2.article_cache_view_v2 import ArticleCacheV2
 # new stuff jun 2024
 from src.views.v2.editref_v2 import EditRefV2
+# new stuff jul 2024
+from src.views.v2.fetchrefs_v2 import FetchRefsV2
 
 logging.basicConfig(level=config.loglevel)
 logger = logging.getLogger(__name__)
@@ -50,18 +52,19 @@ def add_cors_headers(response):
     response.headers["Access-Control-Allow-Headers"] = "Content-Type"
     return response
 
+# Register CORS function as an after_request handler
+app.after_request(add_cors_headers)
+
 
 # let's see if we can distinguish which server we are on
 server_name = os.getenv('FLASK_SERVER_NAME', 'Unknown Server')
 
-# Register the function as an after_request handler
-app.after_request(add_cors_headers)
-
 # We use a prefix here to enable us to stabilize the api over time
 # and bump the version when making breaking changes
-api = Api(app, prefix="/v2")
+api = Api(app, prefix="/v2")  # NB TODO This pseudo-versioning should be addressed
 
 # link the API views to respective endpoint urls
+api.add_resource(FetchRefsV2, "/fetchrefs")
 api.add_resource(EditRefV2, "/editref")
 
 api.add_resource(ArticleV2, "/article")

diff --git a/src/models/api/handlers/pdf.py b/src/models/api/handlers/pdf.py
@@ -15,7 +15,7 @@
 # type: ignore
 from requests import ReadTimeout
 
-from config import link_extraction_regex
+from config import regex_url_link_extraction
 from src.models.api.handlers import BaseHandler
 from src.models.api.job.check_url_job import UrlJob
 from src.models.api.link.pdf_link import PdfLink
@@ -214,7 +214,7 @@ def __extract_links_from_original_text__(self) -> None:
                 # We remove the linebreaks to avoid clipping of URLs, see https://github.com/internetarchive/iari/issues/766
                 # provided by chatgpt:
                 urls = re.findall(
-                    link_extraction_regex,
+                    regex_url_link_extraction,
                     self.text_pages[index],
                 )
                 # cleaned_urls = self.__clean_urls__(urls=urls)
@@ -233,7 +233,7 @@ def __extract_links_from_text_without_linebreaks__(self) -> None:
                 # We remove the linebreaks to avoid clipping of URLs, see https://github.com/internetarchive/iari/issues/766
                 # provided by chatgpt:
                 urls = re.findall(
-                    link_extraction_regex,
+                    regex_url_link_extraction,
                     self.text_pages_without_linebreaks[index],
                 )
                 # cleaned_urls = self.__clean_urls__(urls=urls)
@@ -253,7 +253,7 @@ def __extract_links_from_text_without_spaces__(self) -> None:
                 # provided by chatgpt:
                 if self.text_pages_without_spaces:
                     urls = re.findall(
-                        link_extraction_regex,
+                        regex_url_link_extraction,
                         self.text_pages_without_spaces[index],
                     )
                     # cleaned_urls = self.__clean_urls__(urls=urls)

diff --git a/src/models/api/job/article_job.py b/src/models/api/job/article_job.py
@@ -15,13 +15,17 @@ class ArticleJob(Job):
     lang: str = "en"
     domain: WikimediaDomain = WikimediaDomain.wikipedia
     title: str = ""
+    revision: int = 0  # this is named just as in the MediaWiki API
+
     page_id: int = 0
-    refresh: bool = False
     url: str = ""
+
     sections: str = ""  # string describing which sections to parse
-    revision: int = 0  # this is named just as in the MediaWiki API
+
+    refresh: bool = False
     dehydrate: bool = True
 
+
     @property
     def wari_id(self) -> str:
         if not self.lang:

diff --git a/src/models/v2/job/fetchrefs_job_v2.py b/src/models/v2/job/fetchrefs_job_v2.py
@@ -0,0 +1,53 @@
+import re
+from urllib.parse import quote, unquote
+from src.models.wikimedia.enums import WikimediaDomain
+from src import MissingInformationError
+from src.models.v2.job import JobV2
+from typing import List
+
+
+class FetchRefsJobV2(JobV2):
+    """job that supports FetchRefsV2 endpoint"""
+
+    # using marshmallow to describe parameters
+
+    which_wiki: str = ""
+    pages: List[str] = []
+    wikitext: str = ""
+
+    wiki_domain: WikimediaDomain = WikimediaDomain.wikipedia
+    wiki_lang: str = ""
+
+    wiki_id: str = ""
+    wiki_page_title: str = ""
+    wiki_revision: str = ""
+
+    # @property
+    # def quoted_title(self):
+    #     if not self.wiki_page_title:
+    #         raise MissingInformationError("self.wiki_page_title is empty")
+    #     return quote(self.wiki_page_title, safe="")
+
+
+    def validate_fields(self):
+        """
+        parameter checking done here...
+
+        must have at "pages" or "wikitext" defined
+        """
+
+        from src import app
+
+        # app.logger.error('fetchrefs validate_fields: Fake Error')
+        # raise MissingInformationError(
+        #     f'fetchrefs validate_fields: Fake Error'
+        # )
+
+        if not self.wikitext:
+            if not self.pages:
+                raise MissingInformationError(
+                    f"pages or wikitext parameter must be specified"
+                )
+
+
+
diff --git a/src/models/v2/schema/fetchrefs_schema_v2.py b/src/models/v2/schema/fetchrefs_schema_v2.py
@@ -0,0 +1,54 @@
+from marshmallow import fields, pre_load, post_load
+
+from src.models.v2.job.fetchrefs_job_v2 import FetchRefsJobV2
+from src.models.v2.schema import BaseSchemaV2
+
+
+class FetchRefsSchemaV2(BaseSchemaV2):
+    # Defines expected parameters for endpoint
+    #   - default parameters are defined in BaseSchemaV2
+
+    which_wiki = fields.Str(default="enwiki")
+    pages = fields.List(fields.String(), required=False)  # either pages or wikitext must be defined
+    wikitext = fields.Str(required=False)  # if provided, overrides pages array
+
+    @pre_load
+    # NB: pre_load is a marshmallow directive;
+    def process_input(self, data, **kwargs):
+        """
+        transform comma separated pages into a List
+        """
+        from src import app
+        app.logger.debug(f"==> FetchRefsSchemaV2::(@pre_load)process_input: data:{data}")
+
+        request_method = self.context.get('request_method', None)
+        # if request_method:
+        #     print(f"Request method received: {request_method}")
+
+        app.logger.debug(f"==> FetchRefsSchemaV2::(@pre_load)process_input: request_method:{request_method}")
+
+
+        mutable_data = dict(data)  # Convert ImmutableMultiDict to a mutable dict
+        if 'pages' in mutable_data and isinstance(mutable_data['pages'], str):
+            mutable_data['pages'] = mutable_data['pages'].split('|')
+        return mutable_data
+
+    # noinspection PyUnusedLocal
+    @post_load
+    # NB: post_load is a marshmallow directive;
+    #   this function is run after loading request args
+    #   it basically pulls the request object value into a Job object
+    #
+    #  **kwargs is needed here despite what the validator claims
+    def return_job_object(self, data, **kwargs) -> FetchRefsJobV2:  # type: ignore # dead: disable
+        """Return Job object"""
+        from src import app
+        app.logger.debug("==> FetchRefsSchemaV2::@post_load:return_job_object")
+        app.logger.debug(f"return_job_object data: {data}")
+
+        job = FetchRefsJobV2(**data)
+        job.validate_fields()
+
+        # NB here is where we can modify job field values before returning if we want
+
+        return job
diff --git a/src/models/v2/wikimedia/wikipedia/article_v2.py b/src/models/v2/wikimedia/wikipedia/article_v2.py
@@ -1,8 +1,6 @@
 import logging
 import re
-import pprint
-# import urllib
-from urllib.parse import quote, unquote
+from urllib.parse import unquote
 
 from datetime import datetime
 from typing import Any, Dict, List, Optional
@@ -69,6 +67,7 @@ class WikipediaArticleV2(IariBaseModel):
 
     error_items: List[Any] = []
 
+    # required pydantic class
     class Config:  # dead: disable
         arbitrary_types_allowed = True  # dead: disable
         extra = "forbid"  # dead: disable
@@ -129,7 +128,7 @@ def fetch_and_parse(self):
         """
         from src import app
 
-        app.logger.debug("ArticleV2::fetch_and_parse")
+        app.logger.debug("==> ArticleV2::fetch_and_parse")
         app.logger.info("Fetching article data and parsing")
 
         if not self.wikitext:
@@ -138,28 +137,60 @@ def fetch_and_parse(self):
             self.__fetch_wikitext__()
 
         if self.is_redirect:
-            logger.debug(
+            logger.error(
                 "Skipped extraction and parsing because the article is a redirect"
             )
-            raise WikipediaApiFetchError("wiki article is a redirect")
+            raise WikipediaApiFetchError("Wiki article is a redirect")
+            # TODO Might want to change this from raising exception,
+            #   but we do want to stop further processing,
+            #   so need to have some way of indicating that to caller
 
         if not self.found_in_wikipedia:
-            logger.debug(
-                "Skipped extraction and parsing because the article was not found"
+            logger.error(
+                "Skipped extraction and parsing because the article was not found in wiki"
             )
-            raise WikipediaApiFetchError("wiki article not found in wiki")
+            raise WikipediaApiFetchError(f"Article {self.job.quoted_title} not found in wiki")
 
         if not self.wikitext:
             raise WikipediaApiFetchError("wikitext is empty")
 
+
+        # wikitext extraction
+
+        app.logger.debug("==> ArticleV2::fetch_and_parse: extracting from wikitext")
+
+        # elif not self.is_redirect and self.found_in_wikipedia:
+        if not self.is_redirect and self.found_in_wikipedia:
+
+            if not self.wikitext:
+                raise MissingInformationError("WikipediaReferenceExtractorV2::fetch_and_parse: self.wikitext is empty")
+
+            self.extractor = WikipediaReferenceExtractorV2(
+                wikitext=self.wikitext,
+                html_source=self.html_markup,
+                job=self.job,
+            )
+
+            app.logger.debug("==> ArticleV2::fetch_and_parse: extracting all refs")
+            self.extractor.extract_all_references()
+
+        app.logger.debug("==> ArticleV2::fetch_and_parse: fetching ores scores")
+        self.__get_ores_scores__()
+        # self.__generate_hash__()
+
+
+        app.logger.debug("==> ArticleV2::fetch_and_parse: extracting from html")
+
+        # html extraction
         if not self.html_markup:
             self.__fetch_html__()
 
+        # extract references from html point-of-view
         self.__extract_footnote_references__()
         self.__extract_section_references__()
         self.__extract_urls_from_references__()
 
-        self.__get_ores_scores__()  # fills ores_quality_prediction and ores_details
+        # self.__get_ores_scores__()  # fills ores_quality_prediction and ores_details
 
     def __extract_urls_from_references__(self):
         # traverse references, adding urls to self.urlDict,
@@ -196,8 +227,8 @@ def __extract_footnote_references__(self):
         regex_extract_ref_name = r"#cite_note-(.*?)-\d+$"
 
         soup = BeautifulSoup(self.html_markup, "html.parser")
-            # for link in soup.find_all("a"):
-            #     print(link.get("href"))
+        # for link in soup.find_all("a"):
+        #     print(link.get("href"))
 
 
         references_wrapper = soup.find("div", class_="mw-references-wrap")
@@ -247,7 +278,7 @@ def __extract_footnote_references__(self):
                 if span_ref:
                     # span_ref contains citation markup and possible template data
 
-                    app.logger.debug(f"Checking <link> data...")
+                    # ### app.logger.debug(f"Checking <link> data...")
 
                     # fetch "template" data from link[data-mw] attribute
                     link_refs = span_ref.find_all("link")
@@ -293,7 +324,9 @@ def __extract_footnote_references__(self):
                     # TODO What is held in these elements, specifically? is it books?
                     span_refs = span_ref.find_all("span", class_="Z3988")
                     for span_ref in span_refs:
-                        app.logger.debug(f"found span.Z3988...")
+
+                        # app.logger.debug(f"found span.Z3988...")
+
                         span_data = span_ref.get("title")
                         if span_data:
                             span_template = self.__parse_span_template__(span_data)
@@ -489,7 +522,7 @@ def __parse_span_template__(self, span_data) -> Dict[str, Any] or None:
 
         span_list = span_data.split("&")
 
-        app.logger.debug(f"SPAN DATA (parsed):")
+        # app.logger.debug(f"SPAN DATA (parsed):")
 
         span_template = []
         # print this string out