diff --git a/src/__init__.py b/src/__init__.py
index 6422102f..4be9c923 100644
--- a/src/__init__.py
+++ b/src/__init__.py
@@ -17,7 +17,7 @@
import config
from src.models.exceptions import MissingInformationError, WikipediaApiFetchError
-# old stuff...
+# legacy endpoints stuff...
from src.views.check_doi import CheckDoi
from src.views.check_url import CheckUrl
from src.views.check_url_archive import CheckUrlArchive
@@ -36,6 +36,8 @@
from src.views.v2.article_cache_view_v2 import ArticleCacheV2
# new stuff jun 2024
from src.views.v2.editref_v2 import EditRefV2
+# new stuff jul 2024
+from src.views.v2.fetchrefs_v2 import FetchRefsV2
logging.basicConfig(level=config.loglevel)
logger = logging.getLogger(__name__)
@@ -50,18 +52,19 @@ def add_cors_headers(response):
response.headers["Access-Control-Allow-Headers"] = "Content-Type"
return response
+# Register CORS function as an after_request handler
+app.after_request(add_cors_headers)
+
# let's see if we can distinguish which server we are on
server_name = os.getenv('FLASK_SERVER_NAME', 'Unknown Server')
-# Register the function as an after_request handler
-app.after_request(add_cors_headers)
-
# We use a prefix here to enable us to stabilize the api over time
# and bump the version when making breaking changes
-api = Api(app, prefix="/v2")
+api = Api(app, prefix="/v2") # NB TODO This pseudo-versioning should be addressed
# link the API views to respective endpoint urls
+api.add_resource(FetchRefsV2, "/fetchrefs")
api.add_resource(EditRefV2, "/editref")
api.add_resource(ArticleV2, "/article")
diff --git a/src/models/api/handlers/pdf.py b/src/models/api/handlers/pdf.py
index ab3cf951..6096c2ed 100644
--- a/src/models/api/handlers/pdf.py
+++ b/src/models/api/handlers/pdf.py
@@ -15,7 +15,7 @@
# type: ignore
from requests import ReadTimeout
-from config import link_extraction_regex
+from config import regex_url_link_extraction
from src.models.api.handlers import BaseHandler
from src.models.api.job.check_url_job import UrlJob
from src.models.api.link.pdf_link import PdfLink
@@ -214,7 +214,7 @@ def __extract_links_from_original_text__(self) -> None:
# We remove the linebreaks to avoid clipping of URLs, see https://github.com/internetarchive/iari/issues/766
# provided by chatgpt:
urls = re.findall(
- link_extraction_regex,
+ regex_url_link_extraction,
self.text_pages[index],
)
# cleaned_urls = self.__clean_urls__(urls=urls)
@@ -233,7 +233,7 @@ def __extract_links_from_text_without_linebreaks__(self) -> None:
# We remove the linebreaks to avoid clipping of URLs, see https://github.com/internetarchive/iari/issues/766
# provided by chatgpt:
urls = re.findall(
- link_extraction_regex,
+ regex_url_link_extraction,
self.text_pages_without_linebreaks[index],
)
# cleaned_urls = self.__clean_urls__(urls=urls)
@@ -253,7 +253,7 @@ def __extract_links_from_text_without_spaces__(self) -> None:
# provided by chatgpt:
if self.text_pages_without_spaces:
urls = re.findall(
- link_extraction_regex,
+ regex_url_link_extraction,
self.text_pages_without_spaces[index],
)
# cleaned_urls = self.__clean_urls__(urls=urls)
diff --git a/src/models/api/job/article_job.py b/src/models/api/job/article_job.py
index c43f631d..6bd8571c 100644
--- a/src/models/api/job/article_job.py
+++ b/src/models/api/job/article_job.py
@@ -15,13 +15,17 @@ class ArticleJob(Job):
lang: str = "en"
domain: WikimediaDomain = WikimediaDomain.wikipedia
title: str = ""
+ revision: int = 0 # this is named just as in the MediaWiki API
+
page_id: int = 0
- refresh: bool = False
url: str = ""
+
sections: str = "" # string describing which sections to parse
- revision: int = 0 # this is named just as in the MediaWiki API
+
+ refresh: bool = False
dehydrate: bool = True
+
@property
def wari_id(self) -> str:
if not self.lang:
diff --git a/src/models/v2/job/fetchrefs_job_v2.py b/src/models/v2/job/fetchrefs_job_v2.py
new file mode 100644
index 00000000..1d8d86fa
--- /dev/null
+++ b/src/models/v2/job/fetchrefs_job_v2.py
@@ -0,0 +1,53 @@
+import re
+from urllib.parse import quote, unquote
+from src.models.wikimedia.enums import WikimediaDomain
+from src import MissingInformationError
+from src.models.v2.job import JobV2
+from typing import List
+
+
+class FetchRefsJobV2(JobV2):
+ """job that supports FetchRefsV2 endpoint"""
+
+ # using marshmallow to describe parameters
+
+ which_wiki: str = ""
+ pages: List[str] = []
+ wikitext: str = ""
+
+ wiki_domain: WikimediaDomain = WikimediaDomain.wikipedia
+ wiki_lang: str = ""
+
+ wiki_id: str = ""
+ wiki_page_title: str = ""
+ wiki_revision: str = ""
+
+ # @property
+ # def quoted_title(self):
+ # if not self.wiki_page_title:
+ # raise MissingInformationError("self.wiki_page_title is empty")
+ # return quote(self.wiki_page_title, safe="")
+
+
+ def validate_fields(self):
+ """
+ parameter checking done here...
+
+ must have at "pages" or "wikitext" defined
+ """
+
+ from src import app
+
+ # app.logger.error('fetchrefs validate_fields: Fake Error')
+ # raise MissingInformationError(
+ # f'fetchrefs validate_fields: Fake Error'
+ # )
+
+ if not self.wikitext:
+ if not self.pages:
+ raise MissingInformationError(
+ f"pages or wikitext parameter must be specified"
+ )
+
+
+
diff --git a/src/models/v2/schema/fetchrefs_schema_v2.py b/src/models/v2/schema/fetchrefs_schema_v2.py
new file mode 100644
index 00000000..5dce5b4a
--- /dev/null
+++ b/src/models/v2/schema/fetchrefs_schema_v2.py
@@ -0,0 +1,54 @@
+from marshmallow import fields, pre_load, post_load
+
+from src.models.v2.job.fetchrefs_job_v2 import FetchRefsJobV2
+from src.models.v2.schema import BaseSchemaV2
+
+
+class FetchRefsSchemaV2(BaseSchemaV2):
+ # Defines expected parameters for endpoint
+ # - default parameters are defined in BaseSchemaV2
+
+ which_wiki = fields.Str(default="enwiki")
+ pages = fields.List(fields.String(), required=False) # either pages or wikitext must be defined
+ wikitext = fields.Str(required=False) # if provided, overrides pages array
+
+ @pre_load
+ # NB: pre_load is a marshmallow directive;
+ def process_input(self, data, **kwargs):
+ """
+ transform comma separated pages into a List
+ """
+ from src import app
+ app.logger.debug(f"==> FetchRefsSchemaV2::(@pre_load)process_input: data:{data}")
+
+ request_method = self.context.get('request_method', None)
+ # if request_method:
+ # print(f"Request method received: {request_method}")
+
+ app.logger.debug(f"==> FetchRefsSchemaV2::(@pre_load)process_input: request_method:{request_method}")
+
+
+ mutable_data = dict(data) # Convert ImmutableMultiDict to a mutable dict
+ if 'pages' in mutable_data and isinstance(mutable_data['pages'], str):
+ mutable_data['pages'] = mutable_data['pages'].split('|')
+ return mutable_data
+
+ # noinspection PyUnusedLocal
+ @post_load
+ # NB: post_load is a marshmallow directive;
+ # this function is run after loading request args
+ # it basically pulls the request object value into a Job object
+ #
+ # **kwargs is needed here despite what the validator claims
+ def return_job_object(self, data, **kwargs) -> FetchRefsJobV2: # type: ignore # dead: disable
+ """Return Job object"""
+ from src import app
+ app.logger.debug("==> FetchRefsSchemaV2::@post_load:return_job_object")
+ app.logger.debug(f"return_job_object data: {data}")
+
+ job = FetchRefsJobV2(**data)
+ job.validate_fields()
+
+ # NB here is where we can modify job field values before returning if we want
+
+ return job
diff --git a/src/models/v2/wikimedia/wikipedia/article_v2.py b/src/models/v2/wikimedia/wikipedia/article_v2.py
index e2daef84..11813200 100644
--- a/src/models/v2/wikimedia/wikipedia/article_v2.py
+++ b/src/models/v2/wikimedia/wikipedia/article_v2.py
@@ -1,8 +1,6 @@
import logging
import re
-import pprint
-# import urllib
-from urllib.parse import quote, unquote
+from urllib.parse import unquote
from datetime import datetime
from typing import Any, Dict, List, Optional
@@ -69,6 +67,7 @@ class WikipediaArticleV2(IariBaseModel):
error_items: List[Any] = []
+ # required pydantic class
class Config: # dead: disable
arbitrary_types_allowed = True # dead: disable
extra = "forbid" # dead: disable
@@ -129,7 +128,7 @@ def fetch_and_parse(self):
"""
from src import app
- app.logger.debug("ArticleV2::fetch_and_parse")
+ app.logger.debug("==> ArticleV2::fetch_and_parse")
app.logger.info("Fetching article data and parsing")
if not self.wikitext:
@@ -138,28 +137,60 @@ def fetch_and_parse(self):
self.__fetch_wikitext__()
if self.is_redirect:
- logger.debug(
+ logger.error(
"Skipped extraction and parsing because the article is a redirect"
)
- raise WikipediaApiFetchError("wiki article is a redirect")
+ raise WikipediaApiFetchError("Wiki article is a redirect")
+ # TODO Might want to change this from raising exception,
+ # but we do want to stop further processing,
+ # so need to have some way of indicating that to caller
if not self.found_in_wikipedia:
- logger.debug(
- "Skipped extraction and parsing because the article was not found"
+ logger.error(
+ "Skipped extraction and parsing because the article was not found in wiki"
)
- raise WikipediaApiFetchError("wiki article not found in wiki")
+ raise WikipediaApiFetchError(f"Article {self.job.quoted_title} not found in wiki")
if not self.wikitext:
raise WikipediaApiFetchError("wikitext is empty")
+
+ # wikitext extraction
+
+ app.logger.debug("==> ArticleV2::fetch_and_parse: extracting from wikitext")
+
+ # elif not self.is_redirect and self.found_in_wikipedia:
+ if not self.is_redirect and self.found_in_wikipedia:
+
+ if not self.wikitext:
+ raise MissingInformationError("WikipediaReferenceExtractorV2::fetch_and_parse: self.wikitext is empty")
+
+ self.extractor = WikipediaReferenceExtractorV2(
+ wikitext=self.wikitext,
+ html_source=self.html_markup,
+ job=self.job,
+ )
+
+ app.logger.debug("==> ArticleV2::fetch_and_parse: extracting all refs")
+ self.extractor.extract_all_references()
+
+ app.logger.debug("==> ArticleV2::fetch_and_parse: fetching ores scores")
+ self.__get_ores_scores__()
+ # self.__generate_hash__()
+
+
+ app.logger.debug("==> ArticleV2::fetch_and_parse: extracting from html")
+
+ # html extraction
if not self.html_markup:
self.__fetch_html__()
+ # extract references from html point-of-view
self.__extract_footnote_references__()
self.__extract_section_references__()
self.__extract_urls_from_references__()
- self.__get_ores_scores__() # fills ores_quality_prediction and ores_details
+ # self.__get_ores_scores__() # fills ores_quality_prediction and ores_details
def __extract_urls_from_references__(self):
# traverse references, adding urls to self.urlDict,
@@ -196,8 +227,8 @@ def __extract_footnote_references__(self):
regex_extract_ref_name = r"#cite_note-(.*?)-\d+$"
soup = BeautifulSoup(self.html_markup, "html.parser")
- # for link in soup.find_all("a"):
- # print(link.get("href"))
+ # for link in soup.find_all("a"):
+ # print(link.get("href"))
references_wrapper = soup.find("div", class_="mw-references-wrap")
@@ -247,7 +278,7 @@ def __extract_footnote_references__(self):
if span_ref:
# span_ref contains citation markup and possible template data
- app.logger.debug(f"Checking data...")
+ # ### app.logger.debug(f"Checking data...")
# fetch "template" data from link[data-mw] attribute
link_refs = span_ref.find_all("link")
@@ -293,7 +324,9 @@ def __extract_footnote_references__(self):
# TODO What is held in these elements, specifically? is it books?
span_refs = span_ref.find_all("span", class_="Z3988")
for span_ref in span_refs:
- app.logger.debug(f"found span.Z3988...")
+
+ # app.logger.debug(f"found span.Z3988...")
+
span_data = span_ref.get("title")
if span_data:
span_template = self.__parse_span_template__(span_data)
@@ -489,7 +522,7 @@ def __parse_span_template__(self, span_data) -> Dict[str, Any] or None:
span_list = span_data.split("&")
- app.logger.debug(f"SPAN DATA (parsed):")
+ # app.logger.debug(f"SPAN DATA (parsed):")
span_template = []
# print this string out
diff --git a/src/models/v2/wikimedia/wikipedia/reference/__init__.py b/src/models/v2/wikimedia/wikipedia/reference/__init__.py
index 14cb37b8..323910c5 100644
--- a/src/models/v2/wikimedia/wikipedia/reference/__init__.py
+++ b/src/models/v2/wikimedia/wikipedia/reference/__init__.py
@@ -7,7 +7,7 @@
from mwparserfromhell.nodes import Tag # type: ignore
from mwparserfromhell.wikicode import Wikicode # type: ignore
-from config import link_extraction_regex
+from config import regex_url_link_extraction
from src.models.base.job import JobBaseModel
from src.models.exceptions import MissingInformationError
from src.models.v2.wikimedia.wikipedia.reference.template import WikipediaTemplateV2
@@ -162,7 +162,7 @@ def is_footnote_reference(self):
return True
@property
- def get_wikicode_as_string(self):
+ def wikicode_as_string(self):
return str(self.wikicode)
@property
@@ -253,23 +253,26 @@ def __extract_unique_first_level_domains__(self) -> None:
"""This aggregates all first level domains from the urls found in the urls"""
from src import app
- app.logger.debug("__extract_first_level_domains__: running")
+ app.logger.debug("==> __extract_unique_first_level_domains__")
+
if not self.reference_urls:
- app.logger.info("no reference_urls found so we skip extraction")
+ app.logger.debug("no reference_urls found so we skip extraction")
+
else:
- logger.debug("found at least one url")
+ app.logger.debug("found at least one url")
first_level_domains = set()
for url in self.reference_urls:
- logger.debug("working on url")
if url.first_level_domain:
- app.logger.debug(f"found fld: {url.first_level_domain}")
+ # app.logger.debug(f"found fld: {url.first_level_domain}")
first_level_domains.add(url.first_level_domain)
else:
- app.logger.warning(f"no fld found for: {url.url}")
+ # app.logger.warning(f"no fld found for: {url.url}")
+ pass
+
# Return unique domains to avoid confusion
# https://github.com/internetarchive/iari/issues/834
self.unique_first_level_domains = list(first_level_domains)
- app.logger.debug(f"found unique flds: {self.unique_first_level_domains}")
+ # app.logger.debug(f"found unique flds: {self.unique_first_level_domains}")
# @property
# def plain_text_in_reference(self) -> bool:
@@ -292,7 +295,7 @@ def __find_bare_urls_outside_templates__(self) -> List[str]:
stripped_wikicode = str(self.wikicode.strip_code())
logger.debug(stripped_wikicode)
return re.findall(
- link_extraction_regex,
+ regex_url_link_extraction,
stripped_wikicode,
)
else:
@@ -317,7 +320,7 @@ def __extract_templates_and_parameters__(self) -> None:
from src import app
app.logger.debug(
- "__extract_templates_and_parameters_from_raw_reference__: running"
+ "==> __extract_templates_and_parameters_from_raw_reference__"
)
self.__extract_raw_templates__()
self.__extract_and_clean_template_parameters__()
@@ -326,9 +329,10 @@ def __extract_templates_and_parameters__(self) -> None:
def __extract_raw_templates__(self) -> None:
"""Extract the templates from self.wikicode"""
from src import app
+ app.logger.debug("==> __extract_raw_templates__")
self.templates = []
- app.logger.debug("__extract_raw_templates__: running")
+
if not self.wikicode:
raise MissingInformationError("self.wikicode was None")
if isinstance(self.wikicode, str):
@@ -338,7 +342,7 @@ def __extract_raw_templates__(self) -> None:
if self.is_footnote_reference and (
"" not in wikicode_string or ">" in wikicode_string
):
- logger.info(f"Skipping named reference with no content {wikicode_string}")
+ logger.info(f"Skipping empty named reference {wikicode_string}")
self.is_named_reused_reference = True
else:
logger.debug(f"Extracting templates from: {self.wikicode}")
@@ -357,7 +361,7 @@ def __extract_raw_templates__(self) -> None:
for raw_template in raw_templates:
count += 1
self.templates.append(
- WikipediaTemplate(
+ WikipediaTemplateV2(
raw_template=raw_template, language_code=self.language_code
)
)
@@ -368,7 +372,7 @@ def __extract_and_clean_template_parameters__(self) -> None:
"""We extract all templates"""
from src import app
- app.logger.debug("__extract_and_clean_template_parameters__: running")
+ app.logger.debug("==> __extract_and_clean_template_parameters__")
if self.templates:
[
template.extract_and_prepare_parameter_and_flds()
@@ -379,7 +383,7 @@ def extract_and_check(self) -> None:
"""Helper method"""
from src import app
- app.logger.debug("extract_and_check: running")
+ app.logger.debug("==> extract_and_check")
self.__parse_xhtml__()
self.__extract_xhtml_comments__()
self.__extract_templates_and_parameters__()
diff --git a/src/models/v2/wikimedia/wikipedia/reference/extractor_v2.py b/src/models/v2/wikimedia/wikipedia/reference/extractor_v2.py
index 7e9d4ce6..bc9736f6 100644
--- a/src/models/v2/wikimedia/wikipedia/reference/extractor_v2.py
+++ b/src/models/v2/wikimedia/wikipedia/reference/extractor_v2.py
@@ -34,7 +34,7 @@ class WikipediaReferenceExtractorV2(WariBaseModel):
wikitext: str
wikicode: Wikicode = None # wiki object tree parsed from wikitext
- html_source: str = "" # used to extract citeref reference data
+ html_source: Optional[str] = "" # used to extract citeref reference data
references: Optional[List[WikipediaReferenceV2]] = None
# cite_page_refs: Optional[List] = []
@@ -197,7 +197,7 @@ def extract_all_references(self):
"""Extract all references from self.wikitext"""
from src import app
- app.logger.debug("extract_all_references: running")
+ app.logger.debug("==> WikipediaReferenceExtractorV2:: extract_all_references")
if not self.job:
raise MissingInformationError("no job")
self.__parse_wikitext__()
diff --git a/src/models/v2/wikimedia/wikipedia/reference/reference_lite_v2.py b/src/models/v2/wikimedia/wikipedia/reference/reference_lite_v2.py
new file mode 100644
index 00000000..adade837
--- /dev/null
+++ b/src/models/v2/wikimedia/wikipedia/reference/reference_lite_v2.py
@@ -0,0 +1,273 @@
+import hashlib
+import logging
+import re
+from typing import Any, Dict, List, Optional, Union
+
+from bs4 import BeautifulSoup, Comment
+from mwparserfromhell.nodes import Tag # type: ignore
+from mwparserfromhell.wikicode import Wikicode # type: ignore
+
+from config import regex_url_link_extraction
+from src.models.base.job import JobBaseModel
+from src.models.exceptions import MissingInformationError
+from src.models.v2.wikimedia.wikipedia.reference.template import WikipediaTemplateV2
+from src.models.v2.wikimedia.wikipedia.url_v2 import WikipediaUrlV2
+from src.models.wikimedia.wikipedia.reference.enums import (
+ FootnoteSubtype,
+ ReferenceType,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# We use marshmallow here because pydantic did not seem to support optional alias fields.
+# https://github.com/samuelcolvin/pydantic/discussions/3855
+
+
+class WikipediaReferenceLiteV2(JobBaseModel):
+ """
+ models a reference on a Wikipedia page
+ See class WikipediaReferenceV2(JobBaseModel) for what this was based on
+
+ This is very simple for now. we just have a name (of the reference, if there) and wikitext
+
+ we validate with pydantic when creating this object
+ """
+
+ wikicode: Union[Tag, Wikicode] # output from mwparserfromhell
+
+ name: str = ""
+ wikitext: str = ""
+
+ # This is for pydantic
+ class Config: # dead: disable
+ arbitrary_types_allowed = True # dead: disable
+
+ @property
+ def get_name(self) -> str:
+ if not self.soup:
+ raise MissingInformationError()
+ # Find the [ tag
+ ref_tag = self.soup.find("ref")
+ if ref_tag:
+ # Extract the value of the 'name' attribute
+ name = str(ref_tag.get("name")) # type: ignore # see https://github.com/python/typeshed/issues/8356
+ if name.endswith("\\"):
+ # Cut off the trailing backward slash
+ name = name[:-1]
+ if name.endswith("/"):
+ # Cut off the trailing forward slash
+ name = name[:-1]
+ if name == "None" or name is None:
+ return ""
+ else:
+ return name
+ else:
+ return ""
+
+
+ @property
+ def wikicode_as_string(self):
+ return str(self.wikicode)
+
+ def __parse_xhtml__(self):
+ self.soup = BeautifulSoup(str(self.wikicode), "lxml")
+
+ def __extract_template_urls__(self) -> None:
+ self.template_urls = []
+ urls = []
+ if self.templates:
+ for template in self.templates:
+ if template.urls:
+ urls.extend(template.urls)
+ self.template_urls = urls
+
+ def __extract_bare_urls_outside_templates__(self) -> None:
+ """This is a slightly more sophisticated and slower search for bare URLs using a regex"""
+ self.bare_urls = []
+ urls = []
+ for url in self.__find_bare_urls_outside_templates__():
+ url_object = WikipediaUrlV2(url=url)
+ url_object.extract()
+ urls.append(url_object)
+ self.bare_urls = urls
+
+ # def __extract_external_wikicoded_links_from_the_reference__(self) -> None:
+ # """
+ # Uses mwparserfromhell's ifilter_external_links function (via wikicode.ifilter_external_links)
+ # returns iterator of external links found in the wikicode, like [google.com Google]
+ # """
+ # self.wikicoded_links = []
+ # urls = set()
+ #
+ # # Check if self.wikicode is an instance of Wikicode
+ # if isinstance(self.wikicode, Wikicode):
+ # # Get external links directly from self.wikicode
+ # links = self.wikicode.ifilter_external_links()
+ # else:
+ # # Get external links from the contents of self.wikicode
+ # links = self.wikicode.contents.ifilter_external_links()
+ #
+ # for url in links:
+ # # url: ExternalLink
+ # # we throw away the title here
+ # url = WikipediaUrlV2(url=str(url.url))
+ # url.extract()
+ # urls.add(url)
+ #
+ # self.wikicoded_links = list(urls)
+
+ # def __extract_reference_urls__(self) -> None:
+ # """We support both URLs in templates and outside aka bare URLs"""
+ # urls_list = []
+ #
+ # if not self.template_urls:
+ # self.__extract_template_urls__()
+ # if self.template_urls:
+ # urls_list.extend(self.template_urls)
+ #
+ # if not self.bare_urls:
+ # self.__extract_bare_urls_outside_templates__()
+ # if self.bare_urls:
+ # urls_list.extend(self.bare_urls)
+ #
+ # if not self.wikicoded_links:
+ # self.__extract_external_wikicoded_links_from_the_reference__()
+ # if self.wikicoded_links:
+ # urls_list.extend(self.wikicoded_links)
+ #
+ # # if not self.comment_urls:
+ # # self.__extract_urls_from_comments__()
+ # # urls_list.extend(self.comment_urls)
+ # # We set it to avoid duplicates
+ #
+ # self.reference_urls = list(set(urls_list))
+ #
+ # def __extract_unique_first_level_domains__(self) -> None:
+ # """This aggregates all first level domains from the urls found in the urls"""
+ # from src import app
+ #
+ # app.logger.debug("__extract_first_level_domains__: running")
+ # if not self.reference_urls:
+ # app.logger.info("no reference_urls found so we skip extraction")
+ # else:
+ # logger.debug("found at least one url")
+ # first_level_domains = set()
+ # for url in self.reference_urls:
+ # logger.debug("working on url")
+ # if url.first_level_domain:
+ # app.logger.debug(f"found fld: {url.first_level_domain}")
+ # first_level_domains.add(url.first_level_domain)
+ # else:
+ # app.logger.warning(f"no fld found for: {url.url}")
+ # # Return unique domains to avoid confusion
+ # # https://github.com/internetarchive/iari/issues/834
+ # self.unique_first_level_domains = list(first_level_domains)
+ # app.logger.debug(f"found unique flds: {self.unique_first_level_domains}")
+
+
+ # def __find_bare_urls_outside_templates__(self) -> List[str]:
+ # """Return bare urls from the the stripped wikitext (templates are stripped away)"""
+ # if isinstance(self.wikicode, Wikicode):
+ # stripped_wikicode = str(self.wikicode.strip_code())
+ # logger.debug(stripped_wikicode)
+ # return re.findall(
+ # regex_url_link_extraction,
+ # stripped_wikicode,
+ # )
+ # else:
+ # return []
+
+ #
+ # def __extract_templates_and_parameters__(self) -> None:
+ # """Helper method"""
+ # from src import app
+ #
+ # app.logger.debug(
+ # "__extract_templates_and_parameters_from_raw_reference__: running"
+ # )
+ # self.__extract_raw_templates__()
+ # self.__extract_and_clean_template_parameters__()
+ # self.extraction_done = True
+
+ # def __extract_raw_templates__(self) -> None:
+ # """Extract the templates from self.wikicode"""
+ # from src import app
+ #
+ # self.templates = []
+ # app.logger.debug("__extract_raw_templates__: running")
+ # if not self.wikicode:
+ # raise MissingInformationError("self.wikicode was None")
+ # if isinstance(self.wikicode, str):
+ # raise MissingInformationError("self.wikicode was str")
+ # # Skip named references like "]"
+ # wikicode_string = str(self.wikicode)
+ # if self.is_footnote_reference and (
+ # "" not in wikicode_string or ">" in wikicode_string
+ # ):
+ # logger.info(f"Skipping named reference with no content {wikicode_string}")
+ # self.is_named_reused_reference = True
+ # else:
+ # logger.debug(f"Extracting templates from: {self.wikicode}")
+ # if isinstance(self.wikicode, Tag):
+ # # contents is needed here to get a Wikicode object
+ # raw_templates = self.wikicode.contents.ifilter_templates(
+ # matches=lambda x: not x.name.lstrip().startswith("#"),
+ # recursive=True,
+ # )
+ # else:
+ # raw_templates = self.wikicode.ifilter_templates(
+ # matches=lambda x: not x.name.lstrip().startswith("#"),
+ # recursive=True,
+ # )
+ # count = 0
+ # for raw_template in raw_templates:
+ # count += 1
+ # self.templates.append(
+ # WikipediaTemplate(
+ # raw_template=raw_template, language_code=self.language_code
+ # )
+ # )
+ # if count == 0:
+ # logger.debug("Found no templates")
+
+ # def __extract_and_clean_template_parameters__(self) -> None:
+ # """We extract all templates"""
+ # from src import app
+ #
+ # app.logger.debug("__extract_and_clean_template_parameters__: running")
+ # if self.templates:
+ # [
+ # template.extract_and_prepare_parameter_and_flds()
+ # for template in self.templates
+ # ]
+
+ def extract_and_check(self) -> None:
+ """Helper method"""
+ from src import app
+
+ app.logger.debug("extract_and_check: running")
+ self.__parse_xhtml__()
+ # self.__extract_xhtml_comments__()
+ # self.__extract_templates_and_parameters__()
+ # self.__extract_reference_urls__()
+ # self.__extract_unique_first_level_domains__()
+ # self.__generate_reference_id__()
+
+ # def extract_and_check(self) -> None:
+ # """Helper method"""
+ # from src import app
+ #
+ # app.logger.debug("extract_and_check: running")
+ # self.__parse_xhtml__()
+ # self.__extract_xhtml_comments__()
+ # self.__extract_templates_and_parameters__()
+ # self.__extract_reference_urls__()
+ # self.__extract_unique_first_level_domains__()
+ # self.__generate_reference_id__()
+
+ def __generate_reference_id__(self) -> None:
+ """This generates an 8-char long id based on the md5 hash of
+ the raw wikitext for this reference"""
+ self.reference_id = hashlib.md5(f"{self.wikicode}".encode()).hexdigest()[:8]
+
diff --git a/src/models/v2/wikimedia/wikipedia/section_v2.py b/src/models/v2/wikimedia/wikipedia/section_v2.py
index 967ab3b1..e28e6dd0 100644
--- a/src/models/v2/wikimedia/wikipedia/section_v2.py
+++ b/src/models/v2/wikimedia/wikipedia/section_v2.py
@@ -68,42 +68,48 @@ def __extract_name_from_line__(line):
def __extract_all_general_references__(self):
from src import app
- app.logger.debug("__extract_all_general_references__: running")
- if self.is_general_reference_section:
- app.logger.info("Regex match on section name")
- # Discard the header line
- lines = self.wikitext.split("\n")
- lines_without_heading = lines[1:]
- logger.debug(
- f"Extracting {len(lines_without_heading)} lines form section {lines[0]}"
- )
- for line in lines_without_heading:
- logger.info(f"Working on line: {line}")
- # Guard against empty line
- # logger.debug("Parsing line")
- # We discard all lines not starting with a star to avoid all
- # categories and other templates not containing any references
- if line and self.star_found_at_line_start(line=line):
- parsed_line = mwparserfromhell.parse(line)
- logger.debug("Appending line with star to references")
- # We don't know what the line contains besides a start
- # but we assume it is a reference
- reference = WikipediaReference(
- wikicode=parsed_line,
- # wikibase=self.wikibase,
- testing=self.testing,
- language_code=self.language_code,
- is_general_reference=True,
- section=self.name,
- )
- reference.extract_and_check()
- self.references.append(reference)
+ app.logger.debug("==> WikipediaSectionV2::__extract_all_general_references__")
+
+ # bail if this section is not a "general reference" section
+ # i'm not sure we need to filter this here, as we want to do all sections, i believe
+ # so, i'm deleting this for now
+ ### if not self.is_general_reference_section:
+ ### return
+
+ app.logger.info(f"processing section {self.name}")
+
+ # Discard the header line
+ lines = self.wikitext.split("\n")
+ lines_without_heading = lines[1:]
+ logger.debug(
+ f"Extracting {len(lines_without_heading)} lines form section {lines[0]}"
+ )
+ for line in lines_without_heading:
+ # Guard against empty line
+
+ # logger.info(f"Working on line: {line}")
+ # Discard all lines not starting with a star to avoid categories and other templates
+ # not containing any references
+ if line and self.star_found_at_line_start(line=line):
+ parsed_line = mwparserfromhell.parse(line)
+ logger.debug("Appending line with star to references")
+ # We don't know what the line contains besides a start
+ # but we assume it is a reference
+ reference = WikipediaReferenceV2(
+ wikicode=parsed_line,
+ testing=self.testing,
+ language_code=self.language_code,
+ is_general_reference=True,
+ section=self.name,
+ )
+ reference.extract_and_check()
+ self.references.append(reference)
def __extract_all_footnote_references__(self):
"""This extracts all [...] from self.wikicode"""
from src import app
- app.logger.debug("__extract_all_footnote_references__: running")
+ app.logger.debug("==> __extract_all_footnote_references__")
# Thanks to https://github.com/JJMC89,
# see https://github.com/earwig/mwparserfromhell/discussions/295#discussioncomment-4392452
@@ -124,7 +130,7 @@ def __extract_all_footnote_references__(self):
app.logger.debug(f"extracting ref# {base_ref_counter}")
# app.logger.debug(f"### ### ###")
- reference = WikipediaReference(
+ reference = WikipediaReferenceV2(
wikicode=ref,
# wikibase=self.wikibase,
testing=self.testing,
@@ -147,14 +153,14 @@ def extract(self):
def __populate_wikitext__(self):
from src import app
-
app.logger.debug("__populate_wikitext__: running")
+
if self.wikicode and not self.wikitext:
self.wikitext = str(self.wikicode)
def __parse_wikitext__(self):
from src import app
-
app.logger.debug("__parse_wikitext__: running")
+
if self.wikitext and not self.wikicode:
self.wikicode = mwparserfromhell.parse(self.wikitext)
diff --git a/src/models/v2/wikimedia/wikipedia/url_v2.py b/src/models/v2/wikimedia/wikipedia/url_v2.py
index e710eed0..3a1ef2a4 100644
--- a/src/models/v2/wikimedia/wikipedia/url_v2.py
+++ b/src/models/v2/wikimedia/wikipedia/url_v2.py
@@ -38,7 +38,6 @@ class WikipediaUrlV2(BaseModel):
@property
def __is_wayback_machine_url__(self):
- logger.debug("is_wayback_machine_url: running")
return bool("//web.archive.org" in self.url)
@property
@@ -58,7 +57,7 @@ def __lt__(self, other):
return self.url < other.url
def __parse_extract_and_validate__(self) -> None:
- logger.debug("__parse_extract_and_validate__: running")
+ logger.debug("==> __parse_extract_and_validate__")
if self.__is_wayback_machine_url__:
self.__parse_wayback_machine_url__()
self.__parse_and_extract_url__()
@@ -68,7 +67,7 @@ def __parse_extract_and_validate__(self) -> None:
def __extract_first_level_domain__(self) -> None:
from src import app
- app.logger.debug("__extract_first_level_domain__: Running")
+ app.logger.debug("==> __extract_first_level_domain__")
try:
self.__get_fld__()
except (TldBadUrl, TldDomainNotFound):
@@ -97,7 +96,8 @@ def __check_scheme__(self):
self.malformed_url = True
self.malformed_url_details = MalformedUrlError.UNRECOGNIZED_SCHEME
else:
- logger.debug(f"Found valid urlscheme: {self.scheme}")
+ # logger.debug(f"Found valid urlscheme: {self.scheme}")
+ pass
def __extract_tld__(self):
if not self.netloc:
@@ -147,19 +147,19 @@ def __parse_wayback_machine_url__(self):
# )
def __get_fld__(self):
- logger.debug("__get_fld__: running")
+ # logger.debug("==> __get_fld__")
if self.archived_url:
- logger.debug(f"Trying to get FLD from {self.archived_url}")
+ # logger.debug(f"Trying to get FLD from {self.archived_url}")
fld = get_fld(self.archived_url)
else:
- logger.debug(f"Trying to get FLD from {self.url}")
+ # logger.debug(f"Trying to get FLD from {self.url}")
fld = get_fld(self.url)
- logger.debug(f"Found FLD: {fld}")
+ # logger.debug(f"Found FLD: {fld}")
self.first_level_domain = fld
def extract(self):
from src import app
- app.logger.debug("extract: running")
+ # app.logger.debug("==> extract")
self.__parse_extract_and_validate__()
self.__extract_first_level_domain__()
diff --git a/src/models/v2/wikimedia/wikipedia/wiki_page_v2.py b/src/models/v2/wikimedia/wikipedia/wiki_page_v2.py
new file mode 100644
index 00000000..419c5223
--- /dev/null
+++ b/src/models/v2/wikimedia/wikipedia/wiki_page_v2.py
@@ -0,0 +1,301 @@
+# objhect for wiki article
+# pass in analyzer
+# - extracts refs?
+# Ref Analyzer will just grab refs for this purpose
+# - can reuse ref parsing existing that takes wikitext
+# WikiRefAnalyzer - specific analyzer to this task
+#
+# wiki_page.analyzer.refs ???
+#
+# wiki_page.refs
+# - analyzer sets those
+# OR
+# append analyzer to wiki_page - ???
+#
+#
+# if not self.page_analyzer:
+# self.page_analyzer = WikipediaAnalyzerV2(job=self.job)
+#
+# self.io.data = self.page_analyzer.get_article_data()
+#
+# # if article not found, return error as such
+# if not self.page_analyzer.article_found:
+# return AnalyzerReturnValues.NOT_FOUND.value, 404
+#
+# # if article is a redirect, return error as such
+# if self.page_analyzer.is_redirect:
+# app.logger.debug("found redirect")
+# return AnalyzerReturnValues.IS_REDIRECT.value, 400
+#
+# app.logger.debug("ArticleV2:: processed article, saving...")
+
+from datetime import datetime
+from typing import Any, Tuple, Dict, List, Optional
+import traceback
+
+import mwparserfromhell
+from mwparserfromhell.wikicode import Wikicode
+
+from src.models.base import WariBaseModel
+from src.models.exceptions import MissingInformationError, WikipediaApiFetchError
+
+from flask_restful import Resource, abort # type: ignore
+
+from src.models.api.job.article_job import ArticleJob
+# from src.models.api.schema.article_schema import ArticleSchema
+from src.models.exceptions import MissingInformationError
+
+from src.models.file_io.article_file_io import ArticleFileIo
+from src.models.file_io.references import ReferencesFileIo
+from src.models.v2.wikimedia.wikipedia.reference.reference_lite_v2 import WikipediaReferenceLiteV2
+
+from src.models.wikimedia.enums import AnalyzerReturnValues, WikimediaDomain
+from src.models.wikimedia.wikipedia.analyzer import WikipediaAnalyzer
+
+# from src.views.statistics.write_view import StatisticsWriteView
+
+
+class WikiArticleV2(WariBaseModel): ## NB NOT based on StatisticsView or StatisticsWriteView like others
+ """
+ we really should have a base class of something like "resource with references"
+ or something that indicates this is a referencable object
+
+ class for wiki article
+ it contains space for references
+ An analyzer consumes this article object and extracts references from it
+ """
+
+ job: ArticleJob
+ wikitext: str
+ wikicode: Wikicode = None # wiki object tree parsed from wikitext
+
+ page_url: str = ""
+ page_title: str = ""
+ page_lang: str = ""
+
+ references: Optional[List[WikipediaReferenceLiteV2]] = None
+
+ def parse_references(self):
+ self.wikicode = mwparserfromhell.parse(self.wikitext)
+
+
+ def __extract_sections__(self) -> None:
+ """This uses the sections regex supplied by the patron via the API
+ and populates the sections attribute with a list of MediawikiSection objects
+
+ We only consider level 2 sections beginning with =="""
+ from src import app
+
+ self.sections = []
+ app.logger.debug("__extract_sections__: running")
+ if not self.wikicode:
+ self.__parse_wikitext__()
+
+ # all_sections: List[Wikicode] = self.wikicode.get_sections(
+ # # levels=[2],
+ # include_headings=True,
+ # )
+
+ # section_counter = 0
+ # for section in all_sections:
+ # section_counter += 1
+ # app.logger.info(f"All Section #{section_counter}")
+ #
+ # self.section_list.append({"id": section_counter, "name": "???"})
+ #
+ # for node in section.filter_headings():
+ # header_text = node.title.strip()
+ # header_level = node.level
+ # # app.logger.info(f"Section id: {section_counter}, Header: {header_text}, Level: {header_level}")
+ # app.logger.info(f"Section #: {section_counter} header: {node}")
+
+ sections: List[Wikicode] = self.wikicode.get_sections(
+ levels=[2],
+ include_headings=True,
+ )
+
+ '''
+ loop thru all sections
+ keeping counter
+ when level 2 hit,
+ create a mw_section object
+ set counter as section_id
+ '''
+
+ # TODO: make this code better by special casing no section and making faux section, and putting through same loop
+
+ section_counter = 0
+ section_list = []
+
+ if not sections:
+ app.logger.debug("No level 2 sections detected, creating root section")
+ # console.print(self.wikicode)
+ # exit()
+ mw_section = MediawikiSection(
+ # We add the whole article to the root section
+ wikicode=self.wikicode,
+ section_id=section_counter,
+
+ job=self.job,
+
+ testing=self.testing,
+ language_code=self.language_code,
+ )
+ mw_section.extract()
+ self.sections.append(mw_section)
+
+ else:
+ app.logger.info(f"Processing section number {section_counter}")
+
+ # append root section as first section in section list
+ self.__extract_root_section__()
+
+ # append each section to section list
+ for section in sections:
+
+ section_counter += 1
+
+ app.logger.info(f"Section: {section}")
+
+ mw_section = MediawikiSection(
+ wikicode=section,
+ section_id=section_counter,
+
+ job=self.job,
+
+ testing=self.testing,
+ language_code=self.language_code,
+ )
+
+ mw_section.extract() # pull all refs from section
+ self.sections.append(mw_section)
+
+ section_list.append({"name": "section name", "counter": section_counter})
+
+
+ app.logger.debug(f"Number of sections found: {len(self.sections)}")
+
+ self.section_info.update({"count": len(self.sections), "list": section_list})
+ # self.section_info["count"] = len(self.sections)
+ # self.section_info["list"] = section_list
+
+
+ # def __handle_article_request__(self):
+ # from src import app
+ #
+ # app.logger.info("==> WikiArticleV2::__handle_article_request__: fetching article data and saving to cache")
+ #
+ # self.__setup_wikipedia_analyzer__()
+ # return self.__analyze_and_write_and_return__()
+ #
+ # def __analyze_and_write_and_return__(self) -> Tuple[Any, int]:
+ # """Analyze, calculate the time, write statistics to disk and return it
+ # If we did not get statistics, return a meaningful error to the patron"""
+ # from src import app
+ #
+ # app.logger.info("==> __analyze_and_write_and_return__")
+ #
+ # if not self.wikipedia_page_analyzer:
+ # raise MissingInformationError("self.wikipedia_page_analyzer was None")
+ #
+ # self.__get_statistics__() # populate self.io.data with analysis results
+ # self.__setup_io__()
+ # self.io.data = self.wikipedia_page_analyzer.get_statistics()
+ #
+ # if self.wikipedia_page_analyzer.found: # found === True means article was successfully processed
+ # app.logger.debug("valid article found and processed")
+ #
+ # if self.wikipedia_page_analyzer.is_redirect:
+ # app.logger.debug("found redirect")
+ # return AnalyzerReturnValues.IS_REDIRECT.value, 400
+ #
+ # else:
+ # app.logger.debug("adding time information and returning the statistics")
+ # self.__update_statistics_with_time_information__()
+ # # we got a json response
+ # # according to https://stackoverflow.com/questions/13081532/return-json-response-from-flask-view
+ # # flask calls jsonify automatically
+ # self.__write_to_disk__() # writes self.io.dtata to disk
+ # if not self.io:
+ # raise MissingInformationError()
+ # if self.io.data:
+ # self.io.data["served_from_cache"] = False # append return data
+ # return self.io.data, 200
+ # else:
+ # raise MissingInformationError()
+ # else:
+ # return AnalyzerReturnValues.NOT_FOUND.value, 404
+ #
+ # def __get_statistics__(self):
+ # """
+ # get the results from wikipedia_page_analyzer.get_statistics and save to self.io.data
+ # """
+ # from src import app
+ #
+ # app.logger.debug("==> __get_statistics__")
+ #
+ # if not self.wikipedia_page_analyzer:
+ # raise MissingInformationError("self.wikipedia_page_analyzer was None")
+ #
+ # # https://realpython.com/python-timer/
+ # self.__setup_io__()
+ # self.io.data = self.wikipedia_page_analyzer.get_statistics()
+ #
+ # def __update_statistics_with_time_information__(self):
+ # """Update the dictionary before returning it"""
+ # if self.io.data:
+ # timestamp = datetime.timestamp(datetime.utcnow())
+ # self.io.data["timestamp"] = int(timestamp)
+ # isodate = datetime.isoformat(datetime.utcnow())
+ # self.io.data["isodate"] = str(isodate)
+ # else:
+ # raise ValueError("not a dict")
+ #
+ # def __return_meaningful_error__(self):
+ # from src import app
+ #
+ # app.logger.error("==> __return_meaningful_error__")
+ # if self.job.title == "":
+ # return "Title was missing", 400
+ # if self.job.domain != "wikipedia":
+ # return "Only 'wikipedia' site is supported", 400
+ #
+ # def __setup_wikipedia_analyzer__(self):
+ # if not self.wikipedia_page_analyzer:
+ # from src import app
+ #
+ # app.logger.info(f"Setup analyzer for {self.job.title}...")
+ #
+ # # wikipedia_page_analyzer is declared in the StatisticsView class (views/statistics/__init.py)
+ # # NB This wrong! It should be declared here in the Article class.
+ # # we fix this in the v2/ArticleV2 code, but not here, since it "works".
+ # # this is the only place it is called, so it makes no sense to
+ # # declare it in a base class that other objects that do not use
+ # # the analysis feature...!
+ # self.wikipedia_page_analyzer = WikipediaAnalyzer(job=self.job)
+ #
+ # def __setup_io__(self):
+ # self.io = ArticleFileIo(job=self.job)
+ #
+ # def __write_to_disk__(self):
+ # """Write both article json and all reference json files"""
+ # from src import app
+ #
+ # app.logger.debug("__write_to_disk__: running")
+ # if not self.job.testing:
+ # self.__write_article_to_disk__()
+ # self.__write_references_to_disk__()
+ #
+ # def __write_article_to_disk__(self):
+ # article_io = ArticleFileIo(
+ # job=self.job,
+ # data=self.io.data,
+ # wari_id=self.job.wari_id,
+ # )
+ # article_io.write_to_disk()
+ #
+ # def __write_references_to_disk__(self):
+ # references_file_io = ReferencesFileIo(
+ # references=self.wikipedia_page_analyzer.reference_statistics
+ # )
+ # references_file_io.write_references_to_disk()
diff --git a/src/models/wikimedia/enums.py b/src/models/wikimedia/enums.py
index 4b90468e..e8c64373 100644
--- a/src/models/wikimedia/enums.py
+++ b/src/models/wikimedia/enums.py
@@ -59,3 +59,10 @@ class WikimediaDomain(Enum):
class AnalyzerReturnValues(Enum):
IS_REDIRECT = "No statistic available because this is a redirect."
NOT_FOUND = "Article title not found."
+
+
+class RequestMethods(Enum):
+ # Http request types
+ get = "get"
+ post = "post"
+
diff --git a/src/models/wikimedia/wikipedia/analyzer.py b/src/models/wikimedia/wikipedia/analyzer.py
index 5f595cad..61ac8b54 100644
--- a/src/models/wikimedia/wikipedia/analyzer.py
+++ b/src/models/wikimedia/wikipedia/analyzer.py
@@ -31,6 +31,7 @@ class WikipediaAnalyzer(WariBaseModel):
job: Optional[ArticleJob] = None
article: Optional[WikipediaArticle] = None
+
article_statistics: Optional[ArticleStatistics] = None # includes cite_refs property
reference_statistics: Optional[List[Dict[str, Any]]] = None
@@ -221,7 +222,7 @@ def __gather_reference_statistics__(self):
flds=reference.unique_first_level_domains
if reference.unique_first_level_domains
else [],
- wikitext=reference.get_wikicode_as_string,
+ wikitext=reference.wikicode_as_string,
section=reference.section,
section_id=reference.section_id,
template_names=reference.template_names,
@@ -290,7 +291,7 @@ def __get_article_data_for_response__(self):
# refs = section.references
for ref in section.references:
- new_ref ={"wikitext": ref.get_wikicode_as_string}
+ new_ref ={"wikitext": ref.wikicode_as_string}
new_refs.append(new_ref)
sections.append(new_section)
diff --git a/src/models/wikimedia/wikipedia/article.py b/src/models/wikimedia/wikipedia/article.py
index 063302ce..28a552e4 100644
--- a/src/models/wikimedia/wikipedia/article.py
+++ b/src/models/wikimedia/wikipedia/article.py
@@ -31,6 +31,8 @@ class WikipediaArticle(WariBaseModel):
because of
https://github.com/internetarchive/wcdimportbot/issues/261"""
+ job: ArticleJob
+
md5hash: Optional[str]
page_id: int = 0
wdqid: str = ""
@@ -46,8 +48,6 @@ class WikipediaArticle(WariBaseModel):
# extractor: Optional[Any] = None
# TODO: FIXFIX
- job: ArticleJob
-
ores_quality_prediction: str = ""
ores_details: Optional[Dict] = None
@@ -95,8 +95,7 @@ def url(self):
def fetch_and_extract_and_parse(self):
from src import app
- app.logger.debug("==> fetch_and_extract_and_parse")
- app.logger.info("Extracting templates and parsing references")
+ app.logger.debug("==> WikipediaArticle::fetch_and_extract_and_parse")
if not self.wikitext:
# fetch page data from Wikipedia if we don't already have wikitext
@@ -106,6 +105,7 @@ def fetch_and_extract_and_parse(self):
logger.debug(
"Skipped extraction and parsing because the article is a redirect"
)
+
elif not self.found_in_wikipedia:
logger.debug(
"Skipped extraction and parsing because the article was not found"
@@ -143,7 +143,8 @@ def __fetch_page_data__(self) -> None:
and date from the MediaWiki REST v1 API if needed"""
from src import app
- app.logger.debug("__fetch_page_data__: Running")
+ app.logger.debug("==> __fetch_page_data__: Running")
+
self.__check_if_title_is_empty__()
if not self.wikitext:
if self.revision_id:
@@ -173,7 +174,7 @@ def __get_title_from_wikidata__(self):
def __check_if_title_is_empty__(self):
if not self.job.title:
- raise MissingInformationError("self.job.title was empty string")
+ raise MissingInformationError("WikipediaArticle: self.job.title is empty")
def __get_ores_scores__(self):
self.ores_details = {}
diff --git a/src/models/wikimedia/wikipedia/reference/generic.py b/src/models/wikimedia/wikipedia/reference/generic.py
index 4195a11c..bfa385fb 100644
--- a/src/models/wikimedia/wikipedia/reference/generic.py
+++ b/src/models/wikimedia/wikipedia/reference/generic.py
@@ -7,7 +7,7 @@
from mwparserfromhell.nodes import Tag # type: ignore
from mwparserfromhell.wikicode import Wikicode # type: ignore
-from config import link_extraction_regex
+from config import regex_url_link_extraction
from src.models.base.job import JobBaseModel
from src.models.exceptions import MissingInformationError
from src.models.wikimedia.wikipedia.reference.enums import (
@@ -168,7 +168,7 @@ def is_footnote_reference(self):
return not self.is_general_reference
@property
- def get_wikicode_as_string(self):
+ def wikicode_as_string(self):
return str(self.wikicode)
@property
@@ -298,7 +298,7 @@ def __find_bare_urls_outside_templates__(self) -> List[str]:
stripped_wikicode = str(self.wikicode.strip_code())
logger.debug(stripped_wikicode)
return re.findall(
- link_extraction_regex,
+ regex_url_link_extraction,
stripped_wikicode,
)
else:
diff --git a/src/models/wikimedia/wikipedia/url.py b/src/models/wikimedia/wikipedia/url.py
index 5ccb27ca..aaa85c9a 100644
--- a/src/models/wikimedia/wikipedia/url.py
+++ b/src/models/wikimedia/wikipedia/url.py
@@ -67,7 +67,7 @@ def __parse_extract_and_validate__(self) -> None:
def __extract_first_level_domain__(self) -> None:
from src import app
- app.logger.debug("__extract_first_level_domain__: Running")
+ app.logger.debug("==> __extract_first_level_domain__")
try:
self.__get_fld__()
except (TldBadUrl, TldDomainNotFound):
diff --git a/src/views/check_doi.py b/src/views/check_doi.py
index ddaace38..0e9dd5f5 100644
--- a/src/views/check_doi.py
+++ b/src/views/check_doi.py
@@ -33,7 +33,7 @@ def get(self):
Every branch in this method has to return a tuple (Any,response_code)"""
from src import app
- app.logger.debug("get: running")
+ app.logger.debug("==> CheckDoi::get")
self.__validate_and_get_job__()
if self.job:
return self.__return_from_cache_or_analyze_and_return__()
diff --git a/src/views/v2/fetchrefs_v2.py b/src/views/v2/fetchrefs_v2.py
new file mode 100644
index 00000000..90f3ffe4
--- /dev/null
+++ b/src/views/v2/fetchrefs_v2.py
@@ -0,0 +1,144 @@
+# from flask_restful import Resource, abort # type: ignore
+# from marshmallow import Schema
+from datetime import datetime
+from typing import Any, Optional, Tuple, List, Dict
+import traceback
+
+from dateutil.parser import isoparse
+
+import config
+import requests
+
+from src.models.exceptions import MissingInformationError, WikipediaApiFetchError
+from src.models.v2.job.article_job_v2 import ArticleJobV2
+
+from src.models.v2.schema.fetchrefs_schema_v2 import FetchRefsSchemaV2
+from src.models.v2.job.fetchrefs_job_v2 import FetchRefsJobV2
+# from src.models.v2.wikimedia.wikipedia.wiki_page_v2 import WikiArticleV2
+
+from src.models.api.job.article_job import ArticleJob
+from src.models.v2.wikimedia.wikipedia.article_v2 import WikipediaArticleV2
+from src.models.wikimedia.wikipedia.article import WikipediaArticle
+
+from src.views.v2.statistics import StatisticsViewV2
+from src.models.wikimedia.enums import RequestMethods
+
+
+class FetchRefsV2(StatisticsViewV2):
+
+ """
+ takes an array of page specifiers, and
+ returns data for all citations for each page.
+
+ """
+
+ schema = FetchRefsSchemaV2() # Defines expected parameters; Overrides StatisticsViewV2's "schema" property
+ job: FetchRefsJobV2 # Holds usable variables, seeded from schema. Overrides StatisticsViewV2's "job"
+
+ pages: List[Dict[str, Any]] = [] # contents parsed from pipe-delimited "pages" URL parameter
+
+ def get(self):
+ """
+ flask GET entrypoint for returning fetchrefs results
+ must return a tuple: (Any,response_code)
+ """
+ from src import app
+ app.logger.debug(f"==> FetchRefsV2::get")
+
+ return self.__process_request__(method=RequestMethods.get)
+ # return {"errors": [
+ # {"error": "GET method not supported for this endpoint"}
+ # ]}
+
+ def post(self):
+ """
+ flask POST entrypoint for returning fetchrefs results
+ must return a tuple: (Any,response_code)
+ """
+ from src import app
+ app.logger.debug(f"==> FetchRefsV2::post")
+
+ # return self.__process_data__(method="post")
+ return self.__process_request__(method=RequestMethods.post)
+
+
+ def __process_request__(self, method=RequestMethods.post): # default to POST
+ from src import app
+ app.logger.debug(f"==> FetchRefsV2::__process_request__, method = {method}")
+
+ try:
+ self.__validate_and_get_job__(method) # inherited from StatisticsViewV2
+
+ self.pages = []
+
+ # process pages, get refs, sets self.pages data
+ for page in self.job.pages:
+ page_results = self.__get_page_data__(page)
+ # append page ref data to pages result
+ self.pages.append(page_results)
+
+ # and return results
+ return {"pages": self.pages}
+
+
+ except MissingInformationError as e:
+ traceback.print_exc()
+ return {"error": f"Missing Information Error: {str(e)}"}, 500
+
+ except Exception as e:
+ traceback.print_exc()
+ return {"error": f"General Error: {str(e)}"}, 500
+
+ def __get_page_data__(self, page_title):
+ """
+ Assume page is a fully resolved url, such as: https://en.wikipedia.org/wiki/Easter_Island
+ """
+
+ try:
+ # process page
+
+ url_template = "https://{lang}.{wiki_domain}/wiki/{page_title}" # TODO make this a global
+ page_url = url_template.format(page_title=page_title, lang="en", wiki_domain="wikipedia.org")
+
+ article_job = ArticleJobV2(url=page_url)
+ article_job.__extract_url__()
+
+ # get article object corresponding to page
+ # page = WikiArticleV2(job=article_job)
+
+ page = WikipediaArticleV2(job=article_job)
+ page.fetch_and_parse()
+
+ # loop thru references
+ page_refs = []
+ if page.extractor:
+ for ref in page.extractor.references:
+ page_refs.append({
+ "name": ref.get_name,
+ "wikitext": ref.wikicode_as_string
+ })
+
+ except WikipediaApiFetchError as e:
+ return {
+ "page_title": page_title,
+ "which_wiki": self.job.which_wiki,
+ "error": f"Page data error: {str(e)}"
+ }
+
+ except Exception as e:
+ traceback.print_exc()
+ return {
+ "page_title": page_title,
+ "which_wiki": self.job.which_wiki,
+ "error": f"General error: {str(e)}"
+ }
+
+ return {
+ "page_title": page_title,
+ "which_wiki": self.job.which_wiki,
+
+ "refs": page_refs,
+ }
+
+
+
diff --git a/src/views/v2/statistics/__init__.py b/src/views/v2/statistics/__init__.py
index 0ad66511..76861922 100644
--- a/src/views/v2/statistics/__init__.py
+++ b/src/views/v2/statistics/__init__.py
@@ -1,5 +1,5 @@
from datetime import datetime
-from typing import Optional
+from typing import Optional, Any
from flask import request
from flask_restful import Resource, abort # type: ignore
@@ -10,6 +10,7 @@
from src.models.api.job import Job
from src.models.exceptions import MissingInformationError
from src.models.file_io import FileIo
+from src.models.wikimedia.enums import RequestMethods
class StatisticsViewV2(Resource):
@@ -28,6 +29,7 @@ class StatisticsViewV2(Resource):
job: Optional[Job] # loads parameters via schema.load
io: Optional[FileIo] = None # derived class must implement __setup_io__
+ request_args: Any = {}
time_of_analysis: Optional[datetime] = None
def __setup_io__(self):
@@ -42,39 +44,51 @@ def __read_from_cache__(self):
if self.io:
self.io.read_from_disk()
- def __validate_and_get_job__(self, method="get"):
+ def __validate_and_get_job__(self, method=RequestMethods.get):
"""
Validates request params, whether from GET or POST, and,
if successful, pulls param values into job's properties
"""
from src import app
- app.logger.debug(f"==> StatisticsViewV2::__validate_and_get_job__({method})")
+ app.logger.debug(f"==> StatisticsViewV2::__validate_and_get_job__(method = {method})")
- # use args if GET, form if POST
- request_args = request.args if (method == "get") else request.form
+ self.schema.context['request_method'] = request.method
+ app.logger.debug(f"==> StatisticsViewV2::__validate_and_get_job__: request.method = {request.method})")
- self.__validate__(request_args)
- self.__parse_into_job__(request_args)
+ # self.request_method = method
+ # use request.args if GET, request.form if POST
+ # self.request_args = request.args if (method == RequestMethods.get) else request.form
+ self.request_args = request.args if (request.method == "GET") else request.form
- def __validate__(self, request_args):
+ app.logger.debug(f"==> StatisticsViewV2::__validate_and_get_job__: request_args: {self.request_args}")
+
+ # self.__validate__(request_args)
+ # self.__parse_into_job__(request_args)
+ self.__validate__()
+ self.__parse_into_job__()
+
+ def __validate__(self):
from src import app
- app.logger.debug(f"==> StatisticsViewV2::__validate__({request_args})")
+ app.logger.debug(f"==> StatisticsViewV2::__validate__({self.request_args})")
- errors = self.schema.validate(request_args)
+ errors = self.schema.validate(self.request_args)
if errors:
app.logger.debug(f"Validation errors: {errors}")
raise MissingInformationError(errors)
- def __parse_into_job__(self, request_args):
+ # def __parse_into_job__(self, request_args):
+ def __parse_into_job__(self):
from src import app
- app.logger.debug(f"==> StatisticsViewV2::__parse_into_job__({request_args})")
+ app.logger.debug(f"==> StatisticsViewV2::__parse_into_job__({self.request_args})")
if not self.schema:
raise MissingInformationError("No schema set for StatisticsViewV2")
- self.job = self.schema.load(request_args)
+ self.schema.context['request_method'] = request.method
+
+ self.job = self.schema.load(self.request_args)
# returns a job object, populated with field values mapped from request_args
if not self.job:
diff --git a/tests/wikipedia/reference/test_english_wikipedia_page_reference.py b/tests/wikipedia/reference/test_english_wikipedia_page_reference.py
index ec73d55c..3535721d 100644
--- a/tests/wikipedia/reference/test_english_wikipedia_page_reference.py
+++ b/tests/wikipedia/reference/test_english_wikipedia_page_reference.py
@@ -882,7 +882,7 @@ def test_get_wikicode_as_string_empty(self):
for ref in refs:
print(ref)
raw_reference_object = WikipediaReference(tag=ref, testing=True)
- assert raw_reference_object.get_wikicode_as_string == ref
+ assert raw_reference_object.wikicode_as_string == ref
def test_get_wikicode_as_string_nonempty(self):
wikitext = (
@@ -894,7 +894,7 @@ def test_get_wikicode_as_string_nonempty(self):
refs = wikicode.filter_tags(matches=lambda tag: tag.lower() == "ref")
for ref in refs:
raw_reference_object = WikipediaReference(tag=ref, testing=True)
- assert raw_reference_object.get_wikicode_as_string == ref
+ assert raw_reference_object.wikicode_as_string == ref
def test_is_footnote_reference(self):
ref = "{{citeq|Q1}}"