diff --git a/src/__init__.py b/src/__init__.py index 6422102f..4be9c923 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -17,7 +17,7 @@ import config from src.models.exceptions import MissingInformationError, WikipediaApiFetchError -# old stuff... +# legacy endpoints stuff... from src.views.check_doi import CheckDoi from src.views.check_url import CheckUrl from src.views.check_url_archive import CheckUrlArchive @@ -36,6 +36,8 @@ from src.views.v2.article_cache_view_v2 import ArticleCacheV2 # new stuff jun 2024 from src.views.v2.editref_v2 import EditRefV2 +# new stuff jul 2024 +from src.views.v2.fetchrefs_v2 import FetchRefsV2 logging.basicConfig(level=config.loglevel) logger = logging.getLogger(__name__) @@ -50,18 +52,19 @@ def add_cors_headers(response): response.headers["Access-Control-Allow-Headers"] = "Content-Type" return response +# Register CORS function as an after_request handler +app.after_request(add_cors_headers) + # let's see if we can distinguish which server we are on server_name = os.getenv('FLASK_SERVER_NAME', 'Unknown Server') -# Register the function as an after_request handler -app.after_request(add_cors_headers) - # We use a prefix here to enable us to stabilize the api over time # and bump the version when making breaking changes -api = Api(app, prefix="/v2") +api = Api(app, prefix="/v2") # NB TODO This pseudo-versioning should be addressed # link the API views to respective endpoint urls +api.add_resource(FetchRefsV2, "/fetchrefs") api.add_resource(EditRefV2, "/editref") api.add_resource(ArticleV2, "/article") diff --git a/src/models/api/handlers/pdf.py b/src/models/api/handlers/pdf.py index ab3cf951..6096c2ed 100644 --- a/src/models/api/handlers/pdf.py +++ b/src/models/api/handlers/pdf.py @@ -15,7 +15,7 @@ # type: ignore from requests import ReadTimeout -from config import link_extraction_regex +from config import regex_url_link_extraction from src.models.api.handlers import BaseHandler from src.models.api.job.check_url_job import UrlJob from src.models.api.link.pdf_link import PdfLink @@ -214,7 +214,7 @@ def __extract_links_from_original_text__(self) -> None: # We remove the linebreaks to avoid clipping of URLs, see https://github.com/internetarchive/iari/issues/766 # provided by chatgpt: urls = re.findall( - link_extraction_regex, + regex_url_link_extraction, self.text_pages[index], ) # cleaned_urls = self.__clean_urls__(urls=urls) @@ -233,7 +233,7 @@ def __extract_links_from_text_without_linebreaks__(self) -> None: # We remove the linebreaks to avoid clipping of URLs, see https://github.com/internetarchive/iari/issues/766 # provided by chatgpt: urls = re.findall( - link_extraction_regex, + regex_url_link_extraction, self.text_pages_without_linebreaks[index], ) # cleaned_urls = self.__clean_urls__(urls=urls) @@ -253,7 +253,7 @@ def __extract_links_from_text_without_spaces__(self) -> None: # provided by chatgpt: if self.text_pages_without_spaces: urls = re.findall( - link_extraction_regex, + regex_url_link_extraction, self.text_pages_without_spaces[index], ) # cleaned_urls = self.__clean_urls__(urls=urls) diff --git a/src/models/api/job/article_job.py b/src/models/api/job/article_job.py index c43f631d..6bd8571c 100644 --- a/src/models/api/job/article_job.py +++ b/src/models/api/job/article_job.py @@ -15,13 +15,17 @@ class ArticleJob(Job): lang: str = "en" domain: WikimediaDomain = WikimediaDomain.wikipedia title: str = "" + revision: int = 0 # this is named just as in the MediaWiki API + page_id: int = 0 - refresh: bool = False url: str = "" + sections: str = "" # string describing which sections to parse - revision: int = 0 # this is named just as in the MediaWiki API + + refresh: bool = False dehydrate: bool = True + @property def wari_id(self) -> str: if not self.lang: diff --git a/src/models/v2/job/fetchrefs_job_v2.py b/src/models/v2/job/fetchrefs_job_v2.py new file mode 100644 index 00000000..1d8d86fa --- /dev/null +++ b/src/models/v2/job/fetchrefs_job_v2.py @@ -0,0 +1,53 @@ +import re +from urllib.parse import quote, unquote +from src.models.wikimedia.enums import WikimediaDomain +from src import MissingInformationError +from src.models.v2.job import JobV2 +from typing import List + + +class FetchRefsJobV2(JobV2): + """job that supports FetchRefsV2 endpoint""" + + # using marshmallow to describe parameters + + which_wiki: str = "" + pages: List[str] = [] + wikitext: str = "" + + wiki_domain: WikimediaDomain = WikimediaDomain.wikipedia + wiki_lang: str = "" + + wiki_id: str = "" + wiki_page_title: str = "" + wiki_revision: str = "" + + # @property + # def quoted_title(self): + # if not self.wiki_page_title: + # raise MissingInformationError("self.wiki_page_title is empty") + # return quote(self.wiki_page_title, safe="") + + + def validate_fields(self): + """ + parameter checking done here... + + must have at "pages" or "wikitext" defined + """ + + from src import app + + # app.logger.error('fetchrefs validate_fields: Fake Error') + # raise MissingInformationError( + # f'fetchrefs validate_fields: Fake Error' + # ) + + if not self.wikitext: + if not self.pages: + raise MissingInformationError( + f"pages or wikitext parameter must be specified" + ) + + + diff --git a/src/models/v2/schema/fetchrefs_schema_v2.py b/src/models/v2/schema/fetchrefs_schema_v2.py new file mode 100644 index 00000000..5dce5b4a --- /dev/null +++ b/src/models/v2/schema/fetchrefs_schema_v2.py @@ -0,0 +1,54 @@ +from marshmallow import fields, pre_load, post_load + +from src.models.v2.job.fetchrefs_job_v2 import FetchRefsJobV2 +from src.models.v2.schema import BaseSchemaV2 + + +class FetchRefsSchemaV2(BaseSchemaV2): + # Defines expected parameters for endpoint + # - default parameters are defined in BaseSchemaV2 + + which_wiki = fields.Str(default="enwiki") + pages = fields.List(fields.String(), required=False) # either pages or wikitext must be defined + wikitext = fields.Str(required=False) # if provided, overrides pages array + + @pre_load + # NB: pre_load is a marshmallow directive; + def process_input(self, data, **kwargs): + """ + transform comma separated pages into a List + """ + from src import app + app.logger.debug(f"==> FetchRefsSchemaV2::(@pre_load)process_input: data:{data}") + + request_method = self.context.get('request_method', None) + # if request_method: + # print(f"Request method received: {request_method}") + + app.logger.debug(f"==> FetchRefsSchemaV2::(@pre_load)process_input: request_method:{request_method}") + + + mutable_data = dict(data) # Convert ImmutableMultiDict to a mutable dict + if 'pages' in mutable_data and isinstance(mutable_data['pages'], str): + mutable_data['pages'] = mutable_data['pages'].split('|') + return mutable_data + + # noinspection PyUnusedLocal + @post_load + # NB: post_load is a marshmallow directive; + # this function is run after loading request args + # it basically pulls the request object value into a Job object + # + # **kwargs is needed here despite what the validator claims + def return_job_object(self, data, **kwargs) -> FetchRefsJobV2: # type: ignore # dead: disable + """Return Job object""" + from src import app + app.logger.debug("==> FetchRefsSchemaV2::@post_load:return_job_object") + app.logger.debug(f"return_job_object data: {data}") + + job = FetchRefsJobV2(**data) + job.validate_fields() + + # NB here is where we can modify job field values before returning if we want + + return job diff --git a/src/models/v2/wikimedia/wikipedia/article_v2.py b/src/models/v2/wikimedia/wikipedia/article_v2.py index e2daef84..11813200 100644 --- a/src/models/v2/wikimedia/wikipedia/article_v2.py +++ b/src/models/v2/wikimedia/wikipedia/article_v2.py @@ -1,8 +1,6 @@ import logging import re -import pprint -# import urllib -from urllib.parse import quote, unquote +from urllib.parse import unquote from datetime import datetime from typing import Any, Dict, List, Optional @@ -69,6 +67,7 @@ class WikipediaArticleV2(IariBaseModel): error_items: List[Any] = [] + # required pydantic class class Config: # dead: disable arbitrary_types_allowed = True # dead: disable extra = "forbid" # dead: disable @@ -129,7 +128,7 @@ def fetch_and_parse(self): """ from src import app - app.logger.debug("ArticleV2::fetch_and_parse") + app.logger.debug("==> ArticleV2::fetch_and_parse") app.logger.info("Fetching article data and parsing") if not self.wikitext: @@ -138,28 +137,60 @@ def fetch_and_parse(self): self.__fetch_wikitext__() if self.is_redirect: - logger.debug( + logger.error( "Skipped extraction and parsing because the article is a redirect" ) - raise WikipediaApiFetchError("wiki article is a redirect") + raise WikipediaApiFetchError("Wiki article is a redirect") + # TODO Might want to change this from raising exception, + # but we do want to stop further processing, + # so need to have some way of indicating that to caller if not self.found_in_wikipedia: - logger.debug( - "Skipped extraction and parsing because the article was not found" + logger.error( + "Skipped extraction and parsing because the article was not found in wiki" ) - raise WikipediaApiFetchError("wiki article not found in wiki") + raise WikipediaApiFetchError(f"Article {self.job.quoted_title} not found in wiki") if not self.wikitext: raise WikipediaApiFetchError("wikitext is empty") + + # wikitext extraction + + app.logger.debug("==> ArticleV2::fetch_and_parse: extracting from wikitext") + + # elif not self.is_redirect and self.found_in_wikipedia: + if not self.is_redirect and self.found_in_wikipedia: + + if not self.wikitext: + raise MissingInformationError("WikipediaReferenceExtractorV2::fetch_and_parse: self.wikitext is empty") + + self.extractor = WikipediaReferenceExtractorV2( + wikitext=self.wikitext, + html_source=self.html_markup, + job=self.job, + ) + + app.logger.debug("==> ArticleV2::fetch_and_parse: extracting all refs") + self.extractor.extract_all_references() + + app.logger.debug("==> ArticleV2::fetch_and_parse: fetching ores scores") + self.__get_ores_scores__() + # self.__generate_hash__() + + + app.logger.debug("==> ArticleV2::fetch_and_parse: extracting from html") + + # html extraction if not self.html_markup: self.__fetch_html__() + # extract references from html point-of-view self.__extract_footnote_references__() self.__extract_section_references__() self.__extract_urls_from_references__() - self.__get_ores_scores__() # fills ores_quality_prediction and ores_details + # self.__get_ores_scores__() # fills ores_quality_prediction and ores_details def __extract_urls_from_references__(self): # traverse references, adding urls to self.urlDict, @@ -196,8 +227,8 @@ def __extract_footnote_references__(self): regex_extract_ref_name = r"#cite_note-(.*?)-\d+$" soup = BeautifulSoup(self.html_markup, "html.parser") - # for link in soup.find_all("a"): - # print(link.get("href")) + # for link in soup.find_all("a"): + # print(link.get("href")) references_wrapper = soup.find("div", class_="mw-references-wrap") @@ -247,7 +278,7 @@ def __extract_footnote_references__(self): if span_ref: # span_ref contains citation markup and possible template data - app.logger.debug(f"Checking data...") + # ### app.logger.debug(f"Checking data...") # fetch "template" data from link[data-mw] attribute link_refs = span_ref.find_all("link") @@ -293,7 +324,9 @@ def __extract_footnote_references__(self): # TODO What is held in these elements, specifically? is it books? span_refs = span_ref.find_all("span", class_="Z3988") for span_ref in span_refs: - app.logger.debug(f"found span.Z3988...") + + # app.logger.debug(f"found span.Z3988...") + span_data = span_ref.get("title") if span_data: span_template = self.__parse_span_template__(span_data) @@ -489,7 +522,7 @@ def __parse_span_template__(self, span_data) -> Dict[str, Any] or None: span_list = span_data.split("&") - app.logger.debug(f"SPAN DATA (parsed):") + # app.logger.debug(f"SPAN DATA (parsed):") span_template = [] # print this string out diff --git a/src/models/v2/wikimedia/wikipedia/reference/__init__.py b/src/models/v2/wikimedia/wikipedia/reference/__init__.py index 14cb37b8..323910c5 100644 --- a/src/models/v2/wikimedia/wikipedia/reference/__init__.py +++ b/src/models/v2/wikimedia/wikipedia/reference/__init__.py @@ -7,7 +7,7 @@ from mwparserfromhell.nodes import Tag # type: ignore from mwparserfromhell.wikicode import Wikicode # type: ignore -from config import link_extraction_regex +from config import regex_url_link_extraction from src.models.base.job import JobBaseModel from src.models.exceptions import MissingInformationError from src.models.v2.wikimedia.wikipedia.reference.template import WikipediaTemplateV2 @@ -162,7 +162,7 @@ def is_footnote_reference(self): return True @property - def get_wikicode_as_string(self): + def wikicode_as_string(self): return str(self.wikicode) @property @@ -253,23 +253,26 @@ def __extract_unique_first_level_domains__(self) -> None: """This aggregates all first level domains from the urls found in the urls""" from src import app - app.logger.debug("__extract_first_level_domains__: running") + app.logger.debug("==> __extract_unique_first_level_domains__") + if not self.reference_urls: - app.logger.info("no reference_urls found so we skip extraction") + app.logger.debug("no reference_urls found so we skip extraction") + else: - logger.debug("found at least one url") + app.logger.debug("found at least one url") first_level_domains = set() for url in self.reference_urls: - logger.debug("working on url") if url.first_level_domain: - app.logger.debug(f"found fld: {url.first_level_domain}") + # app.logger.debug(f"found fld: {url.first_level_domain}") first_level_domains.add(url.first_level_domain) else: - app.logger.warning(f"no fld found for: {url.url}") + # app.logger.warning(f"no fld found for: {url.url}") + pass + # Return unique domains to avoid confusion # https://github.com/internetarchive/iari/issues/834 self.unique_first_level_domains = list(first_level_domains) - app.logger.debug(f"found unique flds: {self.unique_first_level_domains}") + # app.logger.debug(f"found unique flds: {self.unique_first_level_domains}") # @property # def plain_text_in_reference(self) -> bool: @@ -292,7 +295,7 @@ def __find_bare_urls_outside_templates__(self) -> List[str]: stripped_wikicode = str(self.wikicode.strip_code()) logger.debug(stripped_wikicode) return re.findall( - link_extraction_regex, + regex_url_link_extraction, stripped_wikicode, ) else: @@ -317,7 +320,7 @@ def __extract_templates_and_parameters__(self) -> None: from src import app app.logger.debug( - "__extract_templates_and_parameters_from_raw_reference__: running" + "==> __extract_templates_and_parameters_from_raw_reference__" ) self.__extract_raw_templates__() self.__extract_and_clean_template_parameters__() @@ -326,9 +329,10 @@ def __extract_templates_and_parameters__(self) -> None: def __extract_raw_templates__(self) -> None: """Extract the templates from self.wikicode""" from src import app + app.logger.debug("==> __extract_raw_templates__") self.templates = [] - app.logger.debug("__extract_raw_templates__: running") + if not self.wikicode: raise MissingInformationError("self.wikicode was None") if isinstance(self.wikicode, str): @@ -338,7 +342,7 @@ def __extract_raw_templates__(self) -> None: if self.is_footnote_reference and ( "" not in wikicode_string or ">" in wikicode_string ): - logger.info(f"Skipping named reference with no content {wikicode_string}") + logger.info(f"Skipping empty named reference {wikicode_string}") self.is_named_reused_reference = True else: logger.debug(f"Extracting templates from: {self.wikicode}") @@ -357,7 +361,7 @@ def __extract_raw_templates__(self) -> None: for raw_template in raw_templates: count += 1 self.templates.append( - WikipediaTemplate( + WikipediaTemplateV2( raw_template=raw_template, language_code=self.language_code ) ) @@ -368,7 +372,7 @@ def __extract_and_clean_template_parameters__(self) -> None: """We extract all templates""" from src import app - app.logger.debug("__extract_and_clean_template_parameters__: running") + app.logger.debug("==> __extract_and_clean_template_parameters__") if self.templates: [ template.extract_and_prepare_parameter_and_flds() @@ -379,7 +383,7 @@ def extract_and_check(self) -> None: """Helper method""" from src import app - app.logger.debug("extract_and_check: running") + app.logger.debug("==> extract_and_check") self.__parse_xhtml__() self.__extract_xhtml_comments__() self.__extract_templates_and_parameters__() diff --git a/src/models/v2/wikimedia/wikipedia/reference/extractor_v2.py b/src/models/v2/wikimedia/wikipedia/reference/extractor_v2.py index 7e9d4ce6..bc9736f6 100644 --- a/src/models/v2/wikimedia/wikipedia/reference/extractor_v2.py +++ b/src/models/v2/wikimedia/wikipedia/reference/extractor_v2.py @@ -34,7 +34,7 @@ class WikipediaReferenceExtractorV2(WariBaseModel): wikitext: str wikicode: Wikicode = None # wiki object tree parsed from wikitext - html_source: str = "" # used to extract citeref reference data + html_source: Optional[str] = "" # used to extract citeref reference data references: Optional[List[WikipediaReferenceV2]] = None # cite_page_refs: Optional[List] = [] @@ -197,7 +197,7 @@ def extract_all_references(self): """Extract all references from self.wikitext""" from src import app - app.logger.debug("extract_all_references: running") + app.logger.debug("==> WikipediaReferenceExtractorV2:: extract_all_references") if not self.job: raise MissingInformationError("no job") self.__parse_wikitext__() diff --git a/src/models/v2/wikimedia/wikipedia/reference/reference_lite_v2.py b/src/models/v2/wikimedia/wikipedia/reference/reference_lite_v2.py new file mode 100644 index 00000000..adade837 --- /dev/null +++ b/src/models/v2/wikimedia/wikipedia/reference/reference_lite_v2.py @@ -0,0 +1,273 @@ +import hashlib +import logging +import re +from typing import Any, Dict, List, Optional, Union + +from bs4 import BeautifulSoup, Comment +from mwparserfromhell.nodes import Tag # type: ignore +from mwparserfromhell.wikicode import Wikicode # type: ignore + +from config import regex_url_link_extraction +from src.models.base.job import JobBaseModel +from src.models.exceptions import MissingInformationError +from src.models.v2.wikimedia.wikipedia.reference.template import WikipediaTemplateV2 +from src.models.v2.wikimedia.wikipedia.url_v2 import WikipediaUrlV2 +from src.models.wikimedia.wikipedia.reference.enums import ( + FootnoteSubtype, + ReferenceType, +) + +logger = logging.getLogger(__name__) + + +# We use marshmallow here because pydantic did not seem to support optional alias fields. +# https://github.com/samuelcolvin/pydantic/discussions/3855 + + +class WikipediaReferenceLiteV2(JobBaseModel): + """ + models a reference on a Wikipedia page + See class WikipediaReferenceV2(JobBaseModel) for what this was based on + + This is very simple for now. we just have a name (of the reference, if there) and wikitext + + we validate with pydantic when creating this object + """ + + wikicode: Union[Tag, Wikicode] # output from mwparserfromhell + + name: str = "" + wikitext: str = "" + + # This is for pydantic + class Config: # dead: disable + arbitrary_types_allowed = True # dead: disable + + @property + def get_name(self) -> str: + if not self.soup: + raise MissingInformationError() + # Find the tag + ref_tag = self.soup.find("ref") + if ref_tag: + # Extract the value of the 'name' attribute + name = str(ref_tag.get("name")) # type: ignore # see https://github.com/python/typeshed/issues/8356 + if name.endswith("\\"): + # Cut off the trailing backward slash + name = name[:-1] + if name.endswith("/"): + # Cut off the trailing forward slash + name = name[:-1] + if name == "None" or name is None: + return "" + else: + return name + else: + return "" + + + @property + def wikicode_as_string(self): + return str(self.wikicode) + + def __parse_xhtml__(self): + self.soup = BeautifulSoup(str(self.wikicode), "lxml") + + def __extract_template_urls__(self) -> None: + self.template_urls = [] + urls = [] + if self.templates: + for template in self.templates: + if template.urls: + urls.extend(template.urls) + self.template_urls = urls + + def __extract_bare_urls_outside_templates__(self) -> None: + """This is a slightly more sophisticated and slower search for bare URLs using a regex""" + self.bare_urls = [] + urls = [] + for url in self.__find_bare_urls_outside_templates__(): + url_object = WikipediaUrlV2(url=url) + url_object.extract() + urls.append(url_object) + self.bare_urls = urls + + # def __extract_external_wikicoded_links_from_the_reference__(self) -> None: + # """ + # Uses mwparserfromhell's ifilter_external_links function (via wikicode.ifilter_external_links) + # returns iterator of external links found in the wikicode, like [google.com Google] + # """ + # self.wikicoded_links = [] + # urls = set() + # + # # Check if self.wikicode is an instance of Wikicode + # if isinstance(self.wikicode, Wikicode): + # # Get external links directly from self.wikicode + # links = self.wikicode.ifilter_external_links() + # else: + # # Get external links from the contents of self.wikicode + # links = self.wikicode.contents.ifilter_external_links() + # + # for url in links: + # # url: ExternalLink + # # we throw away the title here + # url = WikipediaUrlV2(url=str(url.url)) + # url.extract() + # urls.add(url) + # + # self.wikicoded_links = list(urls) + + # def __extract_reference_urls__(self) -> None: + # """We support both URLs in templates and outside aka bare URLs""" + # urls_list = [] + # + # if not self.template_urls: + # self.__extract_template_urls__() + # if self.template_urls: + # urls_list.extend(self.template_urls) + # + # if not self.bare_urls: + # self.__extract_bare_urls_outside_templates__() + # if self.bare_urls: + # urls_list.extend(self.bare_urls) + # + # if not self.wikicoded_links: + # self.__extract_external_wikicoded_links_from_the_reference__() + # if self.wikicoded_links: + # urls_list.extend(self.wikicoded_links) + # + # # if not self.comment_urls: + # # self.__extract_urls_from_comments__() + # # urls_list.extend(self.comment_urls) + # # We set it to avoid duplicates + # + # self.reference_urls = list(set(urls_list)) + # + # def __extract_unique_first_level_domains__(self) -> None: + # """This aggregates all first level domains from the urls found in the urls""" + # from src import app + # + # app.logger.debug("__extract_first_level_domains__: running") + # if not self.reference_urls: + # app.logger.info("no reference_urls found so we skip extraction") + # else: + # logger.debug("found at least one url") + # first_level_domains = set() + # for url in self.reference_urls: + # logger.debug("working on url") + # if url.first_level_domain: + # app.logger.debug(f"found fld: {url.first_level_domain}") + # first_level_domains.add(url.first_level_domain) + # else: + # app.logger.warning(f"no fld found for: {url.url}") + # # Return unique domains to avoid confusion + # # https://github.com/internetarchive/iari/issues/834 + # self.unique_first_level_domains = list(first_level_domains) + # app.logger.debug(f"found unique flds: {self.unique_first_level_domains}") + + + # def __find_bare_urls_outside_templates__(self) -> List[str]: + # """Return bare urls from the the stripped wikitext (templates are stripped away)""" + # if isinstance(self.wikicode, Wikicode): + # stripped_wikicode = str(self.wikicode.strip_code()) + # logger.debug(stripped_wikicode) + # return re.findall( + # regex_url_link_extraction, + # stripped_wikicode, + # ) + # else: + # return [] + + # + # def __extract_templates_and_parameters__(self) -> None: + # """Helper method""" + # from src import app + # + # app.logger.debug( + # "__extract_templates_and_parameters_from_raw_reference__: running" + # ) + # self.__extract_raw_templates__() + # self.__extract_and_clean_template_parameters__() + # self.extraction_done = True + + # def __extract_raw_templates__(self) -> None: + # """Extract the templates from self.wikicode""" + # from src import app + # + # self.templates = [] + # app.logger.debug("__extract_raw_templates__: running") + # if not self.wikicode: + # raise MissingInformationError("self.wikicode was None") + # if isinstance(self.wikicode, str): + # raise MissingInformationError("self.wikicode was str") + # # Skip named references like "" + # wikicode_string = str(self.wikicode) + # if self.is_footnote_reference and ( + # "" not in wikicode_string or ">" in wikicode_string + # ): + # logger.info(f"Skipping named reference with no content {wikicode_string}") + # self.is_named_reused_reference = True + # else: + # logger.debug(f"Extracting templates from: {self.wikicode}") + # if isinstance(self.wikicode, Tag): + # # contents is needed here to get a Wikicode object + # raw_templates = self.wikicode.contents.ifilter_templates( + # matches=lambda x: not x.name.lstrip().startswith("#"), + # recursive=True, + # ) + # else: + # raw_templates = self.wikicode.ifilter_templates( + # matches=lambda x: not x.name.lstrip().startswith("#"), + # recursive=True, + # ) + # count = 0 + # for raw_template in raw_templates: + # count += 1 + # self.templates.append( + # WikipediaTemplate( + # raw_template=raw_template, language_code=self.language_code + # ) + # ) + # if count == 0: + # logger.debug("Found no templates") + + # def __extract_and_clean_template_parameters__(self) -> None: + # """We extract all templates""" + # from src import app + # + # app.logger.debug("__extract_and_clean_template_parameters__: running") + # if self.templates: + # [ + # template.extract_and_prepare_parameter_and_flds() + # for template in self.templates + # ] + + def extract_and_check(self) -> None: + """Helper method""" + from src import app + + app.logger.debug("extract_and_check: running") + self.__parse_xhtml__() + # self.__extract_xhtml_comments__() + # self.__extract_templates_and_parameters__() + # self.__extract_reference_urls__() + # self.__extract_unique_first_level_domains__() + # self.__generate_reference_id__() + + # def extract_and_check(self) -> None: + # """Helper method""" + # from src import app + # + # app.logger.debug("extract_and_check: running") + # self.__parse_xhtml__() + # self.__extract_xhtml_comments__() + # self.__extract_templates_and_parameters__() + # self.__extract_reference_urls__() + # self.__extract_unique_first_level_domains__() + # self.__generate_reference_id__() + + def __generate_reference_id__(self) -> None: + """This generates an 8-char long id based on the md5 hash of + the raw wikitext for this reference""" + self.reference_id = hashlib.md5(f"{self.wikicode}".encode()).hexdigest()[:8] + diff --git a/src/models/v2/wikimedia/wikipedia/section_v2.py b/src/models/v2/wikimedia/wikipedia/section_v2.py index 967ab3b1..e28e6dd0 100644 --- a/src/models/v2/wikimedia/wikipedia/section_v2.py +++ b/src/models/v2/wikimedia/wikipedia/section_v2.py @@ -68,42 +68,48 @@ def __extract_name_from_line__(line): def __extract_all_general_references__(self): from src import app - app.logger.debug("__extract_all_general_references__: running") - if self.is_general_reference_section: - app.logger.info("Regex match on section name") - # Discard the header line - lines = self.wikitext.split("\n") - lines_without_heading = lines[1:] - logger.debug( - f"Extracting {len(lines_without_heading)} lines form section {lines[0]}" - ) - for line in lines_without_heading: - logger.info(f"Working on line: {line}") - # Guard against empty line - # logger.debug("Parsing line") - # We discard all lines not starting with a star to avoid all - # categories and other templates not containing any references - if line and self.star_found_at_line_start(line=line): - parsed_line = mwparserfromhell.parse(line) - logger.debug("Appending line with star to references") - # We don't know what the line contains besides a start - # but we assume it is a reference - reference = WikipediaReference( - wikicode=parsed_line, - # wikibase=self.wikibase, - testing=self.testing, - language_code=self.language_code, - is_general_reference=True, - section=self.name, - ) - reference.extract_and_check() - self.references.append(reference) + app.logger.debug("==> WikipediaSectionV2::__extract_all_general_references__") + + # bail if this section is not a "general reference" section + # i'm not sure we need to filter this here, as we want to do all sections, i believe + # so, i'm deleting this for now + ### if not self.is_general_reference_section: + ### return + + app.logger.info(f"processing section {self.name}") + + # Discard the header line + lines = self.wikitext.split("\n") + lines_without_heading = lines[1:] + logger.debug( + f"Extracting {len(lines_without_heading)} lines form section {lines[0]}" + ) + for line in lines_without_heading: + # Guard against empty line + + # logger.info(f"Working on line: {line}") + # Discard all lines not starting with a star to avoid categories and other templates + # not containing any references + if line and self.star_found_at_line_start(line=line): + parsed_line = mwparserfromhell.parse(line) + logger.debug("Appending line with star to references") + # We don't know what the line contains besides a start + # but we assume it is a reference + reference = WikipediaReferenceV2( + wikicode=parsed_line, + testing=self.testing, + language_code=self.language_code, + is_general_reference=True, + section=self.name, + ) + reference.extract_and_check() + self.references.append(reference) def __extract_all_footnote_references__(self): """This extracts all ... from self.wikicode""" from src import app - app.logger.debug("__extract_all_footnote_references__: running") + app.logger.debug("==> __extract_all_footnote_references__") # Thanks to https://github.com/JJMC89, # see https://github.com/earwig/mwparserfromhell/discussions/295#discussioncomment-4392452 @@ -124,7 +130,7 @@ def __extract_all_footnote_references__(self): app.logger.debug(f"extracting ref# {base_ref_counter}") # app.logger.debug(f"### ### ###") - reference = WikipediaReference( + reference = WikipediaReferenceV2( wikicode=ref, # wikibase=self.wikibase, testing=self.testing, @@ -147,14 +153,14 @@ def extract(self): def __populate_wikitext__(self): from src import app - app.logger.debug("__populate_wikitext__: running") + if self.wikicode and not self.wikitext: self.wikitext = str(self.wikicode) def __parse_wikitext__(self): from src import app - app.logger.debug("__parse_wikitext__: running") + if self.wikitext and not self.wikicode: self.wikicode = mwparserfromhell.parse(self.wikitext) diff --git a/src/models/v2/wikimedia/wikipedia/url_v2.py b/src/models/v2/wikimedia/wikipedia/url_v2.py index e710eed0..3a1ef2a4 100644 --- a/src/models/v2/wikimedia/wikipedia/url_v2.py +++ b/src/models/v2/wikimedia/wikipedia/url_v2.py @@ -38,7 +38,6 @@ class WikipediaUrlV2(BaseModel): @property def __is_wayback_machine_url__(self): - logger.debug("is_wayback_machine_url: running") return bool("//web.archive.org" in self.url) @property @@ -58,7 +57,7 @@ def __lt__(self, other): return self.url < other.url def __parse_extract_and_validate__(self) -> None: - logger.debug("__parse_extract_and_validate__: running") + logger.debug("==> __parse_extract_and_validate__") if self.__is_wayback_machine_url__: self.__parse_wayback_machine_url__() self.__parse_and_extract_url__() @@ -68,7 +67,7 @@ def __parse_extract_and_validate__(self) -> None: def __extract_first_level_domain__(self) -> None: from src import app - app.logger.debug("__extract_first_level_domain__: Running") + app.logger.debug("==> __extract_first_level_domain__") try: self.__get_fld__() except (TldBadUrl, TldDomainNotFound): @@ -97,7 +96,8 @@ def __check_scheme__(self): self.malformed_url = True self.malformed_url_details = MalformedUrlError.UNRECOGNIZED_SCHEME else: - logger.debug(f"Found valid urlscheme: {self.scheme}") + # logger.debug(f"Found valid urlscheme: {self.scheme}") + pass def __extract_tld__(self): if not self.netloc: @@ -147,19 +147,19 @@ def __parse_wayback_machine_url__(self): # ) def __get_fld__(self): - logger.debug("__get_fld__: running") + # logger.debug("==> __get_fld__") if self.archived_url: - logger.debug(f"Trying to get FLD from {self.archived_url}") + # logger.debug(f"Trying to get FLD from {self.archived_url}") fld = get_fld(self.archived_url) else: - logger.debug(f"Trying to get FLD from {self.url}") + # logger.debug(f"Trying to get FLD from {self.url}") fld = get_fld(self.url) - logger.debug(f"Found FLD: {fld}") + # logger.debug(f"Found FLD: {fld}") self.first_level_domain = fld def extract(self): from src import app - app.logger.debug("extract: running") + # app.logger.debug("==> extract") self.__parse_extract_and_validate__() self.__extract_first_level_domain__() diff --git a/src/models/v2/wikimedia/wikipedia/wiki_page_v2.py b/src/models/v2/wikimedia/wikipedia/wiki_page_v2.py new file mode 100644 index 00000000..419c5223 --- /dev/null +++ b/src/models/v2/wikimedia/wikipedia/wiki_page_v2.py @@ -0,0 +1,301 @@ +# objhect for wiki article +# pass in analyzer +# - extracts refs? +# Ref Analyzer will just grab refs for this purpose +# - can reuse ref parsing existing that takes wikitext +# WikiRefAnalyzer - specific analyzer to this task +# +# wiki_page.analyzer.refs ??? +# +# wiki_page.refs +# - analyzer sets those +# OR +# append analyzer to wiki_page - ??? +# +# +# if not self.page_analyzer: +# self.page_analyzer = WikipediaAnalyzerV2(job=self.job) +# +# self.io.data = self.page_analyzer.get_article_data() +# +# # if article not found, return error as such +# if not self.page_analyzer.article_found: +# return AnalyzerReturnValues.NOT_FOUND.value, 404 +# +# # if article is a redirect, return error as such +# if self.page_analyzer.is_redirect: +# app.logger.debug("found redirect") +# return AnalyzerReturnValues.IS_REDIRECT.value, 400 +# +# app.logger.debug("ArticleV2:: processed article, saving...") + +from datetime import datetime +from typing import Any, Tuple, Dict, List, Optional +import traceback + +import mwparserfromhell +from mwparserfromhell.wikicode import Wikicode + +from src.models.base import WariBaseModel +from src.models.exceptions import MissingInformationError, WikipediaApiFetchError + +from flask_restful import Resource, abort # type: ignore + +from src.models.api.job.article_job import ArticleJob +# from src.models.api.schema.article_schema import ArticleSchema +from src.models.exceptions import MissingInformationError + +from src.models.file_io.article_file_io import ArticleFileIo +from src.models.file_io.references import ReferencesFileIo +from src.models.v2.wikimedia.wikipedia.reference.reference_lite_v2 import WikipediaReferenceLiteV2 + +from src.models.wikimedia.enums import AnalyzerReturnValues, WikimediaDomain +from src.models.wikimedia.wikipedia.analyzer import WikipediaAnalyzer + +# from src.views.statistics.write_view import StatisticsWriteView + + +class WikiArticleV2(WariBaseModel): ## NB NOT based on StatisticsView or StatisticsWriteView like others + """ + we really should have a base class of something like "resource with references" + or something that indicates this is a referencable object + + class for wiki article + it contains space for references + An analyzer consumes this article object and extracts references from it + """ + + job: ArticleJob + wikitext: str + wikicode: Wikicode = None # wiki object tree parsed from wikitext + + page_url: str = "" + page_title: str = "" + page_lang: str = "" + + references: Optional[List[WikipediaReferenceLiteV2]] = None + + def parse_references(self): + self.wikicode = mwparserfromhell.parse(self.wikitext) + + + def __extract_sections__(self) -> None: + """This uses the sections regex supplied by the patron via the API + and populates the sections attribute with a list of MediawikiSection objects + + We only consider level 2 sections beginning with ==""" + from src import app + + self.sections = [] + app.logger.debug("__extract_sections__: running") + if not self.wikicode: + self.__parse_wikitext__() + + # all_sections: List[Wikicode] = self.wikicode.get_sections( + # # levels=[2], + # include_headings=True, + # ) + + # section_counter = 0 + # for section in all_sections: + # section_counter += 1 + # app.logger.info(f"All Section #{section_counter}") + # + # self.section_list.append({"id": section_counter, "name": "???"}) + # + # for node in section.filter_headings(): + # header_text = node.title.strip() + # header_level = node.level + # # app.logger.info(f"Section id: {section_counter}, Header: {header_text}, Level: {header_level}") + # app.logger.info(f"Section #: {section_counter} header: {node}") + + sections: List[Wikicode] = self.wikicode.get_sections( + levels=[2], + include_headings=True, + ) + + ''' + loop thru all sections + keeping counter + when level 2 hit, + create a mw_section object + set counter as section_id + ''' + + # TODO: make this code better by special casing no section and making faux section, and putting through same loop + + section_counter = 0 + section_list = [] + + if not sections: + app.logger.debug("No level 2 sections detected, creating root section") + # console.print(self.wikicode) + # exit() + mw_section = MediawikiSection( + # We add the whole article to the root section + wikicode=self.wikicode, + section_id=section_counter, + + job=self.job, + + testing=self.testing, + language_code=self.language_code, + ) + mw_section.extract() + self.sections.append(mw_section) + + else: + app.logger.info(f"Processing section number {section_counter}") + + # append root section as first section in section list + self.__extract_root_section__() + + # append each section to section list + for section in sections: + + section_counter += 1 + + app.logger.info(f"Section: {section}") + + mw_section = MediawikiSection( + wikicode=section, + section_id=section_counter, + + job=self.job, + + testing=self.testing, + language_code=self.language_code, + ) + + mw_section.extract() # pull all refs from section + self.sections.append(mw_section) + + section_list.append({"name": "section name", "counter": section_counter}) + + + app.logger.debug(f"Number of sections found: {len(self.sections)}") + + self.section_info.update({"count": len(self.sections), "list": section_list}) + # self.section_info["count"] = len(self.sections) + # self.section_info["list"] = section_list + + + # def __handle_article_request__(self): + # from src import app + # + # app.logger.info("==> WikiArticleV2::__handle_article_request__: fetching article data and saving to cache") + # + # self.__setup_wikipedia_analyzer__() + # return self.__analyze_and_write_and_return__() + # + # def __analyze_and_write_and_return__(self) -> Tuple[Any, int]: + # """Analyze, calculate the time, write statistics to disk and return it + # If we did not get statistics, return a meaningful error to the patron""" + # from src import app + # + # app.logger.info("==> __analyze_and_write_and_return__") + # + # if not self.wikipedia_page_analyzer: + # raise MissingInformationError("self.wikipedia_page_analyzer was None") + # + # self.__get_statistics__() # populate self.io.data with analysis results + # self.__setup_io__() + # self.io.data = self.wikipedia_page_analyzer.get_statistics() + # + # if self.wikipedia_page_analyzer.found: # found === True means article was successfully processed + # app.logger.debug("valid article found and processed") + # + # if self.wikipedia_page_analyzer.is_redirect: + # app.logger.debug("found redirect") + # return AnalyzerReturnValues.IS_REDIRECT.value, 400 + # + # else: + # app.logger.debug("adding time information and returning the statistics") + # self.__update_statistics_with_time_information__() + # # we got a json response + # # according to https://stackoverflow.com/questions/13081532/return-json-response-from-flask-view + # # flask calls jsonify automatically + # self.__write_to_disk__() # writes self.io.dtata to disk + # if not self.io: + # raise MissingInformationError() + # if self.io.data: + # self.io.data["served_from_cache"] = False # append return data + # return self.io.data, 200 + # else: + # raise MissingInformationError() + # else: + # return AnalyzerReturnValues.NOT_FOUND.value, 404 + # + # def __get_statistics__(self): + # """ + # get the results from wikipedia_page_analyzer.get_statistics and save to self.io.data + # """ + # from src import app + # + # app.logger.debug("==> __get_statistics__") + # + # if not self.wikipedia_page_analyzer: + # raise MissingInformationError("self.wikipedia_page_analyzer was None") + # + # # https://realpython.com/python-timer/ + # self.__setup_io__() + # self.io.data = self.wikipedia_page_analyzer.get_statistics() + # + # def __update_statistics_with_time_information__(self): + # """Update the dictionary before returning it""" + # if self.io.data: + # timestamp = datetime.timestamp(datetime.utcnow()) + # self.io.data["timestamp"] = int(timestamp) + # isodate = datetime.isoformat(datetime.utcnow()) + # self.io.data["isodate"] = str(isodate) + # else: + # raise ValueError("not a dict") + # + # def __return_meaningful_error__(self): + # from src import app + # + # app.logger.error("==> __return_meaningful_error__") + # if self.job.title == "": + # return "Title was missing", 400 + # if self.job.domain != "wikipedia": + # return "Only 'wikipedia' site is supported", 400 + # + # def __setup_wikipedia_analyzer__(self): + # if not self.wikipedia_page_analyzer: + # from src import app + # + # app.logger.info(f"Setup analyzer for {self.job.title}...") + # + # # wikipedia_page_analyzer is declared in the StatisticsView class (views/statistics/__init.py) + # # NB This wrong! It should be declared here in the Article class. + # # we fix this in the v2/ArticleV2 code, but not here, since it "works". + # # this is the only place it is called, so it makes no sense to + # # declare it in a base class that other objects that do not use + # # the analysis feature...! + # self.wikipedia_page_analyzer = WikipediaAnalyzer(job=self.job) + # + # def __setup_io__(self): + # self.io = ArticleFileIo(job=self.job) + # + # def __write_to_disk__(self): + # """Write both article json and all reference json files""" + # from src import app + # + # app.logger.debug("__write_to_disk__: running") + # if not self.job.testing: + # self.__write_article_to_disk__() + # self.__write_references_to_disk__() + # + # def __write_article_to_disk__(self): + # article_io = ArticleFileIo( + # job=self.job, + # data=self.io.data, + # wari_id=self.job.wari_id, + # ) + # article_io.write_to_disk() + # + # def __write_references_to_disk__(self): + # references_file_io = ReferencesFileIo( + # references=self.wikipedia_page_analyzer.reference_statistics + # ) + # references_file_io.write_references_to_disk() diff --git a/src/models/wikimedia/enums.py b/src/models/wikimedia/enums.py index 4b90468e..e8c64373 100644 --- a/src/models/wikimedia/enums.py +++ b/src/models/wikimedia/enums.py @@ -59,3 +59,10 @@ class WikimediaDomain(Enum): class AnalyzerReturnValues(Enum): IS_REDIRECT = "No statistic available because this is a redirect." NOT_FOUND = "Article title not found." + + +class RequestMethods(Enum): + # Http request types + get = "get" + post = "post" + diff --git a/src/models/wikimedia/wikipedia/analyzer.py b/src/models/wikimedia/wikipedia/analyzer.py index 5f595cad..61ac8b54 100644 --- a/src/models/wikimedia/wikipedia/analyzer.py +++ b/src/models/wikimedia/wikipedia/analyzer.py @@ -31,6 +31,7 @@ class WikipediaAnalyzer(WariBaseModel): job: Optional[ArticleJob] = None article: Optional[WikipediaArticle] = None + article_statistics: Optional[ArticleStatistics] = None # includes cite_refs property reference_statistics: Optional[List[Dict[str, Any]]] = None @@ -221,7 +222,7 @@ def __gather_reference_statistics__(self): flds=reference.unique_first_level_domains if reference.unique_first_level_domains else [], - wikitext=reference.get_wikicode_as_string, + wikitext=reference.wikicode_as_string, section=reference.section, section_id=reference.section_id, template_names=reference.template_names, @@ -290,7 +291,7 @@ def __get_article_data_for_response__(self): # refs = section.references for ref in section.references: - new_ref ={"wikitext": ref.get_wikicode_as_string} + new_ref ={"wikitext": ref.wikicode_as_string} new_refs.append(new_ref) sections.append(new_section) diff --git a/src/models/wikimedia/wikipedia/article.py b/src/models/wikimedia/wikipedia/article.py index 063302ce..28a552e4 100644 --- a/src/models/wikimedia/wikipedia/article.py +++ b/src/models/wikimedia/wikipedia/article.py @@ -31,6 +31,8 @@ class WikipediaArticle(WariBaseModel): because of https://github.com/internetarchive/wcdimportbot/issues/261""" + job: ArticleJob + md5hash: Optional[str] page_id: int = 0 wdqid: str = "" @@ -46,8 +48,6 @@ class WikipediaArticle(WariBaseModel): # extractor: Optional[Any] = None # TODO: FIXFIX - job: ArticleJob - ores_quality_prediction: str = "" ores_details: Optional[Dict] = None @@ -95,8 +95,7 @@ def url(self): def fetch_and_extract_and_parse(self): from src import app - app.logger.debug("==> fetch_and_extract_and_parse") - app.logger.info("Extracting templates and parsing references") + app.logger.debug("==> WikipediaArticle::fetch_and_extract_and_parse") if not self.wikitext: # fetch page data from Wikipedia if we don't already have wikitext @@ -106,6 +105,7 @@ def fetch_and_extract_and_parse(self): logger.debug( "Skipped extraction and parsing because the article is a redirect" ) + elif not self.found_in_wikipedia: logger.debug( "Skipped extraction and parsing because the article was not found" @@ -143,7 +143,8 @@ def __fetch_page_data__(self) -> None: and date from the MediaWiki REST v1 API if needed""" from src import app - app.logger.debug("__fetch_page_data__: Running") + app.logger.debug("==> __fetch_page_data__: Running") + self.__check_if_title_is_empty__() if not self.wikitext: if self.revision_id: @@ -173,7 +174,7 @@ def __get_title_from_wikidata__(self): def __check_if_title_is_empty__(self): if not self.job.title: - raise MissingInformationError("self.job.title was empty string") + raise MissingInformationError("WikipediaArticle: self.job.title is empty") def __get_ores_scores__(self): self.ores_details = {} diff --git a/src/models/wikimedia/wikipedia/reference/generic.py b/src/models/wikimedia/wikipedia/reference/generic.py index 4195a11c..bfa385fb 100644 --- a/src/models/wikimedia/wikipedia/reference/generic.py +++ b/src/models/wikimedia/wikipedia/reference/generic.py @@ -7,7 +7,7 @@ from mwparserfromhell.nodes import Tag # type: ignore from mwparserfromhell.wikicode import Wikicode # type: ignore -from config import link_extraction_regex +from config import regex_url_link_extraction from src.models.base.job import JobBaseModel from src.models.exceptions import MissingInformationError from src.models.wikimedia.wikipedia.reference.enums import ( @@ -168,7 +168,7 @@ def is_footnote_reference(self): return not self.is_general_reference @property - def get_wikicode_as_string(self): + def wikicode_as_string(self): return str(self.wikicode) @property @@ -298,7 +298,7 @@ def __find_bare_urls_outside_templates__(self) -> List[str]: stripped_wikicode = str(self.wikicode.strip_code()) logger.debug(stripped_wikicode) return re.findall( - link_extraction_regex, + regex_url_link_extraction, stripped_wikicode, ) else: diff --git a/src/models/wikimedia/wikipedia/url.py b/src/models/wikimedia/wikipedia/url.py index 5ccb27ca..aaa85c9a 100644 --- a/src/models/wikimedia/wikipedia/url.py +++ b/src/models/wikimedia/wikipedia/url.py @@ -67,7 +67,7 @@ def __parse_extract_and_validate__(self) -> None: def __extract_first_level_domain__(self) -> None: from src import app - app.logger.debug("__extract_first_level_domain__: Running") + app.logger.debug("==> __extract_first_level_domain__") try: self.__get_fld__() except (TldBadUrl, TldDomainNotFound): diff --git a/src/views/check_doi.py b/src/views/check_doi.py index ddaace38..0e9dd5f5 100644 --- a/src/views/check_doi.py +++ b/src/views/check_doi.py @@ -33,7 +33,7 @@ def get(self): Every branch in this method has to return a tuple (Any,response_code)""" from src import app - app.logger.debug("get: running") + app.logger.debug("==> CheckDoi::get") self.__validate_and_get_job__() if self.job: return self.__return_from_cache_or_analyze_and_return__() diff --git a/src/views/v2/fetchrefs_v2.py b/src/views/v2/fetchrefs_v2.py new file mode 100644 index 00000000..90f3ffe4 --- /dev/null +++ b/src/views/v2/fetchrefs_v2.py @@ -0,0 +1,144 @@ +# from flask_restful import Resource, abort # type: ignore +# from marshmallow import Schema +from datetime import datetime +from typing import Any, Optional, Tuple, List, Dict +import traceback + +from dateutil.parser import isoparse + +import config +import requests + +from src.models.exceptions import MissingInformationError, WikipediaApiFetchError +from src.models.v2.job.article_job_v2 import ArticleJobV2 + +from src.models.v2.schema.fetchrefs_schema_v2 import FetchRefsSchemaV2 +from src.models.v2.job.fetchrefs_job_v2 import FetchRefsJobV2 +# from src.models.v2.wikimedia.wikipedia.wiki_page_v2 import WikiArticleV2 + +from src.models.api.job.article_job import ArticleJob +from src.models.v2.wikimedia.wikipedia.article_v2 import WikipediaArticleV2 +from src.models.wikimedia.wikipedia.article import WikipediaArticle + +from src.views.v2.statistics import StatisticsViewV2 +from src.models.wikimedia.enums import RequestMethods + + +class FetchRefsV2(StatisticsViewV2): + + """ + takes an array of page specifiers, and + returns data for all citations for each page. + + """ + + schema = FetchRefsSchemaV2() # Defines expected parameters; Overrides StatisticsViewV2's "schema" property + job: FetchRefsJobV2 # Holds usable variables, seeded from schema. Overrides StatisticsViewV2's "job" + + pages: List[Dict[str, Any]] = [] # contents parsed from pipe-delimited "pages" URL parameter + + def get(self): + """ + flask GET entrypoint for returning fetchrefs results + must return a tuple: (Any,response_code) + """ + from src import app + app.logger.debug(f"==> FetchRefsV2::get") + + return self.__process_request__(method=RequestMethods.get) + # return {"errors": [ + # {"error": "GET method not supported for this endpoint"} + # ]} + + def post(self): + """ + flask POST entrypoint for returning fetchrefs results + must return a tuple: (Any,response_code) + """ + from src import app + app.logger.debug(f"==> FetchRefsV2::post") + + # return self.__process_data__(method="post") + return self.__process_request__(method=RequestMethods.post) + + + def __process_request__(self, method=RequestMethods.post): # default to POST + from src import app + app.logger.debug(f"==> FetchRefsV2::__process_request__, method = {method}") + + try: + self.__validate_and_get_job__(method) # inherited from StatisticsViewV2 + + self.pages = [] + + # process pages, get refs, sets self.pages data + for page in self.job.pages: + page_results = self.__get_page_data__(page) + # append page ref data to pages result + self.pages.append(page_results) + + # and return results + return {"pages": self.pages} + + + except MissingInformationError as e: + traceback.print_exc() + return {"error": f"Missing Information Error: {str(e)}"}, 500 + + except Exception as e: + traceback.print_exc() + return {"error": f"General Error: {str(e)}"}, 500 + + def __get_page_data__(self, page_title): + """ + Assume page is a fully resolved url, such as: https://en.wikipedia.org/wiki/Easter_Island + """ + + try: + # process page + + url_template = "https://{lang}.{wiki_domain}/wiki/{page_title}" # TODO make this a global + page_url = url_template.format(page_title=page_title, lang="en", wiki_domain="wikipedia.org") + + article_job = ArticleJobV2(url=page_url) + article_job.__extract_url__() + + # get article object corresponding to page + # page = WikiArticleV2(job=article_job) + + page = WikipediaArticleV2(job=article_job) + page.fetch_and_parse() + + # loop thru references + page_refs = [] + if page.extractor: + for ref in page.extractor.references: + page_refs.append({ + "name": ref.get_name, + "wikitext": ref.wikicode_as_string + }) + + except WikipediaApiFetchError as e: + return { + "page_title": page_title, + "which_wiki": self.job.which_wiki, + "error": f"Page data error: {str(e)}" + } + + except Exception as e: + traceback.print_exc() + return { + "page_title": page_title, + "which_wiki": self.job.which_wiki, + "error": f"General error: {str(e)}" + } + + return { + "page_title": page_title, + "which_wiki": self.job.which_wiki, + + "refs": page_refs, + } + + + diff --git a/src/views/v2/statistics/__init__.py b/src/views/v2/statistics/__init__.py index 0ad66511..76861922 100644 --- a/src/views/v2/statistics/__init__.py +++ b/src/views/v2/statistics/__init__.py @@ -1,5 +1,5 @@ from datetime import datetime -from typing import Optional +from typing import Optional, Any from flask import request from flask_restful import Resource, abort # type: ignore @@ -10,6 +10,7 @@ from src.models.api.job import Job from src.models.exceptions import MissingInformationError from src.models.file_io import FileIo +from src.models.wikimedia.enums import RequestMethods class StatisticsViewV2(Resource): @@ -28,6 +29,7 @@ class StatisticsViewV2(Resource): job: Optional[Job] # loads parameters via schema.load io: Optional[FileIo] = None # derived class must implement __setup_io__ + request_args: Any = {} time_of_analysis: Optional[datetime] = None def __setup_io__(self): @@ -42,39 +44,51 @@ def __read_from_cache__(self): if self.io: self.io.read_from_disk() - def __validate_and_get_job__(self, method="get"): + def __validate_and_get_job__(self, method=RequestMethods.get): """ Validates request params, whether from GET or POST, and, if successful, pulls param values into job's properties """ from src import app - app.logger.debug(f"==> StatisticsViewV2::__validate_and_get_job__({method})") + app.logger.debug(f"==> StatisticsViewV2::__validate_and_get_job__(method = {method})") - # use args if GET, form if POST - request_args = request.args if (method == "get") else request.form + self.schema.context['request_method'] = request.method + app.logger.debug(f"==> StatisticsViewV2::__validate_and_get_job__: request.method = {request.method})") - self.__validate__(request_args) - self.__parse_into_job__(request_args) + # self.request_method = method + # use request.args if GET, request.form if POST + # self.request_args = request.args if (method == RequestMethods.get) else request.form + self.request_args = request.args if (request.method == "GET") else request.form - def __validate__(self, request_args): + app.logger.debug(f"==> StatisticsViewV2::__validate_and_get_job__: request_args: {self.request_args}") + + # self.__validate__(request_args) + # self.__parse_into_job__(request_args) + self.__validate__() + self.__parse_into_job__() + + def __validate__(self): from src import app - app.logger.debug(f"==> StatisticsViewV2::__validate__({request_args})") + app.logger.debug(f"==> StatisticsViewV2::__validate__({self.request_args})") - errors = self.schema.validate(request_args) + errors = self.schema.validate(self.request_args) if errors: app.logger.debug(f"Validation errors: {errors}") raise MissingInformationError(errors) - def __parse_into_job__(self, request_args): + # def __parse_into_job__(self, request_args): + def __parse_into_job__(self): from src import app - app.logger.debug(f"==> StatisticsViewV2::__parse_into_job__({request_args})") + app.logger.debug(f"==> StatisticsViewV2::__parse_into_job__({self.request_args})") if not self.schema: raise MissingInformationError("No schema set for StatisticsViewV2") - self.job = self.schema.load(request_args) + self.schema.context['request_method'] = request.method + + self.job = self.schema.load(self.request_args) # returns a job object, populated with field values mapped from request_args if not self.job: diff --git a/tests/wikipedia/reference/test_english_wikipedia_page_reference.py b/tests/wikipedia/reference/test_english_wikipedia_page_reference.py index ec73d55c..3535721d 100644 --- a/tests/wikipedia/reference/test_english_wikipedia_page_reference.py +++ b/tests/wikipedia/reference/test_english_wikipedia_page_reference.py @@ -882,7 +882,7 @@ def test_get_wikicode_as_string_empty(self): for ref in refs: print(ref) raw_reference_object = WikipediaReference(tag=ref, testing=True) - assert raw_reference_object.get_wikicode_as_string == ref + assert raw_reference_object.wikicode_as_string == ref def test_get_wikicode_as_string_nonempty(self): wikitext = ( @@ -894,7 +894,7 @@ def test_get_wikicode_as_string_nonempty(self): refs = wikicode.filter_tags(matches=lambda tag: tag.lower() == "ref") for ref in refs: raw_reference_object = WikipediaReference(tag=ref, testing=True) - assert raw_reference_object.get_wikicode_as_string == ref + assert raw_reference_object.wikicode_as_string == ref def test_is_footnote_reference(self): ref = "{{citeq|Q1}}"