Skip to content

Commit

Permalink
Merge branch 'multifetch' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
mojomonger committed Aug 26, 2024
2 parents 936bde9 + 393b39f commit b2ea1a9
Show file tree
Hide file tree
Showing 21 changed files with 1,013 additions and 115 deletions.
13 changes: 8 additions & 5 deletions src/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import config
from src.models.exceptions import MissingInformationError, WikipediaApiFetchError

# old stuff...
# legacy endpoints stuff...
from src.views.check_doi import CheckDoi
from src.views.check_url import CheckUrl
from src.views.check_url_archive import CheckUrlArchive
Expand All @@ -36,6 +36,8 @@
from src.views.v2.article_cache_view_v2 import ArticleCacheV2
# new stuff jun 2024
from src.views.v2.editref_v2 import EditRefV2
# new stuff jul 2024
from src.views.v2.fetchrefs_v2 import FetchRefsV2

logging.basicConfig(level=config.loglevel)
logger = logging.getLogger(__name__)
Expand All @@ -50,18 +52,19 @@ def add_cors_headers(response):
response.headers["Access-Control-Allow-Headers"] = "Content-Type"
return response

# Register CORS function as an after_request handler
app.after_request(add_cors_headers)


# let's see if we can distinguish which server we are on
server_name = os.getenv('FLASK_SERVER_NAME', 'Unknown Server')

# Register the function as an after_request handler
app.after_request(add_cors_headers)

# We use a prefix here to enable us to stabilize the api over time
# and bump the version when making breaking changes
api = Api(app, prefix="/v2")
api = Api(app, prefix="/v2") # NB TODO This pseudo-versioning should be addressed

# link the API views to respective endpoint urls
api.add_resource(FetchRefsV2, "/fetchrefs")
api.add_resource(EditRefV2, "/editref")

api.add_resource(ArticleV2, "/article")
Expand Down
8 changes: 4 additions & 4 deletions src/models/api/handlers/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# type: ignore
from requests import ReadTimeout

from config import link_extraction_regex
from config import regex_url_link_extraction
from src.models.api.handlers import BaseHandler
from src.models.api.job.check_url_job import UrlJob
from src.models.api.link.pdf_link import PdfLink
Expand Down Expand Up @@ -214,7 +214,7 @@ def __extract_links_from_original_text__(self) -> None:
# We remove the linebreaks to avoid clipping of URLs, see https://github.com/internetarchive/iari/issues/766
# provided by chatgpt:
urls = re.findall(
link_extraction_regex,
regex_url_link_extraction,
self.text_pages[index],
)
# cleaned_urls = self.__clean_urls__(urls=urls)
Expand All @@ -233,7 +233,7 @@ def __extract_links_from_text_without_linebreaks__(self) -> None:
# We remove the linebreaks to avoid clipping of URLs, see https://github.com/internetarchive/iari/issues/766
# provided by chatgpt:
urls = re.findall(
link_extraction_regex,
regex_url_link_extraction,
self.text_pages_without_linebreaks[index],
)
# cleaned_urls = self.__clean_urls__(urls=urls)
Expand All @@ -253,7 +253,7 @@ def __extract_links_from_text_without_spaces__(self) -> None:
# provided by chatgpt:
if self.text_pages_without_spaces:
urls = re.findall(
link_extraction_regex,
regex_url_link_extraction,
self.text_pages_without_spaces[index],
)
# cleaned_urls = self.__clean_urls__(urls=urls)
Expand Down
8 changes: 6 additions & 2 deletions src/models/api/job/article_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,17 @@ class ArticleJob(Job):
lang: str = "en"
domain: WikimediaDomain = WikimediaDomain.wikipedia
title: str = ""
revision: int = 0 # this is named just as in the MediaWiki API

page_id: int = 0
refresh: bool = False
url: str = ""

sections: str = "" # string describing which sections to parse
revision: int = 0 # this is named just as in the MediaWiki API

refresh: bool = False
dehydrate: bool = True


@property
def wari_id(self) -> str:
if not self.lang:
Expand Down
53 changes: 53 additions & 0 deletions src/models/v2/job/fetchrefs_job_v2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import re
from urllib.parse import quote, unquote
from src.models.wikimedia.enums import WikimediaDomain
from src import MissingInformationError
from src.models.v2.job import JobV2
from typing import List


class FetchRefsJobV2(JobV2):
"""job that supports FetchRefsV2 endpoint"""

# using marshmallow to describe parameters

which_wiki: str = ""
pages: List[str] = []
wikitext: str = ""

wiki_domain: WikimediaDomain = WikimediaDomain.wikipedia
wiki_lang: str = ""

wiki_id: str = ""
wiki_page_title: str = ""
wiki_revision: str = ""

# @property
# def quoted_title(self):
# if not self.wiki_page_title:
# raise MissingInformationError("self.wiki_page_title is empty")
# return quote(self.wiki_page_title, safe="")


def validate_fields(self):
"""
parameter checking done here...
must have at "pages" or "wikitext" defined
"""

from src import app

# app.logger.error('fetchrefs validate_fields: Fake Error')
# raise MissingInformationError(
# f'fetchrefs validate_fields: Fake Error'
# )

if not self.wikitext:
if not self.pages:
raise MissingInformationError(
f"pages or wikitext parameter must be specified"
)



54 changes: 54 additions & 0 deletions src/models/v2/schema/fetchrefs_schema_v2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from marshmallow import fields, pre_load, post_load

from src.models.v2.job.fetchrefs_job_v2 import FetchRefsJobV2
from src.models.v2.schema import BaseSchemaV2


class FetchRefsSchemaV2(BaseSchemaV2):
# Defines expected parameters for endpoint
# - default parameters are defined in BaseSchemaV2

which_wiki = fields.Str(default="enwiki")
pages = fields.List(fields.String(), required=False) # either pages or wikitext must be defined
wikitext = fields.Str(required=False) # if provided, overrides pages array

@pre_load
# NB: pre_load is a marshmallow directive;
def process_input(self, data, **kwargs):
"""
transform comma separated pages into a List
"""
from src import app
app.logger.debug(f"==> FetchRefsSchemaV2::(@pre_load)process_input: data:{data}")

request_method = self.context.get('request_method', None)
# if request_method:
# print(f"Request method received: {request_method}")

app.logger.debug(f"==> FetchRefsSchemaV2::(@pre_load)process_input: request_method:{request_method}")


mutable_data = dict(data) # Convert ImmutableMultiDict to a mutable dict
if 'pages' in mutable_data and isinstance(mutable_data['pages'], str):
mutable_data['pages'] = mutable_data['pages'].split('|')
return mutable_data

# noinspection PyUnusedLocal
@post_load
# NB: post_load is a marshmallow directive;
# this function is run after loading request args
# it basically pulls the request object value into a Job object
#
# **kwargs is needed here despite what the validator claims
def return_job_object(self, data, **kwargs) -> FetchRefsJobV2: # type: ignore # dead: disable
"""Return Job object"""
from src import app
app.logger.debug("==> FetchRefsSchemaV2::@post_load:return_job_object")
app.logger.debug(f"return_job_object data: {data}")

job = FetchRefsJobV2(**data)
job.validate_fields()

# NB here is where we can modify job field values before returning if we want

return job
63 changes: 48 additions & 15 deletions src/models/v2/wikimedia/wikipedia/article_v2.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import logging
import re
import pprint
# import urllib
from urllib.parse import quote, unquote
from urllib.parse import unquote

from datetime import datetime
from typing import Any, Dict, List, Optional
Expand Down Expand Up @@ -69,6 +67,7 @@ class WikipediaArticleV2(IariBaseModel):

error_items: List[Any] = []

# required pydantic class
class Config: # dead: disable
arbitrary_types_allowed = True # dead: disable
extra = "forbid" # dead: disable
Expand Down Expand Up @@ -129,7 +128,7 @@ def fetch_and_parse(self):
"""
from src import app

app.logger.debug("ArticleV2::fetch_and_parse")
app.logger.debug("==> ArticleV2::fetch_and_parse")
app.logger.info("Fetching article data and parsing")

if not self.wikitext:
Expand All @@ -138,28 +137,60 @@ def fetch_and_parse(self):
self.__fetch_wikitext__()

if self.is_redirect:
logger.debug(
logger.error(
"Skipped extraction and parsing because the article is a redirect"
)
raise WikipediaApiFetchError("wiki article is a redirect")
raise WikipediaApiFetchError("Wiki article is a redirect")
# TODO Might want to change this from raising exception,
# but we do want to stop further processing,
# so need to have some way of indicating that to caller

if not self.found_in_wikipedia:
logger.debug(
"Skipped extraction and parsing because the article was not found"
logger.error(
"Skipped extraction and parsing because the article was not found in wiki"
)
raise WikipediaApiFetchError("wiki article not found in wiki")
raise WikipediaApiFetchError(f"Article {self.job.quoted_title} not found in wiki")

if not self.wikitext:
raise WikipediaApiFetchError("wikitext is empty")


# wikitext extraction

app.logger.debug("==> ArticleV2::fetch_and_parse: extracting from wikitext")

# elif not self.is_redirect and self.found_in_wikipedia:
if not self.is_redirect and self.found_in_wikipedia:

if not self.wikitext:
raise MissingInformationError("WikipediaReferenceExtractorV2::fetch_and_parse: self.wikitext is empty")

self.extractor = WikipediaReferenceExtractorV2(
wikitext=self.wikitext,
html_source=self.html_markup,
job=self.job,
)

app.logger.debug("==> ArticleV2::fetch_and_parse: extracting all refs")
self.extractor.extract_all_references()

app.logger.debug("==> ArticleV2::fetch_and_parse: fetching ores scores")
self.__get_ores_scores__()
# self.__generate_hash__()


app.logger.debug("==> ArticleV2::fetch_and_parse: extracting from html")

# html extraction
if not self.html_markup:
self.__fetch_html__()

# extract references from html point-of-view
self.__extract_footnote_references__()
self.__extract_section_references__()
self.__extract_urls_from_references__()

self.__get_ores_scores__() # fills ores_quality_prediction and ores_details
# self.__get_ores_scores__() # fills ores_quality_prediction and ores_details

def __extract_urls_from_references__(self):
# traverse references, adding urls to self.urlDict,
Expand Down Expand Up @@ -196,8 +227,8 @@ def __extract_footnote_references__(self):
regex_extract_ref_name = r"#cite_note-(.*?)-\d+$"

soup = BeautifulSoup(self.html_markup, "html.parser")
# for link in soup.find_all("a"):
# print(link.get("href"))
# for link in soup.find_all("a"):
# print(link.get("href"))


references_wrapper = soup.find("div", class_="mw-references-wrap")
Expand Down Expand Up @@ -247,7 +278,7 @@ def __extract_footnote_references__(self):
if span_ref:
# span_ref contains citation markup and possible template data

app.logger.debug(f"Checking <link> data...")
# ### app.logger.debug(f"Checking <link> data...")

# fetch "template" data from link[data-mw] attribute
link_refs = span_ref.find_all("link")
Expand Down Expand Up @@ -293,7 +324,9 @@ def __extract_footnote_references__(self):
# TODO What is held in these elements, specifically? is it books?
span_refs = span_ref.find_all("span", class_="Z3988")
for span_ref in span_refs:
app.logger.debug(f"found span.Z3988...")

# app.logger.debug(f"found span.Z3988...")

span_data = span_ref.get("title")
if span_data:
span_template = self.__parse_span_template__(span_data)
Expand Down Expand Up @@ -489,7 +522,7 @@ def __parse_span_template__(self, span_data) -> Dict[str, Any] or None:

span_list = span_data.split("&")

app.logger.debug(f"SPAN DATA (parsed):")
# app.logger.debug(f"SPAN DATA (parsed):")

span_template = []
# print this string out
Expand Down
Loading

0 comments on commit b2ea1a9

Please sign in to comment.