Skip to content

Commit

Permalink
moved extract_cite_refs to parse_utils
Browse files Browse the repository at this point in the history
  • Loading branch information
mojomonger committed May 21, 2024
1 parent 23f3310 commit 10ac81b
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 97 deletions.
Empty file added iarilib/__init__.py
Empty file.
49 changes: 49 additions & 0 deletions iarilib/parse_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# parse_utils.py
from bs4 import BeautifulSoup


def extract_cite_refs(html):

soup = BeautifulSoup(html, "html.parser")
# for link in soup.find_all("a"):
# print(link.get("href"))

ref_wrapper = soup.find("div", class_="mw-references-wrap")

refs = []

if ref_wrapper:

references_list = ref_wrapper.find("ol", class_="references")

ref_counter = 0
for ref in references_list.find_all("li"):
ref_counter += 1
page_refs = []
for link in ref.find_all("a"):
# span.mw-linkback-text children should have a citeref link
if link.find("span", class_="mw-linkback-text"):
page_refs.append(
{
"href": link.get("href"),
"id": link.get("id"),
}
)

span_link = ref.find("span", class_="mw-reference-text")
raw_data = None
if span_link:
link_data = span_link.find("link")
if link_data:
raw_data = link_data.get("data-mw")

refs.append(
{
"id": ref.get("id"),
# "ref_index": ref_counter,
"raw_data": raw_data,
"page_refs": page_refs,
}
)

return refs
1 change: 0 additions & 1 deletion lib/WikidataIntegrator
Submodule WikidataIntegrator deleted from 5db457
49 changes: 2 additions & 47 deletions src/models/v2/wikimedia/wikipedia/reference/extractor_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import mwparserfromhell # type: ignore
from bs4 import BeautifulSoup
from mwparserfromhell.wikicode import Wikicode # type: ignore
from iarilib.parse_utils import extract_cite_refs

from src.models.base import WariBaseModel # TODO change to IariBaseModel
from src.models.exceptions import MissingInformationError
Expand Down Expand Up @@ -257,52 +258,6 @@ def __parse_wikitext__(self):
if not self.wikicode:
self.wikicode = mwparserfromhell.parse(self.wikitext)

def __extract_cite_refs__(self):

soup = BeautifulSoup(self.html_source, "html.parser")
# for link in soup.find_all("a"):
# print(link.get("href"))

references_wrapper = soup.find("div", class_="mw-references-wrap")

refs = []

if references_wrapper:
references_list = references_wrapper.find("ol", class_="references")
ref_counter = 0
for ref in references_list.find_all("li"):

ref_counter += 1

page_refs = []
for link in ref.find_all("a"):
# span.mw-linkback-text children should have a citeref link
if link.find("span", class_="mw-linkback-text"):
page_refs.append(
{
"href": link.get("href"),
"id": link.get("id"),
}
)

span_link = ref.find("span", class_="mw-reference-text")
raw_data = None
if span_link:
link_data = span_link.find("link")
if link_data:
raw_data = link_data.get("data-mw")

refs.append(
{
"id": ref.get("id"),
# "ref_index": ref_counter,
"raw_data": raw_data,
"page_refs": page_refs,
}
)

self.cite_page_refs = refs

def __parse_html_source__(self):
"""
Parses html to extract cite reference data from references section
Expand All @@ -315,7 +270,7 @@ def __parse_html_source__(self):
# return css_class is None # and len(css_class) == 6

if self.html_source:
self.__extract_cite_refs__()
self.cite_page_refs = extract_cite_refs(self.html_source)

@property
def reference_ids(self) -> List[str]:
Expand Down
52 changes: 3 additions & 49 deletions src/models/wikimedia/wikipedia/reference/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
from typing import Any, Dict, List, Optional

import mwparserfromhell # type: ignore
from bs4 import BeautifulSoup
from mwparserfromhell.wikicode import Wikicode # type: ignore
# from bs4 import BeautifulSoup
from iarilib.parse_utils import extract_cite_refs

from src.models.api.job.article_job import ArticleJob
from src.models.base import WariBaseModel
Expand Down Expand Up @@ -265,51 +266,7 @@ def __parse_wikitext__(self):
if not self.wikicode:
self.wikicode = mwparserfromhell.parse(self.wikitext)

def __extract_cite_refs__(self):

soup = BeautifulSoup(self.html_source, "html.parser")
# for link in soup.find_all("a"):
# print(link.get("href"))

references_wrapper = soup.find("div", class_="mw-references-wrap")

refs = []

if references_wrapper:
references_list = references_wrapper.find("ol", class_="references")
ref_counter = 0
for ref in references_list.find_all("li"):

ref_counter += 1

page_refs = []
for link in ref.find_all("a"):
# span.mw-linkback-text children should have a citeref link
if link.find("span", class_="mw-linkback-text"):
page_refs.append(
{
"href": link.get("href"),
"id": link.get("id"),
}
)

span_link = ref.find("span", class_="mw-reference-text")
raw_data = None
if span_link:
link_data = span_link.find("link")
if link_data:
raw_data = link_data.get("data-mw")

refs.append(
{
"id": ref.get("id"),
# "ref_index": ref_counter,
"raw_data": raw_data,
"page_refs": page_refs,
}
)

self.cite_page_refs = refs

def __parse_html_source__(self):
"""
Expand All @@ -319,11 +276,8 @@ def __parse_html_source__(self):

app.logger.debug("__parse_html_source__: running")

# def is_citeref_link(css_class):
# return css_class is None # and len(css_class) == 6

if self.html_source:
self.__extract_cite_refs__()
self.cite_page_refs = extract_cite_refs(self.html_source)

@property
def reference_ids(self) -> List[str]:
Expand Down

0 comments on commit 10ac81b

Please sign in to comment.