Skip to content

Commit

Permalink
Merge pull request #917 from internetarchive/check_url_archive
Browse files Browse the repository at this point in the history
functionality is sufficient to merge with main branch.
check-url-archive endpoint was added
  • Loading branch information
mojomonger authored Dec 3, 2023
2 parents 1bcd7ba + 10d9335 commit abafbad
Show file tree
Hide file tree
Showing 30 changed files with 686 additions and 430 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,5 @@ __pycache__/
.coverage
.idea/
venv/
_notes/
_notes/
.env
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ RUN poetry install -v --no-interaction --no-ansi
COPY . ./

# Setup all the needed directories
RUN mkdir -p /tmp/wikicitations-api json/articles json/references json/dois json/urls json/xhtmls json/pdfs
RUN mkdir -p /tmp/wikicitations-api json/articles json/references json/dois json/urls json/urls/archives json/xhtmls json/pdfs

#CMD ["./debug_app.py"]
CMD ["gunicorn","-w", "4", "--bind", ":5000", "--timeout", "1500", "wsgi:app"]
CMD ["gunicorn","-w", "9", "--bind", ":5000", "--timeout", "1500", "wsgi:app"]
3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
[tool.poetry]
name = "Internet Archive Reference Inventory (IARI)"
#name = "IARI"
version = "4.1.4"
version = "4.2.0"
description = "API capable of fetching, extracting, transforming and storing reference information from Wikipedia articles, websites and PDFs as structured data."
authors = ["Dennis Priskorn <[email protected]>"]
license = "GPLv3+"
Expand Down
2 changes: 2 additions & 0 deletions src/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import config
from src.views.check_doi import CheckDoi
from src.views.check_url import CheckUrl
from src.views.check_url_archive import CheckUrlArchive
from src.views.check_urls import CheckUrls
from src.views.statistics.all import All
from src.views.statistics.article import Article
Expand All @@ -37,6 +38,7 @@
api.add_resource(Version, "/version")
api.add_resource(CheckUrls, "/check-urls")
api.add_resource(CheckUrl, "/check-url")
api.add_resource(CheckUrlArchive, "/check-url-archive")
api.add_resource(CheckDoi, "/check-doi")
api.add_resource(Article, "/statistics/article")
api.add_resource(All, "/statistics/all")
Expand Down
22 changes: 22 additions & 0 deletions src/helpers/get_version.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# get_version.py


def get_poetry_version(file_path):
with open(file_path) as toml_file:
content = toml_file.read()

poetry_start = content.find("[tool.poetry]")
if poetry_start == -1:
return None # The [tool.poetry] section is not found

version_start = content.find("version", poetry_start)
if version_start == -1:
return None # The 'version' property is not found in [tool.poetry]

version_end = content.find("\n", version_start)
version_line = content[version_start:version_end].strip()

# Assuming version is in the format 'version = "x.y.z"'
version = version_line.split("=")[1].strip().strip('"')

return version
10 changes: 5 additions & 5 deletions src/models/api/job/article_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,17 +74,17 @@ def __urldecode_url__(self):

def __extract_url__(self):
"""This was generated with help of chatgpt using this prompt:
I want a python re regex that extracts "en" "wikipedia.or"
I want a python re regex that extracts "en" "wikipedia.org"
and "Test" from http://en.wikipedia.org/wiki/Test
"""
from src import app

app.logger.debug("extract_url: running")
if self.url:
self.__urldecode_url__()
pattern = r"https?://(\w+)\.(\w+\.\w+)/wiki/(.+)"
wiki_url_pattern = r"https?://(\w+)\.(\w+\.\w+)/wiki/(.+)"

matches = re.match(pattern, self.url)
matches = re.match(wiki_url_pattern, self.url)
if matches:
groups = matches.groups()
self.lang = groups[0]
Expand All @@ -104,14 +104,14 @@ def __valid_regex__(self) -> bool:
Words separated by spaces are allowed.
_ is not allowed anywhere"""
underscore_pattern = re.compile(r"^[^_]*$")
horizontal_line_regex = r"^(\s*[^\s]+\s*)+(\s*\|\s*[^\s]+\s*)*$"
pipe_delimiter_pattern = r"^(\s*[^\s]+\s*)+(\s*\|\s*[^\s]+\s*)*$"
if " | " in self.regex:
return False
if "||" in self.regex:
return False
if not re.fullmatch(underscore_pattern, self.regex):
return False
if re.fullmatch(horizontal_line_regex, self.regex):
if re.fullmatch(pipe_delimiter_pattern, self.regex):
# print('The string is formatted correctly.')
return True
else:
Expand Down
12 changes: 12 additions & 0 deletions src/models/api/job/check_url_archive_job.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from urllib.parse import unquote

from src.models.api.job import Job


class UrlArchiveJob(Job):
url: str

@property
def unquoted_url(self):
"""Decoded url"""
return unquote(self.url)
2 changes: 2 additions & 0 deletions src/models/api/job/check_url_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
class UrlJob(Job):
url: str
timeout: int = 2 # We default to 2 seconds
method: str = "iabot" # default to iabot check method # TODO get methods from global structure

debug: bool = False
blocks: bool = False
html: bool = False
Expand Down
26 changes: 26 additions & 0 deletions src/models/api/schema/check_url_archive_schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import logging

from marshmallow import post_load
from marshmallow.fields import Bool, Int, String

from src.models.api.job.check_url_archive_job import UrlArchiveJob
from src.models.api.schema.refresh import BaseSchema

logger = logging.getLogger(__name__)


class UrlArchiveSchema(BaseSchema):
"""This validates the patron input in the get request"""

url = String(required=True)

# noinspection PyUnusedLocal
@post_load
# **kwargs is needed here despite what the validator claims
def return_object(self, data, **kwargs) -> UrlArchiveJob: # type: ignore # dead: disable
"""Return job object"""
from src import app

app.logger.debug("return_object: running")
job = UrlArchiveJob(**data)
return job
1 change: 1 addition & 0 deletions src/models/api/schema/check_url_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ class UrlSchema(BaseSchema):

url = String(required=True)
timeout = Int(required=False)
method = String(required=False)
debug = Bool(required=False)
blocks = Bool(required=False)
xml = Bool(required=False)
Expand Down
28 changes: 17 additions & 11 deletions src/models/api/statistic/article.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Any, Dict, List
from typing import Any, Dict, List, Optional

from pydantic import BaseModel, Extra

Expand All @@ -14,21 +14,27 @@ class ArticleStatistics(BaseModel):
wari_id: str = ""
lang: str = "en" # language code according to Wikimedia
page_id: int = 0 # page id of the Wikipedia in question
dehydrated_references: List[str] = []
references: List[str] = []
reference_statistics: Dict[str, int] = {}
served_from_cache: bool = False
revision_id: int = 0
revision_isodate: str = ""
revision_timestamp: int = 0
site: str = WikimediaDomain.wikipedia.value # wikimedia site in question
title: str = ""
ores_score: Any = {}

served_from_cache: bool = False
timestamp: int = 0 # timestamp at beginning of analysis
isodate: str = "" # isodate (human readable) at beginning of analysis
timing: int = 0 # time to analyze in seconds
title: str = ""
fld_counts: Dict[str, int] = {}

references: List[str] = []
reference_statistics: Dict[str, int] = {}
dehydrated_references: List[str] = []

cite_refs_count: int = 0
cite_refs: Optional[List] = []

urls: List[str] = []
ores_score: Any = {}
revision_id: int = 0
revision_isodate: str = ""
revision_timestamp: int = 0
fld_counts: Dict[str, int] = {}

class Config: # dead: disable
extra = Extra.forbid # dead: disable
21 changes: 12 additions & 9 deletions src/models/api/statistic/reference.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,21 @@ class ReferenceStatistic(BaseModel):
the patron wants from the reference endpoint"""

id: str = ""
template_names: List[str]
wikitext: str
type: str # # [general|footnote]
ref_index: int = 0
name: str = ""
type: str # [general|footnote]
footnote_subtype: str # [named|content]
# identifiers: Dict[str, Any] # {dois: [1234,12345], isbns: [1234]}
flds: List[str] = [] # non-unique first level domain strings
urls: List[str] = [] # non-unique url strings
templates: List[Dict[str, Any]]
titles: List[str] = []
section: str = ""

titles: List[str] = []
template_names: List[str]
templates: List[Dict[str, Any]]
urls: List[str] = [] # non-unique url strings
url_objects: List[Dict[str, Any]]
name: str = ""
flds: List[str] = [] # non-unique first level domain strings

wikitext: str
# identifiers: Dict[str, Any] # {dois: [1234,12345], isbns: [1234]}

class Config: # dead: disable
extra = "forbid" # dead: disable
3 changes: 1 addition & 2 deletions src/models/file_io/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def path_filename(self) -> str:
path_filename = (
f"{config.subdirectory_for_json}{self.subfolder}{self.filename}"
)
app.logger.debug(f"using path: {path_filename}")
app.logger.debug(f"using path: {path_filename} (subfolder: {self.subfolder})")
return path_filename

def write_to_disk(
Expand Down Expand Up @@ -77,4 +77,3 @@ def read_from_disk(self) -> None:
# app.logger.debug(f"loaded: {self.statistics_dictionary}")
else:
logger.debug("no json on disk")
app.logger.debug("no json on disk")
11 changes: 11 additions & 0 deletions src/models/file_io/url_archive_file_io.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import logging
from typing import Any, Dict, Optional

from src.models.file_io.hash_based import HashBasedFileIo

logger = logging.getLogger(__name__)


class UrlArchiveFileIo(HashBasedFileIo):
data: Optional[Dict[str, Any]] = None
subfolder = "urls/archives/"
1 change: 1 addition & 0 deletions src/models/file_io/url_file_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@

class UrlFileIo(HashBasedFileIo):
data: Optional[Dict[str, Any]] = None
flavor: str = ""
subfolder = "urls/"
75 changes: 61 additions & 14 deletions src/models/identifiers_checking/url.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
import os
import urllib.parse
from typing import Any, Dict, Optional

import requests
Expand Down Expand Up @@ -34,9 +35,7 @@

class Url(WikipediaUrl):
"""
This handles checking a URL
Our patrons want to know if this URL is likely to lead to the content that is referenced.
This handles checking a URL for it's http status
We define a malformed URL as any URL that the reader cannot easily
click and successfully get the contents of in a normal web browser session
Expand All @@ -45,32 +44,52 @@ class Url(WikipediaUrl):
and do not offer turning them off for now.
"""

request_error: bool = False
request_error_details: str = ""
dns_record_found: bool = False
dns_no_answer: bool = False
dns_error: bool = False
# soft404_probability: float = 0.0 # not implemented yet
# iari test - deprecated, for now (2023.11.08)
status_code: int = 0
status_code_method: str = ""

# iabot status
testdeadlink_status_code: int = 0
testdeadlink_error_details: str = ""
timeout: int = 2
dns_error_details: str = ""
response_headers: Optional[Dict] = None

# IABot Archive information (from internal iabot database)
# iabot_results: Optional[Dict] = None

text: str = ""
response_headers: Optional[Dict] = None

detected_language: str = ""
detected_language_error: bool = False
detected_language_error_details: str = ""

request_error: bool = False
request_error_details: str = ""
timeout: int = 2

dns_record_found: bool = False
dns_no_answer: bool = False
dns_error: bool = False
dns_error_details: str = ""

# soft404_probability: float = 0.0 # not implemented yet

# @property
# def __check_soft404__(self):
# raise NotImplementedError()

def check(self):
def check(self, method):
from src import app

if self.url:
self.extract()
self.__check_url__()
# self.__check_url__() # deprecated - omit native IARI checking - just using IABot's testdeadlink for now

self.status_code_method = method
app.logger.debug(f"checking url with method {method}")

# TODO me must respect "method" parameter here to check URL status
self.__check_url_with_testdeadlink_api__()
# self.__check_url_archive_with_iabot_api__()
self.__detect_language__()

def __get_dns_record__(self) -> None:
Expand Down Expand Up @@ -113,6 +132,7 @@ def __check_with_https_verify__(self):
allow_redirects=True,
)
self.status_code = r.status_code

logger.debug(self.url + "\tStatus: " + str(r.status_code))
self.response_headers = dict(r.headers)
if r.status_code == 200:
Expand Down Expand Up @@ -292,3 +312,30 @@ def __check_url_with_testdeadlink_api__(self):
result
]
break

# def __check_url_archive_with_iabot_api__(self):
# """
# This fetches the status code, archive, and other information from the
# searchurldata API of IABot
# """
#
# modified_url = urllib.parse.quote(self.url) # url encode the url
#
# headers = {
# "Content-Type": "application/x-www-form-urlencoded",
# "User-Agent": "http://en.wikipedia.org/wiki/User:GreenC via iabget.awk",
# }
# data = f"&action=searchurldata&urls={modified_url}"
#
# response = requests.post(
# "https://iabot.wmcloud.org/api.php?wiki=enwiki",
# headers=headers,
# data=data,
# )
#
# # if status code is 200, the request was successful
# if response.status_code == 200:
# data = response.json()
# print(data)
# # TODO handle return data or errors
# self.iabot_results = data
Loading

0 comments on commit abafbad

Please sign in to comment.