Skip to content

Commit

Permalink
Write tests for page_range
Browse files Browse the repository at this point in the history
  • Loading branch information
SamEdwardes committed Oct 17, 2023
1 parent ca475d2 commit 2a2e0b3
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 1 deletion.
2 changes: 1 addition & 1 deletion spacypdfreader/spacypdfreader.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def pdf_reader(
>>> from spacypdfreader.parsers import pytesseract
>>>
>>> nlp = spacy.load("en_core_web_sm")
>>> doc = pdf_reader("tests/data/test_pdf_01.pdf", nlp, pytesseract.parser, n_processes=4, page_range=(1, 2)
>>> doc = pdf_reader("tests/data/test_pdf_01.pdf", nlp, pytesseract.parser, n_processes=4, page_range=(1, 2))
"""
# For backwards compatibility, if someone passes in PdfMinerParser or
# PyTesseractParser replace with the correct function
Expand Down
51 changes: 51 additions & 0 deletions tests/test_page_range.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import pytest
import spacy

from spacypdfreader.parsers import pdfminer
from spacypdfreader.parsers import pytesseract
from spacypdfreader.spacypdfreader import pdf_reader


def pdf_assertions(doc: spacy.tokens.Doc):
# Page numbers.
assert doc[0]._.page_number == 2
assert doc[-1]._.page_number == 3
# Doc attributes.
assert doc._.page_range == (2, 3)
assert doc._.first_page == 2
assert doc._.last_page == 3
assert doc._.pdf_file_name == "tests/data/test_pdf_01.pdf"


def test_page_range_pdfminer_single():
nlp = spacy.load("en_core_web_sm")
doc = pdf_reader("tests/data/test_pdf_01.pdf", nlp, pdfminer.parser, page_range=(2, 3))
pdf_assertions(doc)


def test_page_range_pdfminer_multi():
nlp = spacy.load("en_core_web_sm")
doc = pdf_reader("tests/data/test_pdf_01.pdf", nlp, pdfminer.parser, page_range=(2, 3), n_processes=2)
pdf_assertions(doc)


def test_page_range_pytesseract_single():
nlp = spacy.load("en_core_web_sm")
doc = pdf_reader("tests/data/test_pdf_01.pdf", nlp, pytesseract.parser, page_range=(2, 3))
pdf_assertions(doc)


def test_page_range_pytesseract_multi():
nlp = spacy.load("en_core_web_sm")
doc = pdf_reader("tests/data/test_pdf_01.pdf", nlp, pytesseract.parser, page_range=(2, 3), n_processes=2)
pdf_assertions(doc)


def test_page_range_logic():
nlp = spacy.load("en_core_web_sm")
with pytest.raises(ValueError):
doc = pdf_reader("tests/data/test_pdf_01.pdf", nlp, pytesseract.parser, page_range=(10, 20), n_processes=2)
with pytest.raises(ValueError):
doc = pdf_reader("tests/data/test_pdf_01.pdf", nlp, pytesseract.parser, page_range=(-1, 2), n_processes=2)
with pytest.raises(ValueError):
doc = pdf_reader("tests/data/test_pdf_01.pdf", nlp, pytesseract.parser, page_range=(3, 1), n_processes=2)
7 changes: 7 additions & 0 deletions tests/test_pdfminer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import pytest
import spacy

from spacypdfreader.parsers import pdfminer
Expand Down Expand Up @@ -59,3 +60,9 @@ def test_pdfminer_multi_same_as_single():
)
doc_single = pdf_reader("tests/data/test_pdf_01.pdf", nlp, pdfminer.parser)
assert doc_multi.text == doc_single.text


def test_pdfminer_rejects_n_pages():
nlp = spacy.load("en_core_web_sm")
with pytest.raises(ValueError):
doc = pdf_reader("tests/data/test_pdf_01.pdf", nlp, pdfminer.parser, page_numbers=[1, 2])

0 comments on commit 2a2e0b3

Please sign in to comment.