-
Notifications
You must be signed in to change notification settings - Fork 2
/
core.py
109 lines (92 loc) · 4.62 KB
/
core.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
"""Core class with default toolchain for the PDF to AAS conversion."""
import logging
from .dictionary import ECLASS, Dictionary
from .extractor import Extractor, PropertyLLMSearch
from .generator import AASSubmodelTechnicalData, Generator
from .preprocessor import PDFium, Preprocessor
logger = logging.getLogger(__name__)
class PDF2AAS:
"""Convert PDF documents into Asset Administration Shell (AAS) submodels.
Attributes:
preprocessor (Preprocessor): A preprocessing object to handle PDF files.
Defaults to PDFium.
dictionary (Dictionary): A dictionary object for term mapping.
Defaults to ECLASS in current release.
extractor (Extractor): An extractor object to pull relevant information
from the preprocessed PDF. Defaults to PropertyLLMSearch with
current openai model.
generator (Generator): A generator object to create AAS submodels.
Defaults to AASSubmodelTechnicalData.
batch_size (int): The number of properties that are extracted in one
batch. 0 (default) extracts all properties in one. 1 extracts each
property on its own.
"""
def __init__(
self,
preprocessor: Preprocessor = None,
dictionary: Dictionary = None,
extractor: Extractor = None,
generator: Generator = None,
batch_size: int = 0,
) -> None:
"""Initialize the PDF2AAS toolchain with optional custom components.
Args:
preprocessor (Preprocessor, optional): A preprocessing object to
handle PDF files. Defaults to PDFium.
dictionary (Dictionary, optional): A dictionary object for term
mapping. Defaults to ECLASS.
extractor (Extractor, optional): An extractor object to pull
relevant information from the preprocessed PDF. Defaults to
PropertyLLMSearch with the current openai model.
generator (Generator, optional): A generator object to create AAS
submodels. Defaults to AASSubmodelTechnicalData.
batch_size (int, optional): The number of properties that are
extracted in one batch. 0 (default) extracts all properties
in one. 1 extracts each property on its own.
"""
self.preprocessor = PDFium() if preprocessor is None else preprocessor
self.dictionary = ECLASS() if dictionary is None else dictionary
self.extractor = PropertyLLMSearch("gpt-4o-mini") if extractor is None else extractor
self.generator = AASSubmodelTechnicalData() if generator is None else generator
self.batch_size = batch_size
def convert(
self,
pdf_filepath: str,
classification: str,
output_filepath: str | None = None,
) -> None:
"""Convert a PDF document into an AAS submodel.
Uses the configured preprocessor, dictionary, extractor to
extract or search for the given properties of the `classification`.
Dumps the result using the configured generator to the given
'output_filepath' if provided.
Args:
pdf_filepath (str): The file path to the input PDF document.
classification (str): The classification term for mapping
properties, e.g. "27274001" when using ECLASS.
output_filepath (str, optional): The file path to save the generated
AAS submodel or configured generator output.
"""
preprocessed_datasheet = self.preprocessor.convert(pdf_filepath)
property_definitions = self.dictionary.get_class_properties(classification)
if self.batch_size <= 0:
properties = self.extractor.extract(preprocessed_datasheet, property_definitions)
elif self.batch_size == 1:
properties = [
self.extractor.extract(preprocessed_datasheet, d) for d in property_definitions
]
else:
properties = []
for i in range(0, len(property_definitions), self.batch_size):
properties.extend(
self.extractor.extract(
preprocessed_datasheet, property_definitions[i : i + self.batch_size],
),
)
self.generator.reset()
if isinstance(self.generator, AASSubmodelTechnicalData):
self.generator.add_classification(self.dictionary, classification)
self.generator.add_properties(properties)
if output_filepath is not None:
self.generator.dump(filepath=output_filepath)
logger.info("Generated result in: %s", output_filepath)