-
Notifications
You must be signed in to change notification settings - Fork 1.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: Add store_full_path to converters (2/3) #8573
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,6 +2,8 @@ | |
# | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
import os | ||
import warnings | ||
from pathlib import Path | ||
from typing import Any, Dict, List, Optional, Union | ||
|
||
|
@@ -38,19 +40,23 @@ class MarkdownToDocument: | |
``` | ||
""" | ||
|
||
def __init__(self, table_to_single_line: bool = False, progress_bar: bool = True): | ||
def __init__(self, table_to_single_line: bool = False, progress_bar: bool = True, store_full_path: bool = True): | ||
""" | ||
Create a MarkdownToDocument component. | ||
|
||
:param table_to_single_line: | ||
If True converts table contents into a single line. | ||
:param progress_bar: | ||
If True shows a progress bar when running. | ||
:param store_full_path: | ||
If True, the full path of the file is stored in the metadata of the document. | ||
If False, only the file name is stored. | ||
""" | ||
markdown_conversion_imports.check() | ||
|
||
self.table_to_single_line = table_to_single_line | ||
self.progress_bar = progress_bar | ||
self.store_full_path = store_full_path | ||
|
||
@component.output_types(documents=List[Document]) | ||
def run( | ||
|
@@ -105,6 +111,19 @@ def run( | |
continue | ||
|
||
merged_metadata = {**bytestream.meta, **metadata} | ||
|
||
warnings.warn( | ||
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. " | ||
"In the 2.9.0 release, the default value for `store_full_path` will change to False, " | ||
"storing only file names to improve privacy.", | ||
DeprecationWarning, | ||
) | ||
|
||
if not self.store_full_path and "file_path" in bytestream.meta: | ||
file_path = bytestream.meta.get("file_path") | ||
if file_path: # Ensure the value is not None for pylint | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. something as above |
||
merged_metadata["file_path"] = os.path.basename(file_path) | ||
|
||
document = Document(content=text, meta=merged_metadata) | ||
documents.append(document) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,8 @@ | |
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
import io | ||
import os | ||
import warnings | ||
from pathlib import Path | ||
from typing import Any, Dict, List, Optional, Union | ||
|
||
|
@@ -46,6 +48,7 @@ def __init__( # pylint: disable=too-many-positional-arguments | |
boxes_flow: Optional[float] = 0.5, | ||
detect_vertical: bool = True, | ||
all_texts: bool = False, | ||
store_full_path: bool = True, | ||
) -> None: | ||
""" | ||
Create a PDFMinerToDocument component. | ||
|
@@ -78,6 +81,9 @@ def __init__( # pylint: disable=too-many-positional-arguments | |
This parameter determines whether vertical text should be considered during layout analysis. | ||
:param all_texts: | ||
If layout analysis should be performed on text in figures. | ||
:param store_full_path: | ||
If True, the full path of the file is stored in the metadata of the document. | ||
If False, only the file name is stored. | ||
""" | ||
|
||
pdfminer_import.check() | ||
|
@@ -91,6 +97,7 @@ def __init__( # pylint: disable=too-many-positional-arguments | |
detect_vertical=detect_vertical, | ||
all_texts=all_texts, | ||
) | ||
self.store_full_path = store_full_path | ||
|
||
def _converter(self, extractor) -> Document: | ||
""" | ||
|
@@ -165,6 +172,17 @@ def run( | |
) | ||
|
||
merged_metadata = {**bytestream.meta, **metadata} | ||
warnings.warn( | ||
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. " | ||
"In the 2.9.0 release, the default value for `store_full_path` will change to False, " | ||
"storing only file names to improve privacy.", | ||
DeprecationWarning, | ||
) | ||
|
||
if not self.store_full_path and "file_path" in bytestream.meta: | ||
file_path = bytestream.meta.get("file_path") | ||
if file_path: # Ensure the value is not None for pylint | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. something here |
||
merged_metadata["file_path"] = os.path.basename(file_path) | ||
document.meta = merged_metadata | ||
documents.append(document) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,8 @@ | |
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
import io | ||
import os | ||
import warnings | ||
from pathlib import Path | ||
from typing import Any, Dict, List, Optional, Union | ||
|
||
|
@@ -35,11 +37,16 @@ class PPTXToDocument: | |
``` | ||
""" | ||
|
||
def __init__(self): | ||
def __init__(self, store_full_path: bool = True): | ||
""" | ||
Create an PPTXToDocument component. | ||
|
||
:param store_full_path: | ||
If True, the full path of the file is stored in the metadata of the document. | ||
If False, only the file name is stored. | ||
""" | ||
pptx_import.check() | ||
self.store_full_path = store_full_path | ||
|
||
def _convert(self, file_content: io.BytesIO) -> str: | ||
""" | ||
|
@@ -97,6 +104,17 @@ def run( | |
continue | ||
|
||
merged_metadata = {**bytestream.meta, **metadata} | ||
warnings.warn( | ||
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. " | ||
"In the 2.9.0 release, the default value for `store_full_path` will change to False, " | ||
"storing only file names to improve privacy.", | ||
DeprecationWarning, | ||
) | ||
|
||
if not self.store_full_path and "file_path" in bytestream.meta: | ||
file_path = bytestream.meta.get("file_path") | ||
if file_path: # Ensure the value is not None for pylint | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. something here |
||
merged_metadata["file_path"] = os.path.basename(file_path) | ||
documents.append(Document(content=text, meta=merged_metadata)) | ||
|
||
return {"documents": documents} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,8 @@ | |
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
import io | ||
import os | ||
import warnings | ||
from html.parser import HTMLParser | ||
from pathlib import Path | ||
from typing import Any, Dict, List, Optional, Union | ||
|
@@ -73,15 +75,19 @@ class TikaDocumentConverter: | |
``` | ||
""" | ||
|
||
def __init__(self, tika_url: str = "http://localhost:9998/tika"): | ||
def __init__(self, tika_url: str = "http://localhost:9998/tika", store_full_path: bool = True): | ||
""" | ||
Create a TikaDocumentConverter component. | ||
|
||
:param tika_url: | ||
Tika server URL. | ||
:param store_full_path: | ||
If True, the full path of the file is stored in the metadata of the document. | ||
If False, only the file name is stored. | ||
""" | ||
tika_import.check() | ||
self.tika_url = tika_url | ||
self.store_full_path = store_full_path | ||
|
||
@component.output_types(documents=List[Document]) | ||
def run( | ||
|
@@ -133,6 +139,18 @@ def run( | |
continue | ||
|
||
merged_metadata = {**bytestream.meta, **metadata} | ||
warnings.warn( | ||
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. " | ||
"In the 2.9.0 release, the default value for `store_full_path` will change to False, " | ||
"storing only file names to improve privacy.", | ||
DeprecationWarning, | ||
) | ||
|
||
if not self.store_full_path and "file_path" in bytestream.meta: | ||
file_path = bytestream.meta.get("file_path") | ||
if file_path: # Ensure the value is not None for pylint | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. something here |
||
merged_metadata["file_path"] = os.path.basename(file_path) | ||
|
||
document = Document(content=text, meta=merged_metadata) | ||
documents.append(document) | ||
return {"documents": documents} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,6 +2,8 @@ | |
# | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
import os | ||
import warnings | ||
from pathlib import Path | ||
from typing import Any, Dict, List, Optional, Union | ||
|
||
|
@@ -34,16 +36,20 @@ class TextFileToDocument: | |
``` | ||
""" | ||
|
||
def __init__(self, encoding: str = "utf-8"): | ||
def __init__(self, encoding: str = "utf-8", store_full_path: bool = True): | ||
""" | ||
Creates a TextFileToDocument component. | ||
|
||
:param encoding: | ||
The encoding of the text files to convert. | ||
If the encoding is specified in the metadata of a source ByteStream, | ||
it overrides this value. | ||
:param store_full_path: | ||
If True, the full path of the file is stored in the metadata of the document. | ||
If False, only the file name is stored. | ||
""" | ||
self.encoding = encoding | ||
self.store_full_path = store_full_path | ||
|
||
@component.output_types(documents=List[Document]) | ||
def run( | ||
|
@@ -87,6 +93,17 @@ def run( | |
continue | ||
|
||
merged_metadata = {**bytestream.meta, **metadata} | ||
warnings.warn( | ||
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. " | ||
"In the 2.9.0 release, the default value for `store_full_path` will change to False, " | ||
"storing only file names to improve privacy.", | ||
DeprecationWarning, | ||
) | ||
|
||
if not self.store_full_path and "file_path" in bytestream.meta: | ||
file_path = bytestream.meta.get("file_path") | ||
if file_path: # Ensure the value is not None for pylint | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. idem |
||
merged_metadata["file_path"] = os.path.basename(file_path) | ||
document = Document(content=text, meta=merged_metadata) | ||
documents.append(document) | ||
|
||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,8 @@ | ||||||
--- | ||||||
features: | ||||||
- | | ||||||
Added a new `store_full_path` parameter to the `__init__` methods of `JSONConverter`, `MarkdownToDocument`, `PDFMinerToDocument`, `PPTXToDocument`, `TikaDocumentConverter` and `TextFileToDocument`. The default value is `True`, which stores full file path in the metadata of the output documents. When set to `False`, only the file name is stored. | ||||||
|
||||||
deprecations: | ||||||
- | | ||||||
The default value of the `store_full_path` parameter will be changed to `False` in Haysatck 2.9.0 to enhance privacy. | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you can do this with the walrus operator, merging if the assignment in one go