Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add store_full_path to converters (2/3) #8573

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion haystack/components/converters/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
# SPDX-License-Identifier: Apache-2.0

import json
import os
import warnings
from pathlib import Path
from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union

Expand Down Expand Up @@ -93,6 +95,7 @@ def __init__(
jq_schema: Optional[str] = None,
content_key: Optional[str] = None,
extra_meta_fields: Optional[Union[Set[str], Literal["*"]]] = None,
store_full_path: bool = True,
):
"""
Creates a JSONConverter component.
Expand Down Expand Up @@ -129,6 +132,9 @@ def __init__(
:param extra_meta_fields:
An optional set of meta keys to extract from the content.
If `jq_schema` is specified, all keys will be extracted from that object.
:param store_full_path:
If True, the full path of the file is stored in the metadata of the document.
If False, only the file name is stored.
"""
self._compiled_filter = None
if jq_schema:
Expand All @@ -138,6 +144,7 @@ def __init__(
self._jq_schema = jq_schema
self._content_key = content_key
self._meta_fields = extra_meta_fields
self._store_full_path = store_full_path

if self._compiled_filter is None and self._content_key is None:
msg = "No `jq_schema` nor `content_key` specified. Set either or both to extract data."
Expand All @@ -151,7 +158,11 @@ def to_dict(self) -> Dict[str, Any]:
Dictionary with serialized data.
"""
return default_to_dict(
self, jq_schema=self._jq_schema, content_key=self._content_key, extra_meta_fields=self._meta_fields
self,
jq_schema=self._jq_schema,
content_key=self._content_key,
extra_meta_fields=self._meta_fields,
store_full_path=self._store_full_path,
)

@classmethod
Expand Down Expand Up @@ -269,8 +280,19 @@ def run(

data = self._get_content_and_meta(bytestream)

warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)
for text, extra_meta in data:
merged_metadata = {**bytestream.meta, **metadata, **extra_meta}

if not self._store_full_path and "file_path" in bytestream.meta:
file_path = bytestream.meta.get("file_path")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can do this with the walrus operator, merging if the assignment in one go

                    if file_path := bytestream.meta.get("file_path"):
                        merged_metadata["file_path"] = os.path.basename(file_path)

if file_path: # Ensure the value is not None for pylint
merged_metadata["file_path"] = os.path.basename(file_path)
document = Document(content=text, meta=merged_metadata)
documents.append(document)

Expand Down
21 changes: 20 additions & 1 deletion haystack/components/converters/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
#
# SPDX-License-Identifier: Apache-2.0

import os
import warnings
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

Expand Down Expand Up @@ -38,19 +40,23 @@ class MarkdownToDocument:
```
"""

def __init__(self, table_to_single_line: bool = False, progress_bar: bool = True):
def __init__(self, table_to_single_line: bool = False, progress_bar: bool = True, store_full_path: bool = True):
"""
Create a MarkdownToDocument component.

:param table_to_single_line:
If True converts table contents into a single line.
:param progress_bar:
If True shows a progress bar when running.
:param store_full_path:
If True, the full path of the file is stored in the metadata of the document.
If False, only the file name is stored.
"""
markdown_conversion_imports.check()

self.table_to_single_line = table_to_single_line
self.progress_bar = progress_bar
self.store_full_path = store_full_path

@component.output_types(documents=List[Document])
def run(
Expand Down Expand Up @@ -105,6 +111,19 @@ def run(
continue

merged_metadata = {**bytestream.meta, **metadata}

warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)

if not self.store_full_path and "file_path" in bytestream.meta:
file_path = bytestream.meta.get("file_path")
if file_path: # Ensure the value is not None for pylint
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

something as above

merged_metadata["file_path"] = os.path.basename(file_path)

document = Document(content=text, meta=merged_metadata)
documents.append(document)

Expand Down
18 changes: 18 additions & 0 deletions haystack/components/converters/pdfminer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
# SPDX-License-Identifier: Apache-2.0

import io
import os
import warnings
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

Expand Down Expand Up @@ -46,6 +48,7 @@ def __init__( # pylint: disable=too-many-positional-arguments
boxes_flow: Optional[float] = 0.5,
detect_vertical: bool = True,
all_texts: bool = False,
store_full_path: bool = True,
) -> None:
"""
Create a PDFMinerToDocument component.
Expand Down Expand Up @@ -78,6 +81,9 @@ def __init__( # pylint: disable=too-many-positional-arguments
This parameter determines whether vertical text should be considered during layout analysis.
:param all_texts:
If layout analysis should be performed on text in figures.
:param store_full_path:
If True, the full path of the file is stored in the metadata of the document.
If False, only the file name is stored.
"""

pdfminer_import.check()
Expand All @@ -91,6 +97,7 @@ def __init__( # pylint: disable=too-many-positional-arguments
detect_vertical=detect_vertical,
all_texts=all_texts,
)
self.store_full_path = store_full_path

def _converter(self, extractor) -> Document:
"""
Expand Down Expand Up @@ -165,6 +172,17 @@ def run(
)

merged_metadata = {**bytestream.meta, **metadata}
warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)

if not self.store_full_path and "file_path" in bytestream.meta:
file_path = bytestream.meta.get("file_path")
if file_path: # Ensure the value is not None for pylint
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

something here

merged_metadata["file_path"] = os.path.basename(file_path)
document.meta = merged_metadata
documents.append(document)

Expand Down
20 changes: 19 additions & 1 deletion haystack/components/converters/pptx.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
# SPDX-License-Identifier: Apache-2.0

import io
import os
import warnings
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

Expand Down Expand Up @@ -35,11 +37,16 @@ class PPTXToDocument:
```
"""

def __init__(self):
def __init__(self, store_full_path: bool = True):
"""
Create an PPTXToDocument component.

:param store_full_path:
If True, the full path of the file is stored in the metadata of the document.
If False, only the file name is stored.
"""
pptx_import.check()
self.store_full_path = store_full_path

def _convert(self, file_content: io.BytesIO) -> str:
"""
Expand Down Expand Up @@ -97,6 +104,17 @@ def run(
continue

merged_metadata = {**bytestream.meta, **metadata}
warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)

if not self.store_full_path and "file_path" in bytestream.meta:
file_path = bytestream.meta.get("file_path")
if file_path: # Ensure the value is not None for pylint
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

something here

merged_metadata["file_path"] = os.path.basename(file_path)
documents.append(Document(content=text, meta=merged_metadata))

return {"documents": documents}
20 changes: 19 additions & 1 deletion haystack/components/converters/tika.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
# SPDX-License-Identifier: Apache-2.0

import io
import os
import warnings
from html.parser import HTMLParser
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
Expand Down Expand Up @@ -73,15 +75,19 @@ class TikaDocumentConverter:
```
"""

def __init__(self, tika_url: str = "http://localhost:9998/tika"):
def __init__(self, tika_url: str = "http://localhost:9998/tika", store_full_path: bool = True):
"""
Create a TikaDocumentConverter component.

:param tika_url:
Tika server URL.
:param store_full_path:
If True, the full path of the file is stored in the metadata of the document.
If False, only the file name is stored.
"""
tika_import.check()
self.tika_url = tika_url
self.store_full_path = store_full_path

@component.output_types(documents=List[Document])
def run(
Expand Down Expand Up @@ -133,6 +139,18 @@ def run(
continue

merged_metadata = {**bytestream.meta, **metadata}
warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)

if not self.store_full_path and "file_path" in bytestream.meta:
file_path = bytestream.meta.get("file_path")
if file_path: # Ensure the value is not None for pylint
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

something here

merged_metadata["file_path"] = os.path.basename(file_path)

document = Document(content=text, meta=merged_metadata)
documents.append(document)
return {"documents": documents}
19 changes: 18 additions & 1 deletion haystack/components/converters/txt.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
#
# SPDX-License-Identifier: Apache-2.0

import os
import warnings
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

Expand Down Expand Up @@ -34,16 +36,20 @@ class TextFileToDocument:
```
"""

def __init__(self, encoding: str = "utf-8"):
def __init__(self, encoding: str = "utf-8", store_full_path: bool = True):
"""
Creates a TextFileToDocument component.

:param encoding:
The encoding of the text files to convert.
If the encoding is specified in the metadata of a source ByteStream,
it overrides this value.
:param store_full_path:
If True, the full path of the file is stored in the metadata of the document.
If False, only the file name is stored.
"""
self.encoding = encoding
self.store_full_path = store_full_path

@component.output_types(documents=List[Document])
def run(
Expand Down Expand Up @@ -87,6 +93,17 @@ def run(
continue

merged_metadata = {**bytestream.meta, **metadata}
warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)

if not self.store_full_path and "file_path" in bytestream.meta:
file_path = bytestream.meta.get("file_path")
if file_path: # Ensure the value is not None for pylint
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

idem

merged_metadata["file_path"] = os.path.basename(file_path)
document = Document(content=text, meta=merged_metadata)
documents.append(document)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
---
features:
- |
Added a new `store_full_path` parameter to the `__init__` methods of `JSONConverter`, `MarkdownToDocument`, `PDFMinerToDocument`, `PPTXToDocument`, `TikaDocumentConverter` and `TextFileToDocument`. The default value is `True`, which stores full file path in the metadata of the output documents. When set to `False`, only the file name is stored.

deprecations:
- |
The default value of the `store_full_path` parameter will be changed to `False` in Haysatck 2.9.0 to enhance privacy.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
The default value of the `store_full_path` parameter will be changed to `False` in Haysatck 2.9.0 to enhance privacy.
The default value of the `store_full_path` parameter will change to `False` in Haysatck 2.9.0 to enhance privacy.

Loading
Loading