From 71d3e3aa2de9fad48f4e1b5f6787cccb7e1102aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9F?= Date: Fri, 5 Jan 2024 12:11:21 +0100 Subject: [PATCH] directory connector: add some todos --- backend/danswer/connectors/directory/connector.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/backend/danswer/connectors/directory/connector.py b/backend/danswer/connectors/directory/connector.py index 098270d7706..08452f9a0a0 100644 --- a/backend/danswer/connectors/directory/connector.py +++ b/backend/danswer/connectors/directory/connector.py @@ -39,11 +39,13 @@ def _open_files_at_location_recursive( if file.is_dir(follow_symlinks=False): yield from _open_files_at_location_recursive(base_path, rel_file_path) elif not file.is_symlink(): + # FIXME: Move this outside; to avoid stat()ing already processed files if file.stat().st_size > MAX_FILE_SIZE: logger.warning(f"Skipping file '{abs_file_path}' as it is too large") continue extension = get_file_ext(file.name) if extension == ".txt": + # FIXME: Move this outside; to avoid opening already processed files with open(abs_file_path, "r", encoding = "utf8") as fd: yield str(rel_file_path), fd else: @@ -97,6 +99,7 @@ def load_from_state(self) -> GenerateDocumentsOutput: for file_name, file in files: file_path = os.path.join(file_location, file_name) + # TODO: Check if file has been modified since last time if file_path in state: logger.debug(f"Skipping file '{file_path}' as it has already been processed") continue