From 8742e851c12893e99a1b787f53efcd6938d37167 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9F?= Date: Wed, 27 Sep 2023 16:02:51 +0200 Subject: [PATCH] add directory connector --- backend/danswer/configs/app_configs.py | 1 + backend/danswer/configs/constants.py | 1 + .../danswer/connectors/directory/connector.py | 86 +++++++++++ backend/danswer/connectors/factory.py | 2 + .../app/admin/connectors/directory/page.tsx | 134 ++++++++++++++++++ web/src/app/admin/layout.tsx | 10 ++ web/src/components/icons/icons.tsx | 8 ++ web/src/components/search/Filters.tsx | 1 + web/src/components/source.tsx | 7 + web/src/lib/types.ts | 12 +- 10 files changed, 261 insertions(+), 1 deletion(-) create mode 100644 backend/danswer/connectors/directory/connector.py create mode 100644 web/src/app/admin/connectors/directory/page.tsx diff --git a/backend/danswer/configs/app_configs.py b/backend/danswer/configs/app_configs.py index 055c2367ea6..1d66af6e0d2 100644 --- a/backend/danswer/configs/app_configs.py +++ b/backend/danswer/configs/app_configs.py @@ -122,6 +122,7 @@ FILE_CONNECTOR_TMP_STORAGE_PATH = os.environ.get( "FILE_CONNECTOR_TMP_STORAGE_PATH", "/home/file_connector_storage" ) +DIRECTORY_CONNECTOR_PATH = os.environ.get("DIRECTORY_CONNECTOR_PATH", "/data") # TODO these should be available for frontend configuration, via advanced options expandable WEB_CONNECTOR_IGNORED_CLASSES = os.environ.get( "WEB_CONNECTOR_IGNORED_CLASSES", "sidebar,footer" diff --git a/backend/danswer/configs/constants.py b/backend/danswer/configs/constants.py index 3dde92e5dfd..1c6175db040 100644 --- a/backend/danswer/configs/constants.py +++ b/backend/danswer/configs/constants.py @@ -53,6 +53,7 @@ class DocumentSource(str, Enum): JIRA = "jira" PRODUCTBOARD = "productboard" FILE = "file" + DIRECTORY = "directory" NOTION = "notion" ZULIP = "zulip" LINEAR = "linear" diff --git a/backend/danswer/connectors/directory/connector.py b/backend/danswer/connectors/directory/connector.py new file mode 100644 index 00000000000..8a6ff440d22 --- /dev/null +++ b/backend/danswer/connectors/directory/connector.py @@ -0,0 +1,86 @@ +import json +import os +from collections.abc import Generator +from pathlib import Path +from typing import Any +from typing import IO + +from danswer.configs.app_configs import INDEX_BATCH_SIZE +from danswer.configs.app_configs import DIRECTORY_CONNECTOR_PATH +from danswer.configs.constants import DocumentSource +from danswer.connectors.file.utils import get_file_ext +from danswer.connectors.interfaces import GenerateDocumentsOutput +from danswer.connectors.interfaces import LoadConnector +from danswer.connectors.models import Document +from danswer.connectors.models import Section +from danswer.utils.logger import setup_logger + + +logger = setup_logger() + +_METADATA_FLAG = "#DANSWER_METADATA=" + + +def _open_files_at_location_recursive( + base_path: str | Path, + file_path: str | Path, +) -> Generator[tuple[str, IO[Any]], Any, None]: + for file in os.listdir(os.path.join(base_path, file_path)): + rel_file_path = os.path.join(file_path, file) + abs_file_path = os.path.join(base_path, rel_file_path) + if os.path.isdir(abs_file_path): + yield from _open_files_at_location_recursive(base_path, rel_file_path) + else: + extension = get_file_ext(abs_file_path) + if extension == ".txt": + with open(abs_file_path, "r", encoding = "utf8") as file: + yield str(rel_file_path), file + else: + logger.warning(f"Skipping file '{abs_file_path}' with extension '{extension}'") + +def _process_file(file_name: str, file: IO[Any]) -> list[Document]: + metadata = {} + file_content_raw = "" + for ind, line in enumerate(file): + if isinstance(line, bytes): + line = line.decode("utf-8") + line = str(line) + + if ind == 0 and line.startswith(_METADATA_FLAG): + metadata = json.loads(line.replace(_METADATA_FLAG, "", 1).strip()) + else: + file_content_raw += line + + return [ + Document( + id=file_name, + sections=[Section(link=metadata.get("link", ""), text=file_content_raw)], + source=DocumentSource.FILE, + semantic_identifier=file_name, + metadata={}, + ) + ] + + +class LocalDirectoryConnector(LoadConnector): + def __init__(self) -> None: + self.file_locations = [Path(DIRECTORY_CONNECTOR_PATH)] + self.batch_size = INDEX_BATCH_SIZE + + def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: + pass + + def load_from_state(self) -> GenerateDocumentsOutput: + documents: list[Document] = [] + for file_location in self.file_locations: + files = _open_files_at_location_recursive(file_location, '') + + for file_name, file in files: + documents.extend(_process_file(file_name, file)) + + if len(documents) >= self.batch_size: + yield documents + documents = [] + + if documents: + yield documents diff --git a/backend/danswer/connectors/factory.py b/backend/danswer/connectors/factory.py index 6de50524a0d..43a20f6c888 100644 --- a/backend/danswer/connectors/factory.py +++ b/backend/danswer/connectors/factory.py @@ -6,6 +6,7 @@ from danswer.connectors.confluence.connector import ConfluenceConnector from danswer.connectors.danswer_jira.connector import JiraConnector from danswer.connectors.file.connector import LocalFileConnector +from danswer.connectors.directory.connector import LocalDirectoryConnector from danswer.connectors.github.connector import GithubConnector from danswer.connectors.google_drive.connector import GoogleDriveConnector from danswer.connectors.guru.connector import GuruConnector @@ -35,6 +36,7 @@ def identify_connector_class( connector_map = { DocumentSource.WEB: WebConnector, DocumentSource.FILE: LocalFileConnector, + DocumentSource.DIRECTORY: LocalDirectoryConnector, DocumentSource.SLACK: { InputType.LOAD_STATE: SlackLoadConnector, InputType.POLL: SlackPollConnector, diff --git a/web/src/app/admin/connectors/directory/page.tsx b/web/src/app/admin/connectors/directory/page.tsx new file mode 100644 index 00000000000..7706674185f --- /dev/null +++ b/web/src/app/admin/connectors/directory/page.tsx @@ -0,0 +1,134 @@ +"use client"; + +import * as Yup from "yup"; +import { DirectoryIcon, TrashIcon } from "@/components/icons/icons"; +import { TextFormField } from "@/components/admin/connectors/Field"; +import { HealthCheckBanner } from "@/components/health/healthcheck"; +import { CredentialForm } from "@/components/admin/connectors/CredentialForm"; +import { +// DirectoryCredentialJson, + DirectoryConfig, + ConnectorIndexingStatus, +} from "@/lib/types"; +import useSWR, { useSWRConfig } from "swr"; +import { fetcher } from "@/lib/fetcher"; +import { LoadingAnimation } from "@/components/Loading"; +import { adminDeleteCredential, linkCredential } from "@/lib/credential"; +import { ConnectorForm } from "@/components/admin/connectors/ConnectorForm"; +import { ConnectorsTable } from "@/components/admin/connectors/table/ConnectorsTable"; +import { usePopup } from "@/components/admin/connectors/Popup"; +import { usePublicCredentials } from "@/lib/hooks"; + +const Main = () => { + const { popup, setPopup } = usePopup(); + + const { mutate } = useSWRConfig(); + const { + data: connectorIndexingStatuses, + isLoading: isConnectorIndexingStatusesLoading, + error: isConnectorIndexingStatusesError, + } = useSWR[]>( + "/api/manage/admin/connector/indexing-status", + fetcher + ); +// const { +// data: credentialsData, +// isLoading: isCredentialsLoading, +// error: isCredentialsError, +// refreshCredentials, +// } = usePublicCredentials(); + + if (!connectorIndexingStatuses && isConnectorIndexingStatusesLoading) { + return ; + } + + if (isConnectorIndexingStatusesError || !connectorIndexingStatuses) { + return
Failed to load connectors
; + } + + const directoryConnectorIndexingStatuses: ConnectorIndexingStatus< + DirectoryConfig, + // DirectoryCredentialJson + {} + >[] = connectorIndexingStatuses.filter( + (connectorIndexingStatus) => + connectorIndexingStatus.connector.source === "directory" + ); + + return ( + <> + {popup} + + {directoryConnectorIndexingStatuses.length > 0 && ( + <> +

+ Directory indexing status +

+
+ + connectorIndexingStatuses={directoryConnectorIndexingStatuses} + // liveCredential={directoryCredential} + // getCredential={(credential) => { + // return ( + //
+ //

{credential.credential_json.bookstack_api_token_id}

+ //
+ // ); + // }} + // onCredentialLink={async (connectorId) => { + // if (directoryCredential) { + // await linkCredential(connectorId, directoryCredential.id); + // mutate("/api/manage/admin/connector/indexing-status"); + // } + // }} + onUpdate={() => + mutate("/api/manage/admin/connector/indexing-status") + } + /> +
+ + )} + + {directoryConnectorIndexingStatuses.length === 0 && ( + <> +
+

Create Connection

+

+ Press connect below to start directory indexing. +

+ + nameBuilder={(values) => `DirectoryConnector`} + source="directory" + inputType="load_state" // or "poll" + formBody={<>} + validationSchema={Yup.object().shape({})} + initialValues={{}} + refreshFreq={1 * 60} // 1 minute + onSubmit={async (isSuccess, responseJson) => { + if (isSuccess && responseJson) { + await linkCredential(responseJson.id, 0); + mutate("/api/manage/admin/connector/indexing-status"); + } + }} + /> +
+ + )} + + ); +}; + +export default function Page() { + return ( +
+
+ +
+
+ +

Directory

+
+
+
+ ); +} diff --git a/web/src/app/admin/layout.tsx b/web/src/app/admin/layout.tsx index 7c9edba2d30..f9ce2404030 100644 --- a/web/src/app/admin/layout.tsx +++ b/web/src/app/admin/layout.tsx @@ -11,6 +11,7 @@ import { ConfluenceIcon, GuruIcon, FileIcon, + DirectoryIcon, JiraIcon, SlabIcon, NotionIcon, @@ -190,6 +191,15 @@ export default async function AdminLayout({ ), link: "/admin/connectors/file", }, + { + name: ( +
+ +
Directory
+
+ ), + link: "/admin/connectors/directory", + }, ], }, { diff --git a/web/src/components/icons/icons.tsx b/web/src/components/icons/icons.tsx index f9a250e4de0..b0e5a30ddd9 100644 --- a/web/src/components/icons/icons.tsx +++ b/web/src/components/icons/icons.tsx @@ -20,6 +20,7 @@ import { FiChevronsUp, FiEdit, FiFile, + FiFolder, FiGlobe, FiThumbsDown, FiThumbsUp, @@ -117,6 +118,13 @@ export const FileIcon = ({ return ; }; +export const DirectoryIcon = ({ + size = 16, + className = defaultTailwindCSS, +}: IconProps) => { + return ; +}; + export const InfoIcon = ({ size = 16, className = defaultTailwindCSS, diff --git a/web/src/components/search/Filters.tsx b/web/src/components/search/Filters.tsx index 485548b7b15..c2dc16870cf 100644 --- a/web/src/components/search/Filters.tsx +++ b/web/src/components/search/Filters.tsx @@ -16,6 +16,7 @@ const sources: Source[] = [ { displayName: "Web", internalName: "web" }, { displayName: "Guru", internalName: "guru" }, { displayName: "File", internalName: "file" }, + { displayName: "Directory", internalName: "directory" }, { displayName: "Notion", internalName: "notion" }, { displayName: "Zulip", internalName: "zulip" }, { displayName: "Linear", internalName: "linear" }, diff --git a/web/src/components/source.tsx b/web/src/components/source.tsx index 683c6c77fac..9d5810e0465 100644 --- a/web/src/components/source.tsx +++ b/web/src/components/source.tsx @@ -3,6 +3,7 @@ import { BookstackIcon, ConfluenceIcon, FileIcon, + DirectoryIcon, GithubIcon, GlobeIcon, GoogleDriveIcon, @@ -36,6 +37,12 @@ export const getSourceMetadata = (sourceType: ValidSources): SourceMetadata => { displayName: "File", adminPageLink: "/admin/connectors/file", }; + case "directory": + return { + icon: DirectoryIcon, + displayName: "Directory", + adminPageLink: "/admin/connectors/directory", + }; case "slack": return { icon: SlackIcon, diff --git a/web/src/lib/types.ts b/web/src/lib/types.ts index a4035747d6f..e34cc9b03c8 100644 --- a/web/src/lib/types.ts +++ b/web/src/lib/types.ts @@ -21,7 +21,8 @@ export type ValidSources = | "guru" | "zulip" | "linear" - | "file"; + | "file" + | "directory"; export type ValidInputTypes = "load_state" | "poll" | "event"; export type ValidStatuses = | "success" @@ -98,6 +99,9 @@ export interface FileConfig { file_locations: string[]; } +export interface DirectoryConfig { +} + export interface ZulipConfig { realm_name: string; realm_url: string; @@ -201,6 +205,12 @@ export interface LinearCredentialJson { linear_api_key: string; } +// export interface DirectoryCredentialJson { +// bookstack_base_url: string; +// bookstack_api_token_id: string; +// bookstack_api_token_secret: string; +// } + // DELETION export interface DeletionAttemptSnapshot {