From 08879e2d687ed42997c4c0b8d060c77281d4d71c Mon Sep 17 00:00:00 2001 From: Subash-Mohan Date: Fri, 6 Dec 2024 20:28:44 +0530 Subject: [PATCH 1/3] Add SlimConnector support for Zendesk --- .../danswer/connectors/zendesk/connector.py | 43 ++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/backend/danswer/connectors/zendesk/connector.py b/backend/danswer/connectors/zendesk/connector.py index 170da788a76..e0ffde7d688 100644 --- a/backend/danswer/connectors/zendesk/connector.py +++ b/backend/danswer/connectors/zendesk/connector.py @@ -10,17 +10,21 @@ time_str_to_utc, ) from danswer.connectors.interfaces import GenerateDocumentsOutput +from danswer.connectors.interfaces import GenerateSlimDocumentOutput from danswer.connectors.interfaces import LoadConnector from danswer.connectors.interfaces import PollConnector from danswer.connectors.interfaces import SecondsSinceUnixEpoch +from danswer.connectors.interfaces import SlimConnector from danswer.connectors.models import BasicExpertInfo from danswer.connectors.models import Document from danswer.connectors.models import Section +from danswer.connectors.models import SlimDocument from danswer.file_processing.html_utils import parse_html_page_basic from danswer.utils.retry_wrapper import retry_builder MAX_PAGE_SIZE = 30 # Zendesk API maximum +_SLIM_BATCH_SIZE = 1000 class ZendeskCredentialsNotSetUpError(PermissionError): @@ -265,7 +269,7 @@ def _ticket_to_document( ) -class ZendeskConnector(LoadConnector, PollConnector): +class ZendeskConnector(LoadConnector, PollConnector, SlimConnector): def __init__( self, batch_size: int = INDEX_BATCH_SIZE, @@ -390,6 +394,43 @@ def _poll_tickets( if doc_batch: yield doc_batch + def retrieve_all_slim_documents( + self, + start: SecondsSinceUnixEpoch | None = None, + end: SecondsSinceUnixEpoch | None = None, + ) -> GenerateSlimDocumentOutput: + slim_doc_batch: list[SlimDocument] = [] + if self.content_type == "articles": + articles = _get_articles( + self.client, start_time=int(start) if start else None + ) + for article in articles: + slim_doc_batch.append( + SlimDocument( + id=f"article:{article['id']}", + ) + ) + if len(slim_doc_batch) >= _SLIM_BATCH_SIZE: + yield slim_doc_batch + slim_doc_batch = [] + elif self.content_type == "tickets": + tickets = _get_tickets( + self.client, start_time=int(start) if start else None + ) + for ticket in tickets: + slim_doc_batch.append( + SlimDocument( + id=f"zendesk_ticket_{ticket['id']}", + ) + ) + if len(slim_doc_batch) >= _SLIM_BATCH_SIZE: + yield slim_doc_batch + slim_doc_batch = [] + else: + raise ValueError(f"Unsupported content_type: {self.content_type}") + if slim_doc_batch: + yield slim_doc_batch + if __name__ == "__main__": import os From 2d146d8528cb950ccee7b72f9fa57e3842797aae Mon Sep 17 00:00:00 2001 From: Subash-Mohan Date: Sat, 7 Dec 2024 21:15:22 +0530 Subject: [PATCH 2/3] ZenDesk format changes --- .../workflows/pr-python-connector-tests.yml | 5 + .../zendesk/test_zendesk_connector.py | 94 +++++++++++++++++++ .../connectors/zendesk/test_zendesk_data.json | 11 +++ 3 files changed, 110 insertions(+) create mode 100644 backend/tests/daily/connectors/zendesk/test_zendesk_connector.py create mode 100644 backend/tests/daily/connectors/zendesk/test_zendesk_data.json diff --git a/.github/workflows/pr-python-connector-tests.yml b/.github/workflows/pr-python-connector-tests.yml index e8720adaf2f..92b21d9ac03 100644 --- a/.github/workflows/pr-python-connector-tests.yml +++ b/.github/workflows/pr-python-connector-tests.yml @@ -26,6 +26,11 @@ env: GOOGLE_GMAIL_OAUTH_CREDENTIALS_JSON_STR: ${{ secrets.GOOGLE_GMAIL_OAUTH_CREDENTIALS_JSON_STR }} # Slab SLAB_BOT_TOKEN: ${{ secrets.SLAB_BOT_TOKEN }} + #ZenDesk + ZENDESK_SUBDOMAIN: ${{ secrets.ZENDESK_SUBDOMAIN }} + ZENDESK_EMAIL: ${{ secrets.ZENDESK_EMAIL }} + ZENDESK_TOKEN: ${{ secrets.ZENDESK_TOKEN }} + jobs: connectors-check: diff --git a/backend/tests/daily/connectors/zendesk/test_zendesk_connector.py b/backend/tests/daily/connectors/zendesk/test_zendesk_connector.py new file mode 100644 index 00000000000..5a3ebd3338b --- /dev/null +++ b/backend/tests/daily/connectors/zendesk/test_zendesk_connector.py @@ -0,0 +1,94 @@ +import json +import os +import time +from pathlib import Path + +import pytest + +from danswer.configs.constants import DocumentSource +from danswer.connectors.models import Document +from danswer.connectors.zendesk.connector import ZendeskConnector + + +def load_test_data(file_name: str = "test_zendesk_data.json") -> dict[str, dict]: + current_dir = Path(__file__).parent + with open(current_dir / file_name, "r") as f: + return json.load(f) + + +@pytest.fixture +def zendesk_article_connector() -> ZendeskConnector: + connector = ZendeskConnector(content_type="articles") + connector.load_credentials(get_credentials()) + return connector + + +@pytest.fixture +def zendesk_ticket_connector() -> ZendeskConnector: + connector = ZendeskConnector(content_type="tickets") + connector.load_credentials(get_credentials()) + return connector + + +def get_credentials(): + return { + "zendesk_subdomain": os.environ["ZENDESK_SUBDOMAIN"], + "zendesk_email": os.environ["ZENDESK_EMAIL"], + "zendesk_token": os.environ["ZENDESK_TOKEN"], + } + + +@pytest.mark.parametrize( + "connector_fixture", ["zendesk_article_connector", "zendesk_ticket_connector"] +) +def test_zendesk_connector_basic(request, connector_fixture): + connector = request.getfixturevalue(connector_fixture) + test_data = load_test_data() + all_docs: list[Document] = [] + target_test_doc_id: str + if connector.content_type == "articles": + target_test_doc_id = f"article:{test_data['article']['id']}" + else: + target_test_doc_id = f"zendesk_ticket_{test_data['ticket']['id']}" + + target_doc: Document | None = None + + for doc_batch in connector.poll_source(0, time.time()): + for doc in doc_batch: + all_docs.append(doc) + if doc.id == target_test_doc_id: + target_doc = doc + + assert len(all_docs) > 0, "No documents were retrieved from the connector" + assert ( + target_doc is not None + ), "Target document was not found in the retrieved documents" + assert target_doc.source == DocumentSource.ZENDESK, "Document source is not ZENDESK" + + if connector.content_type == "articles": + print(f"target_doc.semantic_identifier {target_doc.semantic_identifier}") + assert ( + target_doc.semantic_identifier + == test_data["article"]["semantic_identifier"] + ), "Article title does not match" + else: + assert target_doc.semantic_identifier.startswith( + f"Ticket #{test_data['ticket']['id']}" + ), "Ticket ID does not match" + + +def test_zendesk_connector_slim(zendesk_article_connector: ZendeskConnector): + # Get full doc IDs + all_full_doc_ids = set() + for doc_batch in zendesk_article_connector.load_from_state(): + all_full_doc_ids.update([doc.id for doc in doc_batch]) + + # Get slim doc IDs + all_slim_doc_ids = set() + for slim_doc_batch in zendesk_article_connector.retrieve_all_slim_documents(): + all_slim_doc_ids.update([doc.id for doc in slim_doc_batch]) + + # Full docs should be subset of slim docs + assert all_full_doc_ids.issubset( + all_slim_doc_ids + ), f"Full doc IDs {all_full_doc_ids} not subset of slim doc IDs {all_slim_doc_ids}" diff --git a/backend/tests/daily/connectors/zendesk/test_zendesk_data.json b/backend/tests/daily/connectors/zendesk/test_zendesk_data.json new file mode 100644 index 00000000000..0b508715502 --- /dev/null +++ b/backend/tests/daily/connectors/zendesk/test_zendesk_data.json @@ -0,0 +1,11 @@ +{ + "article": { + "id": "17275801227804", + "semantic_identifier": "How can agents leverage knowledge to help customers?" + + }, + "ticket": { + "id": "1" + + } +} \ No newline at end of file From ad6e663447c36b37a5f8441cec91f8ff1acd9c88 Mon Sep 17 00:00:00 2001 From: Subash-Mohan Date: Tue, 10 Dec 2024 14:27:04 +0530 Subject: [PATCH 3/3] code formating --- .../daily/connectors/zendesk/test_zendesk_connector.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/backend/tests/daily/connectors/zendesk/test_zendesk_connector.py b/backend/tests/daily/connectors/zendesk/test_zendesk_connector.py index 5a3ebd3338b..321ca298b79 100644 --- a/backend/tests/daily/connectors/zendesk/test_zendesk_connector.py +++ b/backend/tests/daily/connectors/zendesk/test_zendesk_connector.py @@ -30,7 +30,7 @@ def zendesk_ticket_connector() -> ZendeskConnector: return connector -def get_credentials(): +def get_credentials() -> dict[str, str]: return { "zendesk_subdomain": os.environ["ZENDESK_SUBDOMAIN"], "zendesk_email": os.environ["ZENDESK_EMAIL"], @@ -41,7 +41,9 @@ def get_credentials(): @pytest.mark.parametrize( "connector_fixture", ["zendesk_article_connector", "zendesk_ticket_connector"] ) -def test_zendesk_connector_basic(request, connector_fixture): +def test_zendesk_connector_basic( + request: pytest.FixtureRequest, connector_fixture: str +) -> None: connector = request.getfixturevalue(connector_fixture) test_data = load_test_data() all_docs: list[Document] = [] @@ -77,7 +79,7 @@ def test_zendesk_connector_basic(request, connector_fixture): ), "Ticket ID does not match" -def test_zendesk_connector_slim(zendesk_article_connector: ZendeskConnector): +def test_zendesk_connector_slim(zendesk_article_connector: ZendeskConnector) -> None: # Get full doc IDs all_full_doc_ids = set() for doc_batch in zendesk_article_connector.load_from_state():