Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Slim connector for Zendesk #3367

Merged
merged 4 commits into from
Jan 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/workflows/pr-python-connector-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ env:
GOOGLE_GMAIL_OAUTH_CREDENTIALS_JSON_STR: ${{ secrets.GOOGLE_GMAIL_OAUTH_CREDENTIALS_JSON_STR }}
# Slab
SLAB_BOT_TOKEN: ${{ secrets.SLAB_BOT_TOKEN }}
# Zendesk
ZENDESK_SUBDOMAIN: ${{ secrets.ZENDESK_SUBDOMAIN }}
ZENDESK_EMAIL: ${{ secrets.ZENDESK_EMAIL }}
ZENDESK_TOKEN: ${{ secrets.ZENDESK_TOKEN }}
# Salesforce
SF_USERNAME: ${{ secrets.SF_USERNAME }}
SF_PASSWORD: ${{ secrets.SF_PASSWORD }}
Expand Down
43 changes: 42 additions & 1 deletion backend/onyx/connectors/zendesk/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,21 @@
time_str_to_utc,
)
from onyx.connectors.interfaces import GenerateDocumentsOutput
from onyx.connectors.interfaces import GenerateSlimDocumentOutput
from onyx.connectors.interfaces import LoadConnector
from onyx.connectors.interfaces import PollConnector
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.interfaces import SlimConnector
from onyx.connectors.models import BasicExpertInfo
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import SlimDocument
from onyx.file_processing.html_utils import parse_html_page_basic
from onyx.utils.retry_wrapper import retry_builder


MAX_PAGE_SIZE = 30 # Zendesk API maximum
_SLIM_BATCH_SIZE = 1000


class ZendeskCredentialsNotSetUpError(PermissionError):
Expand Down Expand Up @@ -272,7 +276,7 @@ def _ticket_to_document(
)


class ZendeskConnector(LoadConnector, PollConnector):
class ZendeskConnector(LoadConnector, PollConnector, SlimConnector):
def __init__(
self,
batch_size: int = INDEX_BATCH_SIZE,
Expand Down Expand Up @@ -397,6 +401,43 @@ def _poll_tickets(
if doc_batch:
yield doc_batch

def retrieve_all_slim_documents(
self,
start: SecondsSinceUnixEpoch | None = None,
end: SecondsSinceUnixEpoch | None = None,
) -> GenerateSlimDocumentOutput:
slim_doc_batch: list[SlimDocument] = []
if self.content_type == "articles":
articles = _get_articles(
self.client, start_time=int(start) if start else None
)
for article in articles:
slim_doc_batch.append(
SlimDocument(
id=f"article:{article['id']}",
)
)
if len(slim_doc_batch) >= _SLIM_BATCH_SIZE:
yield slim_doc_batch
slim_doc_batch = []
elif self.content_type == "tickets":
tickets = _get_tickets(
self.client, start_time=int(start) if start else None
)
for ticket in tickets:
slim_doc_batch.append(
SlimDocument(
id=f"zendesk_ticket_{ticket['id']}",
)
)
if len(slim_doc_batch) >= _SLIM_BATCH_SIZE:
yield slim_doc_batch
slim_doc_batch = []
else:
raise ValueError(f"Unsupported content_type: {self.content_type}")
if slim_doc_batch:
yield slim_doc_batch


if __name__ == "__main__":
import os
Expand Down
96 changes: 96 additions & 0 deletions backend/tests/daily/connectors/zendesk/test_zendesk_connector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import json
import os
import time
from pathlib import Path

import pytest

from danswer.configs.constants import DocumentSource
from danswer.connectors.models import Document
from danswer.connectors.zendesk.connector import ZendeskConnector


def load_test_data(file_name: str = "test_zendesk_data.json") -> dict[str, dict]:
current_dir = Path(__file__).parent
with open(current_dir / file_name, "r") as f:
return json.load(f)


@pytest.fixture
def zendesk_article_connector() -> ZendeskConnector:
connector = ZendeskConnector(content_type="articles")
connector.load_credentials(get_credentials())
return connector


@pytest.fixture
def zendesk_ticket_connector() -> ZendeskConnector:
connector = ZendeskConnector(content_type="tickets")
connector.load_credentials(get_credentials())
return connector


def get_credentials() -> dict[str, str]:
return {
"zendesk_subdomain": os.environ["ZENDESK_SUBDOMAIN"],
"zendesk_email": os.environ["ZENDESK_EMAIL"],
"zendesk_token": os.environ["ZENDESK_TOKEN"],
}


@pytest.mark.parametrize(
"connector_fixture", ["zendesk_article_connector", "zendesk_ticket_connector"]
)
def test_zendesk_connector_basic(
request: pytest.FixtureRequest, connector_fixture: str
) -> None:
connector = request.getfixturevalue(connector_fixture)
test_data = load_test_data()
all_docs: list[Document] = []
target_test_doc_id: str
if connector.content_type == "articles":
target_test_doc_id = f"article:{test_data['article']['id']}"
else:
target_test_doc_id = f"zendesk_ticket_{test_data['ticket']['id']}"

target_doc: Document | None = None

for doc_batch in connector.poll_source(0, time.time()):
for doc in doc_batch:
all_docs.append(doc)
if doc.id == target_test_doc_id:
target_doc = doc

assert len(all_docs) > 0, "No documents were retrieved from the connector"
assert (
target_doc is not None
), "Target document was not found in the retrieved documents"
assert target_doc.source == DocumentSource.ZENDESK, "Document source is not ZENDESK"

if connector.content_type == "articles":
print(f"target_doc.semantic_identifier {target_doc.semantic_identifier}")
assert (
target_doc.semantic_identifier
== test_data["article"]["semantic_identifier"]
), "Article title does not match"
else:
assert target_doc.semantic_identifier.startswith(
f"Ticket #{test_data['ticket']['id']}"
), "Ticket ID does not match"


def test_zendesk_connector_slim(zendesk_article_connector: ZendeskConnector) -> None:
# Get full doc IDs
all_full_doc_ids = set()
for doc_batch in zendesk_article_connector.load_from_state():
all_full_doc_ids.update([doc.id for doc in doc_batch])

# Get slim doc IDs
all_slim_doc_ids = set()
for slim_doc_batch in zendesk_article_connector.retrieve_all_slim_documents():
all_slim_doc_ids.update([doc.id for doc in slim_doc_batch])

# Full docs should be subset of slim docs
assert all_full_doc_ids.issubset(
all_slim_doc_ids
), f"Full doc IDs {all_full_doc_ids} not subset of slim doc IDs {all_slim_doc_ids}"
11 changes: 11 additions & 0 deletions backend/tests/daily/connectors/zendesk/test_zendesk_data.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"article": {
"id": "17275801227804",
"semantic_identifier": "How can agents leverage knowledge to help customers?"

},
"ticket": {
"id": "1"

}
}
Loading