-
Notifications
You must be signed in to change notification settings - Fork 264
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Initial contact center base scenario setup and added summarization scenario #2569
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
""" | ||
Base scenario for the contact center conversation. | ||
|
||
This scenario defines the basic input structure of the conversation | ||
and shared functions for the contact center conversation scenarios. | ||
|
||
The conversation structure is json format with the following fields: | ||
[ | ||
{ | ||
"conversation_name": "cresta-helm-cc-2018-01-12T19:31:31.404000", | ||
"body": [ | ||
{ | ||
"id": "abcd-1", | ||
"text": "Hello?", | ||
"speaker_role": "visitor", | ||
}, | ||
{ | ||
"id": "abcd-2", | ||
"text": "Thank you for contacting xxx! My name is Jack. I am here to help you. How can I help you today?", | ||
"speaker_role": "agent", | ||
} | ||
], | ||
}, | ||
... | ||
] | ||
""" | ||
|
||
import json | ||
from .scenario import Scenario | ||
|
||
class ContactCenterConversationScenario(Scenario): | ||
"""Base scenario for the contact center conversation.""" | ||
name = "cc_conversation" | ||
description = "Base scenario for contact center conversation tasks" | ||
tags = ["cc_conversation"] | ||
|
||
def __init__(self, dataset_path: str) -> None: | ||
""" | ||
Initializes contact center base scenario. | ||
Args: | ||
dataset_path: path of dataset to load from. | ||
""" | ||
super().__init__() | ||
self.dataset_path = dataset_path | ||
|
||
def _load_conversations(self, dataset_path): | ||
""" | ||
Load the conversations from the given path. | ||
|
||
Only returns the raw dictionary of conversations, where specific input/output text formatting | ||
is handled by the subclass scenario. | ||
Args: | ||
dataset_path: path of dataset to load from. | ||
Returns: | ||
dataset: List of conversation dictionaries. | ||
""" | ||
with open(dataset_path, 'r', encoding='utf-8') as f: | ||
raw_chats = json.load(f) | ||
return raw_chats |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
""" | ||
Scenario for contact center conversational summarization. | ||
|
||
Loads input conversations defined in the ContactCenterConversationScenario | ||
and packs task specific input/output format for summarization tasks. | ||
|
||
Task structure | ||
|
||
Conversation: | ||
agent: message1 | ||
visitor: message2 | ||
.... | ||
Summary: | ||
summary of the conversation | ||
|
||
|
||
Example from the dataset | ||
Conversation: | ||
agent: hi how can i help you today | ||
visitor: i need help with my account | ||
agent: sure what is your account number | ||
visitor: 123456 | ||
..... | ||
Summary: | ||
- agent helped visitor with account number 123456 | ||
""" | ||
|
||
|
||
import json | ||
import os | ||
from typing import List, Optional | ||
from helm.benchmark.scenarios.contactcenter_convo_base_scenario import ContactCenterConversationScenario | ||
from helm.common.general import ensure_file_downloaded, ensure_directory_exists | ||
from .scenario import Instance, Reference, TEST_SPLIT, CORRECT_TAG, Input, Output | ||
|
||
|
||
class ContactCenterConversationSummarizationScenario(ContactCenterConversationScenario): | ||
""" | ||
Scenario for contact center conversational summarization. | ||
""" | ||
|
||
name = "cc_convo_summarization" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Pick a consistent name that is used both here and as the filename. I'd prefer both to be Likewise with the other file. |
||
description = "Scenario for contact centern summarization tasks" | ||
tags = ["cc_conversation_summarization"] | ||
|
||
def __init__( | ||
self, | ||
dataset_path: str, | ||
sampling_min_length: Optional[int] = None, | ||
sampling_max_length: Optional[int] = None, | ||
doc_max_length: Optional[int] = None, | ||
Comment on lines
+49
to
+51
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. suggestion: names like |
||
): | ||
""" | ||
Initializes summarization scenario. | ||
Args: | ||
dataset_path: path of dataset to load from | ||
sampling_min_length: Int indicating minimum length for training | ||
documents. Training examples smaller than | ||
sampling_min_length will be filtered out. | ||
Useful for preventing the adapter from sampling | ||
really small documents. | ||
sampling_max_length: Int indicating maximum length for training | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. change comments to indicate that "length" is the number of words, not characters (or change the variable names to reflect this) |
||
documents. Training examples larger than | ||
sampling_max_length will be filtered out. | ||
Useful for preventing the adapter from | ||
sampling really large documents. | ||
doc_max_length: Int indicating the maximum length to truncate | ||
documents. Documents in all splits will be | ||
truncated to doc_max_length tokens. | ||
NOTE: Currently uses whitespace tokenization. | ||
""" | ||
super().__init__() | ||
self.dataset_path = dataset_path | ||
self.sampling_min_length = sampling_min_length | ||
self.sampling_max_length = sampling_max_length | ||
self.doc_max_length = doc_max_length | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This doesn't seem to be used. Are you missing the truncation logic? Alternatively, just delete this parameter. |
||
|
||
def _filter(self, convo: str, summary: str): | ||
"""filter on conversation turns""" | ||
convo_len = len(convo.split('\n')) | ||
if convo_len <= 10: | ||
return True | ||
return False | ||
|
||
def _load_summaries(self, dataset_path): | ||
with open(dataset_path, 'r', encoding='utf-8') as f: | ||
summaries_list = json.load(f) | ||
summaries = {item['conversation_name']: item['summary'] for item in summaries_list} | ||
return summaries | ||
|
||
def get_instances(self) -> List[Instance]: | ||
conversation_path = os.path.join(self.dataset_path, "conversations.json") | ||
summary_path = os.path.join(self.dataset_path, "summaries.json") | ||
conversations = self._load_conversations(conversation_path) | ||
summaries = self._load_summaries(summary_path) | ||
|
||
|
||
instances: List[Instance] = [] | ||
|
||
for example in conversations: | ||
conversation_name = example['conversation_name'] | ||
full_conversation_text = '\n'.join(f"{item['speaker_role']}:{item['text']}" for item in example['body']) | ||
summary = summaries[conversation_name] | ||
|
||
# use better tokenization to count tokens | ||
conversation_len = len(full_conversation_text.split()) | ||
if self.sampling_max_length and conversation_len > self.sampling_max_length: | ||
continue | ||
if self.sampling_min_length and conversation_len < self.sampling_min_length: | ||
continue | ||
|
||
if self._filter(full_conversation_text, summary): | ||
continue | ||
|
||
# always load TEST split as we don't offer train data | ||
instances.append( | ||
Instance( | ||
input=Input(text=full_conversation_text), | ||
references=[Reference(Output(text=summary), tags=[CORRECT_TAG])], | ||
split=TEST_SPLIT, | ||
) | ||
) | ||
|
||
return instances |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can just delete
name
,description
andtags
since this is not a concrete class and can't be instantated.