Skip to content

Commit

Permalink
Stop retrieve more docs if all docs have been returned (microsoft#3282)
Browse files Browse the repository at this point in the history
  • Loading branch information
thinkall authored Aug 6, 2024
1 parent 03bfb8f commit b370170
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 83 deletions.
4 changes: 2 additions & 2 deletions autogen/agentchat/contrib/retrieve_user_proxy_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -519,7 +519,7 @@ def _generate_retrieve_user_reply(
self.problem, self.n_results * (2 * _tmp_retrieve_count + 1), self._search_string
)
doc_contents = self._get_context(self._results)
if doc_contents:
if doc_contents or self.n_results * (2 * _tmp_retrieve_count + 1) >= len(self._results[0]):
break
elif update_context_case2:
# Use the current intermediate info as the query text to retrieve docs, and each time we append the top similar
Expand All @@ -531,7 +531,7 @@ def _generate_retrieve_user_reply(
)
self._get_context(self._results)
doc_contents = "\n".join(self._doc_contents) # + "\n" + "\n".join(self._intermediate_answers)
if doc_contents:
if doc_contents or self.n_results * (2 * _tmp_retrieve_count + 1) >= len(self._results[0]):
break

self.clear_history()
Expand Down
135 changes: 54 additions & 81 deletions notebook/agentchat_RetrieveChat.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,14 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"models to use: ['gpt-3.5-turbo-0125']\n"
"models to use: ['gpt-35-turbo', 'gpt4-1106-preview', 'gpt-4o']\n"
]
}
],
Expand All @@ -73,9 +73,7 @@
"# a vector database instance\n",
"from autogen.retrieve_utils import TEXT_FORMATS\n",
"\n",
"config_list = [\n",
" {\"model\": \"gpt-3.5-turbo-0125\", \"api_key\": \"<YOUR_API_KEY>\", \"api_type\": \"openai\"},\n",
"]\n",
"config_list = autogen.config_list_from_json(\"OAI_CONFIG_LIST\")\n",
"\n",
"assert len(config_list) > 0\n",
"print(\"models to use: \", [config_list[i][\"model\"] for i in range(len(config_list))])"
Expand Down Expand Up @@ -107,7 +105,7 @@
"output_type": "stream",
"text": [
"Accepted file formats for `docs_path`:\n",
"['odt', 'xml', 'pdf', 'docx', 'html', 'md', 'htm', 'csv', 'rst', 'org', 'ppt', 'doc', 'log', 'json', 'epub', 'jsonl', 'pptx', 'yml', 'xlsx', 'tsv', 'txt', 'yaml', 'msg', 'rtf']\n"
"['txt', 'json', 'csv', 'tsv', 'md', 'html', 'htm', 'rtf', 'rst', 'jsonl', 'log', 'xml', 'yaml', 'yml', 'pdf']\n"
]
}
],
Expand All @@ -120,7 +118,16 @@
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/workspace/anaconda3/envs/autogen312/lib/python3.12/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:11: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n",
" from tqdm.autonotebook import tqdm, trange\n"
]
}
],
"source": [
"# 1. create an RetrieveAssistantAgent instance named \"assistant\"\n",
"assistant = RetrieveAssistantAgent(\n",
Expand Down Expand Up @@ -160,6 +167,7 @@
" # \"client\": chromadb.PersistentClient(path=\"/tmp/chromadb\"), # deprecated, use \"vector_db\" instead\n",
" \"vector_db\": \"chroma\", # to use the deprecated `client` parameter, set to None and uncomment the line above\n",
" \"overwrite\": False, # set to True if you want to overwrite an existing collection\n",
" \"get_or_create\": True, # set to False if don't want to reuse an existing collection\n",
" },\n",
" code_execution_config=False, # set to False if you don't want to execute the code\n",
")"
Expand Down Expand Up @@ -188,7 +196,8 @@
"name": "stderr",
"output_type": "stream",
"text": [
"2024-04-07 17:30:56,955 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - \u001b[32mUse the existing collection `autogen-docs`.\u001b[0m\n"
"2024-08-02 06:30:11,303 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - \u001b[32mUse the existing collection `autogen-docs`.\u001b[0m\n",
"2024-08-02 06:30:11,485 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - Found 2 chunks.\u001b[0m\n"
]
},
{
Expand All @@ -202,7 +211,6 @@
"name": "stderr",
"output_type": "stream",
"text": [
"2024-04-07 17:30:59,609 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - Found 2 chunks.\u001b[0m\n",
"Number of requested results 20 is greater than number of elements in index 2, updating n_results = 2\n"
]
},
Expand Down Expand Up @@ -361,65 +369,53 @@
"--------------------------------------------------------------------------------\n",
"\u001b[33massistant\u001b[0m (to ragproxyagent):\n",
"\n",
"To perform a classification task using FLAML and use Spark to do parallel training for 30 seconds and force cancel jobs if the time limit is reached, you can follow these steps:\n",
"\n",
"1. First, convert your data into Spark dataframe format using `to_pandas_on_spark` function from `flaml.automl.spark.utils` module.\n",
"2. Then, format your data for use SparkML models by using `VectorAssembler`.\n",
"3. Define your AutoML settings, including the `metric`, `time_budget`, and `task`.\n",
"4. Use `AutoML` from `flaml` to run AutoML with SparkML models by setting `use_spark` to `true`, and `estimator_list` to a list of spark-based estimators, like `[\"lgbm_spark\"]`.\n",
"5. Set `n_concurrent_trials` to the desired number of parallel jobs and `force_cancel` to `True` to cancel the jobs if the time limit is reached.\n",
"\n",
"Here's an example code snippet for performing classification using FLAML and Spark:\n",
"\n",
"```python\n",
"import pandas as pd\n",
"import flaml\n",
"from flaml.automl.spark.utils import to_pandas_on_spark\n",
"from pyspark.ml.feature import VectorAssembler\n",
"import flaml\n",
"import pandas as pd\n",
"\n",
"# Creating a dictionary\n",
"# Example Data (Please provide real data in practice)\n",
"data = {\n",
" \"sepal_length\": [5.1, 4.9, 4.7, 4.6, 5.0],\n",
" \"sepal_width\": [3.5, 3.0, 3.2, 3.1, 3.6],\n",
" \"petal_length\": [1.4, 1.4, 1.3, 1.5, 1.4],\n",
" \"petal_width\": [0.2, 0.2, 0.2, 0.2, 0.2],\n",
" \"species\": [\"setosa\", \"setosa\", \"setosa\", \"setosa\", \"setosa\"]\n",
" \"feature1\": [0, 1, 2, 3, 4],\n",
" \"feature2\": [1, 2, 3, 4, 5],\n",
" # ... add all features you need for your classification\n",
" \"label\": ['a', 'b', 'a', 'a', 'b'], # assuming binary classification with labels 'a' and 'b'\n",
"}\n",
"\n",
"# Creating a pandas DataFrame\n",
"dataframe = pd.DataFrame(data)\n",
"label = \"species\"\n",
"# Convert to Pandas DataFrame\n",
"pdf = pd.DataFrame(data)\n",
"\n",
"# Convert to pandas-on-spark dataframe\n",
"psdf = to_pandas_on_spark(dataframe)\n",
"# Generate pandas-on-spark dataframe\n",
"psdf = to_pandas_on_spark(pdf)\n",
"\n",
"# Format data for SparkML models\n",
"columns = psdf.columns\n",
"feature_cols = [col for col in columns if col != label]\n",
"# Organize data into feature vectors and labels\n",
"label_col = \"label\"\n",
"feature_cols = [col for col in psdf.columns if col != label_col]\n",
"featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n",
"psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\"]\n",
"\n",
"# Define AutoML settings\n",
"settings = {\n",
"# Apply the transformation\n",
"psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\", label_col]\n",
"\n",
"# Prepare AutoML settings\n",
"automl_settings = {\n",
" \"time_budget\": 30,\n",
" \"metric\": \"accuracy\",\n",
" \"metric\": \"accuracy\", # Change this to a classification metric you prefer\n",
" \"task\": \"classification\",\n",
" \"n_concurrent_trials\": 2, # Or other number that fits your Spark cluster configuration\n",
" \"use_spark\": True,\n",
" \"force_cancel\": True, # Enable force cancel to obey the time constraint\n",
" \"estimator_list\": [\"lgbm_spark\"], # Specify SparkML estimators you want to try\n",
"}\n",
"\n",
"# Use AutoML with SparkML models and parallel jobs\n",
"# Create an AutoML instance\n",
"automl = flaml.AutoML()\n",
"automl.fit(\n",
" dataframe=psdf,\n",
" label=label,\n",
" estimator_list=[\"lgbm_spark\"],\n",
" use_spark=True,\n",
" n_concurrent_trials=2,\n",
" force_cancel=True,\n",
" **settings,\n",
")\n",
"```\n",
"\n",
"Note that the above code assumes the data is small enough to train within 30 seconds. If you have a larger dataset, you may need to increase the `time_budget` and adjust the number of parallel jobs accordingly.\n",
"# Run the AutoML search\n",
"automl.fit(dataframe=psdf, label=label_col, **automl_settings)\n",
"``` \n",
"\n",
"Remember to replace the example data with your real dataset and choose an appropriate metric for your classification task. You'll also need a configured and running Spark environment to utilize the \"use_spark\" feature.\n",
"\n",
"--------------------------------------------------------------------------------\n",
"\u001b[33mragproxyagent\u001b[0m (to assistant):\n",
Expand All @@ -439,25 +435,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"Number of requested results 60 is greater than number of elements in index 2, updating n_results = 2\n",
"Number of requested results 100 is greater than number of elements in index 2, updating n_results = 2\n",
"Number of requested results 140 is greater than number of elements in index 2, updating n_results = 2\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"VectorDB returns doc_ids: [['bdfbc921']]\n",
"VectorDB returns doc_ids: [['bdfbc921']]\n",
"VectorDB returns doc_ids: [['bdfbc921']]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Number of requested results 180 is greater than number of elements in index 2, updating n_results = 2\n"
"Number of requested results 60 is greater than number of elements in index 2, updating n_results = 2\n"
]
},
{
Expand All @@ -470,18 +448,13 @@
"\n",
"TERMINATE\n",
"\n",
"--------------------------------------------------------------------------------\n",
"\u001b[33mragproxyagent\u001b[0m (to assistant):\n",
"\n",
"TERMINATE\n",
"\n",
"--------------------------------------------------------------------------------\n"
]
},
{
"data": {
"text/plain": [
"ChatResult(chat_id=None, chat_history=[{'content': 'TERMINATE', 'role': 'assistant'}], summary='', cost=({'total_cost': 0.007691, 'gpt-35-turbo': {'cost': 0.007691, 'prompt_tokens': 4242, 'completion_tokens': 664, 'total_tokens': 4906}}, {'total_cost': 0}), human_input=[])"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
Expand Down Expand Up @@ -2836,7 +2809,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
"version": "3.12.4"
},
"skip_test": "Requires interactive usage"
},
Expand Down

0 comments on commit b370170

Please sign in to comment.