From 370ebf5e0042a8ea030474eda6f532b10d9e048f Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Fri, 17 Nov 2023 21:56:11 +0800 Subject: [PATCH] Update speaker selector in GroupChat and update some notebooks (#688) * Add speaker selection methods * Update groupchat RAG * Update seed to cache_seed * Update RetrieveChat notebook * Update parameter name * Add test * Add more tests * Add mock to test * Add mock to test * Fix typo speaking * Add gracefully exit manual input * Update round_robin docstring * Add method checking * Remove participant roles * Fix versions in notebooks * Minimize installation overhead * Fix missing lower() * Add comments for try_count 3 * Update warning for n_agents < 3 * Update warning for n_agents < 3 * Add test_n_agents_less_than_3 * Add a function for manual select * Update version in notebooks * Fixed bugs that allow speakers to go twice in a row even when allow_repeat_speaker = False --------- Co-authored-by: Adam Fourney --- .github/workflows/build.yml | 2 +- autogen/agentchat/groupchat.py | 101 +- notebook/agentchat_RetrieveChat.ipynb | 3959 ++++++------------ notebook/agentchat_groupchat_RAG.ipynb | 1130 +---- notebook/agentchat_qdrant_RetrieveChat.ipynb | 15 +- notebook/agentchat_teaching.ipynb | 6 +- setup.py | 1 + test/agentchat/test_groupchat.py | 158 +- 8 files changed, 1842 insertions(+), 3530 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 3cdb6293b27a..9e5332b58c9d 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -40,7 +40,7 @@ jobs: python -m pip install --upgrade pip wheel pip install -e . python -c "import autogen" - pip install -e. pytest + pip install -e. pytest mock pip uninstall -y openai - name: Install unstructured if not windows if: matrix.os != 'windows-2019' diff --git a/autogen/agentchat/groupchat.py b/autogen/agentchat/groupchat.py index 0f3bfa7a4792..a59f035fb89e 100644 --- a/autogen/agentchat/groupchat.py +++ b/autogen/agentchat/groupchat.py @@ -1,5 +1,6 @@ import logging import sys +import random from dataclasses import dataclass from typing import Dict, List, Optional, Union import re @@ -21,6 +22,13 @@ class GroupChat: When set to True and when a message is a function call suggestion, the next speaker will be chosen from an agent which contains the corresponding function name in its `function_map`. + - speaker_selection_method: the method for selecting the next speaker. Default is "auto". + Could be any of the following (case insensitive), will raise ValueError if not recognized: + - "auto": the next speaker is selected automatically by LLM. + - "manual": the next speaker is selected manually by user input. + - "random": the next speaker is selected randomly. + - "round_robin": the next speaker is selected in a round robin fashion, i.e., iterating in the same order as provided in `agents`. + - allow_repeat_speaker: whether to allow the same speaker to speak consecutively. Default is True. """ agents: List[Agent] @@ -28,6 +36,10 @@ class GroupChat: max_round: int = 10 admin_name: str = "Admin" func_call_filter: bool = True + speaker_selection_method: str = "auto" + allow_repeat_speaker: bool = True + + _VALID_SPEAKER_SELECTION_METHODS = ["auto", "manual", "random", "round_robin"] @property def agent_names(self) -> List[str]: @@ -55,13 +67,61 @@ def next_agent(self, agent: Agent, agents: List[Agent]) -> Agent: def select_speaker_msg(self, agents: List[Agent]): """Return the message for selecting the next speaker.""" return f"""You are in a role play game. The following roles are available: -{self._participant_roles()}. +{self._participant_roles(agents)}. Read the following conversation. Then select the next role from {[agent.name for agent in agents]} to play. Only return the role.""" + def manual_select_speaker(self, agents: List[Agent]) -> Agent: + """Manually select the next speaker.""" + + print("Please select the next speaker from the following list:") + _n_agents = len(agents) + for i in range(_n_agents): + print(f"{i+1}: {agents[i].name}") + try_count = 0 + # Assume the user will enter a valid number within 3 tries, otherwise use auto selection to avoid blocking. + while try_count <= 3: + try_count += 1 + if try_count >= 3: + print(f"You have tried {try_count} times. The next speaker will be selected automatically.") + break + try: + i = input("Enter the number of the next speaker (enter nothing or `q` to use auto selection): ") + if i == "" or i == "q": + break + i = int(i) + if i > 0 and i <= _n_agents: + return agents[i - 1] + else: + raise ValueError + except ValueError: + print(f"Invalid input. Please enter a number between 1 and {_n_agents}.") + return None + def select_speaker(self, last_speaker: Agent, selector: ConversableAgent): """Select the next speaker.""" + if self.speaker_selection_method.lower() not in self._VALID_SPEAKER_SELECTION_METHODS: + raise ValueError( + f"GroupChat speaker_selection_method is set to '{self.speaker_selection_method}'. " + f"It should be one of {self._VALID_SPEAKER_SELECTION_METHODS} (case insensitive). " + ) + + agents = self.agents + n_agents = len(agents) + # Warn if GroupChat is underpopulated + if n_agents < 2: + raise ValueError( + f"GroupChat is underpopulated with {n_agents} agents. " + "Please add more agents to the GroupChat or use direct communication instead." + ) + elif n_agents == 2 and self.speaker_selection_method.lower() != "round_robin" and self.allow_repeat_speaker: + logger.warning( + f"GroupChat is underpopulated with {n_agents} agents. " + "It is recommended to set speaker_selection_method to 'round_robin' or allow_repeat_speaker to False." + "Or, use direct communication instead." + ) + if self.func_call_filter and self.messages and "function_call" in self.messages[-1]: # find agents with the right function_map which contains the function name agents = [ @@ -80,14 +140,20 @@ def select_speaker(self, last_speaker: Agent, selector: ConversableAgent): f"No agent can execute the function {self.messages[-1]['name']}. " "Please check the function_map of the agents." ) - else: - agents = self.agents - # Warn if GroupChat is underpopulated - n_agents = len(agents) - if n_agents < 3: - logger.warning( - f"GroupChat is underpopulated with {n_agents} agents. Direct communication would be more efficient." - ) + + # remove the last speaker from the list to avoid selecting the same speaker if allow_repeat_speaker is False + agents = agents if self.allow_repeat_speaker else [agent for agent in agents if agent != last_speaker] + + if self.speaker_selection_method.lower() == "manual": + selected_agent = self.manual_select_speaker(agents) + if selected_agent: + return selected_agent + elif self.speaker_selection_method.lower() == "round_robin": + return self.next_agent(last_speaker, agents) + elif self.speaker_selection_method.lower() == "random": + return random.choice(agents) + + # auto speaker selection selector.update_system_message(self.select_speaker_msg(agents)) final, name = selector.generate_oai_reply( self.messages @@ -99,26 +165,31 @@ def select_speaker(self, last_speaker: Agent, selector: ConversableAgent): ] ) if not final: - # i = self._random.randint(0, len(self._agent_names) - 1) # randomly pick an id + # the LLM client is None, thus no reply is generated. Use round robin instead. return self.next_agent(last_speaker, agents) # If exactly one agent is mentioned, use it. Otherwise, leave the OAI response unmodified mentions = self._mentioned_agents(name, agents) if len(mentions) == 1: name = next(iter(mentions)) + else: + logger.warning( + f"GroupChat select_speaker failed to resolve the next speaker's name. This is because the speaker selection OAI call returned:\n{name}" + ) # Return the result try: return self.agent_by_name(name) except ValueError: - logger.warning( - f"GroupChat select_speaker failed to resolve the next speaker's name. Speaker selection will default to the next speaker in the list. This is because the speaker selection OAI call returned:\n{name}" - ) return self.next_agent(last_speaker, agents) - def _participant_roles(self): + def _participant_roles(self, agents: List[Agent] = None) -> str: + # Default to all agents registered + if agents is None: + agents = self.agents + roles = [] - for agent in self.agents: + for agent in agents: if agent.system_message.strip() == "": logger.warning( f"The agent '{agent.name}' has an empty system_message, and may not work well with GroupChat." diff --git a/notebook/agentchat_RetrieveChat.ipynb b/notebook/agentchat_RetrieveChat.ipynb index 4aabc52b01e0..cc7fea721c5e 100644 --- a/notebook/agentchat_RetrieveChat.ipynb +++ b/notebook/agentchat_RetrieveChat.ipynb @@ -42,7 +42,7 @@ "\n", "AutoGen requires `Python>=3.8`. To run this notebook example, please install the [retrievechat] option.\n", "```bash\n", - "pip install \"pyautogen[retrievechat]\" \"flaml[automl]\"\n", + "pip install \"pyautogen[retrievechat]~=0.2.0b5\" \"flaml[automl]\"\n", "```" ] }, @@ -52,7 +52,7 @@ "metadata": {}, "outputs": [], "source": [ - "# %pip install \"pyautogen[retrievechat]~=0.1.2\" \"flaml[automl]\"" + "# %pip install \"pyautogen[retrievechat]~=0.2.0b5\" \"flaml[automl]\"" ] }, { @@ -67,14 +67,14 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "models to use: ['gpt-4']\n" + "models to use: ['gpt-35-turbo']\n" ] } ], @@ -148,15 +148,25 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 3, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/lijiang1/anaconda3/envs/autogen/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "/home/lijiang1/anaconda3/envs/autogen/lib/python3.10/site-packages/torch/cuda/__init__.py:138: UserWarning: CUDA initialization: The NVIDIA driver on your system is too old (found version 11060). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver. (Triggered internally at ../c10/cuda/CUDAFunctions.cpp:108.)\n", + " return torch._C._cuda_getDeviceCount() > 0\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ "Accepted file formats for `docs_path`:\n", - "['txt', 'json', 'csv', 'tsv', 'md', 'html', 'htm', 'rtf', 'rst', 'jsonl', 'log', 'xml', 'yaml', 'yml', 'pdf']\n" + "['doc', 'docx', 'eml', 'epub', 'xml', 'tsv', 'pdf', 'pptx', 'ppt', 'rtf', 'html', 'csv', 'htm', 'msg', 'yml', 'xlsx', 'yaml', 'rst', 'jsonl', 'txt', 'md', 'json', 'log', 'odt']\n" ] } ], @@ -171,7 +181,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -181,7 +191,7 @@ "\n", "# 1. create an RetrieveAssistantAgent instance named \"assistant\"\n", "assistant = RetrieveAssistantAgent(\n", - " name=\"assistant\", \n", + " name=\"assistant\",\n", " system_message=\"You are a helpful assistant.\",\n", " llm_config={\n", " \"timeout\": 600,\n", @@ -192,22 +202,20 @@ "\n", "# 2. create the RetrieveUserProxyAgent instance named \"ragproxyagent\"\n", "# By default, the human_input_mode is \"ALWAYS\", which means the agent will ask for human input at every step. We set it to \"NEVER\" here.\n", - "# `docs_path` is the path to the docs directory. It can also be the path to a single file, or the url to a single file. By default, \n", + "# `docs_path` is the path to the docs directory. It can also be the path to a single file, or the url to a single file. By default,\n", "# it is set to None, which works only if the collection is already created.\n", - "# \n", - "# Here we generated the documentations from FLAML's docstrings. Not needed if you just want to try this notebook but not to reproduce the\n", - "# outputs. Clone the FLAML (https://github.com/microsoft/FLAML) repo and navigate to its website folder. Pip install and run `pydoc-markdown`\n", - "# and it will generate folder `reference` under `website/docs`.\n", - "#\n", "# `task` indicates the kind of task we're working on. In this example, it's a `code` task.\n", "# `chunk_token_size` is the chunk token size for the retrieve chat. By default, it is set to `max_tokens * 0.6`, here we set it to 2000.\n", "ragproxyagent = RetrieveUserProxyAgent(\n", " name=\"ragproxyagent\",\n", " human_input_mode=\"NEVER\",\n", - " max_consecutive_auto_reply=10,\n", + " max_consecutive_auto_reply=3,\n", " retrieve_config={\n", " \"task\": \"code\",\n", - " \"docs_path\": \"~/code/FLAML/website/docs/reference\", # change this to your own path, such as https://raw.githubusercontent.com/microsoft/autogen/main/README.md\n", + " \"docs_path\": [\n", + " \"https://raw.githubusercontent.com/microsoft/FLAML/main/website/docs/Examples/Integrate%20-%20Spark.md\",\n", + " \"https://raw.githubusercontent.com/microsoft/FLAML/main/website/docs/Research.md\",\n", + " ],\n", " \"chunk_token_size\": 2000,\n", " \"model\": config_list[0][\"model\"],\n", " \"client\": chromadb.PersistentClient(path=\"/tmp/chromadb\"),\n", @@ -237,14 +245,19 @@ "execution_count": 6, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Number of requested results 20 is greater than number of elements in index 2, updating n_results = 2\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "doc_ids: [['doc_36', 'doc_40', 'doc_15', 'doc_22', 'doc_16', 'doc_51', 'doc_44', 'doc_41', 'doc_45', 'doc_14', 'doc_0', 'doc_37', 'doc_38', 'doc_9']]\n", - "\u001b[32mAdding doc_id doc_36 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_40 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_15 to context.\u001b[0m\n", + "doc_ids: [['doc_0']]\n", + "\u001b[32mAdding doc_id doc_0 to context.\u001b[0m\n", "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", "\n", "You're a retrieve augmented coding assistant. You answer user's questions based on your own knowledge and the\n", @@ -259,542 +272,124 @@ "\n", "User's question is: How can I use FLAML to perform a classification task and use spark to do parallel training. Train 30 seconds and force cancel jobs if time limit is reached.\n", "\n", - "Context is: \n", - "- `seed` - int or None, default=None | The random seed for hpo.\n", - "- `n_concurrent_trials` - [Experimental] int, default=1 | The number of\n", - " concurrent trials. When n_concurrent_trials > 1, flaml performes\n", - " [parallel tuning](../../Use-Cases/Task-Oriented-AutoML#parallel-tuning)\n", - " and installation of ray or spark is required: `pip install flaml[ray]`\n", - " or `pip install flaml[spark]`. Please check\n", - " [here](https://spark.apache.org/docs/latest/api/python/getting_started/install.html)\n", - " for more details about installing Spark.\n", - "- `keep_search_state` - boolean, default=False | Whether to keep data needed\n", - " for model search after fit(). By default the state is deleted for\n", - " space saving.\n", - "- `preserve_checkpoint` - boolean, default=True | Whether to preserve the saved checkpoint\n", - " on disk when deleting automl. By default the checkpoint is preserved.\n", - "- `early_stop` - boolean, default=False | Whether to stop early if the\n", - " search is considered to converge.\n", - "- `force_cancel` - boolean, default=False | Whether to forcely cancel Spark jobs if the\n", - " search time exceeded the time budget.\n", - "- `append_log` - boolean, default=False | Whetehr to directly append the log\n", - " records to the input log file if it exists.\n", - "- `auto_augment` - boolean, default=True | Whether to automatically\n", - " augment rare classes.\n", - "- `min_sample_size` - int, default=MIN_SAMPLE_TRAIN | the minimal sample\n", - " size when sample=True.\n", - "- `use_ray` - boolean or dict.\n", - " If boolean: default=False | Whether to use ray to run the training\n", - " in separate processes. This can be used to prevent OOM for large\n", - " datasets, but will incur more overhead in time.\n", - " If dict: the dict contains the keywords arguments to be passed to\n", - " [ray.tune.run](https://docs.ray.io/en/latest/tune/api_docs/execution.html).\n", - "- `use_spark` - boolean, default=False | Whether to use spark to run the training\n", - " in parallel spark jobs. This can be used to accelerate training on large models\n", - " and large datasets, but will incur more overhead in time and thus slow down\n", - " training in some cases. GPU training is not supported yet when use_spark is True.\n", - " For Spark clusters, by default, we will launch one trial per executor. However,\n", - " sometimes we want to launch more trials than the number of executors (e.g., local mode).\n", - " In this case, we can set the environment variable `FLAML_MAX_CONCURRENT` to override\n", - " the detected `num_executors`. The final number of concurrent trials will be the minimum\n", - " of `n_concurrent_trials` and `num_executors`.\n", - "- `free_mem_ratio` - float between 0 and 1, default=0. The free memory ratio to keep during training.\n", - "- `metric_constraints` - list, default=[] | The list of metric constraints.\n", - " Each element in this list is a 3-tuple, which shall be expressed\n", - " in the following format: the first element of the 3-tuple is the name of the\n", - " metric, the second element is the inequality sign chosen from \">=\" and \"<=\",\n", - " and the third element is the constraint value. E.g., `('val_loss', '<=', 0.1)`.\n", - " Note that all the metric names in metric_constraints need to be reported via\n", - " the metrics_to_log dictionary returned by a customized metric function.\n", - " The customized metric function shall be provided via the `metric` key word\n", - " argument of the fit() function or the automl constructor.\n", - " Find an example in the 4th constraint type in this [doc](../../Use-Cases/Task-Oriented-AutoML#constraint).\n", - " If `pred_time_limit` is provided as one of keyword arguments to fit() function or\n", - " the automl constructor, flaml will automatically (and under the hood)\n", - " add it as an additional element in the metric_constraints. Essentially 'pred_time_limit'\n", - " specifies a constraint about the prediction latency constraint in seconds.\n", - "- `custom_hp` - dict, default=None | The custom search space specified by user.\n", - " It is a nested dict with keys being the estimator names, and values being dicts\n", - " per estimator search space. In the per estimator search space dict,\n", - " the keys are the hyperparameter names, and values are dicts of info (\"domain\",\n", - " \"init_value\", and \"low_cost_init_value\") about the search space associated with\n", - " the hyperparameter (i.e., per hyperparameter search space dict). When custom_hp\n", - " is provided, the built-in search space which is also a nested dict of per estimator\n", - " search space dict, will be updated with custom_hp. Note that during this nested dict update,\n", - " the per hyperparameter search space dicts will be replaced (instead of updated) by the ones\n", - " provided in custom_hp. Note that the value for \"domain\" can either be a constant\n", - " or a sample.Domain object.\n", - " e.g.,\n", - " \n", - "```python\n", - "custom_hp = {\n", - " \"transformer_ms\": {\n", - " \"model_path\": {\n", - " \"domain\": \"albert-base-v2\",\n", - " },\n", - " \"learning_rate\": {\n", - " \"domain\": tune.choice([1e-4, 1e-5]),\n", - " }\n", - " }\n", - " }\n", - "```\n", - "- `skip_transform` - boolean, default=False | Whether to pre-process data prior to modeling.\n", - "- `fit_kwargs_by_estimator` - dict, default=None | The user specified keywords arguments, grouped by estimator name.\n", - " e.g.,\n", - " \n", - "```python\n", - "fit_kwargs_by_estimator = {\n", - " \"transformer\": {\n", - " \"output_dir\": \"test/data/output/\",\n", - " \"fp16\": False,\n", - " }\n", - "}\n", - "```\n", - "- `mlflow_logging` - boolean, default=True | Whether to log the training results to mlflow.\n", - " This requires mlflow to be installed and to have an active mlflow run.\n", - " FLAML will create nested runs.\n", - "\n", - "#### config\\_history\n", - "\n", - "```python\n", - "@property\n", - "def config_history() -> dict\n", - "```\n", - "\n", - "A dictionary of iter->(estimator, config, time),\n", - "storing the best estimator, config, and the time when the best\n", - "model is updated each time.\n", - "\n", - "#### model\n", - "\n", - "```python\n", - "@property\n", - "def model()\n", - "```\n", + "Context is: # Integrate - Spark\n", "\n", - "An object with `predict()` and `predict_proba()` method (for\n", - "classification), storing the best trained model.\n", + "FLAML has integrated Spark for distributed training. There are two main aspects of integration with Spark:\n", + "- Use Spark ML estimators for AutoML.\n", + "- Use Spark to run training in parallel spark jobs.\n", "\n", - "#### best\\_model\\_for\\_estimator\n", - "\n", - "```python\n", - "def best_model_for_estimator(estimator_name: str)\n", - "```\n", + "## Spark ML Estimators\n", "\n", - "Return the best model found for a particular estimator.\n", + "FLAML integrates estimators based on Spark ML models. These models are trained in parallel using Spark, so we called them Spark estimators. To use these models, you first need to organize your data in the required format.\n", "\n", - "**Arguments**:\n", + "### Data\n", "\n", - "- `estimator_name` - a str of the estimator's name.\n", - " \n", + "For Spark estimators, AutoML only consumes Spark data. FLAML provides a convenient function `to_pandas_on_spark` in the `flaml.automl.spark.utils` module to convert your data into a pandas-on-spark (`pyspark.pandas`) dataframe/series, which Spark estimators require.\n", "\n", - "**Returns**:\n", + "This utility function takes data in the form of a `pandas.Dataframe` or `pyspark.sql.Dataframe` and converts it into a pandas-on-spark dataframe. It also takes `pandas.Series` or `pyspark.sql.Dataframe` and converts it into a [pandas-on-spark](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/index.html) series. If you pass in a `pyspark.pandas.Dataframe`, it will not make any changes.\n", "\n", - " An object storing the best model for estimator_name.\n", - " If `model_history` was set to False during fit(), then the returned model\n", - " is untrained unless estimator_name is the best estimator.\n", - " If `model_history` was set to True, then the returned model is trained.\n", + "This function also accepts optional arguments `index_col` and `default_index_type`.\n", + "- `index_col` is the column name to use as the index, default is None.\n", + "- `default_index_type` is the default index type, default is \"distributed-sequence\". More info about default index type could be found on Spark official [documentation](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/options.html#default-index-type)\n", "\n", - "#### best\\_estimator\n", + "Here is an example code snippet for Spark Data:\n", "\n", "```python\n", - "@property\n", - "def best_estimator()\n", - "```\n", - "\n", - "A string indicating the best estimator found.\n", + "import pandas as pd\n", + "from flaml.automl.spark.utils import to_pandas_on_spark\n", + "# Creating a dictionary\n", + "data = {\"Square_Feet\": [800, 1200, 1800, 1500, 850],\n", + " \"Age_Years\": [20, 15, 10, 7, 25],\n", + " \"Price\": [100000, 200000, 300000, 240000, 120000]}\n", "\n", - "#### best\\_iteration\n", + "# Creating a pandas DataFrame\n", + "dataframe = pd.DataFrame(data)\n", + "label = \"Price\"\n", "\n", - "```python\n", - "@property\n", - "def best_iteration()\n", + "# Convert to pandas-on-spark dataframe\n", + "psdf = to_pandas_on_spark(dataframe)\n", "```\n", "\n", - "An integer of the iteration number where the best\n", - "config is found.\n", - "\n", - "#### best\\_config\n", + "To use Spark ML models you need to format your data appropriately. Specifically, use [`VectorAssembler`](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.VectorAssembler.html) to merge all feature columns into a single vector column.\n", "\n", + "Here is an example of how to use it:\n", "```python\n", - "@property\n", - "def best_config()\n", + "from pyspark.ml.feature import VectorAssembler\n", + "columns = psdf.columns\n", + "feature_cols = [col for col in columns if col != label]\n", + "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n", + "psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\"]\n", "```\n", "\n", - "A dictionary of the best configuration.\n", + "Later in conducting the experiment, use your pandas-on-spark data like non-spark data and pass them using `X_train, y_train` or `dataframe, label`.\n", "\n", - "#### best\\_config\\_per\\_estimator\n", + "### Estimators\n", + "#### Model List\n", + "- `lgbm_spark`: The class for fine-tuning Spark version LightGBM models, using [SynapseML](https://microsoft.github.io/SynapseML/docs/features/lightgbm/about/) API.\n", "\n", - "```python\n", - "@property\n", - "def best_config_per_estimator()\n", - "```\n", + "#### Usage\n", + "First, prepare your data in the required format as described in the previous section.\n", "\n", - "A dictionary of all estimators' best configuration.\n", + "By including the models you intend to try in the `estimators_list` argument to `flaml.automl`, FLAML will start trying configurations for these models. If your input is Spark data, FLAML will also use estimators with the `_spark` postfix by default, even if you haven't specified them.\n", "\n", - "#### best\\_loss\\_per\\_estimator\n", + "Here is an example code snippet using SparkML models in AutoML:\n", "\n", "```python\n", - "@property\n", - "def best_loss_per_estimator()\n", - "```\n", - "\n", - "A dictionary of all estimators' best loss.\n", + "import flaml\n", + "# prepare your data in pandas-on-spark format as we previously mentioned\n", "\n", - "#### best\\_loss\n", + "automl = flaml.AutoML()\n", + "settings = {\n", + " \"time_budget\": 30,\n", + " \"metric\": \"r2\",\n", + " \"estimator_list\": [\"lgbm_spark\"], # this setting is optional\n", + " \"task\": \"regression\",\n", + "}\n", "\n", - "```python\n", - "@property\n", - "def best_loss()\n", + "automl.fit(\n", + " dataframe=psdf,\n", + " label=label,\n", + " **settings,\n", + ")\n", "```\n", "\n", - "A float of the best loss found.\n", "\n", - "#### best\\_result\n", + "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/automl_bankrupt_synapseml.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/automl_bankrupt_synapseml.ipynb)\n", "\n", - "```python\n", - "@property\n", - "def best_result()\n", - "```\n", + "## Parallel Spark Jobs\n", + "You can activate Spark as the parallel backend during parallel tuning in both [AutoML](/docs/Use-Cases/Task-Oriented-AutoML#parallel-tuning) and [Hyperparameter Tuning](/docs/Use-Cases/Tune-User-Defined-Function#parallel-tuning), by setting the `use_spark` to `true`. FLAML will dispatch your job to the distributed Spark backend using [`joblib-spark`](https://github.com/joblib/joblib-spark).\n", "\n", - "Result dictionary for model trained with the best config.\n", + "Please note that you should not set `use_spark` to `true` when applying AutoML and Tuning for Spark Data. This is because only SparkML models will be used for Spark Data in AutoML and Tuning. As SparkML models run in parallel, there is no need to distribute them with `use_spark` again.\n", "\n", - "#### metrics\\_for\\_best\\_config\n", + "All the Spark-related arguments are stated below. These arguments are available in both Hyperparameter Tuning and AutoML:\n", "\n", - "```python\n", - "@property\n", - "def metrics_for_best_config()\n", - "```\n", "\n", - "Returns a float of the best loss, and a dictionary of the auxiliary metrics to log\n", - "associated with the best config. These two objects correspond to the returned\n", - "objects by the customized metric function for the config with the best loss.\n", - "\n", - "#### best\\_config\\_train\\_time\n", - " \n", - "- `seed` - int or None, default=None | The random seed for hpo.\n", - "- `n_concurrent_trials` - [Experimental] int, default=1 | The number of\n", - " concurrent trials. When n_concurrent_trials > 1, flaml performes\n", - " [parallel tuning](../../Use-Cases/Task-Oriented-AutoML#parallel-tuning)\n", - " and installation of ray or spark is required: `pip install flaml[ray]`\n", - " or `pip install flaml[spark]`. Please check\n", - " [here](https://spark.apache.org/docs/latest/api/python/getting_started/install.html)\n", - " for more details about installing Spark.\n", - "- `keep_search_state` - boolean, default=False | Whether to keep data needed\n", - " for model search after fit(). By default the state is deleted for\n", - " space saving.\n", - "- `preserve_checkpoint` - boolean, default=True | Whether to preserve the saved checkpoint\n", - " on disk when deleting automl. By default the checkpoint is preserved.\n", - "- `early_stop` - boolean, default=False | Whether to stop early if the\n", - " search is considered to converge.\n", - "- `force_cancel` - boolean, default=False | Whether to forcely cancel the PySpark job if overtime.\n", - "- `append_log` - boolean, default=False | Whetehr to directly append the log\n", - " records to the input log file if it exists.\n", - "- `auto_augment` - boolean, default=True | Whether to automatically\n", - " augment rare classes.\n", - "- `min_sample_size` - int, default=MIN_SAMPLE_TRAIN | the minimal sample\n", - " size when sample=True.\n", - "- `use_ray` - boolean or dict.\n", - " If boolean: default=False | Whether to use ray to run the training\n", - " in separate processes. This can be used to prevent OOM for large\n", - " datasets, but will incur more overhead in time.\n", - " If dict: the dict contains the keywords arguments to be passed to\n", - " [ray.tune.run](https://docs.ray.io/en/latest/tune/api_docs/execution.html).\n", - "- `use_spark` - boolean, default=False | Whether to use spark to run the training\n", - " in parallel spark jobs. This can be used to accelerate training on large models\n", - " and large datasets, but will incur more overhead in time and thus slow down\n", - " training in some cases.\n", - "- `free_mem_ratio` - float between 0 and 1, default=0. The free memory ratio to keep during training.\n", - "- `metric_constraints` - list, default=[] | The list of metric constraints.\n", - " Each element in this list is a 3-tuple, which shall be expressed\n", - " in the following format: the first element of the 3-tuple is the name of the\n", - " metric, the second element is the inequality sign chosen from \">=\" and \"<=\",\n", - " and the third element is the constraint value. E.g., `('precision', '>=', 0.9)`.\n", - " Note that all the metric names in metric_constraints need to be reported via\n", - " the metrics_to_log dictionary returned by a customized metric function.\n", - " The customized metric function shall be provided via the `metric` key word argument\n", - " of the fit() function or the automl constructor.\n", - " Find examples in this [test](https://github.com/microsoft/FLAML/tree/main/test/automl/test_constraints.py).\n", - " If `pred_time_limit` is provided as one of keyword arguments to fit() function or\n", - " the automl constructor, flaml will automatically (and under the hood)\n", - " add it as an additional element in the metric_constraints. Essentially 'pred_time_limit'\n", - " specifies a constraint about the prediction latency constraint in seconds.\n", - "- `custom_hp` - dict, default=None | The custom search space specified by user\n", - " Each key is the estimator name, each value is a dict of the custom search space for that estimator. Notice the\n", - " domain of the custom search space can either be a value of a sample.Domain object.\n", - " \n", - " \n", - " \n", - "```python\n", - "custom_hp = {\n", - " \"transformer_ms\": {\n", - " \"model_path\": {\n", - " \"domain\": \"albert-base-v2\",\n", - " },\n", - " \"learning_rate\": {\n", - " \"domain\": tune.choice([1e-4, 1e-5]),\n", - " }\n", - " }\n", - "}\n", - "```\n", - "- `time_col` - for a time series task, name of the column containing the timestamps. If not\n", - " provided, defaults to the first column of X_train/X_val\n", - " \n", - "- `cv_score_agg_func` - customized cross-validation scores aggregate function. Default to average metrics across folds. If specificed, this function needs to\n", - " have the following input arguments:\n", - " \n", - " * val_loss_folds: list of floats, the loss scores of each fold;\n", - " * log_metrics_folds: list of dicts/floats, the metrics of each fold to log.\n", - " \n", - " This function should return the final aggregate result of all folds. A float number of the minimization objective, and a dictionary as the metrics to log or None.\n", - " E.g.,\n", - " \n", - "```python\n", - "def cv_score_agg_func(val_loss_folds, log_metrics_folds):\n", - " metric_to_minimize = sum(val_loss_folds)/len(val_loss_folds)\n", - " metrics_to_log = None\n", - " for single_fold in log_metrics_folds:\n", - " if metrics_to_log is None:\n", - " metrics_to_log = single_fold\n", - " elif isinstance(metrics_to_log, dict):\n", - " metrics_to_log = {k: metrics_to_log[k] + v for k, v in single_fold.items()}\n", - " else:\n", - " metrics_to_log += single_fold\n", - " if metrics_to_log:\n", - " n = len(val_loss_folds)\n", - " metrics_to_log = (\n", - " {k: v / n for k, v in metrics_to_log.items()}\n", - " if isinstance(metrics_to_log, dict)\n", - " else metrics_to_log / n\n", - " )\n", - " return metric_to_minimize, metrics_to_log\n", - "```\n", - " \n", - "- `skip_transform` - boolean, default=False | Whether to pre-process data prior to modeling.\n", - "- `mlflow_logging` - boolean, default=None | Whether to log the training results to mlflow.\n", - " Default value is None, which means the logging decision is made based on\n", - " AutoML.__init__'s mlflow_logging argument.\n", - " This requires mlflow to be installed and to have an active mlflow run.\n", - " FLAML will create nested runs.\n", - "- `fit_kwargs_by_estimator` - dict, default=None | The user specified keywords arguments, grouped by estimator name.\n", - " For TransformersEstimator, available fit_kwargs can be found from\n", - " [TrainingArgumentsForAuto](nlp/huggingface/training_args).\n", - " e.g.,\n", - " \n", - "```python\n", - "fit_kwargs_by_estimator = {\n", - " \"transformer\": {\n", - " \"output_dir\": \"test/data/output/\",\n", - " \"fp16\": False,\n", - " },\n", - " \"tft\": {\n", - " \"max_encoder_length\": 1,\n", - " \"min_encoder_length\": 1,\n", - " \"static_categoricals\": [],\n", - " \"static_reals\": [],\n", - " \"time_varying_known_categoricals\": [],\n", - " \"time_varying_known_reals\": [],\n", - " \"time_varying_unknown_categoricals\": [],\n", - " \"time_varying_unknown_reals\": [],\n", - " \"variable_groups\": {},\n", - " \"lags\": {},\n", - " }\n", - "}\n", - "```\n", - " \n", - "- `**fit_kwargs` - Other key word arguments to pass to fit() function of\n", - " the searched learners, such as sample_weight. Below are a few examples of\n", - " estimator-specific parameters:\n", - "- `period` - int | forecast horizon for all time series forecast tasks.\n", - "- `gpu_per_trial` - float, default = 0 | A float of the number of gpus per trial,\n", - " only used by TransformersEstimator, XGBoostSklearnEstimator, and\n", - " TemporalFusionTransformerEstimator.\n", - "- `group_ids` - list of strings of column names identifying a time series, only\n", - " used by TemporalFusionTransformerEstimator, required for\n", - " 'ts_forecast_panel' task. `group_ids` is a parameter for TimeSeriesDataSet object\n", - " from PyTorchForecasting.\n", - " For other parameters to describe your dataset, refer to\n", - " [TimeSeriesDataSet PyTorchForecasting](https://pytorch-forecasting.readthedocs.io/en/stable/api/pytorch_forecasting.data.timeseries.TimeSeriesDataSet.html).\n", - " To specify your variables, use `static_categoricals`, `static_reals`,\n", - " `time_varying_known_categoricals`, `time_varying_known_reals`,\n", - " `time_varying_unknown_categoricals`, `time_varying_unknown_reals`,\n", - " `variable_groups`. To provide more information on your data, use\n", - " `max_encoder_length`, `min_encoder_length`, `lags`.\n", - "- `log_dir` - str, default = \"lightning_logs\" | Folder into which to log results\n", - " for tensorboard, only used by TemporalFusionTransformerEstimator.\n", - "- `max_epochs` - int, default = 20 | Maximum number of epochs to run training,\n", - " only used by TemporalFusionTransformerEstimator.\n", - "- `batch_size` - int, default = 64 | Batch size for training model, only\n", - " used by TemporalFusionTransformerEstimator.\n", - "\n", - "\n", - " \n", - "```python\n", - "from flaml import BlendSearch\n", - "algo = BlendSearch(metric='val_loss', mode='min',\n", - " space=search_space,\n", - " low_cost_partial_config=low_cost_partial_config)\n", - "for i in range(10):\n", - " analysis = tune.run(compute_with_config,\n", - " search_alg=algo, use_ray=False)\n", - " print(analysis.trials[-1].last_result)\n", - "```\n", - " \n", - "- `verbose` - 0, 1, 2, or 3. If ray or spark backend is used, their verbosity will be\n", - " affected by this argument. 0 = silent, 1 = only status updates,\n", - " 2 = status and brief trial results, 3 = status and detailed trial results.\n", - " Defaults to 2.\n", - "- `local_dir` - A string of the local dir to save ray logs if ray backend is\n", - " used; or a local dir to save the tuning log.\n", - "- `num_samples` - An integer of the number of configs to try. Defaults to 1.\n", - "- `resources_per_trial` - A dictionary of the hardware resources to allocate\n", - " per trial, e.g., `{'cpu': 1}`. It is only valid when using ray backend\n", - " (by setting 'use_ray = True'). It shall be used when you need to do\n", - " [parallel tuning](../../Use-Cases/Tune-User-Defined-Function#parallel-tuning).\n", - "- `config_constraints` - A list of config constraints to be satisfied.\n", - " e.g., ```config_constraints = [(mem_size, '<=', 1024**3)]```\n", - " \n", - " mem_size is a function which produces a float number for the bytes\n", - " needed for a config.\n", - " It is used to skip configs which do not fit in memory.\n", - "- `metric_constraints` - A list of metric constraints to be satisfied.\n", - " e.g., `['precision', '>=', 0.9]`. The sign can be \">=\" or \"<=\".\n", - "- `max_failure` - int | the maximal consecutive number of failures to sample\n", - " a trial before the tuning is terminated.\n", - "- `use_ray` - A boolean of whether to use ray as the backend.\n", - "- `use_spark` - A boolean of whether to use spark as the backend.\n", - "- `log_file_name` - A string of the log file name. Default to None.\n", - " When set to None:\n", - " if local_dir is not given, no log file is created;\n", - " if local_dir is given, the log file name will be autogenerated under local_dir.\n", - " Only valid when verbose > 0 or use_ray is True.\n", - "- `lexico_objectives` - dict, default=None | It specifics information needed to perform multi-objective\n", - " optimization with lexicographic preferences. When lexico_objectives is not None, the arguments metric,\n", - " mode, will be invalid, and flaml's tune uses CFO\n", - " as the `search_alg`, which makes the input (if provided) `search_alg' invalid.\n", - " This dictionary shall contain the following fields of key-value pairs:\n", - " - \"metrics\": a list of optimization objectives with the orders reflecting the priorities/preferences of the\n", - " objectives.\n", - " - \"modes\" (optional): a list of optimization modes (each mode either \"min\" or \"max\") corresponding to the\n", - " objectives in the metric list. If not provided, we use \"min\" as the default mode for all the objectives.\n", - " - \"targets\" (optional): a dictionary to specify the optimization targets on the objectives. The keys are the\n", - " metric names (provided in \"metric\"), and the values are the numerical target values.\n", - " - \"tolerances\" (optional): a dictionary to specify the optimality tolerances on objectives. The keys are the metric names (provided in \"metrics\"), and the values are the absolute/percentage tolerance in the form of numeric/string.\n", - " E.g.,\n", - "```python\n", - "lexico_objectives = {\n", - " \"metrics\": [\"error_rate\", \"pred_time\"],\n", - " \"modes\": [\"min\", \"min\"],\n", - " \"tolerances\": {\"error_rate\": 0.01, \"pred_time\": 0.0},\n", - " \"targets\": {\"error_rate\": 0.0},\n", - "}\n", - "```\n", - " We also support percentage tolerance.\n", - " E.g.,\n", + "- `use_spark`: boolean, default=False | Whether to use spark to run the training in parallel spark jobs. This can be used to accelerate training on large models and large datasets, but will incur more overhead in time and thus slow down training in some cases. GPU training is not supported yet when use_spark is True. For Spark clusters, by default, we will launch one trial per executor. However, sometimes we want to launch more trials than the number of executors (e.g., local mode). In this case, we can set the environment variable `FLAML_MAX_CONCURRENT` to override the detected `num_executors`. The final number of concurrent trials will be the minimum of `n_concurrent_trials` and `num_executors`.\n", + "- `n_concurrent_trials`: int, default=1 | The number of concurrent trials. When n_concurrent_trials > 1, FLAML performes parallel tuning.\n", + "- `force_cancel`: boolean, default=False | Whether to forcely cancel Spark jobs if the search time exceeded the time budget. Spark jobs include parallel tuning jobs and Spark-based model training jobs.\n", + "\n", + "An example code snippet for using parallel Spark jobs:\n", "```python\n", - "lexico_objectives = {\n", - " \"metrics\": [\"error_rate\", \"pred_time\"],\n", - " \"modes\": [\"min\", \"min\"],\n", - " \"tolerances\": {\"error_rate\": \"5%\", \"pred_time\": \"0%\"},\n", - " \"targets\": {\"error_rate\": 0.0},\n", + "import flaml\n", + "automl_experiment = flaml.AutoML()\n", + "automl_settings = {\n", + " \"time_budget\": 30,\n", + " \"metric\": \"r2\",\n", + " \"task\": \"regression\",\n", + " \"n_concurrent_trials\": 2,\n", + " \"use_spark\": True,\n", + " \"force_cancel\": True, # Activating the force_cancel option can immediately halt Spark jobs once they exceed the allocated time_budget.\n", "}\n", - "```\n", - "- `force_cancel` - boolean, default=False | Whether to forcely cancel the PySpark job if overtime.\n", - "- `n_concurrent_trials` - int, default=0 | The number of concurrent trials when perform hyperparameter\n", - " tuning with Spark. Only valid when use_spark=True and spark is required:\n", - " `pip install flaml[spark]`. Please check\n", - " [here](https://spark.apache.org/docs/latest/api/python/getting_started/install.html)\n", - " for more details about installing Spark. When tune.run() is called from AutoML, it will be\n", - " overwritten by the value of `n_concurrent_trials` in AutoML. When <= 0, the concurrent trials\n", - " will be set to the number of executors.\n", - "- `**ray_args` - keyword arguments to pass to ray.tune.run().\n", - " Only valid when use_ray=True.\n", - "\n", - "## Tuner Objects\n", "\n", - "```python\n", - "class Tuner()\n", + "automl.fit(\n", + " dataframe=dataframe,\n", + " label=label,\n", + " **automl_settings,\n", + ")\n", "```\n", "\n", - "Tuner is the class-based way of launching hyperparameter tuning jobs compatible with Ray Tune 2.\n", - "\n", - "**Arguments**:\n", - "\n", - "- `trainable` - A user-defined evaluation function.\n", - " It takes a configuration as input, outputs a evaluation\n", - " result (can be a numerical value or a dictionary of string\n", - " and numerical value pairs) for the input configuration.\n", - " For machine learning tasks, it usually involves training and\n", - " scoring a machine learning model, e.g., through validation loss.\n", - "- `param_space` - Search space of the tuning job.\n", - " One thing to note is that both preprocessor and dataset can be tuned here.\n", - "- `tune_config` - Tuning algorithm specific configs.\n", - " Refer to ray.tune.tune_config.TuneConfig for more info.\n", - "- `run_config` - Runtime configuration that is specific to individual trials.\n", - " If passed, this will overwrite the run config passed to the Trainer,\n", - " if applicable. Refer to ray.air.config.RunConfig for more info.\n", - " \n", - " Usage pattern:\n", - " \n", - " .. code-block:: python\n", - " \n", - " from sklearn.datasets import load_breast_cancer\n", - " \n", - " from ray import tune\n", - " from ray.data import from_pandas\n", - " from ray.air.config import RunConfig, ScalingConfig\n", - " from ray.train.xgboost import XGBoostTrainer\n", - " from ray.tune.tuner import Tuner\n", - " \n", - " def get_dataset():\n", - " data_raw = load_breast_cancer(as_frame=True)\n", - " dataset_df = data_raw[\"data\"]\n", - " dataset_df[\"target\"] = data_raw[\"target\"]\n", - " dataset = from_pandas(dataset_df)\n", - " return dataset\n", - " \n", - " trainer = XGBoostTrainer(\n", - " label_column=\"target\",\n", - " params={},\n", - "- `datasets={\"train\"` - get_dataset()},\n", - " )\n", - " \n", - " param_space = {\n", - "- `\"scaling_config\"` - ScalingConfig(\n", - " num_workers=tune.grid_search([2, 4]),\n", - " resources_per_worker={\n", - "- `\"CPU\"` - tune.grid_search([1, 2]),\n", - " },\n", - " ),\n", - " # You can even grid search various datasets in Tune.\n", - " # \"datasets\": {\n", - " # \"train\": tune.grid_search(\n", - " # [ds1, ds2]\n", - " # ),\n", - " # },\n", - "- `\"params\"` - {\n", - "- `\"objective\"` - \"binary:logistic\",\n", - "- `\"tree_method\"` - \"approx\",\n", - "- `\"eval_metric\"` - [\"logloss\", \"error\"],\n", - "- `\"eta\"` - tune.loguniform(1e-4, 1e-1),\n", - "- `\"subsample\"` - tune.uniform(0.5, 1.0),\n", - "- `\"max_depth\"` - tune.randint(1, 9),\n", - " },\n", - " }\n", - " tuner = Tuner(trainable=trainer, param_space=param_space,\n", - " run_config=RunConfig(name=\"my_tune_run\"))\n", - " analysis = tuner.fit()\n", - " \n", - " To retry a failed tune run, you can then do\n", - " \n", - " .. code-block:: python\n", - " \n", - " tuner = Tuner.restore(experiment_checkpoint_dir)\n", - " tuner.fit()\n", - " \n", - " ``experiment_checkpoint_dir`` can be easily located near the end of the\n", - " console output of your first failed run.\n", "\n", + "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb)\n", "\n", "\n", "\n", @@ -802,385 +397,289 @@ "--------------------------------------------------------------------------------\n", "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", "\n", - "To perform a classification task using FLAML and parallel training with Spark, you need to install FLAML with Spark support first, if you haven't done it yet:\n", - "\n", - "```\n", - "pip install flaml[spark]\n", - "```\n", - "\n", - "And then, you can use the following code example:\n", + "To perform a classification task and use Spark to do parallel training with FLAML, you can use the `lgbm_spark` estimator in the `estimator_list` argument and set `use_spark` to `True` with some additional arguments for parallel tuning. Here is an example code snippet:\n", "\n", "```python\n", - "from flaml import AutoML\n", - "from flaml.data import load_openml_dataset\n", - "from sklearn.metrics import accuracy_score\n", - "\n", - "# Load the dataset\n", - "X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=21, data_dir='./')\n", + "import flaml\n", + "from flaml.automl.spark.utils import to_pandas_on_spark\n", "\n", - "# Initialize the AutoML instance\n", - "automl = AutoML()\n", + "# prepare your data in pandas-on-spark format as we previously mentioned\n", "\n", - "# Configure AutoML settings for classification\n", + "automl = flaml.AutoML()\n", "settings = {\n", - " \"time_budget\": 30, # Train for 30 seconds\n", - " \"n_concurrent_trials\": 4, # Parallel training using Spark\n", - " \"force_cancel\": True, # Force cancel jobs if time limit is reached\n", - " \"use_spark\": True, # Use spark for parallel training\n", + " \"time_budget\": 30,\n", " \"metric\": \"accuracy\",\n", " \"task\": \"classification\",\n", - " \"log_file_name\": \"flaml.log\",\n", + " \"estimator_list\": [\"lgbm_spark\"],\n", + " \"use_spark\": True,\n", + " \"n_concurrent_trials\": 2,\n", + " \"force_cancel\": True,\n", "}\n", "\n", - "# Train the model\n", - "automl.fit(X_train, y_train, **settings)\n", - "\n", - "# Make predictions and calculate accuracy\n", - "y_pred = automl.predict(X_test)\n", - "accuracy = accuracy_score(y_test, y_pred)\n", - "print(\"Test accuracy:\", accuracy)\n", + "automl.fit(\n", + " dataframe=psdf,\n", + " label=label,\n", + " **settings,\n", + ")\n", "```\n", "\n", - "This code will perform a classification task using FLAML AutoML with parallel training on Spark. FLAML will try different models and hyperparameters, and it will automatically stop after 30 seconds. Jobs will be force-cancelled if the time limit is reached.\n", + "Make sure that your data is in the proper format as described in the context information under the `Data` section. Additionally, the `force_cancel` option can immediately halt Spark jobs once they exceed the allocated time budget.\n", "\n", "--------------------------------------------------------------------------------\n", "\u001b[31m\n", - ">>>>>>>> EXECUTING CODE BLOCK 0 (inferred language is sh)...\u001b[0m\n", - "\u001b[31m\n", - ">>>>>>>> EXECUTING CODE BLOCK 1 (inferred language is python)...\u001b[0m\n", - "load dataset from ./openml_ds21.pkl\n", - "Dataset name: car\n", - "X_train.shape: (1296, 6), y_train.shape: (1296,);\n", - "X_test.shape: (432, 6), y_test.shape: (432,)\n", - "[flaml.automl.logger: 08-11 17:25:31] {1679} INFO - task = classification\n", - "[flaml.automl.logger: 08-11 17:25:31] {1690} INFO - Evaluation method: cv\n", - "[flaml.automl.logger: 08-11 17:25:31] {1788} INFO - Minimizing error metric: 1-accuracy\n", - "[flaml.automl.logger: 08-11 17:25:31] {1900} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'lrl1']\n" + ">>>>>>>> EXECUTING CODE BLOCK 0 (inferred language is python)...\u001b[0m\n" ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[32m[I 2023-08-11 17:25:31,670]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n", - "\u001b[32m[I 2023-08-11 17:25:31,701]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n" + "ename": "NameError", + "evalue": "name 'psdf' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[6], line 18\u001b[0m\n\u001b[1;32m 6\u001b[0m automl \u001b[38;5;241m=\u001b[39m flaml\u001b[38;5;241m.\u001b[39mAutoML()\n\u001b[1;32m 7\u001b[0m settings \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m 8\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtime_budget\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;241m30\u001b[39m,\n\u001b[1;32m 9\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmetric\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maccuracy\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mforce_cancel\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m 15\u001b[0m }\n\u001b[1;32m 17\u001b[0m automl\u001b[38;5;241m.\u001b[39mfit(\n\u001b[0;32m---> 18\u001b[0m dataframe\u001b[38;5;241m=\u001b[39m\u001b[43mpsdf\u001b[49m,\n\u001b[1;32m 19\u001b[0m label\u001b[38;5;241m=\u001b[39mlabel,\n\u001b[1;32m 20\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39msettings,\n\u001b[1;32m 21\u001b[0m )\n", + "\u001b[0;31mNameError\u001b[0m: name 'psdf' is not defined" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "[flaml.tune.tune: 08-11 17:25:31] {729} INFO - Number of trials: 1/1000000, 1 RUNNING, 0 TERMINATED\n" + "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", + "\n", + "exitcode: 1 (execution failed)\n", + "Code output: \n", + "None\n", + "name 'psdf' is not defined\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", + "\n", + "I apologize for the error in my previous response. To prepare your data in the proper format for Spark and FLAML's `lgbm_spark` estimator, you can use the following code snippet:\n", + "\n", + "```python\n", + "import pandas as pd\n", + "from flaml.automl.spark.utils import to_pandas_on_spark\n", + "from pyspark.ml.feature import VectorAssembler\n", + "\n", + "# Creating a dictionary\n", + "data = {\"feature1\": [1, 2, 3, 4, 5],\n", + " \"feature2\": [10, 20, 30, 40, 50],\n", + " \"target\": [0, 1, 0, 1, 0]}\n", + "\n", + "# Creating a pandas DataFrame\n", + "df = pd.DataFrame(data)\n", + "label = 'target'\n", + "\n", + "# Convert to pandas-on-spark dataframe\n", + "psdf = to_pandas_on_spark(df, index_col=None)\n", + "\n", + "columns = psdf.columns\n", + "feature_cols = [col for col in columns if col != label]\n", + "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n", + "psdf = featurizer.transform(psdf.to_spark(index_col=None))[\"features\", label]\n", + "```\n", + "\n", + "Once you have your data in the appropriate format, you can use the `lgbm_spark` estimator in FLAML, set `use_spark` to `True` and configure additional options for parallel tuning. Here is an example code snippet:\n", + "\n", + "```python\n", + "import flaml\n", + "\n", + "automl = flaml.AutoML()\n", + "settings = {\n", + " \"time_budget\": 30,\n", + " \"metric\": \"accuracy\",\n", + " \"task\": \"classification\",\n", + " \"estimator_list\": [\"lgbm_spark\"],\n", + " \"use_spark\": True,\n", + " \"n_concurrent_trials\": 2,\n", + " \"force_cancel\": True,\n", + "}\n", + "\n", + "automl.fit(\n", + " dataframe=psdf,\n", + " label=label,\n", + " **settings,\n", + ")\n", + "```\n", + "\n", + "Note that you may need to modify the `features_col` and `label_col` names in the `VectorAssembler` and `fit` function arguments respectively based on the names in your data.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[31m\n", + ">>>>>>>> EXECUTING CODE BLOCK 0 (inferred language is python)...\u001b[0m\n" ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "2023-08-11 17:25:37.042724: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", - "2023-08-11 17:25:37.108934: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", - "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", - "2023-08-11 17:25:38.540404: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n" + "ename": "TypeError", + "evalue": "'NoneType' object is not callable", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[6], line 15\u001b[0m\n\u001b[1;32m 12\u001b[0m label \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtarget\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;66;03m# Convert to pandas-on-spark dataframe\u001b[39;00m\n\u001b[0;32m---> 15\u001b[0m psdf \u001b[38;5;241m=\u001b[39m \u001b[43mto_pandas_on_spark\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindex_col\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 17\u001b[0m columns \u001b[38;5;241m=\u001b[39m psdf\u001b[38;5;241m.\u001b[39mcolumns\n\u001b[1;32m 18\u001b[0m feature_cols \u001b[38;5;241m=\u001b[39m [col \u001b[38;5;28;01mfor\u001b[39;00m col \u001b[38;5;129;01min\u001b[39;00m columns \u001b[38;5;28;01mif\u001b[39;00m col \u001b[38;5;241m!=\u001b[39m label]\n", + "File \u001b[0;32m~/anaconda3/envs/autogen/lib/python3.10/site-packages/flaml/automl/spark/utils.py:59\u001b[0m, in \u001b[0;36mto_pandas_on_spark\u001b[0;34m(df, index_col, default_index_type)\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mto_pandas_on_spark\u001b[39m(\n\u001b[1;32m 25\u001b[0m df: Union[DataFrame, sparkDataFrame, Series, psDataFrame, psSeries],\n\u001b[1;32m 26\u001b[0m index_col: Optional[\u001b[38;5;28mstr\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 27\u001b[0m default_index_type: Optional[\u001b[38;5;28mstr\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdistributed-sequence\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 28\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Union[psDataFrame, psSeries]:\n\u001b[1;32m 29\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Convert pandas or pyspark dataframe/series to pandas_on_Spark dataframe/series.\u001b[39;00m\n\u001b[1;32m 30\u001b[0m \n\u001b[1;32m 31\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 57\u001b[0m \u001b[38;5;124;03m ```\u001b[39;00m\n\u001b[1;32m 58\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 59\u001b[0m \u001b[43mset_option\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcompute.default_index_type\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdefault_index_type\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 60\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(df, (DataFrame, Series)):\n\u001b[1;32m 61\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ps\u001b[38;5;241m.\u001b[39mfrom_pandas(df)\n", + "\u001b[0;31mTypeError\u001b[0m: 'NoneType' object is not callable" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "[flaml.tune.tune: 08-11 17:25:42] {749} INFO - Brief result: {'pred_time': 2.349200360598676e-05, 'wall_clock_time': 10.836093425750732, 'metric_for_logging': {'pred_time': 2.349200360598676e-05}, 'val_loss': 0.29475200475200475, 'trained_estimator': }\n", - "[flaml.tune.tune: 08-11 17:25:42] {729} INFO - Number of trials: 2/1000000, 1 RUNNING, 1 TERMINATED\n" + "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", + "\n", + "exitcode: 1 (execution failed)\n", + "Code output: \n", + "None\n", + "'NoneType' object is not callable\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", + "\n", + "I apologize for the mistake in my previous response. It looks like I had an error in my code that caused the error you received. Here is the updated code snippet for preparing your data in the proper format for the `lgbm_spark` estimator in FLAML:\n", + "\n", + "```python\n", + "import pandas as pd\n", + "from flaml.automl.spark.utils import to_pandas_on_spark\n", + "from pyspark.ml.feature import VectorAssembler\n", + "\n", + "# Creating a dictionary\n", + "data = {\"feature1\": [1, 2, 3, 4, 5],\n", + " \"feature2\": [10, 20, 30, 40, 50],\n", + " \"target\": [0, 1, 0, 1, 0]}\n", + "\n", + "# Creating a pandas DataFrame\n", + "df = pd.DataFrame(data)\n", + "label = 'target'\n", + "\n", + "# Convert to pandas-on-spark dataframe\n", + "psdf = to_pandas_on_spark(df, index_col=None)\n", + "\n", + "columns = psdf.columns\n", + "feature_cols = [col for col in columns if col != label]\n", + "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n", + "psdf = featurizer.transform(psdf.to_spark(index_col=None)).select(\"features\", label)\n", + "psdf.printSchema()\n", + "```\n", + "\n", + "This code snippet should properly create a Spark dataframe with the vectorized features column and the label column required for the `lgbm_spark` estimator in FLAML. The `printSchema()` function call will print the schema of the resulting Spark dataframe. You may need to modify the `features_col` and `label_col` names in the `VectorAssembler` and `select` function arguments respectively based on the names in your data. Once you have your data in this format, you can use the example code snippet from my previous response to run FLAML's `lgbm_spark` estimator while utilizing parallel training with Spark.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[31m\n", + ">>>>>>>> EXECUTING CODE BLOCK 0 (inferred language is python)...\u001b[0m\n" ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - " \r" + "ename": "TypeError", + "evalue": "'NoneType' object is not callable", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[6], line 15\u001b[0m\n\u001b[1;32m 12\u001b[0m label \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtarget\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;66;03m# Convert to pandas-on-spark dataframe\u001b[39;00m\n\u001b[0;32m---> 15\u001b[0m psdf \u001b[38;5;241m=\u001b[39m \u001b[43mto_pandas_on_spark\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindex_col\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 17\u001b[0m columns \u001b[38;5;241m=\u001b[39m psdf\u001b[38;5;241m.\u001b[39mcolumns\n\u001b[1;32m 18\u001b[0m feature_cols \u001b[38;5;241m=\u001b[39m [col \u001b[38;5;28;01mfor\u001b[39;00m col \u001b[38;5;129;01min\u001b[39;00m columns \u001b[38;5;28;01mif\u001b[39;00m col \u001b[38;5;241m!=\u001b[39m label]\n", + "File \u001b[0;32m~/anaconda3/envs/autogen/lib/python3.10/site-packages/flaml/automl/spark/utils.py:59\u001b[0m, in \u001b[0;36mto_pandas_on_spark\u001b[0;34m(df, index_col, default_index_type)\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mto_pandas_on_spark\u001b[39m(\n\u001b[1;32m 25\u001b[0m df: Union[DataFrame, sparkDataFrame, Series, psDataFrame, psSeries],\n\u001b[1;32m 26\u001b[0m index_col: Optional[\u001b[38;5;28mstr\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 27\u001b[0m default_index_type: Optional[\u001b[38;5;28mstr\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdistributed-sequence\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 28\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Union[psDataFrame, psSeries]:\n\u001b[1;32m 29\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Convert pandas or pyspark dataframe/series to pandas_on_Spark dataframe/series.\u001b[39;00m\n\u001b[1;32m 30\u001b[0m \n\u001b[1;32m 31\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 57\u001b[0m \u001b[38;5;124;03m ```\u001b[39;00m\n\u001b[1;32m 58\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 59\u001b[0m \u001b[43mset_option\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcompute.default_index_type\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdefault_index_type\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 60\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(df, (DataFrame, Series)):\n\u001b[1;32m 61\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ps\u001b[38;5;241m.\u001b[39mfrom_pandas(df)\n", + "\u001b[0;31mTypeError\u001b[0m: 'NoneType' object is not callable" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "[flaml.tune.tune: 08-11 17:25:42] {749} INFO - Brief result: {'pred_time': 1.638828344999381e-05, 'wall_clock_time': 11.25049901008606, 'metric_for_logging': {'pred_time': 1.638828344999381e-05}, 'val_loss': 0.20062964062964062, 'trained_estimator': }\n", - "[flaml.tune.tune: 08-11 17:25:42] {729} INFO - Number of trials: 3/1000000, 1 RUNNING, 2 TERMINATED\n" + "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", + "\n", + "exitcode: 1 (execution failed)\n", + "Code output: \n", + "None\n", + "'NoneType' object is not callable\n", + "\n", + "--------------------------------------------------------------------------------\n", + "exitcode: 1 (execution failed)\n", + "Code output: \n", + "None\n", + "'NoneType' object is not callable\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", + "\n", + "I apologize for the mistake in my previous response. It looks like I had an error in my code that caused the error you received. Here is the updated code snippet for preparing your data in the proper format for the `lgbm_spark` estimator in FLAML:\n", + "\n", + "```python\n", + "import pandas as pd\n", + "from flaml.automl.spark.utils import to_pandas_on_spark\n", + "from pyspark.ml.feature import VectorAssembler\n", + "\n", + "# Creating a dictionary\n", + "data = {\"feature1\": [1, 2, 3, 4, 5],\n", + " \"feature2\": [10, 20, 30, 40, 50],\n", + " \"target\": [0, 1, 0, 1, 0]}\n", + "\n", + "# Creating a pandas DataFrame\n", + "df = pd.DataFrame(data)\n", + "label = 'target'\n", + "\n", + "# Convert to pandas-on-spark dataframe\n", + "psdf = to_pandas_on_spark(df, index_col=None)\n", + "\n", + "columns = psdf.columns\n", + "feature_cols = [col for col in columns if col != label]\n", + "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n", + "psdf = featurizer.transform(psdf.to_spark(index_col=None)).select(\"features\", label)\n", + "psdf.printSchema()\n", + "```\n", + "\n", + "This code snippet should properly create a Spark dataframe with the vectorized features column and the label column required for the `lgbm_spark` estimator in FLAML. The `printSchema()` function call will print the schema of the resulting Spark dataframe. You may need to modify the `features_col` and `label_col` names in the `VectorAssembler` and `select` function arguments respectively based on the names in your data. Once you have your data in this format, you can use the example code snippet from my previous response to run FLAML's `lgbm_spark` estimator while utilizing parallel training with Spark.\n", + "\n", + "--------------------------------------------------------------------------------\n" ] - }, + } + ], + "source": [ + "# reset the assistant. Always reset the assistant before starting a new conversation.\n", + "assistant.reset()\n", + "\n", + "# given a problem, we use the ragproxyagent to generate a prompt to be sent to the assistant as the initial message.\n", + "# the assistant receives the message and generates a response. The response will be sent back to the ragproxyagent for processing.\n", + "# The conversation continues until the termination condition is met, in RetrieveChat, the termination condition when no human-in-loop is no code block detected.\n", + "# With human-in-loop, the conversation will continue until the user says \"exit\".\n", + "code_problem = \"How can I use FLAML to perform a classification task and use spark to do parallel training. Train 30 seconds and force cancel jobs if time limit is reached.\"\n", + "ragproxyagent.initiate_chat(assistant, problem=code_problem, search_string=\"spark\") # search_string is used as an extra filter for the embeddings search, in this case, we only want to search documents that contain \"spark\"." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "### Example 2\n", + "\n", + "[back to top](#toc)\n", + "\n", + "Use RetrieveChat to answer a question that is not related to code generation.\n", + "\n", + "Problem: Who is the author of FLAML?" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "[Stage 3:> (0 + 1) / 1]\r" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[flaml.tune.tune: 08-11 17:25:50] {749} INFO - Brief result: {'pred_time': 3.0794482150416296e-05, 'wall_clock_time': 18.99154567718506, 'metric_for_logging': {'pred_time': 3.0794482150416296e-05}, 'val_loss': 0.0663855063855064, 'trained_estimator': }\n", - "[flaml.tune.tune: 08-11 17:25:50] {729} INFO - Number of trials: 4/1000000, 1 RUNNING, 3 TERMINATED\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[flaml.tune.tune: 08-11 17:25:51] {749} INFO - Brief result: {'pred_time': 2.8759363960150548e-05, 'wall_clock_time': 19.68805766105652, 'metric_for_logging': {'pred_time': 2.8759363960150548e-05}, 'val_loss': 0.152019602019602, 'trained_estimator': }\n", - "[flaml.tune.tune: 08-11 17:25:51] {729} INFO - Number of trials: 5/1000000, 1 RUNNING, 4 TERMINATED\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[flaml.tune.tune: 08-11 17:25:51] {749} INFO - Brief result: {'pred_time': 3.691017574608273e-05, 'wall_clock_time': 20.165640115737915, 'metric_for_logging': {'pred_time': 3.691017574608273e-05}, 'val_loss': 0.2608167508167508, 'trained_estimator': }\n", - "[flaml.tune.tune: 08-11 17:25:51] {729} INFO - Number of trials: 6/1000000, 1 RUNNING, 5 TERMINATED\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[flaml.tune.tune: 08-11 17:25:52] {749} INFO - Brief result: {'pred_time': 1.7430177597394853e-05, 'wall_clock_time': 20.693061351776123, 'metric_for_logging': {'pred_time': 1.7430177597394853e-05}, 'val_loss': 0.03318978318978323, 'trained_estimator': }\n", - "[flaml.tune.tune: 08-11 17:25:52] {729} INFO - Number of trials: 7/1000000, 1 RUNNING, 6 TERMINATED\n", - "[flaml.tune.tune: 08-11 17:25:53] {749} INFO - Brief result: {'pred_time': 3.5216659617275313e-05, 'wall_clock_time': 21.475266218185425, 'metric_for_logging': {'pred_time': 3.5216659617275313e-05}, 'val_loss': 0.16745173745173744, 'trained_estimator': }\n", - "[flaml.tune.tune: 08-11 17:25:53] {729} INFO - Number of trials: 8/1000000, 1 RUNNING, 7 TERMINATED\n" + "Number of requested results 20 is greater than number of elements in index 2, updating n_results = 2\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "[flaml.tune.tune: 08-11 17:25:54] {749} INFO - Brief result: {'pred_time': 4.353435378702026e-05, 'wall_clock_time': 22.360871076583862, 'metric_for_logging': {'pred_time': 4.353435378702026e-05}, 'val_loss': 0.034725274725274737, 'trained_estimator': }\n", - "[flaml.tune.tune: 08-11 17:25:54] {729} INFO - Number of trials: 9/1000000, 1 RUNNING, 8 TERMINATED\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[flaml.tune.tune: 08-11 17:25:54] {749} INFO - Brief result: {'pred_time': 2.568628159906236e-05, 'wall_clock_time': 23.031129837036133, 'metric_for_logging': {'pred_time': 2.568628159906236e-05}, 'val_loss': 0.07177012177012176, 'trained_estimator': }\n", - "[flaml.tune.tune: 08-11 17:25:54] {729} INFO - Number of trials: 10/1000000, 1 RUNNING, 9 TERMINATED\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[flaml.tune.tune: 08-11 17:25:55] {749} INFO - Brief result: {'pred_time': 3.6701016019634797e-05, 'wall_clock_time': 23.525509119033813, 'metric_for_logging': {'pred_time': 3.6701016019634797e-05}, 'val_loss': 0.78009207009207, 'trained_estimator': }\n", - "[flaml.tune.tune: 08-11 17:25:55] {729} INFO - Number of trials: 11/1000000, 1 RUNNING, 10 TERMINATED\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[flaml.tune.tune: 08-11 17:25:55] {749} INFO - Brief result: {'pred_time': 3.9799592953107814e-05, 'wall_clock_time': 24.326939582824707, 'metric_for_logging': {'pred_time': 3.9799592953107814e-05}, 'val_loss': 0.011577071577071552, 'trained_estimator': }\n", - "[flaml.tune.tune: 08-11 17:25:55] {729} INFO - Number of trials: 12/1000000, 1 RUNNING, 11 TERMINATED\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[flaml.tune.tune: 08-11 17:25:56] {749} INFO - Brief result: {'pred_time': 1.9423383118527775e-05, 'wall_clock_time': 24.820234775543213, 'metric_for_logging': {'pred_time': 1.9423383118527775e-05}, 'val_loss': 0.037817047817047825, 'trained_estimator': }\n", - "[flaml.tune.tune: 08-11 17:25:56] {729} INFO - Number of trials: 13/1000000, 1 RUNNING, 12 TERMINATED\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[flaml.tune.tune: 08-11 17:25:57] {749} INFO - Brief result: {'pred_time': 2.987599351620653e-05, 'wall_clock_time': 25.54983139038086, 'metric_for_logging': {'pred_time': 2.987599351620653e-05}, 'val_loss': 0.030873180873180896, 'trained_estimator': }\n", - "[flaml.tune.tune: 08-11 17:25:57] {729} INFO - Number of trials: 14/1000000, 1 RUNNING, 13 TERMINATED\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[flaml.tune.tune: 08-11 17:25:57] {749} INFO - Brief result: {'pred_time': 2.351036190738797e-05, 'wall_clock_time': 26.08720564842224, 'metric_for_logging': {'pred_time': 2.351036190738797e-05}, 'val_loss': 0.020065340065340043, 'trained_estimator': }\n", - "[flaml.tune.tune: 08-11 17:25:57] {729} INFO - Number of trials: 15/1000000, 1 RUNNING, 14 TERMINATED\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[flaml.tune.tune: 08-11 17:25:58] {749} INFO - Brief result: {'pred_time': 2.2003395747883512e-05, 'wall_clock_time': 26.587312698364258, 'metric_for_logging': {'pred_time': 2.2003395747883512e-05}, 'val_loss': 0.03936144936144936, 'trained_estimator': }\n", - "[flaml.tune.tune: 08-11 17:25:58] {729} INFO - Number of trials: 16/1000000, 1 RUNNING, 15 TERMINATED\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[flaml.tune.tune: 08-11 17:25:58] {749} INFO - Brief result: {'pred_time': 2.1086723400146556e-05, 'wall_clock_time': 27.126797914505005, 'metric_for_logging': {'pred_time': 2.1086723400146556e-05}, 'val_loss': 0.015444015444015413, 'trained_estimator': }\n", - "[flaml.tune.tune: 08-11 17:25:58] {729} INFO - Number of trials: 17/1000000, 1 RUNNING, 16 TERMINATED\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[flaml.tune.tune: 08-11 17:25:59] {749} INFO - Brief result: {'pred_time': 1.6717643811435773e-05, 'wall_clock_time': 27.661753177642822, 'metric_for_logging': {'pred_time': 1.6717643811435773e-05}, 'val_loss': 0.07254232254232254, 'trained_estimator': }\n", - "[flaml.tune.tune: 08-11 17:25:59] {729} INFO - Number of trials: 18/1000000, 1 RUNNING, 17 TERMINATED\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[flaml.tune.tune: 08-11 17:26:00] {749} INFO - Brief result: {'pred_time': 3.0297818083348173e-05, 'wall_clock_time': 28.433676958084106, 'metric_for_logging': {'pred_time': 3.0297818083348173e-05}, 'val_loss': 0.020068310068310048, 'trained_estimator': }\n", - "[flaml.tune.tune: 08-11 17:26:00] {729} INFO - Number of trials: 19/1000000, 1 RUNNING, 18 TERMINATED\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[flaml.tune.tune: 08-11 17:26:00] {749} INFO - Brief result: {'pred_time': 2.0136982600838343e-05, 'wall_clock_time': 28.9714093208313, 'metric_for_logging': {'pred_time': 2.0136982600838343e-05}, 'val_loss': 0.010807840807840785, 'trained_estimator': }\n", - "[flaml.tune.tune: 08-11 17:26:00] {729} INFO - Number of trials: 20/1000000, 1 RUNNING, 19 TERMINATED\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[flaml.tune.tune: 08-11 17:26:01] {749} INFO - Brief result: {'pred_time': 2.0759203400709594e-05, 'wall_clock_time': 29.460874795913696, 'metric_for_logging': {'pred_time': 2.0759203400709594e-05}, 'val_loss': 0.017751707751707736, 'trained_estimator': }\n", - "[flaml.tune.tune: 08-11 17:26:01] {729} INFO - Number of trials: 21/1000000, 1 RUNNING, 20 TERMINATED\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "[flaml.automl.logger: 08-11 17:26:01] {2493} INFO - selected model: None\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[flaml.automl.logger: 08-11 17:26:02] {2627} INFO - retrain xgb_limitdepth for 0.7s\n", - "[flaml.automl.logger: 08-11 17:26:02] {2630} INFO - retrained model: XGBClassifier(base_score=None, booster=None, callbacks=[],\n", - " colsample_bylevel=1.0, colsample_bynode=None,\n", - " colsample_bytree=1.0, early_stopping_rounds=None,\n", - " enable_categorical=False, eval_metric=None, feature_types=None,\n", - " gamma=None, gpu_id=None, grow_policy=None, importance_type=None,\n", - " interaction_constraints=None, learning_rate=1.0, max_bin=None,\n", - " max_cat_threshold=None, max_cat_to_onehot=None,\n", - " max_delta_step=None, max_depth=5, max_leaves=None,\n", - " min_child_weight=0.4411564712550587, missing=nan,\n", - " monotone_constraints=None, n_estimators=12, n_jobs=-1,\n", - " num_parallel_tree=None, objective='multi:softprob',\n", - " predictor=None, ...)\n", - "[flaml.automl.logger: 08-11 17:26:02] {2630} INFO - retrained model: XGBClassifier(base_score=None, booster=None, callbacks=[],\n", - " colsample_bylevel=1.0, colsample_bynode=None,\n", - " colsample_bytree=1.0, early_stopping_rounds=None,\n", - " enable_categorical=False, eval_metric=None, feature_types=None,\n", - " gamma=None, gpu_id=None, grow_policy=None, importance_type=None,\n", - " interaction_constraints=None, learning_rate=1.0, max_bin=None,\n", - " max_cat_threshold=None, max_cat_to_onehot=None,\n", - " max_delta_step=None, max_depth=5, max_leaves=None,\n", - " min_child_weight=0.4411564712550587, missing=nan,\n", - " monotone_constraints=None, n_estimators=12, n_jobs=-1,\n", - " num_parallel_tree=None, objective='multi:softprob',\n", - " predictor=None, ...)\n", - "[flaml.automl.logger: 08-11 17:26:02] {1930} INFO - fit succeeded\n", - "[flaml.automl.logger: 08-11 17:26:02] {1931} INFO - Time taken to find the best model: 28.9714093208313\n", - "Test accuracy: 0.9837962962962963\n", - "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", - "\n", - "exitcode: 0 (execution succeeded)\n", - "Code output: \n", - "You MUST NOT install any packages because all the packages needed are already installed.\n", - "None\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", - "\n", - "TERMINATE\n", - "\n", - "--------------------------------------------------------------------------------\n" - ] - } - ], - "source": [ - "# reset the assistant. Always reset the assistant before starting a new conversation.\n", - "assistant.reset()\n", - "\n", - "# given a problem, we use the ragproxyagent to generate a prompt to be sent to the assistant as the initial message.\n", - "# the assistant receives the message and generates a response. The response will be sent back to the ragproxyagent for processing.\n", - "# The conversation continues until the termination condition is met, in RetrieveChat, the termination condition when no human-in-loop is no code block detected.\n", - "# With human-in-loop, the conversation will continue until the user says \"exit\".\n", - "code_problem = \"How can I use FLAML to perform a classification task and use spark to do parallel training. Train 30 seconds and force cancel jobs if time limit is reached.\"\n", - "ragproxyagent.initiate_chat(assistant, problem=code_problem, search_string=\"spark\") # search_string is used as an extra filter for the embeddings search, in this case, we only want to search documents that contain \"spark\"." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "### Example 2\n", - "\n", - "[back to top](#toc)\n", - "\n", - "Use RetrieveChat to answer a question that is not related to code generation.\n", - "\n", - "Problem: Who is the author of FLAML?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# reset the assistant. Always reset the assistant before starting a new conversation.\n", - "assistant.reset()\n", - "\n", - "qa_problem = \"Who is the author of FLAML?\"\n", - "ragproxyagent.initiate_chat(assistant, problem=qa_problem)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "### Example 3\n", - "\n", - "[back to top](#toc)\n", - "\n", - "Use RetrieveChat to help generate sample code and ask for human-in-loop feedbacks.\n", - "\n", - "Problem: how to build a time series forecasting model for stock price using FLAML?" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "doc_ids: [['doc_39', 'doc_46', 'doc_49', 'doc_36', 'doc_38', 'doc_51', 'doc_37', 'doc_58', 'doc_48', 'doc_40', 'doc_47', 'doc_41', 'doc_15', 'doc_52', 'doc_14', 'doc_60', 'doc_59', 'doc_43', 'doc_11', 'doc_35']]\n", - "\u001b[32mAdding doc_id doc_39 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_46 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_49 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_36 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_38 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_46 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_49 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_36 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_38 to context.\u001b[0m\n", + "doc_ids: [['doc_0', 'doc_1']]\n", + "\u001b[32mAdding doc_id doc_0 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id doc_1 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id doc_1 to context.\u001b[0m\n", "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", "\n", "You're a retrieve augmented coding assistant. You answer user's questions based on your own knowledge and the\n", @@ -1193,804 +692,639 @@ "# your code\n", "```\n", "\n", - "User's question is: how to build a time series forecasting model for stock price using FLAML?\n", + "User's question is: Who is the author of FLAML?\n", "\n", - "Context is: \n", - "- `X_train` - A numpy array or a pandas dataframe of training data in\n", - " shape (n, m). For time series forecsat tasks, the first column of X_train\n", - " must be the timestamp column (datetime type). Other columns in\n", - " the dataframe are assumed to be exogenous variables (categorical or numeric).\n", - " When using ray, X_train can be a ray.ObjectRef.\n", - "- `y_train` - A numpy array or a pandas series of labels in shape (n, ).\n", - "- `dataframe` - A dataframe of training data including label column.\n", - " For time series forecast tasks, dataframe must be specified and must have\n", - " at least two columns, timestamp and label, where the first\n", - " column is the timestamp column (datetime type). Other columns in\n", - " the dataframe are assumed to be exogenous variables (categorical or numeric).\n", - " When using ray, dataframe can be a ray.ObjectRef.\n", - "- `label` - A str of the label column name for, e.g., 'label';\n", - "- `Note` - If X_train and y_train are provided,\n", - " dataframe and label are ignored;\n", - " If not, dataframe and label must be provided.\n", - "- `metric` - A string of the metric name or a function,\n", - " e.g., 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_weighted',\n", - " 'roc_auc_ovo_weighted', 'roc_auc_ovr_weighted', 'f1', 'micro_f1', 'macro_f1',\n", - " 'log_loss', 'mae', 'mse', 'r2', 'mape'. Default is 'auto'.\n", - " If passing a customized metric function, the function needs to\n", - " have the following input arguments:\n", - " \n", - "```python\n", - "def custom_metric(\n", - " X_test, y_test, estimator, labels,\n", - " X_train, y_train, weight_test=None, weight_train=None,\n", - " config=None, groups_test=None, groups_train=None,\n", - "):\n", - " return metric_to_minimize, metrics_to_log\n", - "```\n", - " which returns a float number as the minimization objective,\n", - " and a dictionary as the metrics to log. E.g.,\n", - " \n", - "```python\n", - "def custom_metric(\n", - " X_val, y_val, estimator, labels,\n", - " X_train, y_train, weight_val=None, weight_train=None,\n", - " *args,\n", - "):\n", - " from sklearn.metrics import log_loss\n", - " import time\n", - "\n", - " start = time.time()\n", - " y_pred = estimator.predict_proba(X_val)\n", - " pred_time = (time.time() - start) / len(X_val)\n", - " val_loss = log_loss(y_val, y_pred, labels=labels, sample_weight=weight_val)\n", - " y_pred = estimator.predict_proba(X_train)\n", - " train_loss = log_loss(y_train, y_pred, labels=labels, sample_weight=weight_train)\n", - " alpha = 0.5\n", - " return val_loss * (1 + alpha) - alpha * train_loss, {\n", - " \"val_loss\": val_loss,\n", - " \"train_loss\": train_loss,\n", - " \"pred_time\": pred_time,\n", - " }\n", - "```\n", - "- `task` - A string of the task type, e.g.,\n", - " 'classification', 'regression', 'ts_forecast_regression',\n", - " 'ts_forecast_classification', 'rank', 'seq-classification',\n", - " 'seq-regression', 'summarization', or an instance of Task class\n", - "- `n_jobs` - An integer of the number of threads for training | default=-1.\n", - " Use all available resources when n_jobs == -1.\n", - "- `log_file_name` - A string of the log file name | default=\"\". To disable logging,\n", - " set it to be an empty string \"\".\n", - "- `estimator_list` - A list of strings for estimator names, or 'auto'.\n", - " e.g., ```['lgbm', 'xgboost', 'xgb_limitdepth', 'catboost', 'rf', 'extra_tree']```.\n", - "- `time_budget` - A float number of the time budget in seconds.\n", - " Use -1 if no time limit.\n", - "- `max_iter` - An integer of the maximal number of iterations.\n", - "- `NOTE` - when both time_budget and max_iter are unspecified,\n", - " only one model will be trained per estimator.\n", - "- `sample` - A boolean of whether to sample the training data during\n", - " search.\n", - "- `ensemble` - boolean or dict | default=False. Whether to perform\n", - " ensemble after search. Can be a dict with keys 'passthrough'\n", - " and 'final_estimator' to specify the passthrough and\n", - " final_estimator in the stacker. The dict can also contain\n", - " 'n_jobs' as the key to specify the number of jobs for the stacker.\n", - "- `eval_method` - A string of resampling strategy, one of\n", - " ['auto', 'cv', 'holdout'].\n", - "- `split_ratio` - A float of the valiation data percentage for holdout.\n", - "- `n_splits` - An integer of the number of folds for cross - validation.\n", - "- `log_type` - A string of the log type, one of\n", - " ['better', 'all'].\n", - " 'better' only logs configs with better loss than previos iters\n", - " 'all' logs all the tried configs.\n", - "- `model_history` - A boolean of whether to keep the trained best\n", - " model per estimator. Make sure memory is large enough if setting to True.\n", - " Default value is False: best_model_for_estimator would return a\n", - " untrained model for non-best learner.\n", - "- `log_training_metric` - A boolean of whether to log the training\n", - " metric for each model.\n", - "- `mem_thres` - A float of the memory size constraint in bytes.\n", - "- `pred_time_limit` - A float of the prediction latency constraint in seconds.\n", - " It refers to the average prediction time per row in validation data.\n", - "- `train_time_limit` - None or a float of the training time constraint in seconds.\n", - "- `X_val` - None or a numpy array or a pandas dataframe of validation data.\n", - "- `y_val` - None or a numpy array or a pandas series of validation labels.\n", - "- `sample_weight_val` - None or a numpy array of the sample weight of\n", - " validation data of the same shape as y_val.\n", - "- `groups_val` - None or array-like | group labels (with matching length\n", - " to y_val) or group counts (with sum equal to length of y_val)\n", - " for validation data. Need to be consistent with groups.\n", - "- `groups` - None or array-like | Group labels (with matching length to\n", - " y_train) or groups counts (with sum equal to length of y_train)\n", - " for training data.\n", - "- `verbose` - int, default=3 | Controls the verbosity, higher means more\n", - " messages.\n", - "- `retrain_full` - bool or str, default=True | whether to retrain the\n", - " selected model on the full training data when using holdout.\n", - " True - retrain only after search finishes; False - no retraining;\n", - " 'budget' - do best effort to retrain without violating the time\n", - " budget.\n", - "- `split_type` - str or splitter object, default=\"auto\" | the data split type.\n", - " * A valid splitter object is an instance of a derived class of scikit-learn\n", - " [KFold](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold)\n", - " and have ``split`` and ``get_n_splits`` methods with the same signatures.\n", - " Set eval_method to \"cv\" to use the splitter object.\n", - " * Valid str options depend on different tasks.\n", - " For classification tasks, valid choices are\n", - " [\"auto\", 'stratified', 'uniform', 'time', 'group']. \"auto\" -> stratified.\n", - " For regression tasks, valid choices are [\"auto\", 'uniform', 'time'].\n", - " \"auto\" -> uniform.\n", - " For time series forecast tasks, must be \"auto\" or 'time'.\n", - " For ranking task, must be \"auto\" or 'group'.\n", - "- `hpo_method` - str, default=\"auto\" | The hyperparameter\n", - " optimization method. By default, CFO is used for sequential\n", - " search and BlendSearch is used for parallel search.\n", - " No need to set when using flaml's default search space or using\n", - " a simple customized search space. When set to 'bs', BlendSearch\n", - " is used. BlendSearch can be tried when the search space is\n", - " complex, for example, containing multiple disjoint, discontinuous\n", - " subspaces. When set to 'random', random search is used.\n", - "- `starting_points` - A dictionary or a str to specify the starting hyperparameter\n", - " config for the estimators | default=\"data\".\n", - " If str:\n", - " - if \"data\", use data-dependent defaults;\n", - " - if \"data:path\" use data-dependent defaults which are stored at path;\n", - " - if \"static\", use data-independent defaults.\n", - " If dict, keys are the name of the estimators, and values are the starting\n", - " hyperparamter configurations for the corresponding estimators.\n", - " The value can be a single hyperparamter configuration dict or a list\n", - " of hyperparamter configuration dicts.\n", - " In the following code example, we get starting_points from the\n", - " `automl` object and use them in the `new_automl` object.\n", - " e.g.,\n", - " \n", - "```python\n", - "from flaml import AutoML\n", - "automl = AutoML()\n", - "X_train, y_train = load_iris(return_X_y=True)\n", - "automl.fit(X_train, y_train)\n", - "starting_points = automl.best_config_per_estimator\n", - "\n", - "new_automl = AutoML()\n", - "new_automl.fit(X_train, y_train, starting_points=starting_points)\n", - "```\n", - "---\n", - "sidebar_label: ts_model\n", - "title: automl.time_series.ts_model\n", - "---\n", + "Context is: # Integrate - Spark\n", "\n", - "## Prophet Objects\n", + "FLAML has integrated Spark for distributed training. There are two main aspects of integration with Spark:\n", + "- Use Spark ML estimators for AutoML.\n", + "- Use Spark to run training in parallel spark jobs.\n", "\n", - "```python\n", - "class Prophet(TimeSeriesEstimator)\n", - "```\n", + "## Spark ML Estimators\n", "\n", - "The class for tuning Prophet.\n", + "FLAML integrates estimators based on Spark ML models. These models are trained in parallel using Spark, so we called them Spark estimators. To use these models, you first need to organize your data in the required format.\n", "\n", - "## ARIMA Objects\n", + "### Data\n", "\n", - "```python\n", - "class ARIMA(StatsModelsEstimator)\n", - "```\n", - "\n", - "The class for tuning ARIMA.\n", - "\n", - "## SARIMAX Objects\n", + "For Spark estimators, AutoML only consumes Spark data. FLAML provides a convenient function `to_pandas_on_spark` in the `flaml.automl.spark.utils` module to convert your data into a pandas-on-spark (`pyspark.pandas`) dataframe/series, which Spark estimators require.\n", "\n", - "```python\n", - "class SARIMAX(StatsModelsEstimator)\n", - "```\n", + "This utility function takes data in the form of a `pandas.Dataframe` or `pyspark.sql.Dataframe` and converts it into a pandas-on-spark dataframe. It also takes `pandas.Series` or `pyspark.sql.Dataframe` and converts it into a [pandas-on-spark](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/index.html) series. If you pass in a `pyspark.pandas.Dataframe`, it will not make any changes.\n", "\n", - "The class for tuning SARIMA.\n", + "This function also accepts optional arguments `index_col` and `default_index_type`.\n", + "- `index_col` is the column name to use as the index, default is None.\n", + "- `default_index_type` is the default index type, default is \"distributed-sequence\". More info about default index type could be found on Spark official [documentation](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/options.html#default-index-type)\n", "\n", - "## HoltWinters Objects\n", + "Here is an example code snippet for Spark Data:\n", "\n", "```python\n", - "class HoltWinters(StatsModelsEstimator)\n", - "```\n", - "\n", - "The class for tuning Holt Winters model, aka 'Triple Exponential Smoothing'.\n", + "import pandas as pd\n", + "from flaml.automl.spark.utils import to_pandas_on_spark\n", + "# Creating a dictionary\n", + "data = {\"Square_Feet\": [800, 1200, 1800, 1500, 850],\n", + " \"Age_Years\": [20, 15, 10, 7, 25],\n", + " \"Price\": [100000, 200000, 300000, 240000, 120000]}\n", "\n", - "## TS\\_SKLearn Objects\n", + "# Creating a pandas DataFrame\n", + "dataframe = pd.DataFrame(data)\n", + "label = \"Price\"\n", "\n", - "```python\n", - "class TS_SKLearn(TimeSeriesEstimator)\n", + "# Convert to pandas-on-spark dataframe\n", + "psdf = to_pandas_on_spark(dataframe)\n", "```\n", "\n", - "The class for tuning SKLearn Regressors for time-series forecasting\n", - "\n", - "## LGBM\\_TS Objects\n", + "To use Spark ML models you need to format your data appropriately. Specifically, use [`VectorAssembler`](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.VectorAssembler.html) to merge all feature columns into a single vector column.\n", "\n", + "Here is an example of how to use it:\n", "```python\n", - "class LGBM_TS(TS_SKLearn)\n", + "from pyspark.ml.feature import VectorAssembler\n", + "columns = psdf.columns\n", + "feature_cols = [col for col in columns if col != label]\n", + "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n", + "psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\"]\n", "```\n", "\n", - "The class for tuning LGBM Regressor for time-series forecasting\n", + "Later in conducting the experiment, use your pandas-on-spark data like non-spark data and pass them using `X_train, y_train` or `dataframe, label`.\n", "\n", - "## XGBoost\\_TS Objects\n", + "### Estimators\n", + "#### Model List\n", + "- `lgbm_spark`: The class for fine-tuning Spark version LightGBM models, using [SynapseML](https://microsoft.github.io/SynapseML/docs/features/lightgbm/about/) API.\n", "\n", - "```python\n", - "class XGBoost_TS(TS_SKLearn)\n", - "```\n", + "#### Usage\n", + "First, prepare your data in the required format as described in the previous section.\n", "\n", - "The class for tuning XGBoost Regressor for time-series forecasting\n", + "By including the models you intend to try in the `estimators_list` argument to `flaml.automl`, FLAML will start trying configurations for these models. If your input is Spark data, FLAML will also use estimators with the `_spark` postfix by default, even if you haven't specified them.\n", "\n", - "## RF\\_TS Objects\n", + "Here is an example code snippet using SparkML models in AutoML:\n", "\n", "```python\n", - "class RF_TS(TS_SKLearn)\n", - "```\n", - "\n", - "The class for tuning Random Forest Regressor for time-series forecasting\n", + "import flaml\n", + "# prepare your data in pandas-on-spark format as we previously mentioned\n", "\n", - "## ExtraTrees\\_TS Objects\n", + "automl = flaml.AutoML()\n", + "settings = {\n", + " \"time_budget\": 30,\n", + " \"metric\": \"r2\",\n", + " \"estimator_list\": [\"lgbm_spark\"], # this setting is optional\n", + " \"task\": \"regression\",\n", + "}\n", "\n", - "```python\n", - "class ExtraTrees_TS(TS_SKLearn)\n", + "automl.fit(\n", + " dataframe=psdf,\n", + " label=label,\n", + " **settings,\n", + ")\n", "```\n", "\n", - "The class for tuning Extra Trees Regressor for time-series forecasting\n", "\n", - "## XGBoostLimitDepth\\_TS Objects\n", + "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/automl_bankrupt_synapseml.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/automl_bankrupt_synapseml.ipynb)\n", "\n", - "```python\n", - "class XGBoostLimitDepth_TS(TS_SKLearn)\n", - "```\n", + "## Parallel Spark Jobs\n", + "You can activate Spark as the parallel backend during parallel tuning in both [AutoML](/docs/Use-Cases/Task-Oriented-AutoML#parallel-tuning) and [Hyperparameter Tuning](/docs/Use-Cases/Tune-User-Defined-Function#parallel-tuning), by setting the `use_spark` to `true`. FLAML will dispatch your job to the distributed Spark backend using [`joblib-spark`](https://github.com/joblib/joblib-spark).\n", "\n", - "The class for tuning XGBoost Regressor with unlimited depth for time-series forecasting\n", + "Please note that you should not set `use_spark` to `true` when applying AutoML and Tuning for Spark Data. This is because only SparkML models will be used for Spark Data in AutoML and Tuning. As SparkML models run in parallel, there is no need to distribute them with `use_spark` again.\n", "\n", + "All the Spark-related arguments are stated below. These arguments are available in both Hyperparameter Tuning and AutoML:\n", "\n", - "---\n", - "sidebar_label: ts_data\n", - "title: automl.time_series.ts_data\n", - "---\n", "\n", - "## TimeSeriesDataset Objects\n", + "- `use_spark`: boolean, default=False | Whether to use spark to run the training in parallel spark jobs. This can be used to accelerate training on large models and large datasets, but will incur more overhead in time and thus slow down training in some cases. GPU training is not supported yet when use_spark is True. For Spark clusters, by default, we will launch one trial per executor. However, sometimes we want to launch more trials than the number of executors (e.g., local mode). In this case, we can set the environment variable `FLAML_MAX_CONCURRENT` to override the detected `num_executors`. The final number of concurrent trials will be the minimum of `n_concurrent_trials` and `num_executors`.\n", + "- `n_concurrent_trials`: int, default=1 | The number of concurrent trials. When n_concurrent_trials > 1, FLAML performes parallel tuning.\n", + "- `force_cancel`: boolean, default=False | Whether to forcely cancel Spark jobs if the search time exceeded the time budget. Spark jobs include parallel tuning jobs and Spark-based model training jobs.\n", "\n", + "An example code snippet for using parallel Spark jobs:\n", "```python\n", - "@dataclass\n", - "class TimeSeriesDataset()\n", - "```\n", - "\n", - "#### to\\_univariate\n", + "import flaml\n", + "automl_experiment = flaml.AutoML()\n", + "automl_settings = {\n", + " \"time_budget\": 30,\n", + " \"metric\": \"r2\",\n", + " \"task\": \"regression\",\n", + " \"n_concurrent_trials\": 2,\n", + " \"use_spark\": True,\n", + " \"force_cancel\": True, # Activating the force_cancel option can immediately halt Spark jobs once they exceed the allocated time_budget.\n", + "}\n", "\n", - "```python\n", - "def to_univariate() -> Dict[str, \"TimeSeriesDataset\"]\n", + "automl.fit(\n", + " dataframe=dataframe,\n", + " label=label,\n", + " **automl_settings,\n", + ")\n", "```\n", "\n", - "Convert a multivariate TrainingData to a dict of univariate ones\n", - "@param df:\n", - "@return:\n", "\n", - "#### fourier\\_series\n", + "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb)\n", "\n", - "```python\n", - "def fourier_series(feature: pd.Series, name: str)\n", - "```\n", + "# Research\n", "\n", - "Assume feature goes from 0 to 1 cyclically, transform that into Fourier\n", - "@param feature: input feature\n", - "@return: sin(2pi*feature), cos(2pi*feature)\n", + "For technical details, please check our research publications.\n", "\n", - "## DataTransformerTS Objects\n", + "* [FLAML: A Fast and Lightweight AutoML Library](https://www.microsoft.com/en-us/research/publication/flaml-a-fast-and-lightweight-automl-library/). Chi Wang, Qingyun Wu, Markus Weimer, Erkang Zhu. MLSys 2021.\n", "\n", - "```python\n", - "class DataTransformerTS()\n", + "```bibtex\n", + "@inproceedings{wang2021flaml,\n", + " title={FLAML: A Fast and Lightweight AutoML Library},\n", + " author={Chi Wang and Qingyun Wu and Markus Weimer and Erkang Zhu},\n", + " year={2021},\n", + " booktitle={MLSys},\n", + "}\n", "```\n", "\n", - "Transform input time series training data.\n", - "\n", - "#### fit\n", + "* [Frugal Optimization for Cost-related Hyperparameters](https://arxiv.org/abs/2005.01571). Qingyun Wu, Chi Wang, Silu Huang. AAAI 2021.\n", "\n", - "```python\n", - "def fit(X: Union[DataFrame, np.array], y)\n", + "```bibtex\n", + "@inproceedings{wu2021cfo,\n", + " title={Frugal Optimization for Cost-related Hyperparameters},\n", + " author={Qingyun Wu and Chi Wang and Silu Huang},\n", + " year={2021},\n", + " booktitle={AAAI},\n", + "}\n", "```\n", "\n", - "Fit transformer.\n", - "\n", - "**Arguments**:\n", - "\n", - "- `X` - A numpy array or a pandas dataframe of training data.\n", - "- `y` - A numpy array or a pandas series of labels.\n", - " \n", - "\n", - "**Returns**:\n", - "\n", - "- `X` - Processed numpy array or pandas dataframe of training data.\n", - "- `y` - Processed numpy array or pandas series of labels.\n", - "\n", - "\n", - " \n", - "- `seed` - int or None, default=None | The random seed for hpo.\n", - "- `n_concurrent_trials` - [Experimental] int, default=1 | The number of\n", - " concurrent trials. When n_concurrent_trials > 1, flaml performes\n", - " [parallel tuning](../../Use-Cases/Task-Oriented-AutoML#parallel-tuning)\n", - " and installation of ray or spark is required: `pip install flaml[ray]`\n", - " or `pip install flaml[spark]`. Please check\n", - " [here](https://spark.apache.org/docs/latest/api/python/getting_started/install.html)\n", - " for more details about installing Spark.\n", - "- `keep_search_state` - boolean, default=False | Whether to keep data needed\n", - " for model search after fit(). By default the state is deleted for\n", - " space saving.\n", - "- `preserve_checkpoint` - boolean, default=True | Whether to preserve the saved checkpoint\n", - " on disk when deleting automl. By default the checkpoint is preserved.\n", - "- `early_stop` - boolean, default=False | Whether to stop early if the\n", - " search is considered to converge.\n", - "- `force_cancel` - boolean, default=False | Whether to forcely cancel Spark jobs if the\n", - " search time exceeded the time budget.\n", - "- `append_log` - boolean, default=False | Whetehr to directly append the log\n", - " records to the input log file if it exists.\n", - "- `auto_augment` - boolean, default=True | Whether to automatically\n", - " augment rare classes.\n", - "- `min_sample_size` - int, default=MIN_SAMPLE_TRAIN | the minimal sample\n", - " size when sample=True.\n", - "- `use_ray` - boolean or dict.\n", - " If boolean: default=False | Whether to use ray to run the training\n", - " in separate processes. This can be used to prevent OOM for large\n", - " datasets, but will incur more overhead in time.\n", - " If dict: the dict contains the keywords arguments to be passed to\n", - " [ray.tune.run](https://docs.ray.io/en/latest/tune/api_docs/execution.html).\n", - "- `use_spark` - boolean, default=False | Whether to use spark to run the training\n", - " in parallel spark jobs. This can be used to accelerate training on large models\n", - " and large datasets, but will incur more overhead in time and thus slow down\n", - " training in some cases. GPU training is not supported yet when use_spark is True.\n", - " For Spark clusters, by default, we will launch one trial per executor. However,\n", - " sometimes we want to launch more trials than the number of executors (e.g., local mode).\n", - " In this case, we can set the environment variable `FLAML_MAX_CONCURRENT` to override\n", - " the detected `num_executors`. The final number of concurrent trials will be the minimum\n", - " of `n_concurrent_trials` and `num_executors`.\n", - "- `free_mem_ratio` - float between 0 and 1, default=0. The free memory ratio to keep during training.\n", - "- `metric_constraints` - list, default=[] | The list of metric constraints.\n", - " Each element in this list is a 3-tuple, which shall be expressed\n", - " in the following format: the first element of the 3-tuple is the name of the\n", - " metric, the second element is the inequality sign chosen from \">=\" and \"<=\",\n", - " and the third element is the constraint value. E.g., `('val_loss', '<=', 0.1)`.\n", - " Note that all the metric names in metric_constraints need to be reported via\n", - " the metrics_to_log dictionary returned by a customized metric function.\n", - " The customized metric function shall be provided via the `metric` key word\n", - " argument of the fit() function or the automl constructor.\n", - " Find an example in the 4th constraint type in this [doc](../../Use-Cases/Task-Oriented-AutoML#constraint).\n", - " If `pred_time_limit` is provided as one of keyword arguments to fit() function or\n", - " the automl constructor, flaml will automatically (and under the hood)\n", - " add it as an additional element in the metric_constraints. Essentially 'pred_time_limit'\n", - " specifies a constraint about the prediction latency constraint in seconds.\n", - "- `custom_hp` - dict, default=None | The custom search space specified by user.\n", - " It is a nested dict with keys being the estimator names, and values being dicts\n", - " per estimator search space. In the per estimator search space dict,\n", - " the keys are the hyperparameter names, and values are dicts of info (\"domain\",\n", - " \"init_value\", and \"low_cost_init_value\") about the search space associated with\n", - " the hyperparameter (i.e., per hyperparameter search space dict). When custom_hp\n", - " is provided, the built-in search space which is also a nested dict of per estimator\n", - " search space dict, will be updated with custom_hp. Note that during this nested dict update,\n", - " the per hyperparameter search space dicts will be replaced (instead of updated) by the ones\n", - " provided in custom_hp. Note that the value for \"domain\" can either be a constant\n", - " or a sample.Domain object.\n", - " e.g.,\n", - " \n", - "```python\n", - "custom_hp = {\n", - " \"transformer_ms\": {\n", - " \"model_path\": {\n", - " \"domain\": \"albert-base-v2\",\n", - " },\n", - " \"learning_rate\": {\n", - " \"domain\": tune.choice([1e-4, 1e-5]),\n", - " }\n", - " }\n", - " }\n", - "```\n", - "- `skip_transform` - boolean, default=False | Whether to pre-process data prior to modeling.\n", - "- `fit_kwargs_by_estimator` - dict, default=None | The user specified keywords arguments, grouped by estimator name.\n", - " e.g.,\n", - " \n", - "```python\n", - "fit_kwargs_by_estimator = {\n", - " \"transformer\": {\n", - " \"output_dir\": \"test/data/output/\",\n", - " \"fp16\": False,\n", - " }\n", + "* [Economical Hyperparameter Optimization With Blended Search Strategy](https://www.microsoft.com/en-us/research/publication/economical-hyperparameter-optimization-with-blended-search-strategy/). Chi Wang, Qingyun Wu, Silu Huang, Amin Saied. ICLR 2021.\n", + "\n", + "```bibtex\n", + "@inproceedings{wang2021blendsearch,\n", + " title={Economical Hyperparameter Optimization With Blended Search Strategy},\n", + " author={Chi Wang and Qingyun Wu and Silu Huang and Amin Saied},\n", + " year={2021},\n", + " booktitle={ICLR},\n", "}\n", "```\n", - "- `mlflow_logging` - boolean, default=True | Whether to log the training results to mlflow.\n", - " This requires mlflow to be installed and to have an active mlflow run.\n", - " FLAML will create nested runs.\n", "\n", - "#### config\\_history\n", + "* [An Empirical Study on Hyperparameter Optimization for Fine-Tuning Pre-trained Language Models](https://aclanthology.org/2021.acl-long.178.pdf). Susan Xueqing Liu, Chi Wang. ACL 2021.\n", "\n", - "```python\n", - "@property\n", - "def config_history() -> dict\n", + "```bibtex\n", + "@inproceedings{liuwang2021hpolm,\n", + " title={An Empirical Study on Hyperparameter Optimization for Fine-Tuning Pre-trained Language Models},\n", + " author={Susan Xueqing Liu and Chi Wang},\n", + " year={2021},\n", + " booktitle={ACL},\n", + "}\n", "```\n", "\n", - "A dictionary of iter->(estimator, config, time),\n", - "storing the best estimator, config, and the time when the best\n", - "model is updated each time.\n", - "\n", - "#### model\n", + "* [ChaCha for Online AutoML](https://www.microsoft.com/en-us/research/publication/chacha-for-online-automl/). Qingyun Wu, Chi Wang, John Langford, Paul Mineiro and Marco Rossi. ICML 2021.\n", "\n", - "```python\n", - "@property\n", - "def model()\n", + "```bibtex\n", + "@inproceedings{wu2021chacha,\n", + " title={ChaCha for Online AutoML},\n", + " author={Qingyun Wu and Chi Wang and John Langford and Paul Mineiro and Marco Rossi},\n", + " year={2021},\n", + " booktitle={ICML},\n", + "}\n", "```\n", "\n", - "An object with `predict()` and `predict_proba()` method (for\n", - "classification), storing the best trained model.\n", - "\n", - "#### best\\_model\\_for\\_estimator\n", + "* [Fair AutoML](https://arxiv.org/abs/2111.06495). Qingyun Wu, Chi Wang. ArXiv preprint arXiv:2111.06495 (2021).\n", "\n", - "```python\n", - "def best_model_for_estimator(estimator_name: str)\n", + "```bibtex\n", + "@inproceedings{wuwang2021fairautoml,\n", + " title={Fair AutoML},\n", + " author={Qingyun Wu and Chi Wang},\n", + " year={2021},\n", + " booktitle={ArXiv preprint arXiv:2111.06495},\n", + "}\n", "```\n", "\n", - "Return the best model found for a particular estimator.\n", + "* [Mining Robust Default Configurations for Resource-constrained AutoML](https://arxiv.org/abs/2202.09927). Moe Kayali, Chi Wang. ArXiv preprint arXiv:2202.09927 (2022).\n", "\n", - "**Arguments**:\n", - "\n", - "- `estimator_name` - a str of the estimator's name.\n", - " \n", + "```bibtex\n", + "@inproceedings{kayaliwang2022default,\n", + " title={Mining Robust Default Configurations for Resource-constrained AutoML},\n", + " author={Moe Kayali and Chi Wang},\n", + " year={2022},\n", + " booktitle={ArXiv preprint arXiv:2202.09927},\n", + "}\n", + "```\n", "\n", - "**Returns**:\n", + "* [Targeted Hyperparameter Optimization with Lexicographic Preferences Over Multiple Objectives](https://openreview.net/forum?id=0Ij9_q567Ma). Shaokun Zhang, Feiran Jia, Chi Wang, Qingyun Wu. ICLR 2023 (notable-top-5%).\n", "\n", - " An object storing the best model for estimator_name.\n", - " If `model_history` was set to False during fit(), then the returned model\n", - " is untrained unless estimator_name is the best estimator.\n", - " If `model_history` was set to True, then the returned model is trained.\n", + "```bibtex\n", + "@inproceedings{zhang2023targeted,\n", + " title={Targeted Hyperparameter Optimization with Lexicographic Preferences Over Multiple Objectives},\n", + " author={Shaokun Zhang and Feiran Jia and Chi Wang and Qingyun Wu},\n", + " booktitle={International Conference on Learning Representations},\n", + " year={2023},\n", + " url={https://openreview.net/forum?id=0Ij9_q567Ma},\n", + "}\n", + "```\n", "\n", - "#### best\\_estimator\n", + "* [Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference](https://arxiv.org/abs/2303.04673). Chi Wang, Susan Xueqing Liu, Ahmed H. Awadallah. ArXiv preprint arXiv:2303.04673 (2023).\n", "\n", - "```python\n", - "@property\n", - "def best_estimator()\n", + "```bibtex\n", + "@inproceedings{wang2023EcoOptiGen,\n", + " title={Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference},\n", + " author={Chi Wang and Susan Xueqing Liu and Ahmed H. Awadallah},\n", + " year={2023},\n", + " booktitle={ArXiv preprint arXiv:2303.04673},\n", + "}\n", "```\n", "\n", - "A string indicating the best estimator found.\n", - "\n", - "#### best\\_iteration\n", + "* [An Empirical Study on Challenging Math Problem Solving with GPT-4](https://arxiv.org/abs/2306.01337). Yiran Wu, Feiran Jia, Shaokun Zhang, Hangyu Li, Erkang Zhu, Yue Wang, Yin Tat Lee, Richard Peng, Qingyun Wu, Chi Wang. ArXiv preprint arXiv:2306.01337 (2023).\n", "\n", - "```python\n", - "@property\n", - "def best_iteration()\n", + "```bibtex\n", + "@inproceedings{wu2023empirical,\n", + " title={An Empirical Study on Challenging Math Problem Solving with GPT-4},\n", + " author={Yiran Wu and Feiran Jia and Shaokun Zhang and Hangyu Li and Erkang Zhu and Yue Wang and Yin Tat Lee and Richard Peng and Qingyun Wu and Chi Wang},\n", + " year={2023},\n", + " booktitle={ArXiv preprint arXiv:2306.01337},\n", + "}\n", "```\n", "\n", - "An integer of the iteration number where the best\n", - "config is found.\n", "\n", - "#### best\\_config\n", "\n", - "```python\n", - "@property\n", - "def best_config()\n", - "```\n", "\n", - "A dictionary of the best configuration.\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", "\n", - "#### best\\_config\\_per\\_estimator\n", + "The author of FLAML is Microsoft.\n", "\n", - "```python\n", - "@property\n", - "def best_config_per_estimator()\n", - "```\n", + "--------------------------------------------------------------------------------\n" + ] + } + ], + "source": [ + "# reset the assistant. Always reset the assistant before starting a new conversation.\n", + "assistant.reset()\n", + "\n", + "qa_problem = \"Who is the author of FLAML?\"\n", + "ragproxyagent.initiate_chat(assistant, problem=qa_problem)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "### Example 3\n", + "\n", + "[back to top](#toc)\n", + "\n", + "Use RetrieveChat to help generate sample code and ask for human-in-loop feedbacks.\n", + "\n", + "Problem: how to build a time series forecasting model for stock price using FLAML?" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Number of requested results 20 is greater than number of elements in index 2, updating n_results = 2\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "doc_ids: [['doc_0', 'doc_1']]\n", + "\u001b[32mAdding doc_id doc_0 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id doc_1 to context.\u001b[0m\n", + "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", "\n", - "A dictionary of all estimators' best configuration.\n", + "You're a retrieve augmented coding assistant. You answer user's questions based on your own knowledge and the\n", + "context provided by the user.\n", + "If you can't answer the question with or without the current context, you should reply exactly `UPDATE CONTEXT`.\n", + "For code generation, you must obey the following rules:\n", + "Rule 1. You MUST NOT install any packages because all the packages needed are already installed.\n", + "Rule 2. You must follow the formats below to write your code:\n", + "```language\n", + "# your code\n", + "```\n", "\n", - "#### best\\_loss\\_per\\_estimator\n", + "User's question is: how to build a time series forecasting model for stock price using FLAML?\n", "\n", - "```python\n", - "@property\n", - "def best_loss_per_estimator()\n", - "```\n", + "Context is: # Integrate - Spark\n", "\n", - "A dictionary of all estimators' best loss.\n", + "FLAML has integrated Spark for distributed training. There are two main aspects of integration with Spark:\n", + "- Use Spark ML estimators for AutoML.\n", + "- Use Spark to run training in parallel spark jobs.\n", "\n", - "#### best\\_loss\n", + "## Spark ML Estimators\n", "\n", - "```python\n", - "@property\n", - "def best_loss()\n", - "```\n", + "FLAML integrates estimators based on Spark ML models. These models are trained in parallel using Spark, so we called them Spark estimators. To use these models, you first need to organize your data in the required format.\n", "\n", - "A float of the best loss found.\n", + "### Data\n", "\n", - "#### best\\_result\n", + "For Spark estimators, AutoML only consumes Spark data. FLAML provides a convenient function `to_pandas_on_spark` in the `flaml.automl.spark.utils` module to convert your data into a pandas-on-spark (`pyspark.pandas`) dataframe/series, which Spark estimators require.\n", "\n", - "```python\n", - "@property\n", - "def best_result()\n", - "```\n", + "This utility function takes data in the form of a `pandas.Dataframe` or `pyspark.sql.Dataframe` and converts it into a pandas-on-spark dataframe. It also takes `pandas.Series` or `pyspark.sql.Dataframe` and converts it into a [pandas-on-spark](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/index.html) series. If you pass in a `pyspark.pandas.Dataframe`, it will not make any changes.\n", "\n", - "Result dictionary for model trained with the best config.\n", + "This function also accepts optional arguments `index_col` and `default_index_type`.\n", + "- `index_col` is the column name to use as the index, default is None.\n", + "- `default_index_type` is the default index type, default is \"distributed-sequence\". More info about default index type could be found on Spark official [documentation](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/options.html#default-index-type)\n", "\n", - "#### metrics\\_for\\_best\\_config\n", + "Here is an example code snippet for Spark Data:\n", "\n", "```python\n", - "@property\n", - "def metrics_for_best_config()\n", - "```\n", + "import pandas as pd\n", + "from flaml.automl.spark.utils import to_pandas_on_spark\n", + "# Creating a dictionary\n", + "data = {\"Square_Feet\": [800, 1200, 1800, 1500, 850],\n", + " \"Age_Years\": [20, 15, 10, 7, 25],\n", + " \"Price\": [100000, 200000, 300000, 240000, 120000]}\n", "\n", - "Returns a float of the best loss, and a dictionary of the auxiliary metrics to log\n", - "associated with the best config. These two objects correspond to the returned\n", - "objects by the customized metric function for the config with the best loss.\n", + "# Creating a pandas DataFrame\n", + "dataframe = pd.DataFrame(data)\n", + "label = \"Price\"\n", "\n", - "#### best\\_config\\_train\\_time\n", - " \n", - "```python\n", - "custom_hp = {\n", - " \"transformer_ms\": {\n", - " \"model_path\": {\n", - " \"domain\": \"albert-base-v2\",\n", - " },\n", - " \"learning_rate\": {\n", - " \"domain\": tune.choice([1e-4, 1e-5]),\n", - " }\n", - " }\n", - "}\n", + "# Convert to pandas-on-spark dataframe\n", + "psdf = to_pandas_on_spark(dataframe)\n", "```\n", - "- `fit_kwargs_by_estimator` - dict, default=None | The user specified keywords arguments, grouped by estimator name.\n", - " e.g.,\n", - " \n", - "```python\n", - "fit_kwargs_by_estimator = {\n", - " \"transformer\": {\n", - " \"output_dir\": \"test/data/output/\",\n", - " \"fp16\": False,\n", - " }\n", - "}\n", - "```\n", - " \n", - "- `**fit_kwargs` - Other key word arguments to pass to fit() function of\n", - " the searched learners, such as sample_weight. Below are a few examples of\n", - " estimator-specific parameters:\n", - "- `period` - int | forecast horizon for all time series forecast tasks.\n", - "- `gpu_per_trial` - float, default = 0 | A float of the number of gpus per trial,\n", - " only used by TransformersEstimator, XGBoostSklearnEstimator, and\n", - " TemporalFusionTransformerEstimator.\n", - "- `group_ids` - list of strings of column names identifying a time series, only\n", - " used by TemporalFusionTransformerEstimator, required for\n", - " 'ts_forecast_panel' task. `group_ids` is a parameter for TimeSeriesDataSet object\n", - " from PyTorchForecasting.\n", - " For other parameters to describe your dataset, refer to\n", - " [TimeSeriesDataSet PyTorchForecasting](https://pytorch-forecasting.readthedocs.io/en/stable/api/pytorch_forecasting.data.timeseries.TimeSeriesDataSet.html).\n", - " To specify your variables, use `static_categoricals`, `static_reals`,\n", - " `time_varying_known_categoricals`, `time_varying_known_reals`,\n", - " `time_varying_unknown_categoricals`, `time_varying_unknown_reals`,\n", - " `variable_groups`. To provide more information on your data, use\n", - " `max_encoder_length`, `min_encoder_length`, `lags`.\n", - "- `log_dir` - str, default = \"lightning_logs\" | Folder into which to log results\n", - " for tensorboard, only used by TemporalFusionTransformerEstimator.\n", - "- `max_epochs` - int, default = 20 | Maximum number of epochs to run training,\n", - " only used by TemporalFusionTransformerEstimator.\n", - "- `batch_size` - int, default = 64 | Batch size for training model, only\n", - " used by TemporalFusionTransformerEstimator.\n", - "\n", - "#### search\\_space\n", "\n", + "To use Spark ML models you need to format your data appropriately. Specifically, use [`VectorAssembler`](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.VectorAssembler.html) to merge all feature columns into a single vector column.\n", + "\n", + "Here is an example of how to use it:\n", "```python\n", - "@property\n", - "def search_space() -> dict\n", + "from pyspark.ml.feature import VectorAssembler\n", + "columns = psdf.columns\n", + "feature_cols = [col for col in columns if col != label]\n", + "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n", + "psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\"]\n", "```\n", "\n", - "Search space.\n", + "Later in conducting the experiment, use your pandas-on-spark data like non-spark data and pass them using `X_train, y_train` or `dataframe, label`.\n", "\n", - "Must be called after fit(...)\n", - "(use max_iter=0 and retrain_final=False to prevent actual fitting).\n", + "### Estimators\n", + "#### Model List\n", + "- `lgbm_spark`: The class for fine-tuning Spark version LightGBM models, using [SynapseML](https://microsoft.github.io/SynapseML/docs/features/lightgbm/about/) API.\n", "\n", - "**Returns**:\n", + "#### Usage\n", + "First, prepare your data in the required format as described in the previous section.\n", "\n", - " A dict of the search space.\n", + "By including the models you intend to try in the `estimators_list` argument to `flaml.automl`, FLAML will start trying configurations for these models. If your input is Spark data, FLAML will also use estimators with the `_spark` postfix by default, even if you haven't specified them.\n", "\n", - "#### low\\_cost\\_partial\\_config\n", + "Here is an example code snippet using SparkML models in AutoML:\n", "\n", "```python\n", - "@property\n", - "def low_cost_partial_config() -> dict\n", - "```\n", + "import flaml\n", + "# prepare your data in pandas-on-spark format as we previously mentioned\n", "\n", - "Low cost partial config.\n", + "automl = flaml.AutoML()\n", + "settings = {\n", + " \"time_budget\": 30,\n", + " \"metric\": \"r2\",\n", + " \"estimator_list\": [\"lgbm_spark\"], # this setting is optional\n", + " \"task\": \"regression\",\n", + "}\n", "\n", - "**Returns**:\n", + "automl.fit(\n", + " dataframe=psdf,\n", + " label=label,\n", + " **settings,\n", + ")\n", + "```\n", "\n", - " A dict.\n", - " (a) if there is only one estimator in estimator_list, each key is a\n", - " hyperparameter name.\n", - " (b) otherwise, it is a nested dict with 'ml' as the key, and\n", - " a list of the low_cost_partial_configs as the value, corresponding\n", - " to each learner's low_cost_partial_config; the estimator index as\n", - " an integer corresponding to the cheapest learner is appended to the\n", - " list at the end.\n", "\n", - "#### cat\\_hp\\_cost\n", + "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/automl_bankrupt_synapseml.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/automl_bankrupt_synapseml.ipynb)\n", "\n", - "```python\n", - "@property\n", - "def cat_hp_cost() -> dict\n", - "```\n", + "## Parallel Spark Jobs\n", + "You can activate Spark as the parallel backend during parallel tuning in both [AutoML](/docs/Use-Cases/Task-Oriented-AutoML#parallel-tuning) and [Hyperparameter Tuning](/docs/Use-Cases/Tune-User-Defined-Function#parallel-tuning), by setting the `use_spark` to `true`. FLAML will dispatch your job to the distributed Spark backend using [`joblib-spark`](https://github.com/joblib/joblib-spark).\n", "\n", - "Categorical hyperparameter cost\n", + "Please note that you should not set `use_spark` to `true` when applying AutoML and Tuning for Spark Data. This is because only SparkML models will be used for Spark Data in AutoML and Tuning. As SparkML models run in parallel, there is no need to distribute them with `use_spark` again.\n", "\n", - "**Returns**:\n", + "All the Spark-related arguments are stated below. These arguments are available in both Hyperparameter Tuning and AutoML:\n", "\n", - " A dict.\n", - " (a) if there is only one estimator in estimator_list, each key is a\n", - " hyperparameter name.\n", - " (b) otherwise, it is a nested dict with 'ml' as the key, and\n", - " a list of the cat_hp_cost's as the value, corresponding\n", - " to each learner's cat_hp_cost; the cost relative to lgbm for each\n", - " learner (as a list itself) is appended to the list at the end.\n", "\n", - "#### points\\_to\\_evaluate\n", + "- `use_spark`: boolean, default=False | Whether to use spark to run the training in parallel spark jobs. This can be used to accelerate training on large models and large datasets, but will incur more overhead in time and thus slow down training in some cases. GPU training is not supported yet when use_spark is True. For Spark clusters, by default, we will launch one trial per executor. However, sometimes we want to launch more trials than the number of executors (e.g., local mode). In this case, we can set the environment variable `FLAML_MAX_CONCURRENT` to override the detected `num_executors`. The final number of concurrent trials will be the minimum of `n_concurrent_trials` and `num_executors`.\n", + "- `n_concurrent_trials`: int, default=1 | The number of concurrent trials. When n_concurrent_trials > 1, FLAML performes parallel tuning.\n", + "- `force_cancel`: boolean, default=False | Whether to forcely cancel Spark jobs if the search time exceeded the time budget. Spark jobs include parallel tuning jobs and Spark-based model training jobs.\n", "\n", + "An example code snippet for using parallel Spark jobs:\n", "```python\n", - "@property\n", - "def points_to_evaluate() -> dict\n", - "```\n", - "\n", - "Initial points to evaluate.\n", + "import flaml\n", + "automl_experiment = flaml.AutoML()\n", + "automl_settings = {\n", + " \"time_budget\": 30,\n", + " \"metric\": \"r2\",\n", + " \"task\": \"regression\",\n", + " \"n_concurrent_trials\": 2,\n", + " \"use_spark\": True,\n", + " \"force_cancel\": True, # Activating the force_cancel option can immediately halt Spark jobs once they exceed the allocated time_budget.\n", + "}\n", "\n", - "**Returns**:\n", + "automl.fit(\n", + " dataframe=dataframe,\n", + " label=label,\n", + " **automl_settings,\n", + ")\n", + "```\n", "\n", - " A list of dicts. Each dict is the initial point for each learner.\n", "\n", - "#### resource\\_attr\n", + "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb)\n", "\n", - "```python\n", - "@property\n", - "def resource_attr() -> Optional[str]\n", - "```\n", + "# Research\n", "\n", - "Attribute of the resource dimension.\n", + "For technical details, please check our research publications.\n", "\n", - "**Returns**:\n", + "* [FLAML: A Fast and Lightweight AutoML Library](https://www.microsoft.com/en-us/research/publication/flaml-a-fast-and-lightweight-automl-library/). Chi Wang, Qingyun Wu, Markus Weimer, Erkang Zhu. MLSys 2021.\n", "\n", - " A string for the sample size attribute\n", - " (the resource attribute in AutoML) or None.\n", + "```bibtex\n", + "@inproceedings{wang2021flaml,\n", + " title={FLAML: A Fast and Lightweight AutoML Library},\n", + " author={Chi Wang and Qingyun Wu and Markus Weimer and Erkang Zhu},\n", + " year={2021},\n", + " booktitle={MLSys},\n", + "}\n", + "```\n", "\n", - "#### min\\_resource\n", + "* [Frugal Optimization for Cost-related Hyperparameters](https://arxiv.org/abs/2005.01571). Qingyun Wu, Chi Wang, Silu Huang. AAAI 2021.\n", "\n", - "```python\n", - "@property\n", - "def min_resource() -> Optional[float]\n", + "```bibtex\n", + "@inproceedings{wu2021cfo,\n", + " title={Frugal Optimization for Cost-related Hyperparameters},\n", + " author={Qingyun Wu and Chi Wang and Silu Huang},\n", + " year={2021},\n", + " booktitle={AAAI},\n", + "}\n", "```\n", "\n", - "Attribute for pruning.\n", + "* [Economical Hyperparameter Optimization With Blended Search Strategy](https://www.microsoft.com/en-us/research/publication/economical-hyperparameter-optimization-with-blended-search-strategy/). Chi Wang, Qingyun Wu, Silu Huang, Amin Saied. ICLR 2021.\n", "\n", - "**Returns**:\n", - "\n", - " A float for the minimal sample size or None.\n", + "```bibtex\n", + "@inproceedings{wang2021blendsearch,\n", + " title={Economical Hyperparameter Optimization With Blended Search Strategy},\n", + " author={Chi Wang and Qingyun Wu and Silu Huang and Amin Saied},\n", + " year={2021},\n", + " booktitle={ICLR},\n", + "}\n", + "```\n", "\n", - "#### max\\_resource\n", + "* [An Empirical Study on Hyperparameter Optimization for Fine-Tuning Pre-trained Language Models](https://aclanthology.org/2021.acl-long.178.pdf). Susan Xueqing Liu, Chi Wang. ACL 2021.\n", "\n", - "```python\n", - "@property\n", - "def max_resource() -> Optional[float]\n", + "```bibtex\n", + "@inproceedings{liuwang2021hpolm,\n", + " title={An Empirical Study on Hyperparameter Optimization for Fine-Tuning Pre-trained Language Models},\n", + " author={Susan Xueqing Liu and Chi Wang},\n", + " year={2021},\n", + " booktitle={ACL},\n", + "}\n", "```\n", "\n", - "Attribute for pruning.\n", - "\n", - "**Returns**:\n", + "* [ChaCha for Online AutoML](https://www.microsoft.com/en-us/research/publication/chacha-for-online-automl/). Qingyun Wu, Chi Wang, John Langford, Paul Mineiro and Marco Rossi. ICML 2021.\n", "\n", - " A float for the maximal sample size or None.\n", + "```bibtex\n", + "@inproceedings{wu2021chacha,\n", + " title={ChaCha for Online AutoML},\n", + " author={Qingyun Wu and Chi Wang and John Langford and Paul Mineiro and Marco Rossi},\n", + " year={2021},\n", + " booktitle={ICML},\n", + "}\n", + "```\n", "\n", - "#### trainable\n", + "* [Fair AutoML](https://arxiv.org/abs/2111.06495). Qingyun Wu, Chi Wang. ArXiv preprint arXiv:2111.06495 (2021).\n", "\n", - "```python\n", - "@property\n", - "def trainable() -> Callable[[dict], Optional[float]]\n", + "```bibtex\n", + "@inproceedings{wuwang2021fairautoml,\n", + " title={Fair AutoML},\n", + " author={Qingyun Wu and Chi Wang},\n", + " year={2021},\n", + " booktitle={ArXiv preprint arXiv:2111.06495},\n", + "}\n", "```\n", "\n", - "Training function.\n", - "\n", - "**Returns**:\n", + "* [Mining Robust Default Configurations for Resource-constrained AutoML](https://arxiv.org/abs/2202.09927). Moe Kayali, Chi Wang. ArXiv preprint arXiv:2202.09927 (2022).\n", "\n", - " A function that evaluates each config and returns the loss.\n", + "```bibtex\n", + "@inproceedings{kayaliwang2022default,\n", + " title={Mining Robust Default Configurations for Resource-constrained AutoML},\n", + " author={Moe Kayali and Chi Wang},\n", + " year={2022},\n", + " booktitle={ArXiv preprint arXiv:2202.09927},\n", + "}\n", + "```\n", "\n", - "#### metric\\_constraints\n", + "* [Targeted Hyperparameter Optimization with Lexicographic Preferences Over Multiple Objectives](https://openreview.net/forum?id=0Ij9_q567Ma). Shaokun Zhang, Feiran Jia, Chi Wang, Qingyun Wu. ICLR 2023 (notable-top-5%).\n", "\n", - "```python\n", - "@property\n", - "def metric_constraints() -> list\n", + "```bibtex\n", + "@inproceedings{zhang2023targeted,\n", + " title={Targeted Hyperparameter Optimization with Lexicographic Preferences Over Multiple Objectives},\n", + " author={Shaokun Zhang and Feiran Jia and Chi Wang and Qingyun Wu},\n", + " booktitle={International Conference on Learning Representations},\n", + " year={2023},\n", + " url={https://openreview.net/forum?id=0Ij9_q567Ma},\n", + "}\n", "```\n", "\n", - "Metric constraints.\n", + "* [Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference](https://arxiv.org/abs/2303.04673). Chi Wang, Susan Xueqing Liu, Ahmed H. Awadallah. ArXiv preprint arXiv:2303.04673 (2023).\n", "\n", - "**Returns**:\n", - "\n", - " A list of the metric constraints.\n", + "```bibtex\n", + "@inproceedings{wang2023EcoOptiGen,\n", + " title={Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference},\n", + " author={Chi Wang and Susan Xueqing Liu and Ahmed H. Awadallah},\n", + " year={2023},\n", + " booktitle={ArXiv preprint arXiv:2303.04673},\n", + "}\n", + "```\n", "\n", - "#### fit\n", + "* [An Empirical Study on Challenging Math Problem Solving with GPT-4](https://arxiv.org/abs/2306.01337). Yiran Wu, Feiran Jia, Shaokun Zhang, Hangyu Li, Erkang Zhu, Yue Wang, Yin Tat Lee, Richard Peng, Qingyun Wu, Chi Wang. ArXiv preprint arXiv:2306.01337 (2023).\n", "\n", - "```python\n", - "def fit(X_train=None, y_train=None, dataframe=None, label=None, metric=None, task: Optional[Union[str, Task]] = None, n_jobs=None, log_file_name=None, estimator_list=None, time_budget=None, max_iter=None, sample=None, ensemble=None, eval_method=None, log_type=None, model_history=None, split_ratio=None, n_splits=None, log_training_metric=None, mem_thres=None, pred_time_limit=None, train_time_limit=None, X_val=None, y_val=None, sample_weight_val=None, groups_val=None, groups=None, verbose=None, retrain_full=None, split_type=None, learner_selector=None, hpo_method=None, starting_points=None, seed=None, n_concurrent_trials=None, keep_search_state=None, preserve_checkpoint=True, early_stop=None, force_cancel=None, append_log=None, auto_augment=None, min_sample_size=None, use_ray=None, use_spark=None, free_mem_ratio=0, metric_constraints=None, custom_hp=None, time_col=None, cv_score_agg_func=None, skip_transform=None, mlflow_logging=None, fit_kwargs_by_estimator=None, **fit_kwargs, ,)\n", + "```bibtex\n", + "@inproceedings{wu2023empirical,\n", + " title={An Empirical Study on Challenging Math Problem Solving with GPT-4},\n", + " author={Yiran Wu and Feiran Jia and Shaokun Zhang and Hangyu Li and Erkang Zhu and Yue Wang and Yin Tat Lee and Richard Peng and Qingyun Wu and Chi Wang},\n", + " year={2023},\n", + " booktitle={ArXiv preprint arXiv:2306.01337},\n", + "}\n", "```\n", "\n", - "Find a model for a given task.\n", - "\n", - "**Arguments**:\n", "\n", "\n", "\n", "--------------------------------------------------------------------------------\n", "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", "\n", - "To build a time series forecasting model for stock price using FLAML, you can follow these steps:\n", + "To build a time series forecasting model for stock price using FLAML, you can use the `lgbm_spark` estimator in FLAML. First, you need to organize your data into a pandas-on-spark dataframe, then merge all feature columns into a single vector column using Spark's VectorAssembler. Here is an example code snippet:\n", "\n", - "1. Install the FLAML library if you haven't already:\n", - "```bash\n", - "pip install flaml\n", - "```\n", - "\n", - "2. Import required libraries:\n", "```python\n", "import pandas as pd\n", - "from flaml import AutoML\n", - "```\n", + "from flaml.automl.spark.utils import to_pandas_on_spark\n", + "from pyspark.ml.feature import VectorAssembler\n", + "import flaml\n", "\n", - "3. Load your stock price dataset and preprocess it as needed. The dataset must have at least two columns: a timestamp column (datetime type) and a label column (numeric type). For example, if your dataset is named `stock_data` and has columns 'Date' as timestamps and 'Close' as stock prices:\n", + "# load data as Pandas DataFrame\n", + "df = pd.read_csv(\"your_stock_data.csv\")\n", "\n", - "```python\n", - "stock_data['Date'] = pd.to_datetime(stock_data['Date'])\n", - "stock_data = stock_data.sort_values(by='Date')\n", - "```\n", - "\n", - "4. Define the task as 'ts_forecast_regression' and split your dataset into training and test sets:\n", - "\n", - "```python\n", - "task = 'ts_forecast_regression'\n", - "data = stock_data[['Date', 'Close']]\n", - "train_data = data[:-30] # Use the last 30 days as test data\n", - "test_data = data[-30:]\n", - "```\n", + "# convert to pandas-on-spark dataframe\n", + "psdf = to_pandas_on_spark(df)\n", "\n", - "5. Specify the forecasting horizon (e.g., next 5 days):\n", + "# merge all feature columns into a single vector column\n", + "columns = psdf.columns\n", + "label = \"stock_price\"\n", + "feature_cols = [col for col in columns if col != label]\n", + "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n", + "psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\"]\n", "\n", - "```python\n", - "forecast_horizon = 5\n", - "```\n", - "\n", - "6. Create an `AutoML` object and fit it to the training dataset with the `time_series` option:\n", - "\n", - "```python\n", - "automl = AutoML()\n", + "# run AutoML with lgbm_spark estimator\n", + "automl = flaml.AutoML()\n", + "settings = {\n", + " \"time_budget\": 30, # in seconds\n", + " \"metric\": \"r2\", # or other scoring metrics\n", + " \"estimator_list\": [\"lgbm_spark\"], # use lgbm_spark estimator\n", + " \"task\": \"regression\", # or other tasks\n", + "}\n", "automl.fit(\n", - " dataframe=train_data,\n", - " label=\"Close\",\n", - " task=task,\n", - " metric=\"mape\",\n", - " time_budget=600,\n", - " period=forecast_horizon,\n", + " dataframe=psdf,\n", + " label=label,\n", + " **settings,\n", ")\n", "```\n", "\n", - "7. Use the fitted model for prediction:\n", - "\n", - "```python\n", - "predicted_values = automl.predict(train_data, test_data[\"Date\"].values[:forecast_horizon])\n", - "```\n", + "--------------------------------------------------------------------------------\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", "\n", - "`predicted_values` will contain the predicted stock prices for the specified forecasting horizon.\n", + "I want the time_budget to be 1 hour\n", "\n", "--------------------------------------------------------------------------------\n", - "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", - "\n", - "time budget is 2 mins\n", + "I want the time_budget to be 1 hour\n", "\n", "--------------------------------------------------------------------------------\n", "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", "\n", - "To set the time budget to 2 minutes, you can simply update the `time_budget` parameter when calling the `fit()` method. Given that 2 minutes is equal to 120 seconds, the updated code will be:\n", + "Sure, you can set the `time_budget` to 3600 seconds (which is 1 hour in seconds) in the `settings` dictionary. Here is the updated code snippet:\n", "\n", "```python\n", - "automl = AutoML()\n", + "import pandas as pd\n", + "from flaml.automl.spark.utils import to_pandas_on_spark\n", + "from pyspark.ml.feature import VectorAssembler\n", + "import flaml\n", + "\n", + "# load data as Pandas DataFrame\n", + "df = pd.read_csv(\"your_stock_data.csv\")\n", + "\n", + "# convert to pandas-on-spark dataframe\n", + "psdf = to_pandas_on_spark(df)\n", + "\n", + "# merge all feature columns into a single vector column\n", + "columns = psdf.columns\n", + "label = \"stock_price\"\n", + "feature_cols = [col for col in columns if col != label]\n", + "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n", + "psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\"]\n", + "\n", + "# run AutoML with lgbm_spark estimator\n", + "automl = flaml.AutoML()\n", + "settings = {\n", + " \"time_budget\": 3600, # set time_budget to 1 hour (in seconds)\n", + " \"metric\": \"r2\", # or other scoring metrics\n", + " \"estimator_list\": [\"lgbm_spark\"], # use lgbm_spark estimator\n", + " \"task\": \"regression\", # or other tasks\n", + "}\n", "automl.fit(\n", - " dataframe=train_data,\n", - " label=\"Close\",\n", - " task=task,\n", - " metric=\"mape\",\n", - " time_budget=120, # Set the time budget to 2 minutes (120 seconds)\n", - " period=forecast_horizon,\n", + " dataframe=psdf,\n", + " label=label,\n", + " **settings,\n", ")\n", "```\n", "\n", - "This will ensure the model search and training process doesn't exceed 2 minutes.\n", - "\n", "--------------------------------------------------------------------------------\n" ] } @@ -2022,17 +1356,23 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Number of requested results 20 is greater than number of elements in index 2, updating n_results = 2\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "doc_ids: [['doc_36', 'doc_40', 'doc_15', 'doc_14', 'doc_52', 'doc_51', 'doc_58', 'doc_21', 'doc_27', 'doc_35', 'doc_23', 'doc_12', 'doc_59', 'doc_4', 'doc_56', 'doc_47', 'doc_53', 'doc_20', 'doc_29', 'doc_33']]\n", - "\u001b[32mAdding doc_id doc_36 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_40 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_15 to context.\u001b[0m\n", + "doc_ids: [['doc_0', 'doc_1']]\n", + "\u001b[32mAdding doc_id doc_0 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id doc_1 to context.\u001b[0m\n", "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", "\n", "You're a retrieve augmented coding assistant. You answer user's questions based on your own knowledge and the\n", @@ -2045,549 +1385,245 @@ "# your code\n", "```\n", "\n", - "User's question is: Is there a function named `tune_automl` in FLAML?\n", + "User's question is: Is there a function named `tune_automl` in FLAML?\n", + "\n", + "Context is: # Integrate - Spark\n", + "\n", + "FLAML has integrated Spark for distributed training. There are two main aspects of integration with Spark:\n", + "- Use Spark ML estimators for AutoML.\n", + "- Use Spark to run training in parallel spark jobs.\n", + "\n", + "## Spark ML Estimators\n", + "\n", + "FLAML integrates estimators based on Spark ML models. These models are trained in parallel using Spark, so we called them Spark estimators. To use these models, you first need to organize your data in the required format.\n", + "\n", + "### Data\n", + "\n", + "For Spark estimators, AutoML only consumes Spark data. FLAML provides a convenient function `to_pandas_on_spark` in the `flaml.automl.spark.utils` module to convert your data into a pandas-on-spark (`pyspark.pandas`) dataframe/series, which Spark estimators require.\n", + "\n", + "This utility function takes data in the form of a `pandas.Dataframe` or `pyspark.sql.Dataframe` and converts it into a pandas-on-spark dataframe. It also takes `pandas.Series` or `pyspark.sql.Dataframe` and converts it into a [pandas-on-spark](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/index.html) series. If you pass in a `pyspark.pandas.Dataframe`, it will not make any changes.\n", + "\n", + "This function also accepts optional arguments `index_col` and `default_index_type`.\n", + "- `index_col` is the column name to use as the index, default is None.\n", + "- `default_index_type` is the default index type, default is \"distributed-sequence\". More info about default index type could be found on Spark official [documentation](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/options.html#default-index-type)\n", + "\n", + "Here is an example code snippet for Spark Data:\n", + "\n", + "```python\n", + "import pandas as pd\n", + "from flaml.automl.spark.utils import to_pandas_on_spark\n", + "# Creating a dictionary\n", + "data = {\"Square_Feet\": [800, 1200, 1800, 1500, 850],\n", + " \"Age_Years\": [20, 15, 10, 7, 25],\n", + " \"Price\": [100000, 200000, 300000, 240000, 120000]}\n", + "\n", + "# Creating a pandas DataFrame\n", + "dataframe = pd.DataFrame(data)\n", + "label = \"Price\"\n", "\n", - "Context is: \n", - "- `seed` - int or None, default=None | The random seed for hpo.\n", - "- `n_concurrent_trials` - [Experimental] int, default=1 | The number of\n", - " concurrent trials. When n_concurrent_trials > 1, flaml performes\n", - " [parallel tuning](../../Use-Cases/Task-Oriented-AutoML#parallel-tuning)\n", - " and installation of ray or spark is required: `pip install flaml[ray]`\n", - " or `pip install flaml[spark]`. Please check\n", - " [here](https://spark.apache.org/docs/latest/api/python/getting_started/install.html)\n", - " for more details about installing Spark.\n", - "- `keep_search_state` - boolean, default=False | Whether to keep data needed\n", - " for model search after fit(). By default the state is deleted for\n", - " space saving.\n", - "- `preserve_checkpoint` - boolean, default=True | Whether to preserve the saved checkpoint\n", - " on disk when deleting automl. By default the checkpoint is preserved.\n", - "- `early_stop` - boolean, default=False | Whether to stop early if the\n", - " search is considered to converge.\n", - "- `force_cancel` - boolean, default=False | Whether to forcely cancel Spark jobs if the\n", - " search time exceeded the time budget.\n", - "- `append_log` - boolean, default=False | Whetehr to directly append the log\n", - " records to the input log file if it exists.\n", - "- `auto_augment` - boolean, default=True | Whether to automatically\n", - " augment rare classes.\n", - "- `min_sample_size` - int, default=MIN_SAMPLE_TRAIN | the minimal sample\n", - " size when sample=True.\n", - "- `use_ray` - boolean or dict.\n", - " If boolean: default=False | Whether to use ray to run the training\n", - " in separate processes. This can be used to prevent OOM for large\n", - " datasets, but will incur more overhead in time.\n", - " If dict: the dict contains the keywords arguments to be passed to\n", - " [ray.tune.run](https://docs.ray.io/en/latest/tune/api_docs/execution.html).\n", - "- `use_spark` - boolean, default=False | Whether to use spark to run the training\n", - " in parallel spark jobs. This can be used to accelerate training on large models\n", - " and large datasets, but will incur more overhead in time and thus slow down\n", - " training in some cases. GPU training is not supported yet when use_spark is True.\n", - " For Spark clusters, by default, we will launch one trial per executor. However,\n", - " sometimes we want to launch more trials than the number of executors (e.g., local mode).\n", - " In this case, we can set the environment variable `FLAML_MAX_CONCURRENT` to override\n", - " the detected `num_executors`. The final number of concurrent trials will be the minimum\n", - " of `n_concurrent_trials` and `num_executors`.\n", - "- `free_mem_ratio` - float between 0 and 1, default=0. The free memory ratio to keep during training.\n", - "- `metric_constraints` - list, default=[] | The list of metric constraints.\n", - " Each element in this list is a 3-tuple, which shall be expressed\n", - " in the following format: the first element of the 3-tuple is the name of the\n", - " metric, the second element is the inequality sign chosen from \">=\" and \"<=\",\n", - " and the third element is the constraint value. E.g., `('val_loss', '<=', 0.1)`.\n", - " Note that all the metric names in metric_constraints need to be reported via\n", - " the metrics_to_log dictionary returned by a customized metric function.\n", - " The customized metric function shall be provided via the `metric` key word\n", - " argument of the fit() function or the automl constructor.\n", - " Find an example in the 4th constraint type in this [doc](../../Use-Cases/Task-Oriented-AutoML#constraint).\n", - " If `pred_time_limit` is provided as one of keyword arguments to fit() function or\n", - " the automl constructor, flaml will automatically (and under the hood)\n", - " add it as an additional element in the metric_constraints. Essentially 'pred_time_limit'\n", - " specifies a constraint about the prediction latency constraint in seconds.\n", - "- `custom_hp` - dict, default=None | The custom search space specified by user.\n", - " It is a nested dict with keys being the estimator names, and values being dicts\n", - " per estimator search space. In the per estimator search space dict,\n", - " the keys are the hyperparameter names, and values are dicts of info (\"domain\",\n", - " \"init_value\", and \"low_cost_init_value\") about the search space associated with\n", - " the hyperparameter (i.e., per hyperparameter search space dict). When custom_hp\n", - " is provided, the built-in search space which is also a nested dict of per estimator\n", - " search space dict, will be updated with custom_hp. Note that during this nested dict update,\n", - " the per hyperparameter search space dicts will be replaced (instead of updated) by the ones\n", - " provided in custom_hp. Note that the value for \"domain\" can either be a constant\n", - " or a sample.Domain object.\n", - " e.g.,\n", - " \n", - "```python\n", - "custom_hp = {\n", - " \"transformer_ms\": {\n", - " \"model_path\": {\n", - " \"domain\": \"albert-base-v2\",\n", - " },\n", - " \"learning_rate\": {\n", - " \"domain\": tune.choice([1e-4, 1e-5]),\n", - " }\n", - " }\n", - " }\n", - "```\n", - "- `skip_transform` - boolean, default=False | Whether to pre-process data prior to modeling.\n", - "- `fit_kwargs_by_estimator` - dict, default=None | The user specified keywords arguments, grouped by estimator name.\n", - " e.g.,\n", - " \n", - "```python\n", - "fit_kwargs_by_estimator = {\n", - " \"transformer\": {\n", - " \"output_dir\": \"test/data/output/\",\n", - " \"fp16\": False,\n", - " }\n", - "}\n", + "# Convert to pandas-on-spark dataframe\n", + "psdf = to_pandas_on_spark(dataframe)\n", "```\n", - "- `mlflow_logging` - boolean, default=True | Whether to log the training results to mlflow.\n", - " This requires mlflow to be installed and to have an active mlflow run.\n", - " FLAML will create nested runs.\n", "\n", - "#### config\\_history\n", + "To use Spark ML models you need to format your data appropriately. Specifically, use [`VectorAssembler`](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.VectorAssembler.html) to merge all feature columns into a single vector column.\n", "\n", + "Here is an example of how to use it:\n", "```python\n", - "@property\n", - "def config_history() -> dict\n", + "from pyspark.ml.feature import VectorAssembler\n", + "columns = psdf.columns\n", + "feature_cols = [col for col in columns if col != label]\n", + "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n", + "psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\"]\n", "```\n", "\n", - "A dictionary of iter->(estimator, config, time),\n", - "storing the best estimator, config, and the time when the best\n", - "model is updated each time.\n", + "Later in conducting the experiment, use your pandas-on-spark data like non-spark data and pass them using `X_train, y_train` or `dataframe, label`.\n", "\n", - "#### model\n", + "### Estimators\n", + "#### Model List\n", + "- `lgbm_spark`: The class for fine-tuning Spark version LightGBM models, using [SynapseML](https://microsoft.github.io/SynapseML/docs/features/lightgbm/about/) API.\n", "\n", - "```python\n", - "@property\n", - "def model()\n", - "```\n", + "#### Usage\n", + "First, prepare your data in the required format as described in the previous section.\n", "\n", - "An object with `predict()` and `predict_proba()` method (for\n", - "classification), storing the best trained model.\n", + "By including the models you intend to try in the `estimators_list` argument to `flaml.automl`, FLAML will start trying configurations for these models. If your input is Spark data, FLAML will also use estimators with the `_spark` postfix by default, even if you haven't specified them.\n", "\n", - "#### best\\_model\\_for\\_estimator\n", + "Here is an example code snippet using SparkML models in AutoML:\n", "\n", "```python\n", - "def best_model_for_estimator(estimator_name: str)\n", - "```\n", + "import flaml\n", + "# prepare your data in pandas-on-spark format as we previously mentioned\n", "\n", - "Return the best model found for a particular estimator.\n", + "automl = flaml.AutoML()\n", + "settings = {\n", + " \"time_budget\": 30,\n", + " \"metric\": \"r2\",\n", + " \"estimator_list\": [\"lgbm_spark\"], # this setting is optional\n", + " \"task\": \"regression\",\n", + "}\n", "\n", - "**Arguments**:\n", + "automl.fit(\n", + " dataframe=psdf,\n", + " label=label,\n", + " **settings,\n", + ")\n", + "```\n", "\n", - "- `estimator_name` - a str of the estimator's name.\n", - " \n", "\n", - "**Returns**:\n", + "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/automl_bankrupt_synapseml.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/automl_bankrupt_synapseml.ipynb)\n", "\n", - " An object storing the best model for estimator_name.\n", - " If `model_history` was set to False during fit(), then the returned model\n", - " is untrained unless estimator_name is the best estimator.\n", - " If `model_history` was set to True, then the returned model is trained.\n", + "## Parallel Spark Jobs\n", + "You can activate Spark as the parallel backend during parallel tuning in both [AutoML](/docs/Use-Cases/Task-Oriented-AutoML#parallel-tuning) and [Hyperparameter Tuning](/docs/Use-Cases/Tune-User-Defined-Function#parallel-tuning), by setting the `use_spark` to `true`. FLAML will dispatch your job to the distributed Spark backend using [`joblib-spark`](https://github.com/joblib/joblib-spark).\n", "\n", - "#### best\\_estimator\n", + "Please note that you should not set `use_spark` to `true` when applying AutoML and Tuning for Spark Data. This is because only SparkML models will be used for Spark Data in AutoML and Tuning. As SparkML models run in parallel, there is no need to distribute them with `use_spark` again.\n", "\n", - "```python\n", - "@property\n", - "def best_estimator()\n", - "```\n", + "All the Spark-related arguments are stated below. These arguments are available in both Hyperparameter Tuning and AutoML:\n", "\n", - "A string indicating the best estimator found.\n", "\n", - "#### best\\_iteration\n", + "- `use_spark`: boolean, default=False | Whether to use spark to run the training in parallel spark jobs. This can be used to accelerate training on large models and large datasets, but will incur more overhead in time and thus slow down training in some cases. GPU training is not supported yet when use_spark is True. For Spark clusters, by default, we will launch one trial per executor. However, sometimes we want to launch more trials than the number of executors (e.g., local mode). In this case, we can set the environment variable `FLAML_MAX_CONCURRENT` to override the detected `num_executors`. The final number of concurrent trials will be the minimum of `n_concurrent_trials` and `num_executors`.\n", + "- `n_concurrent_trials`: int, default=1 | The number of concurrent trials. When n_concurrent_trials > 1, FLAML performes parallel tuning.\n", + "- `force_cancel`: boolean, default=False | Whether to forcely cancel Spark jobs if the search time exceeded the time budget. Spark jobs include parallel tuning jobs and Spark-based model training jobs.\n", "\n", + "An example code snippet for using parallel Spark jobs:\n", "```python\n", - "@property\n", - "def best_iteration()\n", + "import flaml\n", + "automl_experiment = flaml.AutoML()\n", + "automl_settings = {\n", + " \"time_budget\": 30,\n", + " \"metric\": \"r2\",\n", + " \"task\": \"regression\",\n", + " \"n_concurrent_trials\": 2,\n", + " \"use_spark\": True,\n", + " \"force_cancel\": True, # Activating the force_cancel option can immediately halt Spark jobs once they exceed the allocated time_budget.\n", + "}\n", + "\n", + "automl.fit(\n", + " dataframe=dataframe,\n", + " label=label,\n", + " **automl_settings,\n", + ")\n", "```\n", "\n", - "An integer of the iteration number where the best\n", - "config is found.\n", "\n", - "#### best\\_config\n", + "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb)\n", "\n", - "```python\n", - "@property\n", - "def best_config()\n", - "```\n", + "# Research\n", "\n", - "A dictionary of the best configuration.\n", + "For technical details, please check our research publications.\n", "\n", - "#### best\\_config\\_per\\_estimator\n", + "* [FLAML: A Fast and Lightweight AutoML Library](https://www.microsoft.com/en-us/research/publication/flaml-a-fast-and-lightweight-automl-library/). Chi Wang, Qingyun Wu, Markus Weimer, Erkang Zhu. MLSys 2021.\n", "\n", - "```python\n", - "@property\n", - "def best_config_per_estimator()\n", + "```bibtex\n", + "@inproceedings{wang2021flaml,\n", + " title={FLAML: A Fast and Lightweight AutoML Library},\n", + " author={Chi Wang and Qingyun Wu and Markus Weimer and Erkang Zhu},\n", + " year={2021},\n", + " booktitle={MLSys},\n", + "}\n", "```\n", "\n", - "A dictionary of all estimators' best configuration.\n", - "\n", - "#### best\\_loss\\_per\\_estimator\n", + "* [Frugal Optimization for Cost-related Hyperparameters](https://arxiv.org/abs/2005.01571). Qingyun Wu, Chi Wang, Silu Huang. AAAI 2021.\n", "\n", - "```python\n", - "@property\n", - "def best_loss_per_estimator()\n", + "```bibtex\n", + "@inproceedings{wu2021cfo,\n", + " title={Frugal Optimization for Cost-related Hyperparameters},\n", + " author={Qingyun Wu and Chi Wang and Silu Huang},\n", + " year={2021},\n", + " booktitle={AAAI},\n", + "}\n", "```\n", "\n", - "A dictionary of all estimators' best loss.\n", + "* [Economical Hyperparameter Optimization With Blended Search Strategy](https://www.microsoft.com/en-us/research/publication/economical-hyperparameter-optimization-with-blended-search-strategy/). Chi Wang, Qingyun Wu, Silu Huang, Amin Saied. ICLR 2021.\n", "\n", - "#### best\\_loss\n", - "\n", - "```python\n", - "@property\n", - "def best_loss()\n", + "```bibtex\n", + "@inproceedings{wang2021blendsearch,\n", + " title={Economical Hyperparameter Optimization With Blended Search Strategy},\n", + " author={Chi Wang and Qingyun Wu and Silu Huang and Amin Saied},\n", + " year={2021},\n", + " booktitle={ICLR},\n", + "}\n", "```\n", "\n", - "A float of the best loss found.\n", - "\n", - "#### best\\_result\n", + "* [An Empirical Study on Hyperparameter Optimization for Fine-Tuning Pre-trained Language Models](https://aclanthology.org/2021.acl-long.178.pdf). Susan Xueqing Liu, Chi Wang. ACL 2021.\n", "\n", - "```python\n", - "@property\n", - "def best_result()\n", + "```bibtex\n", + "@inproceedings{liuwang2021hpolm,\n", + " title={An Empirical Study on Hyperparameter Optimization for Fine-Tuning Pre-trained Language Models},\n", + " author={Susan Xueqing Liu and Chi Wang},\n", + " year={2021},\n", + " booktitle={ACL},\n", + "}\n", "```\n", "\n", - "Result dictionary for model trained with the best config.\n", + "* [ChaCha for Online AutoML](https://www.microsoft.com/en-us/research/publication/chacha-for-online-automl/). Qingyun Wu, Chi Wang, John Langford, Paul Mineiro and Marco Rossi. ICML 2021.\n", "\n", - "#### metrics\\_for\\_best\\_config\n", - "\n", - "```python\n", - "@property\n", - "def metrics_for_best_config()\n", + "```bibtex\n", + "@inproceedings{wu2021chacha,\n", + " title={ChaCha for Online AutoML},\n", + " author={Qingyun Wu and Chi Wang and John Langford and Paul Mineiro and Marco Rossi},\n", + " year={2021},\n", + " booktitle={ICML},\n", + "}\n", "```\n", "\n", - "Returns a float of the best loss, and a dictionary of the auxiliary metrics to log\n", - "associated with the best config. These two objects correspond to the returned\n", - "objects by the customized metric function for the config with the best loss.\n", - "\n", - "#### best\\_config\\_train\\_time\n", - " \n", - "- `seed` - int or None, default=None | The random seed for hpo.\n", - "- `n_concurrent_trials` - [Experimental] int, default=1 | The number of\n", - " concurrent trials. When n_concurrent_trials > 1, flaml performes\n", - " [parallel tuning](../../Use-Cases/Task-Oriented-AutoML#parallel-tuning)\n", - " and installation of ray or spark is required: `pip install flaml[ray]`\n", - " or `pip install flaml[spark]`. Please check\n", - " [here](https://spark.apache.org/docs/latest/api/python/getting_started/install.html)\n", - " for more details about installing Spark.\n", - "- `keep_search_state` - boolean, default=False | Whether to keep data needed\n", - " for model search after fit(). By default the state is deleted for\n", - " space saving.\n", - "- `preserve_checkpoint` - boolean, default=True | Whether to preserve the saved checkpoint\n", - " on disk when deleting automl. By default the checkpoint is preserved.\n", - "- `early_stop` - boolean, default=False | Whether to stop early if the\n", - " search is considered to converge.\n", - "- `force_cancel` - boolean, default=False | Whether to forcely cancel the PySpark job if overtime.\n", - "- `append_log` - boolean, default=False | Whetehr to directly append the log\n", - " records to the input log file if it exists.\n", - "- `auto_augment` - boolean, default=True | Whether to automatically\n", - " augment rare classes.\n", - "- `min_sample_size` - int, default=MIN_SAMPLE_TRAIN | the minimal sample\n", - " size when sample=True.\n", - "- `use_ray` - boolean or dict.\n", - " If boolean: default=False | Whether to use ray to run the training\n", - " in separate processes. This can be used to prevent OOM for large\n", - " datasets, but will incur more overhead in time.\n", - " If dict: the dict contains the keywords arguments to be passed to\n", - " [ray.tune.run](https://docs.ray.io/en/latest/tune/api_docs/execution.html).\n", - "- `use_spark` - boolean, default=False | Whether to use spark to run the training\n", - " in parallel spark jobs. This can be used to accelerate training on large models\n", - " and large datasets, but will incur more overhead in time and thus slow down\n", - " training in some cases.\n", - "- `free_mem_ratio` - float between 0 and 1, default=0. The free memory ratio to keep during training.\n", - "- `metric_constraints` - list, default=[] | The list of metric constraints.\n", - " Each element in this list is a 3-tuple, which shall be expressed\n", - " in the following format: the first element of the 3-tuple is the name of the\n", - " metric, the second element is the inequality sign chosen from \">=\" and \"<=\",\n", - " and the third element is the constraint value. E.g., `('precision', '>=', 0.9)`.\n", - " Note that all the metric names in metric_constraints need to be reported via\n", - " the metrics_to_log dictionary returned by a customized metric function.\n", - " The customized metric function shall be provided via the `metric` key word argument\n", - " of the fit() function or the automl constructor.\n", - " Find examples in this [test](https://github.com/microsoft/FLAML/tree/main/test/automl/test_constraints.py).\n", - " If `pred_time_limit` is provided as one of keyword arguments to fit() function or\n", - " the automl constructor, flaml will automatically (and under the hood)\n", - " add it as an additional element in the metric_constraints. Essentially 'pred_time_limit'\n", - " specifies a constraint about the prediction latency constraint in seconds.\n", - "- `custom_hp` - dict, default=None | The custom search space specified by user\n", - " Each key is the estimator name, each value is a dict of the custom search space for that estimator. Notice the\n", - " domain of the custom search space can either be a value of a sample.Domain object.\n", - " \n", - " \n", - " \n", - "```python\n", - "custom_hp = {\n", - " \"transformer_ms\": {\n", - " \"model_path\": {\n", - " \"domain\": \"albert-base-v2\",\n", - " },\n", - " \"learning_rate\": {\n", - " \"domain\": tune.choice([1e-4, 1e-5]),\n", - " }\n", - " }\n", + "* [Fair AutoML](https://arxiv.org/abs/2111.06495). Qingyun Wu, Chi Wang. ArXiv preprint arXiv:2111.06495 (2021).\n", + "\n", + "```bibtex\n", + "@inproceedings{wuwang2021fairautoml,\n", + " title={Fair AutoML},\n", + " author={Qingyun Wu and Chi Wang},\n", + " year={2021},\n", + " booktitle={ArXiv preprint arXiv:2111.06495},\n", "}\n", "```\n", - "- `time_col` - for a time series task, name of the column containing the timestamps. If not\n", - " provided, defaults to the first column of X_train/X_val\n", - " \n", - "- `cv_score_agg_func` - customized cross-validation scores aggregate function. Default to average metrics across folds. If specificed, this function needs to\n", - " have the following input arguments:\n", - " \n", - " * val_loss_folds: list of floats, the loss scores of each fold;\n", - " * log_metrics_folds: list of dicts/floats, the metrics of each fold to log.\n", - " \n", - " This function should return the final aggregate result of all folds. A float number of the minimization objective, and a dictionary as the metrics to log or None.\n", - " E.g.,\n", - " \n", - "```python\n", - "def cv_score_agg_func(val_loss_folds, log_metrics_folds):\n", - " metric_to_minimize = sum(val_loss_folds)/len(val_loss_folds)\n", - " metrics_to_log = None\n", - " for single_fold in log_metrics_folds:\n", - " if metrics_to_log is None:\n", - " metrics_to_log = single_fold\n", - " elif isinstance(metrics_to_log, dict):\n", - " metrics_to_log = {k: metrics_to_log[k] + v for k, v in single_fold.items()}\n", - " else:\n", - " metrics_to_log += single_fold\n", - " if metrics_to_log:\n", - " n = len(val_loss_folds)\n", - " metrics_to_log = (\n", - " {k: v / n for k, v in metrics_to_log.items()}\n", - " if isinstance(metrics_to_log, dict)\n", - " else metrics_to_log / n\n", - " )\n", - " return metric_to_minimize, metrics_to_log\n", - "```\n", - " \n", - "- `skip_transform` - boolean, default=False | Whether to pre-process data prior to modeling.\n", - "- `mlflow_logging` - boolean, default=None | Whether to log the training results to mlflow.\n", - " Default value is None, which means the logging decision is made based on\n", - " AutoML.__init__'s mlflow_logging argument.\n", - " This requires mlflow to be installed and to have an active mlflow run.\n", - " FLAML will create nested runs.\n", - "- `fit_kwargs_by_estimator` - dict, default=None | The user specified keywords arguments, grouped by estimator name.\n", - " For TransformersEstimator, available fit_kwargs can be found from\n", - " [TrainingArgumentsForAuto](nlp/huggingface/training_args).\n", - " e.g.,\n", - " \n", - "```python\n", - "fit_kwargs_by_estimator = {\n", - " \"transformer\": {\n", - " \"output_dir\": \"test/data/output/\",\n", - " \"fp16\": False,\n", - " },\n", - " \"tft\": {\n", - " \"max_encoder_length\": 1,\n", - " \"min_encoder_length\": 1,\n", - " \"static_categoricals\": [],\n", - " \"static_reals\": [],\n", - " \"time_varying_known_categoricals\": [],\n", - " \"time_varying_known_reals\": [],\n", - " \"time_varying_unknown_categoricals\": [],\n", - " \"time_varying_unknown_reals\": [],\n", - " \"variable_groups\": {},\n", - " \"lags\": {},\n", - " }\n", + "\n", + "* [Mining Robust Default Configurations for Resource-constrained AutoML](https://arxiv.org/abs/2202.09927). Moe Kayali, Chi Wang. ArXiv preprint arXiv:2202.09927 (2022).\n", + "\n", + "```bibtex\n", + "@inproceedings{kayaliwang2022default,\n", + " title={Mining Robust Default Configurations for Resource-constrained AutoML},\n", + " author={Moe Kayali and Chi Wang},\n", + " year={2022},\n", + " booktitle={ArXiv preprint arXiv:2202.09927},\n", "}\n", "```\n", - " \n", - "- `**fit_kwargs` - Other key word arguments to pass to fit() function of\n", - " the searched learners, such as sample_weight. Below are a few examples of\n", - " estimator-specific parameters:\n", - "- `period` - int | forecast horizon for all time series forecast tasks.\n", - "- `gpu_per_trial` - float, default = 0 | A float of the number of gpus per trial,\n", - " only used by TransformersEstimator, XGBoostSklearnEstimator, and\n", - " TemporalFusionTransformerEstimator.\n", - "- `group_ids` - list of strings of column names identifying a time series, only\n", - " used by TemporalFusionTransformerEstimator, required for\n", - " 'ts_forecast_panel' task. `group_ids` is a parameter for TimeSeriesDataSet object\n", - " from PyTorchForecasting.\n", - " For other parameters to describe your dataset, refer to\n", - " [TimeSeriesDataSet PyTorchForecasting](https://pytorch-forecasting.readthedocs.io/en/stable/api/pytorch_forecasting.data.timeseries.TimeSeriesDataSet.html).\n", - " To specify your variables, use `static_categoricals`, `static_reals`,\n", - " `time_varying_known_categoricals`, `time_varying_known_reals`,\n", - " `time_varying_unknown_categoricals`, `time_varying_unknown_reals`,\n", - " `variable_groups`. To provide more information on your data, use\n", - " `max_encoder_length`, `min_encoder_length`, `lags`.\n", - "- `log_dir` - str, default = \"lightning_logs\" | Folder into which to log results\n", - " for tensorboard, only used by TemporalFusionTransformerEstimator.\n", - "- `max_epochs` - int, default = 20 | Maximum number of epochs to run training,\n", - " only used by TemporalFusionTransformerEstimator.\n", - "- `batch_size` - int, default = 64 | Batch size for training model, only\n", - " used by TemporalFusionTransformerEstimator.\n", - "\n", - "\n", - " \n", - "```python\n", - "from flaml import BlendSearch\n", - "algo = BlendSearch(metric='val_loss', mode='min',\n", - " space=search_space,\n", - " low_cost_partial_config=low_cost_partial_config)\n", - "for i in range(10):\n", - " analysis = tune.run(compute_with_config,\n", - " search_alg=algo, use_ray=False)\n", - " print(analysis.trials[-1].last_result)\n", - "```\n", - " \n", - "- `verbose` - 0, 1, 2, or 3. If ray or spark backend is used, their verbosity will be\n", - " affected by this argument. 0 = silent, 1 = only status updates,\n", - " 2 = status and brief trial results, 3 = status and detailed trial results.\n", - " Defaults to 2.\n", - "- `local_dir` - A string of the local dir to save ray logs if ray backend is\n", - " used; or a local dir to save the tuning log.\n", - "- `num_samples` - An integer of the number of configs to try. Defaults to 1.\n", - "- `resources_per_trial` - A dictionary of the hardware resources to allocate\n", - " per trial, e.g., `{'cpu': 1}`. It is only valid when using ray backend\n", - " (by setting 'use_ray = True'). It shall be used when you need to do\n", - " [parallel tuning](../../Use-Cases/Tune-User-Defined-Function#parallel-tuning).\n", - "- `config_constraints` - A list of config constraints to be satisfied.\n", - " e.g., ```config_constraints = [(mem_size, '<=', 1024**3)]```\n", - " \n", - " mem_size is a function which produces a float number for the bytes\n", - " needed for a config.\n", - " It is used to skip configs which do not fit in memory.\n", - "- `metric_constraints` - A list of metric constraints to be satisfied.\n", - " e.g., `['precision', '>=', 0.9]`. The sign can be \">=\" or \"<=\".\n", - "- `max_failure` - int | the maximal consecutive number of failures to sample\n", - " a trial before the tuning is terminated.\n", - "- `use_ray` - A boolean of whether to use ray as the backend.\n", - "- `use_spark` - A boolean of whether to use spark as the backend.\n", - "- `log_file_name` - A string of the log file name. Default to None.\n", - " When set to None:\n", - " if local_dir is not given, no log file is created;\n", - " if local_dir is given, the log file name will be autogenerated under local_dir.\n", - " Only valid when verbose > 0 or use_ray is True.\n", - "- `lexico_objectives` - dict, default=None | It specifics information needed to perform multi-objective\n", - " optimization with lexicographic preferences. When lexico_objectives is not None, the arguments metric,\n", - " mode, will be invalid, and flaml's tune uses CFO\n", - " as the `search_alg`, which makes the input (if provided) `search_alg' invalid.\n", - " This dictionary shall contain the following fields of key-value pairs:\n", - " - \"metrics\": a list of optimization objectives with the orders reflecting the priorities/preferences of the\n", - " objectives.\n", - " - \"modes\" (optional): a list of optimization modes (each mode either \"min\" or \"max\") corresponding to the\n", - " objectives in the metric list. If not provided, we use \"min\" as the default mode for all the objectives.\n", - " - \"targets\" (optional): a dictionary to specify the optimization targets on the objectives. The keys are the\n", - " metric names (provided in \"metric\"), and the values are the numerical target values.\n", - " - \"tolerances\" (optional): a dictionary to specify the optimality tolerances on objectives. The keys are the metric names (provided in \"metrics\"), and the values are the absolute/percentage tolerance in the form of numeric/string.\n", - " E.g.,\n", - "```python\n", - "lexico_objectives = {\n", - " \"metrics\": [\"error_rate\", \"pred_time\"],\n", - " \"modes\": [\"min\", \"min\"],\n", - " \"tolerances\": {\"error_rate\": 0.01, \"pred_time\": 0.0},\n", - " \"targets\": {\"error_rate\": 0.0},\n", + "\n", + "* [Targeted Hyperparameter Optimization with Lexicographic Preferences Over Multiple Objectives](https://openreview.net/forum?id=0Ij9_q567Ma). Shaokun Zhang, Feiran Jia, Chi Wang, Qingyun Wu. ICLR 2023 (notable-top-5%).\n", + "\n", + "```bibtex\n", + "@inproceedings{zhang2023targeted,\n", + " title={Targeted Hyperparameter Optimization with Lexicographic Preferences Over Multiple Objectives},\n", + " author={Shaokun Zhang and Feiran Jia and Chi Wang and Qingyun Wu},\n", + " booktitle={International Conference on Learning Representations},\n", + " year={2023},\n", + " url={https://openreview.net/forum?id=0Ij9_q567Ma},\n", "}\n", "```\n", - " We also support percentage tolerance.\n", - " E.g.,\n", - "```python\n", - "lexico_objectives = {\n", - " \"metrics\": [\"error_rate\", \"pred_time\"],\n", - " \"modes\": [\"min\", \"min\"],\n", - " \"tolerances\": {\"error_rate\": \"5%\", \"pred_time\": \"0%\"},\n", - " \"targets\": {\"error_rate\": 0.0},\n", + "\n", + "* [Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference](https://arxiv.org/abs/2303.04673). Chi Wang, Susan Xueqing Liu, Ahmed H. Awadallah. ArXiv preprint arXiv:2303.04673 (2023).\n", + "\n", + "```bibtex\n", + "@inproceedings{wang2023EcoOptiGen,\n", + " title={Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference},\n", + " author={Chi Wang and Susan Xueqing Liu and Ahmed H. Awadallah},\n", + " year={2023},\n", + " booktitle={ArXiv preprint arXiv:2303.04673},\n", "}\n", "```\n", - "- `force_cancel` - boolean, default=False | Whether to forcely cancel the PySpark job if overtime.\n", - "- `n_concurrent_trials` - int, default=0 | The number of concurrent trials when perform hyperparameter\n", - " tuning with Spark. Only valid when use_spark=True and spark is required:\n", - " `pip install flaml[spark]`. Please check\n", - " [here](https://spark.apache.org/docs/latest/api/python/getting_started/install.html)\n", - " for more details about installing Spark. When tune.run() is called from AutoML, it will be\n", - " overwritten by the value of `n_concurrent_trials` in AutoML. When <= 0, the concurrent trials\n", - " will be set to the number of executors.\n", - "- `**ray_args` - keyword arguments to pass to ray.tune.run().\n", - " Only valid when use_ray=True.\n", - "\n", - "## Tuner Objects\n", "\n", - "```python\n", - "class Tuner()\n", + "* [An Empirical Study on Challenging Math Problem Solving with GPT-4](https://arxiv.org/abs/2306.01337). Yiran Wu, Feiran Jia, Shaokun Zhang, Hangyu Li, Erkang Zhu, Yue Wang, Yin Tat Lee, Richard Peng, Qingyun Wu, Chi Wang. ArXiv preprint arXiv:2306.01337 (2023).\n", + "\n", + "```bibtex\n", + "@inproceedings{wu2023empirical,\n", + " title={An Empirical Study on Challenging Math Problem Solving with GPT-4},\n", + " author={Yiran Wu and Feiran Jia and Shaokun Zhang and Hangyu Li and Erkang Zhu and Yue Wang and Yin Tat Lee and Richard Peng and Qingyun Wu and Chi Wang},\n", + " year={2023},\n", + " booktitle={ArXiv preprint arXiv:2306.01337},\n", + "}\n", "```\n", "\n", - "Tuner is the class-based way of launching hyperparameter tuning jobs compatible with Ray Tune 2.\n", - "\n", - "**Arguments**:\n", - "\n", - "- `trainable` - A user-defined evaluation function.\n", - " It takes a configuration as input, outputs a evaluation\n", - " result (can be a numerical value or a dictionary of string\n", - " and numerical value pairs) for the input configuration.\n", - " For machine learning tasks, it usually involves training and\n", - " scoring a machine learning model, e.g., through validation loss.\n", - "- `param_space` - Search space of the tuning job.\n", - " One thing to note is that both preprocessor and dataset can be tuned here.\n", - "- `tune_config` - Tuning algorithm specific configs.\n", - " Refer to ray.tune.tune_config.TuneConfig for more info.\n", - "- `run_config` - Runtime configuration that is specific to individual trials.\n", - " If passed, this will overwrite the run config passed to the Trainer,\n", - " if applicable. Refer to ray.air.config.RunConfig for more info.\n", - " \n", - " Usage pattern:\n", - " \n", - " .. code-block:: python\n", - " \n", - " from sklearn.datasets import load_breast_cancer\n", - " \n", - " from ray import tune\n", - " from ray.data import from_pandas\n", - " from ray.air.config import RunConfig, ScalingConfig\n", - " from ray.train.xgboost import XGBoostTrainer\n", - " from ray.tune.tuner import Tuner\n", - " \n", - " def get_dataset():\n", - " data_raw = load_breast_cancer(as_frame=True)\n", - " dataset_df = data_raw[\"data\"]\n", - " dataset_df[\"target\"] = data_raw[\"target\"]\n", - " dataset = from_pandas(dataset_df)\n", - " return dataset\n", - " \n", - " trainer = XGBoostTrainer(\n", - " label_column=\"target\",\n", - " params={},\n", - "- `datasets={\"train\"` - get_dataset()},\n", - " )\n", - " \n", - " param_space = {\n", - "- `\"scaling_config\"` - ScalingConfig(\n", - " num_workers=tune.grid_search([2, 4]),\n", - " resources_per_worker={\n", - "- `\"CPU\"` - tune.grid_search([1, 2]),\n", - " },\n", - " ),\n", - " # You can even grid search various datasets in Tune.\n", - " # \"datasets\": {\n", - " # \"train\": tune.grid_search(\n", - " # [ds1, ds2]\n", - " # ),\n", - " # },\n", - "- `\"params\"` - {\n", - "- `\"objective\"` - \"binary:logistic\",\n", - "- `\"tree_method\"` - \"approx\",\n", - "- `\"eval_metric\"` - [\"logloss\", \"error\"],\n", - "- `\"eta\"` - tune.loguniform(1e-4, 1e-1),\n", - "- `\"subsample\"` - tune.uniform(0.5, 1.0),\n", - "- `\"max_depth\"` - tune.randint(1, 9),\n", - " },\n", - " }\n", - " tuner = Tuner(trainable=trainer, param_space=param_space,\n", - " run_config=RunConfig(name=\"my_tune_run\"))\n", - " analysis = tuner.fit()\n", - " \n", - " To retry a failed tune run, you can then do\n", - " \n", - " .. code-block:: python\n", - " \n", - " tuner = Tuner.restore(experiment_checkpoint_dir)\n", - " tuner.fit()\n", - " \n", - " ``experiment_checkpoint_dir`` can be easily located near the end of the\n", - " console output of your first failed run.\n", - "\n", - "\n", - "\n", - "\n", - "\u001b[32mAdding doc_id doc_40 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_15 to context.\u001b[0m\n", + "\n", + "\n", + "\u001b[32mAdding doc_id doc_1 to context.\u001b[0m\n", "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", "\n", "You're a retrieve augmented coding assistant. You answer user's questions based on your own knowledge and the\n", @@ -2602,560 +1638,247 @@ "\n", "User's question is: Is there a function named `tune_automl` in FLAML?\n", "\n", - "Context is: \n", - "- `seed` - int or None, default=None | The random seed for hpo.\n", - "- `n_concurrent_trials` - [Experimental] int, default=1 | The number of\n", - " concurrent trials. When n_concurrent_trials > 1, flaml performes\n", - " [parallel tuning](../../Use-Cases/Task-Oriented-AutoML#parallel-tuning)\n", - " and installation of ray or spark is required: `pip install flaml[ray]`\n", - " or `pip install flaml[spark]`. Please check\n", - " [here](https://spark.apache.org/docs/latest/api/python/getting_started/install.html)\n", - " for more details about installing Spark.\n", - "- `keep_search_state` - boolean, default=False | Whether to keep data needed\n", - " for model search after fit(). By default the state is deleted for\n", - " space saving.\n", - "- `preserve_checkpoint` - boolean, default=True | Whether to preserve the saved checkpoint\n", - " on disk when deleting automl. By default the checkpoint is preserved.\n", - "- `early_stop` - boolean, default=False | Whether to stop early if the\n", - " search is considered to converge.\n", - "- `force_cancel` - boolean, default=False | Whether to forcely cancel Spark jobs if the\n", - " search time exceeded the time budget.\n", - "- `append_log` - boolean, default=False | Whetehr to directly append the log\n", - " records to the input log file if it exists.\n", - "- `auto_augment` - boolean, default=True | Whether to automatically\n", - " augment rare classes.\n", - "- `min_sample_size` - int, default=MIN_SAMPLE_TRAIN | the minimal sample\n", - " size when sample=True.\n", - "- `use_ray` - boolean or dict.\n", - " If boolean: default=False | Whether to use ray to run the training\n", - " in separate processes. This can be used to prevent OOM for large\n", - " datasets, but will incur more overhead in time.\n", - " If dict: the dict contains the keywords arguments to be passed to\n", - " [ray.tune.run](https://docs.ray.io/en/latest/tune/api_docs/execution.html).\n", - "- `use_spark` - boolean, default=False | Whether to use spark to run the training\n", - " in parallel spark jobs. This can be used to accelerate training on large models\n", - " and large datasets, but will incur more overhead in time and thus slow down\n", - " training in some cases. GPU training is not supported yet when use_spark is True.\n", - " For Spark clusters, by default, we will launch one trial per executor. However,\n", - " sometimes we want to launch more trials than the number of executors (e.g., local mode).\n", - " In this case, we can set the environment variable `FLAML_MAX_CONCURRENT` to override\n", - " the detected `num_executors`. The final number of concurrent trials will be the minimum\n", - " of `n_concurrent_trials` and `num_executors`.\n", - "- `free_mem_ratio` - float between 0 and 1, default=0. The free memory ratio to keep during training.\n", - "- `metric_constraints` - list, default=[] | The list of metric constraints.\n", - " Each element in this list is a 3-tuple, which shall be expressed\n", - " in the following format: the first element of the 3-tuple is the name of the\n", - " metric, the second element is the inequality sign chosen from \">=\" and \"<=\",\n", - " and the third element is the constraint value. E.g., `('val_loss', '<=', 0.1)`.\n", - " Note that all the metric names in metric_constraints need to be reported via\n", - " the metrics_to_log dictionary returned by a customized metric function.\n", - " The customized metric function shall be provided via the `metric` key word\n", - " argument of the fit() function or the automl constructor.\n", - " Find an example in the 4th constraint type in this [doc](../../Use-Cases/Task-Oriented-AutoML#constraint).\n", - " If `pred_time_limit` is provided as one of keyword arguments to fit() function or\n", - " the automl constructor, flaml will automatically (and under the hood)\n", - " add it as an additional element in the metric_constraints. Essentially 'pred_time_limit'\n", - " specifies a constraint about the prediction latency constraint in seconds.\n", - "- `custom_hp` - dict, default=None | The custom search space specified by user.\n", - " It is a nested dict with keys being the estimator names, and values being dicts\n", - " per estimator search space. In the per estimator search space dict,\n", - " the keys are the hyperparameter names, and values are dicts of info (\"domain\",\n", - " \"init_value\", and \"low_cost_init_value\") about the search space associated with\n", - " the hyperparameter (i.e., per hyperparameter search space dict). When custom_hp\n", - " is provided, the built-in search space which is also a nested dict of per estimator\n", - " search space dict, will be updated with custom_hp. Note that during this nested dict update,\n", - " the per hyperparameter search space dicts will be replaced (instead of updated) by the ones\n", - " provided in custom_hp. Note that the value for \"domain\" can either be a constant\n", - " or a sample.Domain object.\n", - " e.g.,\n", - " \n", - "```python\n", - "custom_hp = {\n", - " \"transformer_ms\": {\n", - " \"model_path\": {\n", - " \"domain\": \"albert-base-v2\",\n", - " },\n", - " \"learning_rate\": {\n", - " \"domain\": tune.choice([1e-4, 1e-5]),\n", - " }\n", - " }\n", - " }\n", - "```\n", - "- `skip_transform` - boolean, default=False | Whether to pre-process data prior to modeling.\n", - "- `fit_kwargs_by_estimator` - dict, default=None | The user specified keywords arguments, grouped by estimator name.\n", - " e.g.,\n", - " \n", - "```python\n", - "fit_kwargs_by_estimator = {\n", - " \"transformer\": {\n", - " \"output_dir\": \"test/data/output/\",\n", - " \"fp16\": False,\n", - " }\n", - "}\n", - "```\n", - "- `mlflow_logging` - boolean, default=True | Whether to log the training results to mlflow.\n", - " This requires mlflow to be installed and to have an active mlflow run.\n", - " FLAML will create nested runs.\n", - "\n", - "#### config\\_history\n", - "\n", - "```python\n", - "@property\n", - "def config_history() -> dict\n", - "```\n", + "Context is: # Integrate - Spark\n", "\n", - "A dictionary of iter->(estimator, config, time),\n", - "storing the best estimator, config, and the time when the best\n", - "model is updated each time.\n", + "FLAML has integrated Spark for distributed training. There are two main aspects of integration with Spark:\n", + "- Use Spark ML estimators for AutoML.\n", + "- Use Spark to run training in parallel spark jobs.\n", "\n", - "#### model\n", + "## Spark ML Estimators\n", "\n", - "```python\n", - "@property\n", - "def model()\n", - "```\n", + "FLAML integrates estimators based on Spark ML models. These models are trained in parallel using Spark, so we called them Spark estimators. To use these models, you first need to organize your data in the required format.\n", "\n", - "An object with `predict()` and `predict_proba()` method (for\n", - "classification), storing the best trained model.\n", + "### Data\n", "\n", - "#### best\\_model\\_for\\_estimator\n", + "For Spark estimators, AutoML only consumes Spark data. FLAML provides a convenient function `to_pandas_on_spark` in the `flaml.automl.spark.utils` module to convert your data into a pandas-on-spark (`pyspark.pandas`) dataframe/series, which Spark estimators require.\n", "\n", - "```python\n", - "def best_model_for_estimator(estimator_name: str)\n", - "```\n", + "This utility function takes data in the form of a `pandas.Dataframe` or `pyspark.sql.Dataframe` and converts it into a pandas-on-spark dataframe. It also takes `pandas.Series` or `pyspark.sql.Dataframe` and converts it into a [pandas-on-spark](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/index.html) series. If you pass in a `pyspark.pandas.Dataframe`, it will not make any changes.\n", "\n", - "Return the best model found for a particular estimator.\n", + "This function also accepts optional arguments `index_col` and `default_index_type`.\n", + "- `index_col` is the column name to use as the index, default is None.\n", + "- `default_index_type` is the default index type, default is \"distributed-sequence\". More info about default index type could be found on Spark official [documentation](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/options.html#default-index-type)\n", "\n", - "**Arguments**:\n", + "Here is an example code snippet for Spark Data:\n", "\n", - "- `estimator_name` - a str of the estimator's name.\n", - " \n", + "```python\n", + "import pandas as pd\n", + "from flaml.automl.spark.utils import to_pandas_on_spark\n", + "# Creating a dictionary\n", + "data = {\"Square_Feet\": [800, 1200, 1800, 1500, 850],\n", + " \"Age_Years\": [20, 15, 10, 7, 25],\n", + " \"Price\": [100000, 200000, 300000, 240000, 120000]}\n", "\n", - "**Returns**:\n", + "# Creating a pandas DataFrame\n", + "dataframe = pd.DataFrame(data)\n", + "label = \"Price\"\n", "\n", - " An object storing the best model for estimator_name.\n", - " If `model_history` was set to False during fit(), then the returned model\n", - " is untrained unless estimator_name is the best estimator.\n", - " If `model_history` was set to True, then the returned model is trained.\n", + "# Convert to pandas-on-spark dataframe\n", + "psdf = to_pandas_on_spark(dataframe)\n", + "```\n", "\n", - "#### best\\_estimator\n", + "To use Spark ML models you need to format your data appropriately. Specifically, use [`VectorAssembler`](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.VectorAssembler.html) to merge all feature columns into a single vector column.\n", "\n", + "Here is an example of how to use it:\n", "```python\n", - "@property\n", - "def best_estimator()\n", + "from pyspark.ml.feature import VectorAssembler\n", + "columns = psdf.columns\n", + "feature_cols = [col for col in columns if col != label]\n", + "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n", + "psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\"]\n", "```\n", "\n", - "A string indicating the best estimator found.\n", + "Later in conducting the experiment, use your pandas-on-spark data like non-spark data and pass them using `X_train, y_train` or `dataframe, label`.\n", "\n", - "#### best\\_iteration\n", + "### Estimators\n", + "#### Model List\n", + "- `lgbm_spark`: The class for fine-tuning Spark version LightGBM models, using [SynapseML](https://microsoft.github.io/SynapseML/docs/features/lightgbm/about/) API.\n", "\n", - "```python\n", - "@property\n", - "def best_iteration()\n", - "```\n", + "#### Usage\n", + "First, prepare your data in the required format as described in the previous section.\n", "\n", - "An integer of the iteration number where the best\n", - "config is found.\n", + "By including the models you intend to try in the `estimators_list` argument to `flaml.automl`, FLAML will start trying configurations for these models. If your input is Spark data, FLAML will also use estimators with the `_spark` postfix by default, even if you haven't specified them.\n", "\n", - "#### best\\_config\n", + "Here is an example code snippet using SparkML models in AutoML:\n", "\n", "```python\n", - "@property\n", - "def best_config()\n", - "```\n", - "\n", - "A dictionary of the best configuration.\n", + "import flaml\n", + "# prepare your data in pandas-on-spark format as we previously mentioned\n", "\n", - "#### best\\_config\\_per\\_estimator\n", + "automl = flaml.AutoML()\n", + "settings = {\n", + " \"time_budget\": 30,\n", + " \"metric\": \"r2\",\n", + " \"estimator_list\": [\"lgbm_spark\"], # this setting is optional\n", + " \"task\": \"regression\",\n", + "}\n", "\n", - "```python\n", - "@property\n", - "def best_config_per_estimator()\n", + "automl.fit(\n", + " dataframe=psdf,\n", + " label=label,\n", + " **settings,\n", + ")\n", "```\n", "\n", - "A dictionary of all estimators' best configuration.\n", "\n", - "#### best\\_loss\\_per\\_estimator\n", + "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/automl_bankrupt_synapseml.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/automl_bankrupt_synapseml.ipynb)\n", "\n", - "```python\n", - "@property\n", - "def best_loss_per_estimator()\n", - "```\n", + "## Parallel Spark Jobs\n", + "You can activate Spark as the parallel backend during parallel tuning in both [AutoML](/docs/Use-Cases/Task-Oriented-AutoML#parallel-tuning) and [Hyperparameter Tuning](/docs/Use-Cases/Tune-User-Defined-Function#parallel-tuning), by setting the `use_spark` to `true`. FLAML will dispatch your job to the distributed Spark backend using [`joblib-spark`](https://github.com/joblib/joblib-spark).\n", + "\n", + "Please note that you should not set `use_spark` to `true` when applying AutoML and Tuning for Spark Data. This is because only SparkML models will be used for Spark Data in AutoML and Tuning. As SparkML models run in parallel, there is no need to distribute them with `use_spark` again.\n", "\n", - "A dictionary of all estimators' best loss.\n", + "All the Spark-related arguments are stated below. These arguments are available in both Hyperparameter Tuning and AutoML:\n", "\n", - "#### best\\_loss\n", "\n", + "- `use_spark`: boolean, default=False | Whether to use spark to run the training in parallel spark jobs. This can be used to accelerate training on large models and large datasets, but will incur more overhead in time and thus slow down training in some cases. GPU training is not supported yet when use_spark is True. For Spark clusters, by default, we will launch one trial per executor. However, sometimes we want to launch more trials than the number of executors (e.g., local mode). In this case, we can set the environment variable `FLAML_MAX_CONCURRENT` to override the detected `num_executors`. The final number of concurrent trials will be the minimum of `n_concurrent_trials` and `num_executors`.\n", + "- `n_concurrent_trials`: int, default=1 | The number of concurrent trials. When n_concurrent_trials > 1, FLAML performes parallel tuning.\n", + "- `force_cancel`: boolean, default=False | Whether to forcely cancel Spark jobs if the search time exceeded the time budget. Spark jobs include parallel tuning jobs and Spark-based model training jobs.\n", + "\n", + "An example code snippet for using parallel Spark jobs:\n", "```python\n", - "@property\n", - "def best_loss()\n", + "import flaml\n", + "automl_experiment = flaml.AutoML()\n", + "automl_settings = {\n", + " \"time_budget\": 30,\n", + " \"metric\": \"r2\",\n", + " \"task\": \"regression\",\n", + " \"n_concurrent_trials\": 2,\n", + " \"use_spark\": True,\n", + " \"force_cancel\": True, # Activating the force_cancel option can immediately halt Spark jobs once they exceed the allocated time_budget.\n", + "}\n", + "\n", + "automl.fit(\n", + " dataframe=dataframe,\n", + " label=label,\n", + " **automl_settings,\n", + ")\n", "```\n", "\n", - "A float of the best loss found.\n", "\n", - "#### best\\_result\n", + "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb)\n", "\n", - "```python\n", - "@property\n", - "def best_result()\n", - "```\n", + "# Research\n", "\n", - "Result dictionary for model trained with the best config.\n", + "For technical details, please check our research publications.\n", "\n", - "#### metrics\\_for\\_best\\_config\n", + "* [FLAML: A Fast and Lightweight AutoML Library](https://www.microsoft.com/en-us/research/publication/flaml-a-fast-and-lightweight-automl-library/). Chi Wang, Qingyun Wu, Markus Weimer, Erkang Zhu. MLSys 2021.\n", "\n", - "```python\n", - "@property\n", - "def metrics_for_best_config()\n", + "```bibtex\n", + "@inproceedings{wang2021flaml,\n", + " title={FLAML: A Fast and Lightweight AutoML Library},\n", + " author={Chi Wang and Qingyun Wu and Markus Weimer and Erkang Zhu},\n", + " year={2021},\n", + " booktitle={MLSys},\n", + "}\n", "```\n", "\n", - "Returns a float of the best loss, and a dictionary of the auxiliary metrics to log\n", - "associated with the best config. These two objects correspond to the returned\n", - "objects by the customized metric function for the config with the best loss.\n", - "\n", - "#### best\\_config\\_train\\_time\n", - " \n", - "- `seed` - int or None, default=None | The random seed for hpo.\n", - "- `n_concurrent_trials` - [Experimental] int, default=1 | The number of\n", - " concurrent trials. When n_concurrent_trials > 1, flaml performes\n", - " [parallel tuning](../../Use-Cases/Task-Oriented-AutoML#parallel-tuning)\n", - " and installation of ray or spark is required: `pip install flaml[ray]`\n", - " or `pip install flaml[spark]`. Please check\n", - " [here](https://spark.apache.org/docs/latest/api/python/getting_started/install.html)\n", - " for more details about installing Spark.\n", - "- `keep_search_state` - boolean, default=False | Whether to keep data needed\n", - " for model search after fit(). By default the state is deleted for\n", - " space saving.\n", - "- `preserve_checkpoint` - boolean, default=True | Whether to preserve the saved checkpoint\n", - " on disk when deleting automl. By default the checkpoint is preserved.\n", - "- `early_stop` - boolean, default=False | Whether to stop early if the\n", - " search is considered to converge.\n", - "- `force_cancel` - boolean, default=False | Whether to forcely cancel the PySpark job if overtime.\n", - "- `append_log` - boolean, default=False | Whetehr to directly append the log\n", - " records to the input log file if it exists.\n", - "- `auto_augment` - boolean, default=True | Whether to automatically\n", - " augment rare classes.\n", - "- `min_sample_size` - int, default=MIN_SAMPLE_TRAIN | the minimal sample\n", - " size when sample=True.\n", - "- `use_ray` - boolean or dict.\n", - " If boolean: default=False | Whether to use ray to run the training\n", - " in separate processes. This can be used to prevent OOM for large\n", - " datasets, but will incur more overhead in time.\n", - " If dict: the dict contains the keywords arguments to be passed to\n", - " [ray.tune.run](https://docs.ray.io/en/latest/tune/api_docs/execution.html).\n", - "- `use_spark` - boolean, default=False | Whether to use spark to run the training\n", - " in parallel spark jobs. This can be used to accelerate training on large models\n", - " and large datasets, but will incur more overhead in time and thus slow down\n", - " training in some cases.\n", - "- `free_mem_ratio` - float between 0 and 1, default=0. The free memory ratio to keep during training.\n", - "- `metric_constraints` - list, default=[] | The list of metric constraints.\n", - " Each element in this list is a 3-tuple, which shall be expressed\n", - " in the following format: the first element of the 3-tuple is the name of the\n", - " metric, the second element is the inequality sign chosen from \">=\" and \"<=\",\n", - " and the third element is the constraint value. E.g., `('precision', '>=', 0.9)`.\n", - " Note that all the metric names in metric_constraints need to be reported via\n", - " the metrics_to_log dictionary returned by a customized metric function.\n", - " The customized metric function shall be provided via the `metric` key word argument\n", - " of the fit() function or the automl constructor.\n", - " Find examples in this [test](https://github.com/microsoft/FLAML/tree/main/test/automl/test_constraints.py).\n", - " If `pred_time_limit` is provided as one of keyword arguments to fit() function or\n", - " the automl constructor, flaml will automatically (and under the hood)\n", - " add it as an additional element in the metric_constraints. Essentially 'pred_time_limit'\n", - " specifies a constraint about the prediction latency constraint in seconds.\n", - "- `custom_hp` - dict, default=None | The custom search space specified by user\n", - " Each key is the estimator name, each value is a dict of the custom search space for that estimator. Notice the\n", - " domain of the custom search space can either be a value of a sample.Domain object.\n", - " \n", - " \n", - " \n", - "```python\n", - "custom_hp = {\n", - " \"transformer_ms\": {\n", - " \"model_path\": {\n", - " \"domain\": \"albert-base-v2\",\n", - " },\n", - " \"learning_rate\": {\n", - " \"domain\": tune.choice([1e-4, 1e-5]),\n", - " }\n", - " }\n", + "* [Frugal Optimization for Cost-related Hyperparameters](https://arxiv.org/abs/2005.01571). Qingyun Wu, Chi Wang, Silu Huang. AAAI 2021.\n", + "\n", + "```bibtex\n", + "@inproceedings{wu2021cfo,\n", + " title={Frugal Optimization for Cost-related Hyperparameters},\n", + " author={Qingyun Wu and Chi Wang and Silu Huang},\n", + " year={2021},\n", + " booktitle={AAAI},\n", "}\n", "```\n", - "- `time_col` - for a time series task, name of the column containing the timestamps. If not\n", - " provided, defaults to the first column of X_train/X_val\n", - " \n", - "- `cv_score_agg_func` - customized cross-validation scores aggregate function. Default to average metrics across folds. If specificed, this function needs to\n", - " have the following input arguments:\n", - " \n", - " * val_loss_folds: list of floats, the loss scores of each fold;\n", - " * log_metrics_folds: list of dicts/floats, the metrics of each fold to log.\n", - " \n", - " This function should return the final aggregate result of all folds. A float number of the minimization objective, and a dictionary as the metrics to log or None.\n", - " E.g.,\n", - " \n", - "```python\n", - "def cv_score_agg_func(val_loss_folds, log_metrics_folds):\n", - " metric_to_minimize = sum(val_loss_folds)/len(val_loss_folds)\n", - " metrics_to_log = None\n", - " for single_fold in log_metrics_folds:\n", - " if metrics_to_log is None:\n", - " metrics_to_log = single_fold\n", - " elif isinstance(metrics_to_log, dict):\n", - " metrics_to_log = {k: metrics_to_log[k] + v for k, v in single_fold.items()}\n", - " else:\n", - " metrics_to_log += single_fold\n", - " if metrics_to_log:\n", - " n = len(val_loss_folds)\n", - " metrics_to_log = (\n", - " {k: v / n for k, v in metrics_to_log.items()}\n", - " if isinstance(metrics_to_log, dict)\n", - " else metrics_to_log / n\n", - " )\n", - " return metric_to_minimize, metrics_to_log\n", - "```\n", - " \n", - "- `skip_transform` - boolean, default=False | Whether to pre-process data prior to modeling.\n", - "- `mlflow_logging` - boolean, default=None | Whether to log the training results to mlflow.\n", - " Default value is None, which means the logging decision is made based on\n", - " AutoML.__init__'s mlflow_logging argument.\n", - " This requires mlflow to be installed and to have an active mlflow run.\n", - " FLAML will create nested runs.\n", - "- `fit_kwargs_by_estimator` - dict, default=None | The user specified keywords arguments, grouped by estimator name.\n", - " For TransformersEstimator, available fit_kwargs can be found from\n", - " [TrainingArgumentsForAuto](nlp/huggingface/training_args).\n", - " e.g.,\n", - " \n", - "```python\n", - "fit_kwargs_by_estimator = {\n", - " \"transformer\": {\n", - " \"output_dir\": \"test/data/output/\",\n", - " \"fp16\": False,\n", - " },\n", - " \"tft\": {\n", - " \"max_encoder_length\": 1,\n", - " \"min_encoder_length\": 1,\n", - " \"static_categoricals\": [],\n", - " \"static_reals\": [],\n", - " \"time_varying_known_categoricals\": [],\n", - " \"time_varying_known_reals\": [],\n", - " \"time_varying_unknown_categoricals\": [],\n", - " \"time_varying_unknown_reals\": [],\n", - " \"variable_groups\": {},\n", - " \"lags\": {},\n", - " }\n", + "\n", + "* [Economical Hyperparameter Optimization With Blended Search Strategy](https://www.microsoft.com/en-us/research/publication/economical-hyperparameter-optimization-with-blended-search-strategy/). Chi Wang, Qingyun Wu, Silu Huang, Amin Saied. ICLR 2021.\n", + "\n", + "```bibtex\n", + "@inproceedings{wang2021blendsearch,\n", + " title={Economical Hyperparameter Optimization With Blended Search Strategy},\n", + " author={Chi Wang and Qingyun Wu and Silu Huang and Amin Saied},\n", + " year={2021},\n", + " booktitle={ICLR},\n", "}\n", "```\n", - " \n", - "- `**fit_kwargs` - Other key word arguments to pass to fit() function of\n", - " the searched learners, such as sample_weight. Below are a few examples of\n", - " estimator-specific parameters:\n", - "- `period` - int | forecast horizon for all time series forecast tasks.\n", - "- `gpu_per_trial` - float, default = 0 | A float of the number of gpus per trial,\n", - " only used by TransformersEstimator, XGBoostSklearnEstimator, and\n", - " TemporalFusionTransformerEstimator.\n", - "- `group_ids` - list of strings of column names identifying a time series, only\n", - " used by TemporalFusionTransformerEstimator, required for\n", - " 'ts_forecast_panel' task. `group_ids` is a parameter for TimeSeriesDataSet object\n", - " from PyTorchForecasting.\n", - " For other parameters to describe your dataset, refer to\n", - " [TimeSeriesDataSet PyTorchForecasting](https://pytorch-forecasting.readthedocs.io/en/stable/api/pytorch_forecasting.data.timeseries.TimeSeriesDataSet.html).\n", - " To specify your variables, use `static_categoricals`, `static_reals`,\n", - " `time_varying_known_categoricals`, `time_varying_known_reals`,\n", - " `time_varying_unknown_categoricals`, `time_varying_unknown_reals`,\n", - " `variable_groups`. To provide more information on your data, use\n", - " `max_encoder_length`, `min_encoder_length`, `lags`.\n", - "- `log_dir` - str, default = \"lightning_logs\" | Folder into which to log results\n", - " for tensorboard, only used by TemporalFusionTransformerEstimator.\n", - "- `max_epochs` - int, default = 20 | Maximum number of epochs to run training,\n", - " only used by TemporalFusionTransformerEstimator.\n", - "- `batch_size` - int, default = 64 | Batch size for training model, only\n", - " used by TemporalFusionTransformerEstimator.\n", - "\n", - "\n", - " \n", - "```python\n", - "from flaml import BlendSearch\n", - "algo = BlendSearch(metric='val_loss', mode='min',\n", - " space=search_space,\n", - " low_cost_partial_config=low_cost_partial_config)\n", - "for i in range(10):\n", - " analysis = tune.run(compute_with_config,\n", - " search_alg=algo, use_ray=False)\n", - " print(analysis.trials[-1].last_result)\n", + "\n", + "* [An Empirical Study on Hyperparameter Optimization for Fine-Tuning Pre-trained Language Models](https://aclanthology.org/2021.acl-long.178.pdf). Susan Xueqing Liu, Chi Wang. ACL 2021.\n", + "\n", + "```bibtex\n", + "@inproceedings{liuwang2021hpolm,\n", + " title={An Empirical Study on Hyperparameter Optimization for Fine-Tuning Pre-trained Language Models},\n", + " author={Susan Xueqing Liu and Chi Wang},\n", + " year={2021},\n", + " booktitle={ACL},\n", + "}\n", "```\n", - " \n", - "- `verbose` - 0, 1, 2, or 3. If ray or spark backend is used, their verbosity will be\n", - " affected by this argument. 0 = silent, 1 = only status updates,\n", - " 2 = status and brief trial results, 3 = status and detailed trial results.\n", - " Defaults to 2.\n", - "- `local_dir` - A string of the local dir to save ray logs if ray backend is\n", - " used; or a local dir to save the tuning log.\n", - "- `num_samples` - An integer of the number of configs to try. Defaults to 1.\n", - "- `resources_per_trial` - A dictionary of the hardware resources to allocate\n", - " per trial, e.g., `{'cpu': 1}`. It is only valid when using ray backend\n", - " (by setting 'use_ray = True'). It shall be used when you need to do\n", - " [parallel tuning](../../Use-Cases/Tune-User-Defined-Function#parallel-tuning).\n", - "- `config_constraints` - A list of config constraints to be satisfied.\n", - " e.g., ```config_constraints = [(mem_size, '<=', 1024**3)]```\n", - " \n", - " mem_size is a function which produces a float number for the bytes\n", - " needed for a config.\n", - " It is used to skip configs which do not fit in memory.\n", - "- `metric_constraints` - A list of metric constraints to be satisfied.\n", - " e.g., `['precision', '>=', 0.9]`. The sign can be \">=\" or \"<=\".\n", - "- `max_failure` - int | the maximal consecutive number of failures to sample\n", - " a trial before the tuning is terminated.\n", - "- `use_ray` - A boolean of whether to use ray as the backend.\n", - "- `use_spark` - A boolean of whether to use spark as the backend.\n", - "- `log_file_name` - A string of the log file name. Default to None.\n", - " When set to None:\n", - " if local_dir is not given, no log file is created;\n", - " if local_dir is given, the log file name will be autogenerated under local_dir.\n", - " Only valid when verbose > 0 or use_ray is True.\n", - "- `lexico_objectives` - dict, default=None | It specifics information needed to perform multi-objective\n", - " optimization with lexicographic preferences. When lexico_objectives is not None, the arguments metric,\n", - " mode, will be invalid, and flaml's tune uses CFO\n", - " as the `search_alg`, which makes the input (if provided) `search_alg' invalid.\n", - " This dictionary shall contain the following fields of key-value pairs:\n", - " - \"metrics\": a list of optimization objectives with the orders reflecting the priorities/preferences of the\n", - " objectives.\n", - " - \"modes\" (optional): a list of optimization modes (each mode either \"min\" or \"max\") corresponding to the\n", - " objectives in the metric list. If not provided, we use \"min\" as the default mode for all the objectives.\n", - " - \"targets\" (optional): a dictionary to specify the optimization targets on the objectives. The keys are the\n", - " metric names (provided in \"metric\"), and the values are the numerical target values.\n", - " - \"tolerances\" (optional): a dictionary to specify the optimality tolerances on objectives. The keys are the metric names (provided in \"metrics\"), and the values are the absolute/percentage tolerance in the form of numeric/string.\n", - " E.g.,\n", - "```python\n", - "lexico_objectives = {\n", - " \"metrics\": [\"error_rate\", \"pred_time\"],\n", - " \"modes\": [\"min\", \"min\"],\n", - " \"tolerances\": {\"error_rate\": 0.01, \"pred_time\": 0.0},\n", - " \"targets\": {\"error_rate\": 0.0},\n", + "\n", + "* [ChaCha for Online AutoML](https://www.microsoft.com/en-us/research/publication/chacha-for-online-automl/). Qingyun Wu, Chi Wang, John Langford, Paul Mineiro and Marco Rossi. ICML 2021.\n", + "\n", + "```bibtex\n", + "@inproceedings{wu2021chacha,\n", + " title={ChaCha for Online AutoML},\n", + " author={Qingyun Wu and Chi Wang and John Langford and Paul Mineiro and Marco Rossi},\n", + " year={2021},\n", + " booktitle={ICML},\n", "}\n", "```\n", - " We also support percentage tolerance.\n", - " E.g.,\n", - "```python\n", - "lexico_objectives = {\n", - " \"metrics\": [\"error_rate\", \"pred_time\"],\n", - " \"modes\": [\"min\", \"min\"],\n", - " \"tolerances\": {\"error_rate\": \"5%\", \"pred_time\": \"0%\"},\n", - " \"targets\": {\"error_rate\": 0.0},\n", + "\n", + "* [Fair AutoML](https://arxiv.org/abs/2111.06495). Qingyun Wu, Chi Wang. ArXiv preprint arXiv:2111.06495 (2021).\n", + "\n", + "```bibtex\n", + "@inproceedings{wuwang2021fairautoml,\n", + " title={Fair AutoML},\n", + " author={Qingyun Wu and Chi Wang},\n", + " year={2021},\n", + " booktitle={ArXiv preprint arXiv:2111.06495},\n", "}\n", "```\n", - "- `force_cancel` - boolean, default=False | Whether to forcely cancel the PySpark job if overtime.\n", - "- `n_concurrent_trials` - int, default=0 | The number of concurrent trials when perform hyperparameter\n", - " tuning with Spark. Only valid when use_spark=True and spark is required:\n", - " `pip install flaml[spark]`. Please check\n", - " [here](https://spark.apache.org/docs/latest/api/python/getting_started/install.html)\n", - " for more details about installing Spark. When tune.run() is called from AutoML, it will be\n", - " overwritten by the value of `n_concurrent_trials` in AutoML. When <= 0, the concurrent trials\n", - " will be set to the number of executors.\n", - "- `**ray_args` - keyword arguments to pass to ray.tune.run().\n", - " Only valid when use_ray=True.\n", - "\n", - "## Tuner Objects\n", "\n", - "```python\n", - "class Tuner()\n", + "* [Mining Robust Default Configurations for Resource-constrained AutoML](https://arxiv.org/abs/2202.09927). Moe Kayali, Chi Wang. ArXiv preprint arXiv:2202.09927 (2022).\n", + "\n", + "```bibtex\n", + "@inproceedings{kayaliwang2022default,\n", + " title={Mining Robust Default Configurations for Resource-constrained AutoML},\n", + " author={Moe Kayali and Chi Wang},\n", + " year={2022},\n", + " booktitle={ArXiv preprint arXiv:2202.09927},\n", + "}\n", "```\n", "\n", - "Tuner is the class-based way of launching hyperparameter tuning jobs compatible with Ray Tune 2.\n", - "\n", - "**Arguments**:\n", - "\n", - "- `trainable` - A user-defined evaluation function.\n", - " It takes a configuration as input, outputs a evaluation\n", - " result (can be a numerical value or a dictionary of string\n", - " and numerical value pairs) for the input configuration.\n", - " For machine learning tasks, it usually involves training and\n", - " scoring a machine learning model, e.g., through validation loss.\n", - "- `param_space` - Search space of the tuning job.\n", - " One thing to note is that both preprocessor and dataset can be tuned here.\n", - "- `tune_config` - Tuning algorithm specific configs.\n", - " Refer to ray.tune.tune_config.TuneConfig for more info.\n", - "- `run_config` - Runtime configuration that is specific to individual trials.\n", - " If passed, this will overwrite the run config passed to the Trainer,\n", - " if applicable. Refer to ray.air.config.RunConfig for more info.\n", - " \n", - " Usage pattern:\n", - " \n", - " .. code-block:: python\n", - " \n", - " from sklearn.datasets import load_breast_cancer\n", - " \n", - " from ray import tune\n", - " from ray.data import from_pandas\n", - " from ray.air.config import RunConfig, ScalingConfig\n", - " from ray.train.xgboost import XGBoostTrainer\n", - " from ray.tune.tuner import Tuner\n", - " \n", - " def get_dataset():\n", - " data_raw = load_breast_cancer(as_frame=True)\n", - " dataset_df = data_raw[\"data\"]\n", - " dataset_df[\"target\"] = data_raw[\"target\"]\n", - " dataset = from_pandas(dataset_df)\n", - " return dataset\n", - " \n", - " trainer = XGBoostTrainer(\n", - " label_column=\"target\",\n", - " params={},\n", - "- `datasets={\"train\"` - get_dataset()},\n", - " )\n", - " \n", - " param_space = {\n", - "- `\"scaling_config\"` - ScalingConfig(\n", - " num_workers=tune.grid_search([2, 4]),\n", - " resources_per_worker={\n", - "- `\"CPU\"` - tune.grid_search([1, 2]),\n", - " },\n", - " ),\n", - " # You can even grid search various datasets in Tune.\n", - " # \"datasets\": {\n", - " # \"train\": tune.grid_search(\n", - " # [ds1, ds2]\n", - " # ),\n", - " # },\n", - "- `\"params\"` - {\n", - "- `\"objective\"` - \"binary:logistic\",\n", - "- `\"tree_method\"` - \"approx\",\n", - "- `\"eval_metric\"` - [\"logloss\", \"error\"],\n", - "- `\"eta\"` - tune.loguniform(1e-4, 1e-1),\n", - "- `\"subsample\"` - tune.uniform(0.5, 1.0),\n", - "- `\"max_depth\"` - tune.randint(1, 9),\n", - " },\n", - " }\n", - " tuner = Tuner(trainable=trainer, param_space=param_space,\n", - " run_config=RunConfig(name=\"my_tune_run\"))\n", - " analysis = tuner.fit()\n", - " \n", - " To retry a failed tune run, you can then do\n", - " \n", - " .. code-block:: python\n", - " \n", - " tuner = Tuner.restore(experiment_checkpoint_dir)\n", - " tuner.fit()\n", - " \n", - " ``experiment_checkpoint_dir`` can be easily located near the end of the\n", - " console output of your first failed run.\n", + "* [Targeted Hyperparameter Optimization with Lexicographic Preferences Over Multiple Objectives](https://openreview.net/forum?id=0Ij9_q567Ma). Shaokun Zhang, Feiran Jia, Chi Wang, Qingyun Wu. ICLR 2023 (notable-top-5%).\n", "\n", + "```bibtex\n", + "@inproceedings{zhang2023targeted,\n", + " title={Targeted Hyperparameter Optimization with Lexicographic Preferences Over Multiple Objectives},\n", + " author={Shaokun Zhang and Feiran Jia and Chi Wang and Qingyun Wu},\n", + " booktitle={International Conference on Learning Representations},\n", + " year={2023},\n", + " url={https://openreview.net/forum?id=0Ij9_q567Ma},\n", + "}\n", + "```\n", "\n", + "* [Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference](https://arxiv.org/abs/2303.04673). Chi Wang, Susan Xueqing Liu, Ahmed H. Awadallah. ArXiv preprint arXiv:2303.04673 (2023).\n", "\n", + "```bibtex\n", + "@inproceedings{wang2023EcoOptiGen,\n", + " title={Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference},\n", + " author={Chi Wang and Susan Xueqing Liu and Ahmed H. Awadallah},\n", + " year={2023},\n", + " booktitle={ArXiv preprint arXiv:2303.04673},\n", + "}\n", + "```\n", "\n", + "* [An Empirical Study on Challenging Math Problem Solving with GPT-4](https://arxiv.org/abs/2306.01337). Yiran Wu, Feiran Jia, Shaokun Zhang, Hangyu Li, Erkang Zhu, Yue Wang, Yin Tat Lee, Richard Peng, Qingyun Wu, Chi Wang. ArXiv preprint arXiv:2306.01337 (2023).\n", "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", + "```bibtex\n", + "@inproceedings{wu2023empirical,\n", + " title={An Empirical Study on Challenging Math Problem Solving with GPT-4},\n", + " author={Yiran Wu and Feiran Jia and Shaokun Zhang and Hangyu Li and Erkang Zhu and Yue Wang and Yin Tat Lee and Richard Peng and Qingyun Wu and Chi Wang},\n", + " year={2023},\n", + " booktitle={ArXiv preprint arXiv:2306.01337},\n", + "}\n", + "```\n", "\n", - "Yes, there is a function named `tune_automl` in FLAML. It is a method of the `AutoML` class and is used for hyperparameter tuning and model selection for a specific AutoML setting. You can use this method to find the best model and its configuration based on the provided search space and constraints.\n", "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", "\n", - "are you sure?\n", "\n", "--------------------------------------------------------------------------------\n", "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", "\n", - "I apologize for the confusion. I made a mistake in my previous response. There is no function named `tune_automl` in FLAML. Instead, you can use the `fit()` method of the `AutoML` class to perform hyperparameter tuning and model selection. \n", + "There is no function called `tune_automl` in FLAML. However, there is a function called `flaml.automl` which can be used for performing AutoML.\n", "\n", "--------------------------------------------------------------------------------\n" ] @@ -3188,7 +1911,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -3197,7 +1920,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -3224,7 +1947,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -3255,7 +1978,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -3267,6 +1990,97 @@ ">>>>>>>>>>>> Below are outputs of Case 1 <<<<<<<<<<<<\n", "\n", "\n", + "Trying to create collection.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "max_tokens is too small to fit a single line of text. Breaking this line:\n", + "\t
Film Year Fuck count Minutes Uses / mi ...\n", + "max_tokens is too small to fit a single line of text. Breaking this line:\n", + "\t
Character Ultimate Avengers Ultimate Avengers 2 I ...\n", + "max_tokens is too small to fit a single line of text. Breaking this line:\n", + "\t
Position Country Town / City PM2. 5 PM ...\n", + "max_tokens is too small to fit a single line of text. Breaking this line:\n", + "\t
Rank Country ( or dependent territory ) Population
Rank State Gross collections ( in thousands ) Rev ...\n", + "max_tokens is too small to fit a single line of text. Breaking this line:\n", + "\t < ...\n", + "max_tokens is too small to fit a single line of text. Breaking this line:\n", + "\t
Date Province Mag . MMI Deaths
City River State
Gangakhed ...\n", + "max_tokens is too small to fit a single line of text. Breaking this line:\n", + "\t
Player Pos . Team Career start Career ...\n", + "max_tokens is too small to fit a single line of text. Breaking this line:\n", + "\t ABO and Rh blood type distribution by country ( population averages )
Country
Total area Land area Performance in the European Cup and UEFA Champions League by club
  • ...\n", + "max_tokens is too small to fit a single line of text. Breaking this line:\n", + "\t
    Rank City State Land area ( sq mi ) La ...\n", + "max_tokens is too small to fit a single line of text. Breaking this line:\n", + "\t
    # Country Name International goals Cap ...\n", + "max_tokens is too small to fit a single line of text. Breaking this line:\n", + "\t
    Rank City Image Population Definition ...\n", + "max_tokens is too small to fit a single line of text. Breaking this line:\n", + "\t
    Rank Team Won Lost Tied Pct ...\n", + "max_tokens is too small to fit a single line of text. Breaking this line:\n", + "\t ...\n", + "max_tokens is too small to fit a single line of text. Breaking this line:\n", + "\t
    Territory Rights holder Ref
    Asia
    ...\n", + "max_tokens is too small to fit a single line of text. Breaking this line:\n", + "\t
    ( hide ) Rank Nat Name Years Goals
    Total area Land area
    Bids by school Most recent
    Rank Name Nation TP SP
    2014 Rank City 2014 Estimate 2010 Census
    S.No . Year Name
    1961
    Densities of various materials covering a range of values
    Material ρ ( ...\n", + "max_tokens is too small to fit a single line of text. Breaking this line:\n", + "\t
    Club Season League Nation ...\n", + "max_tokens is too small to fit a single line of text. Breaking this line:\n", + "\t
    Rank ( 2016 ) Airports ( large hubs ) IATA Code M ...\n", + "max_tokens is too small to fit a single line of text. Breaking this line:\n", + "\t
    City Region / State Country Park name ...\n", + "max_tokens is too small to fit a single line of text. Breaking this line:\n", + "\t ...\n", + "max_tokens is too small to fit a single line of text. Breaking this line:\n", + "\t
    Year Winner ( nationally ) Votes Percent
    Compound SERT NET DAT 5 - HT
    Rank Name Industry Revenue ( USD millions )
    ...\n", + "max_tokens is too small to fit a single line of text. Breaking this line:\n", + "\t
    Rank Name Name in Georgian Population 1989
    Country The World Factbook World Res ...\n", + "max_tokens is too small to fit a single line of text. Breaking this line:\n", + "\t ...\n", + "max_tokens is too small to fit a single line of text. Breaking this line:\n", + "\t
    Rank Country Area ( km2 ) Notes
    ...\n", + "max_tokens is too small to fit a single line of text. Breaking this line:\n", + "\t
    Rank Country Area ( km2 ) Notes
    Date State ( s ) Magnitude Fatalities ...\n", + "max_tokens is too small to fit a single line of text. Breaking this line:\n", + "\t < ...\n", + "max_tokens is too small to fit a single line of text. Breaking this line:\n", + "\t
    Artist # Gold # Platinum # Multi-Platinum
    Name Number of locations Revenue
    Name Country Region Depth ( meters ) < ...\n", + "max_tokens is too small to fit a single line of text. Breaking this line:\n", + "\t
    Rank Player ( 2017 HRs ) HR
    ...\n", + "max_tokens is too small to fit a single line of text. Breaking this line:\n", + "\t ...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "doc_ids: [['doc_0', 'doc_3334', 'doc_720', 'doc_2732', 'doc_2510', 'doc_5084', 'doc_5068', 'doc_3727', 'doc_1938', 'doc_4689', 'doc_5249', 'doc_1751', 'doc_480', 'doc_3989', 'doc_2115', 'doc_1233', 'doc_2264', 'doc_633', 'doc_2376', 'doc_2293', 'doc_5274', 'doc_5213', 'doc_3991', 'doc_2880', 'doc_2737', 'doc_1257', 'doc_1748', 'doc_2038', 'doc_4073', 'doc_2876']]\n", "\u001b[32mAdding doc_id doc_0 to context.\u001b[0m\n", "\u001b[32mAdding doc_id doc_3334 to context.\u001b[0m\n", @@ -3282,6 +2096,19 @@ "\u001b[32mAdding doc_id doc_1751 to context.\u001b[0m\n", "\u001b[32mAdding doc_id doc_480 to context.\u001b[0m\n", "\u001b[32mAdding doc_id doc_3989 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id doc_3334 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id doc_720 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id doc_2732 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id doc_2510 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id doc_5084 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id doc_5068 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id doc_3727 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id doc_1938 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id doc_4689 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id doc_5249 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id doc_1751 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id doc_480 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id doc_3989 to context.\u001b[0m\n", "\u001b[32mAdding doc_id doc_2115 to context.\u001b[0m\n", "\u001b[32mAdding doc_id doc_1233 to context.\u001b[0m\n", "\u001b[32mAdding doc_id doc_2264 to context.\u001b[0m\n", @@ -3321,7 +2148,7 @@ "--------------------------------------------------------------------------------\n", "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", "\n", - "Non controlling interest on a balance sheet refers to the portion of a subsidiary's stock that is not owned by the parent company. It represents the equity stake held by outside investors in the subsidiary.\n", + "Non controlling interest on balance sheet refers to the portion of a subsidiary corporation's stock that is not owned by the parent corporation. It represents ownership of less than 50% of the outstanding shares. It is shown as a separate line item in the equity section of the balance sheet.\n", "\n", "--------------------------------------------------------------------------------\n", "\n", @@ -3329,32 +2156,25 @@ ">>>>>>>>>>>> Below are outputs of Case 2 <<<<<<<<<<<<\n", "\n", "\n", - "doc_ids: [['doc_1', 'doc_1097', 'doc_4221', 'doc_4972', 'doc_1352', 'doc_96', 'doc_4301', 'doc_988', 'doc_2370', 'doc_2414', 'doc_5038', 'doc_302', 'doc_1608', 'doc_980', 'doc_2112', 'doc_1699', 'doc_562', 'doc_4204', 'doc_3298', 'doc_3978', 'doc_1258', 'doc_2971', 'doc_2171', 'doc_1065', 'doc_17', 'doc_2683', 'doc_87', 'doc_1767', 'doc_158', 'doc_482']]\n", + "doc_ids: [['doc_1', 'doc_1097', 'doc_4221', 'doc_4972', 'doc_1352', 'doc_96', 'doc_988', 'doc_2370', 'doc_2414', 'doc_5038', 'doc_302', 'doc_1608', 'doc_980', 'doc_2112', 'doc_562', 'doc_4204', 'doc_3298', 'doc_2995', 'doc_3978', 'doc_1258', 'doc_2971', 'doc_2171', 'doc_1065', 'doc_17', 'doc_2683', 'doc_87', 'doc_1767', 'doc_158', 'doc_482', 'doc_3850']]\n", "\u001b[32mAdding doc_id doc_1 to context.\u001b[0m\n", "\u001b[32mAdding doc_id doc_1097 to context.\u001b[0m\n", "\u001b[32mAdding doc_id doc_4221 to context.\u001b[0m\n", "\u001b[32mAdding doc_id doc_4972 to context.\u001b[0m\n", "\u001b[32mAdding doc_id doc_1352 to context.\u001b[0m\n", "\u001b[32mAdding doc_id doc_96 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_4301 to context.\u001b[0m\n", "\u001b[32mAdding doc_id doc_988 to context.\u001b[0m\n", "\u001b[32mAdding doc_id doc_2370 to context.\u001b[0m\n", "\u001b[32mAdding doc_id doc_2414 to context.\u001b[0m\n", "\u001b[32mAdding doc_id doc_5038 to context.\u001b[0m\n", "\u001b[32mAdding doc_id doc_302 to context.\u001b[0m\n", "\u001b[32mAdding doc_id doc_1608 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_980 to context.\u001b[0m\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ + "\u001b[32mAdding doc_id doc_980 to context.\u001b[0m\n", "\u001b[32mAdding doc_id doc_2112 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_1699 to context.\u001b[0m\n", "\u001b[32mAdding doc_id doc_562 to context.\u001b[0m\n", "\u001b[32mAdding doc_id doc_4204 to context.\u001b[0m\n", "\u001b[32mAdding doc_id doc_3298 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id doc_2995 to context.\u001b[0m\n", "\u001b[32mAdding doc_id doc_3978 to context.\u001b[0m\n", "\u001b[32mAdding doc_id doc_1258 to context.\u001b[0m\n", "\u001b[32mAdding doc_id doc_2971 to context.\u001b[0m\n", @@ -3377,7 +2197,6 @@ "

    The fifth season of Chicago P.D. , an American police drama television series with executive producer Dick Wolf , and producers Derek Haas , Michael Brandt , and Rick Eid , premiered on September 27 , 2017 . This season featured its 100th episode .

    \n", "

    This was the city of Chicago 's first professional sports championship since the Chicago Fire won MLS Cup ' 98 ( which came four months after the Chicago Bulls ' sixth NBA championship that year ) . The next major Chicago sports championship came in 2010 , when the NHL 's Chicago Blackhawks ended a 49 - year Stanley Cup title drought . With the Chicago Bears ' win in Super Bowl XX and the Chicago Cubs ' own World Series championship in 2016 , all Chicago sports teams have won at least one major championship since 1985 . Meanwhile , the Astros themselves made it back to the World Series in 2017 , but this time as an AL team , where they defeated the Los Angeles Dodgers in seven games , resulting in Houston 's first professional sports championship since the 2006 -- 07 Houston Dynamo won their back - to - back MLS Championships .

    \n", "

    The season was ordered in May 2017 , and production began the following month . Ben McKenzie stars as Gordon , alongside Donal Logue , David Mazouz , Morena Baccarin , Sean Pertwee , Robin Lord Taylor , Erin Richards , Camren Bicondova , Cory Michael Smith , Jessica Lucas , Chris Chalk , Drew Powell , Crystal Reed and Alexander Siddig . The fourth season premiered on September 21 , 2017 , on Fox , while the second half premiered on March 1 , 2018 .

    \n", - "

    The Eagle Creek Fire was a destructive wildfire in the Columbia River Gorge in the U.S. states of Oregon and Washington . The fire was started on September 2 , 2017 , reportedly caused by teenagers igniting fireworks during a burn ban . In mid-September , highway closures and local evacuations were gradually being lifted . As of September 28 , 2017 , the fire had consumed 48,831 acres ( 19,761 ha ) and was 46 % contained . In late October , fire growth was slowed by rain . On November 30 , 2017 , the fire was declared fully contained but not yet completely out .

    \n", "

    As of May 24 , 2017 , 58 episodes of The 100 have aired , concluding the fourth season . In March 2017 , The CW renewed the series for a fifth season , set to premiere on April 24 , 2018 .

    \n", "

    The fifth book , River of Fire , is scheduled to be released on April 10 , 2018 .

    \n", "

    On September 10 , 2013 , AMC officially cancelled the series after 38 episodes and three seasons . However , on November 15 , 2013 , Netflix ordered a fourth and final season of six episodes , that was released on Netflix on August 1 , 2014 .

    \n", @@ -3386,10 +2205,10 @@ "

    The first season consisted of eight one - hour - long episodes which were released worldwide on Netflix on July 15 , 2016 , in Ultra HD 4K . The second season , consisting of nine episodes , was released on October 27 , 2017 in HDR . A teaser for the second season , which also announced the release date , aired during Super Bowl LI .

    \n", "

    `` Two Days Before the Day After Tomorrow '' is the eighth episode in the ninth season of the American animated television series South Park . The 133rd overall episode overall , it originally aired on Comedy Central in the United States on October 19 , 2005 . In the episode , Stan and Cartman accidentally destroy a dam , causing the town of Beaverton to be destroyed .

    \n", "

    The fourth season consists of a double order of twenty episodes , split into two parts of ten episodes ; the second half premiered on November 30 , 2016 . The season follows the battles between Ragnar and Rollo in Francia , Bjorn 's raid into the Mediterranean , and the Viking invasion of England . It concluded in its entirety on February 1 , 2017 .

    \n", - "
    • Elizabeth Banks as Gail Abernathy - McKadden - Feinberger , an a cappella commentator making an insulting documentary about The Bellas
    • John Michael Higgins as John Smith , an a cappella commentator making an insulting documentary about The Bellas
    • John Lithgow as Fergus Hobart , Fat Amy 's estranged criminal father
    • Matt Lanter as Chicago Walp , a U.S. soldier guiding the Bellas during the tour , and Chloe 's love interest .
    • Guy Burnet as Theo , DJ Khaled 's music producer , who takes a liking to Beca
    • DJ Khaled as himself
    • Troy Ian Hall as Zeke , a U.S. soldier , partners with Chicago
    • Michael Rose as Aubrey 's father
    • Jessica Chaffin as Evan
    • Moises Arias as Pimp - Lo
    • Ruby Rose , Andy Allo , Venzella Joy Williams , and Hannah Fairlight as Calamity , Serenity , Charity , and Veracity , respectively , members of the band Evermoist
    • Whiskey Shivers as Saddle Up , a country - bluegrass - based band competing against the Bellas
    • Trinidad James and D.J. Looney as Young Sparrow and DJ Dragon Nutz , respectively
    \n", "

    This is an episode list for Sabrina the Teenage Witch , an American sitcom that debuted on ABC in 1996 . From Season 5 , the program was aired on The WB . The series ran for seven seasons totaling 163 episodes . It originally premiered on September 27 , 1996 on ABC and ended on April 24 , 2003 on The WB .

    \n", "

    Hart of Dixie was renewed by The CW for 10 episode season on May 8 , 2014 . The show 's fourth and final season premiered on November 15 , 2014 . The series was later cancelled on May 7 , 2015 .

    \n", "

    The Burning Maze is the third book in the series . It is scheduled to be released on May 1 , 2018 .

    \n", + "
    No . Athlete Nation Sport Years
    My Name Is Earl ( season 4 )
    DVD cover
    Country of origin United States
    No. of episodes 27
    Release
    Original network NBC
    Original release September 25 , 2008 -- May 14 , 2009
    Season chronology
    ← Previous Season 3
    List of My Name Is Earl episodes
    \n", "

    The eighteenth season of Law & Order : Special Victims Unit debuted on Wednesday , September 21 , 2016 , on NBC and finished on Wednesday , May 24 , 2017 , with a two - hour season finale .

    \n", "

    The eighth and final season of the fantasy drama television series Game of Thrones was announced by HBO in July 2016 . Unlike the first six seasons that each had ten episodes and the seventh that had seven episodes , the eighth season will have only six episodes . Like the previous season , it will largely consist of original content not found currently in George R.R. Martin 's A Song of Ice and Fire series , and will instead adapt material Martin has revealed to showrunners about the upcoming novels in the series , The Winds of Winter and A Dream of Spring .

    \n", "

    A total of 49 episodes of The Glades were produced and aired over four seasons .

    \n", @@ -3403,7 +2222,7 @@ "--------------------------------------------------------------------------------\n", "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", "\n", - "Chicago Fire season 4 has 23 episodes.\n", + "There are 23 episodes in Chicago Fire season 4.\n", "\n", "--------------------------------------------------------------------------------\n", "\n", @@ -3411,7 +2230,7 @@ ">>>>>>>>>>>> Below are outputs of Case 3 <<<<<<<<<<<<\n", "\n", "\n", - "doc_ids: [['doc_47', 'doc_45', 'doc_2570', 'doc_2851', 'doc_4033', 'doc_5320', 'doc_3849', 'doc_4172', 'doc_3202', 'doc_2282', 'doc_1896', 'doc_949', 'doc_103', 'doc_1552', 'doc_2791', 'doc_392', 'doc_1175', 'doc_5315', 'doc_832', 'doc_3185', 'doc_2532', 'doc_3409', 'doc_824', 'doc_4075', 'doc_1201', 'doc_4116', 'doc_2545', 'doc_2251', 'doc_2485', 'doc_2280']]\n", + "doc_ids: [['doc_47', 'doc_45', 'doc_2570', 'doc_2851', 'doc_4033', 'doc_5320', 'doc_3849', 'doc_4172', 'doc_3202', 'doc_2282', 'doc_1896', 'doc_949', 'doc_103', 'doc_1552', 'doc_2791', 'doc_392', 'doc_1175', 'doc_5315', 'doc_832', 'doc_3185', 'doc_2532', 'doc_3409', 'doc_824', 'doc_4075', 'doc_1201', 'doc_4116', 'doc_1448', 'doc_2545', 'doc_2251', 'doc_2485']]\n", "\u001b[32mAdding doc_id doc_47 to context.\u001b[0m\n", "\u001b[32mAdding doc_id doc_45 to context.\u001b[0m\n", "\u001b[32mAdding doc_id doc_2570 to context.\u001b[0m\n", @@ -3469,38 +2288,7 @@ "--------------------------------------------------------------------------------\n", "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", "\n", - "Bulls are used for breeding purposes on farms. UPDATE CONTEXT.\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[32mUpdating context and resetting conversation.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_3409 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_824 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_4075 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_1201 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_4116 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_2545 to context.\u001b[0m\n", - "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", - "\n", - "You're a retrieve augmented chatbot. You answer user's questions based on your own knowledge and the\n", - "context provided by the user.\n", - "If you can't answer the question with or without the current context, you should reply exactly `UPDATE CONTEXT`.\n", - "You must give as short an answer as possible.\n", - "\n", - "User's question is: what are bulls used for on a farm\n", - "\n", - "Context is:

    The term was originally used in the United States in the late - 19th and early - 20th centuries to refer to small traveling circuses that toured through small towns and rural areas . The name derives from the common use of performing dogs and ponies as the main attractions of the events . Performances were generally held in open - air arenas , such as race tracks or public spaces in localities that were too small or remote to attract larger , more elaborate performers or performances . The most notorious was `` Prof. Gentry 's Famous Dog & Pony Show , '' started when teenager Henry Gentry and his brothers started touring in 1886 with their act , originally entitled `` Gentry 's Equine and Canine Paradox . '' It started small , but evolved into a full circus show . Other early dog and pony shows included Morris ' Equine and Canine Paradoxes ( 1883 ) and Hurlburt 's Dog and Pony Show ( late 1880s ) .

    \n", - "

    The Dust Bowl , also known as the Dirty Thirties , was a period of severe dust storms that greatly damaged the ecology and agriculture of the American and Canadian prairies during the 1930s ; severe drought and a failure to apply dryland farming methods to prevent wind erosion ( the Aeolian processes ) caused the phenomenon . The drought came in three waves , 1934 , 1936 , and 1939 -- 1940 , but some regions of the high plains experienced drought conditions for as many as eight years . With insufficient understanding of the ecology of the plains , farmers had conducted extensive deep plowing of the virgin topsoil of the Great Plains during the previous decade ; this had displaced the native , deep - rooted grasses that normally trapped soil and moisture even during periods of drought and high winds . The rapid mechanization of farm equipment , especially small gasoline tractors , and widespread use of the combine harvester contributed to farmers ' decisions to convert arid grassland ( much of which received no more than 10 inches ( 250 mm ) of precipitation per year ) to cultivated cropland .

    \n", - "

    A camel is an even - toed ungulate in the genus Camelus , bearing distinctive fatty deposits known as `` humps '' on its back . The three surviving species of camel are the dromedary , or one - humped camel ( C. dromedarius ) , which inhabits the Middle East and the Horn of Africa ; the Bactrian , or two - humped camel ( C. bactrianus ) , which inhabits Central Asia ; and the critically endangered wild Bactrian camel ( C. ferus ) that has limited populations in remote areas of northwest China and Mongolia . Bactrian camels take their name from the historical Bactria region of Central Asia . Additionally one other species of camel in the separate genus Camelops , C. hesternus lived in western North America and became extinct when humans entered the continent at the end of the Pleistocene . Both the dromedary and the Bactrian camels have been domesticated ; they provide milk , meat , hair for textiles or goods such as felted pouches , and are working animals with tasks ranging from human transport to bearing loads .

    \n", - "
    Country Name of animal Scientific name Pictures Ref .
    Algeria Fennec fox Vulpes zerda
    Angola Red - crested turaco ( national bird ) Tauraco erythrolophus
    Anguilla Zenaida dove Zenaida aurita
    Antigua and Barbuda Fallow deer ( national animal ) Dama dama
    Frigate ( national bird ) Fregata magnificens
    Hawksbill turtle ( national sea creature ) Eretmochelys imbricata
    Argentina Rufous hornero Furnarius rufus
    Australia Red kangaroo ( national animal ) Macropus rufus
    Emu ( national bird ) Dromaius novaehollandiae
    Austria Black eagle Ictinaetus malaiensis
    Azerbaijan Karabakh horse Equus ferus caballus
    Bangladesh Royal Bengal tiger ( national animal ) Panthera tigris tigris
    Magpie robin ( national bird ) Copsychus saularis
    Ilish ( national fish ) Tenualosa ilisha
    Belarus European bison Bison bonasus
    Belgium Lion ( heraldic Leo Belgicus ) Panthera leo
    Belize Baird 's tapir ( national animal ) Tapirus bairdii
    Keel - billed toucan ( national bird ) Ramphastos sulfuratus
    Bhutan Druk Mythical
    Takin Budorcas taxicolor
    Brazil Rufous - bellied thrush Turdus rufiventris
    Cambodia Kouprey Bos sauveli
    Canada North American beaver ( sovereignty animal symbol ) Castor canadensis
    Canadian horse ( national horse ) Equus ferus caballus
    China Giant panda ( national animal ) Ailuropoda melanoleuca
    Chinese dragon ( national animal ) Mythical
    Red - crowned crane ( national bird ) Grus japonensis
    Democratic Republic of the Congo Okapi Okapia johnstoni
    Colombia Andean condor Vultur gryphus
    Costa Rica Yigüirro ( national bird ) Turdus grayi
    White - tailed deer ( national animal ) Odocoileus virginianus
    West Indian manatee ( national aquatic animal ) Trichechus manatus
    Croatia Pine marten Martes martes
    Cuba Cuban trogon Priotelus temnurus
    Cyprus Cypriot mouflon Ovis orientalis
    Czech Republic Double - tailed lion Mythical
    Denmark Mute swan ( national bird ) Cygnus olor
    Small tortoiseshell ( national butterfly ) Aglais urticae
    Egypt Steppe eagle Aquila nipalensis
    Estonia Barn swallow ( national bird ) Hirundo rustica
    Eritrea Arabian camel Camelus dromedarius
    Ethiopia Lion Panthera\n", - "

    The history of agriculture records the domestication of plants and animals and the development and dissemination of techniques for raising them productively . Agriculture began independently in different parts of the globe , and included a diverse range of taxa . At least eleven separate regions of the Old and New World were involved as independent centers of origin .

    \n", - "

    It is generally accepted that sustainable gray wolf packs had been extirpated from Yellowstone National Park by 1926 , although the National Park Service maintained its policies of predator control in the park until 1933 . However , a 1975 -- 77 National Park Service sponsored study revealed that during the period 1927 to 1977 , there were several hundred probable sightings of wolves in the park . Between 1977 and the re-introduction in 1995 , there were additional reliable sightings of wolves in the park , most believed to be singles or pairs transiting the region .

    \n", - "\n", - "\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", - "\n", - "Bulls are typically used for breeding purposes on farms.\n", + "Bulls are used for breeding and often kept for their semen to sell for AI purposes. Some male cattle are also kept as work oxen for haulage. The vast majority, however, are slaughtered for meat before the age of three years.\n", "\n", "--------------------------------------------------------------------------------\n", "\n", @@ -3508,7 +2296,7 @@ ">>>>>>>>>>>> Below are outputs of Case 4 <<<<<<<<<<<<\n", "\n", "\n", - "doc_ids: [['doc_3031', 'doc_819', 'doc_4521', 'doc_3980', 'doc_3423', 'doc_5275', 'doc_745', 'doc_753', 'doc_3562', 'doc_4139', 'doc_3678', 'doc_4931', 'doc_2347', 'doc_1115', 'doc_2806', 'doc_5204', 'doc_2707', 'doc_3653', 'doc_1122', 'doc_2398', 'doc_309', 'doc_3891', 'doc_2087', 'doc_330', 'doc_4844', 'doc_2155', 'doc_2987', 'doc_2674', 'doc_5357', 'doc_1581']]\n", + "doc_ids: [['doc_3031', 'doc_819', 'doc_4521', 'doc_3980', 'doc_3423', 'doc_5275', 'doc_745', 'doc_753', 'doc_3562', 'doc_4139', 'doc_3678', 'doc_4931', 'doc_2347', 'doc_1115', 'doc_2806', 'doc_5204', 'doc_2707', 'doc_3653', 'doc_1122', 'doc_2398', 'doc_309', 'doc_3891', 'doc_2087', 'doc_330', 'doc_4844', 'doc_2155', 'doc_2674', 'doc_5357', 'doc_1581', 'doc_9']]\n", "\u001b[32mAdding doc_id doc_3031 to context.\u001b[0m\n", "\u001b[32mAdding doc_id doc_819 to context.\u001b[0m\n", "\u001b[32mAdding doc_id doc_4521 to context.\u001b[0m\n", @@ -3542,7 +2330,7 @@ "--------------------------------------------------------------------------------\n", "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", "\n", - "Sorry, there is no information provided about who has been honoured with the Wisden Leading Cricketer in the World award for 2016. UPDATE CONTEXT.\n", + "I'm sorry, I couldn't find any information about who has been honoured with the Wisden Leading Cricketer in the World award for 2016. UPDATE CONTEXT.\n", "\n", "--------------------------------------------------------------------------------\n", "\u001b[32mUpdating context and resetting conversation.\u001b[0m\n", @@ -3579,7 +2367,7 @@ "--------------------------------------------------------------------------------\n", "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", "\n", - "UPDATE CONTEXT. The context does not provide any information about the Wisden Leading Cricketer in the world award for 2016.\n", + "UPDATE CONTEXT. The current context does not provide information related to the question.\n", "\n", "--------------------------------------------------------------------------------\n", "\u001b[32mUpdating context and resetting conversation.\u001b[0m\n", @@ -3612,7 +2400,7 @@ "--------------------------------------------------------------------------------\n", "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", "\n", - "Virat Kohli has been honored with the Wisden Leading Cricketer in the World Award for 2016.\n", + "I'm sorry, the provided context doesn't contain information about any cricketer being honored with the Wisden Leading Cricketer in the World award for 2016. UPDATE CONTEXT if you have any other query.\n", "\n", "--------------------------------------------------------------------------------\n", "\n", @@ -3620,7 +2408,7 @@ ">>>>>>>>>>>> Below are outputs of Case 5 <<<<<<<<<<<<\n", "\n", "\n", - "doc_ids: [['doc_20', 'doc_2943', 'doc_2059', 'doc_3293', 'doc_4056', 'doc_1914', 'doc_2749', 'doc_1796', 'doc_3468', 'doc_1793', 'doc_876', 'doc_2577', 'doc_27', 'doc_2780', 'doc_366', 'doc_321', 'doc_3103', 'doc_715', 'doc_3534', 'doc_142', 'doc_5337', 'doc_2426', 'doc_5346', 'doc_3021', 'doc_1596', 'doc_316', 'doc_1103', 'doc_1670', 'doc_2853', 'doc_3256']]\n", + "doc_ids: [['doc_20', 'doc_2943', 'doc_2059', 'doc_3293', 'doc_4056', 'doc_1914', 'doc_2749', 'doc_1796', 'doc_3468', 'doc_1793', 'doc_876', 'doc_2577', 'doc_27', 'doc_366', 'doc_321', 'doc_3103', 'doc_715', 'doc_3534', 'doc_142', 'doc_5337', 'doc_2426', 'doc_5346', 'doc_3021', 'doc_1596', 'doc_316', 'doc_1103', 'doc_1602', 'doc_1677', 'doc_1670', 'doc_2853']]\n", "\u001b[32mAdding doc_id doc_20 to context.\u001b[0m\n", "\u001b[32mAdding doc_id doc_2943 to context.\u001b[0m\n", "\u001b[32mAdding doc_id doc_2059 to context.\u001b[0m\n", @@ -3634,7 +2422,6 @@ "\u001b[32mAdding doc_id doc_876 to context.\u001b[0m\n", "\u001b[32mAdding doc_id doc_2577 to context.\u001b[0m\n", "\u001b[32mAdding doc_id doc_27 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_2780 to context.\u001b[0m\n", "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", "\n", "You're a retrieve augmented chatbot. You answer user's questions based on your own knowledge and the\n", @@ -3657,14 +2444,13 @@ "

    The United States Oath of Allegiance , officially referred to as the `` Oath of Allegiance , '' 8 C.F.R. Part 337 ( 2008 ) , is an allegiance oath that must be taken by all immigrants who wish to become United States citizens .

    \n", "

    During the first half of the 19th century , seven stars were added to the flag to represent the seven signatories to the Venezuelan declaration of independence , being the provinces of Caracas , Cumaná , Barcelona , Barinas , Margarita , Mérida , and Trujillo .

    \n", "

    With the annexation of Hawaii in 1898 and the seizure of Guam and the Philippines during the Spanish -- American War that same year , the United States began to consider unclaimed and uninhabited Wake Island , located approximately halfway between Honolulu and Manila , as a good location for a telegraph cable station and coaling station for refueling warships of the rapidly expanding United States Navy and passing merchant and passenger steamships . On July 4 , 1898 , United States Army Brigadier General Francis V. Greene of the 2nd Brigade , Philippine Expeditionary Force , of the Eighth Army Corps , stopped at Wake Island and raised the American flag while en route to the Philippines on the steamship liner SS China .

    \n", - "

    On Opening Day , April 9 , 1965 , a sold - out crowd of 47,879 watched an exhibition game between the Houston Astros and the New York Yankees . President Lyndon B. Johnson and his wife Lady Bird were in attendance , as well as Texas Governor John Connally and Houston Mayor Louie Welch . Governor Connally tossed out the first ball for the first game ever played indoors . Dick `` Turk '' Farrell of the Astros threw the first pitch . Mickey Mantle had both the first hit ( a single ) and the first home run in the Astrodome . The Astros beat the Yankees that night , 2 - 1 .

    \n", "\n", "\n", "\n", "--------------------------------------------------------------------------------\n", "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", "\n", - "Sorry, I cannot find any information about who carried the USA flag in the opening ceremony. UPDATE CONTEXT.\n", + "I don't have the answer with the provided context. UPDATE CONTEXT.\n", "\n", "--------------------------------------------------------------------------------\n", "\u001b[32mUpdating context and resetting conversation.\u001b[0m\n", @@ -3727,7 +2513,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -3772,7 +2558,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -3801,7 +2587,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -3829,26 +2615,35 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 18, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + ">>>>>>>>>>>> Below are outputs of Case 1 <<<<<<<<<<<<\n", + "\n", + "\n", + "Trying to create collection.\n" + ] + }, { "name": "stderr", "output_type": "stream", "text": [ - "Collection 2wikimultihopqa already exists.\n" + "max_tokens is too small to fit a single line of text. Breaking this line:\n", + "\tClyde Thompson: Clyde Thompson( 1910 – July 1, 1979) was an American prisoner turned chaplain. He is ...\n", + "max_tokens is too small to fit a single line of text. Breaking this line:\n", + "\tAustralian Historical Monographs: The Australian Historical Monographs are a series of Historical st ...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\n", - "\n", - ">>>>>>>>>>>> Below are outputs of Case 1 <<<<<<<<<<<<\n", - "\n", - "\n", - "Trying to create collection.\n", "doc_ids: [['doc_12', 'doc_11', 'doc_16', 'doc_19', 'doc_13116', 'doc_14', 'doc_13', 'doc_18', 'doc_977', 'doc_10']]\n", "\u001b[32mAdding doc_id doc_12 to context.\u001b[0m\n", "\u001b[32mAdding doc_id doc_11 to context.\u001b[0m\n", @@ -3911,27 +2706,15 @@ "\n", "\n", "--------------------------------------------------------------------------------\n", - "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", - "\n", - "Blind Shaft is a film directed by Li Yang and was released in the year 2003. The Mask of Fu Manchu, on the other hand, is a pre-Code adventure film directed by Charles Brabin and was released in the year 1932. Thus, The Mask of Fu Manchu came out earlier than Blind Shaft. So the answer is: The Mask of Fu Manchu.\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\n", - "\n", - ">>>>>>>>>>>> Below are outputs of Case 2 <<<<<<<<<<<<\n", - "\n", - "\n", - "doc_ids: [['doc_50790', 'doc_20244', 'doc_1013', 'doc_4364', 'doc_4366', 'doc_57051', 'doc_2851', 'doc_57053', 'doc_13524', 'doc_1316']]\n", - "\u001b[32mAdding doc_id doc_50790 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_20244 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_1013 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_4364 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_4366 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_57051 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_2851 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_57053 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_13524 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_1316 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id doc_11 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id doc_16 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id doc_19 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id doc_13116 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id doc_14 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id doc_13 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id doc_18 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id doc_977 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id doc_10 to context.\u001b[0m\n", "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", "\n", "You're a retrieve augmented chatbot. You answer user's questions based on your own knowledge and the context provided by the user. You must think step-by-step.\n", @@ -3967,39 +2750,43 @@ "Second, please complete the answer by thinking step-by-step.\n", "\n", "Context:\n", - "Princess Josephine of Baden: Princess Josephine Friederike Luise of Baden( 21 October 1813 – 19 June 1900) was born at Mannheim, the second daughter of Charles, Grand Duke of Baden and his wife, Stéphanie de Beauharnais. Through her son, Carol I, she is the ancestress of the Romanian royal family and the Yugoslav Royal family. Through her younger daughter Marie, she is also the ancestress of the Belgian royal family and the Grand Ducal family of Luxembourg.\n", - "Archduchess Marie Astrid of Austria: Archduchess Marie Astrid of Austria( née\" Princess Marie Astrid of Luxembourg\"; born 17 February 1954 at Castle Betzdorf) is the elder daughter and eldest child of Grand Duke Jean of Luxembourg and Joséphine- Charlotte of Belgium, and the wife of Archduke Carl Christian of Austria.\n", - "Princess Joséphine Marie of Belgium: Princess Joséphine Marie of Belgium( 30 November 1870 — 18 January 1871) was the daughter of Prince Philippe, Count of Flanders, and Princess Marie of Hohenzollern- Sigmaringen. She was the older twin to Princess Henriette of Belgium. In 1872 Joséphine Marie's mother gave birth to another daughter, who was named Joséphine in her memory.\n", - "Princess Joséphine Marie of Belgium: Princess Joséphine Marie of Belgium (30 November 1870 — 18 January 1871) was the daughter of Prince Philippe, Count of Flanders, and Princess Marie of Hohenzollern-Sigmaringen. She was the older twin to Princess Henriette of Belgium. In 1872 Joséphine Marie's mother gave birth to another daughter, who was named Joséphine in her memory.\n", - "Princess Joséphine Caroline of Belgium: Princess Joséphine Caroline of Belgium( 18 October 1872 – 6 January 1958) was the youngest daughter of Prince Philippe, Count of Flanders and Princess Marie of Hohenzollern- Sigmaringen. She was an older sister of Albert I of Belgium.\n", - "Federal University of Maranhão: The Federal University of Maranhão( UFMA) is a federal university in the northeastern state of Maranhão, Brazil.\n", - "Princess Margaretha of Liechtenstein: Princess Margaretha of Liechtenstein( born Princess Margaretha of Luxembourg on 15 May 1957) is the fourth child and second and youngest daughter of Grand Duke Jean of Luxembourg and Princess Joséphine- Charlotte of Belgium. As the sister of Grand Duke Henri of Luxembourg and the sister- in- law of Prince Hans- Adam II of Liechtenstein, she is a princess of two current realms and a member of the Luxembourg and Liechtenstein reigning dynasties.\n", - "Federal University, Lokoja: The Federal University, Lokoja, popularly known as Fulokoja, is a federal university in the confluence city of Lokoja, the capital of Kogi State, North- Central Nigeria. Lokoja lies at the confluence of the Niger and Benue rivers. The Federal University, Lokoja was established in February 2011 by the Federal Government of Nigeria as a result of indispensable need to create more universities in the country.\n", - "Princess Luisa Maria of Belgium, Archduchess of Austria-Este: Princess Luisa Maria of Belgium, Archduchess of Austria- Este( Luisa Maria Anna Martine Pilar; born 11 October 1995) is the fourth child and second daughter of Lorenz, Archduke of Austria- Este, and Princess Astrid of Belgium. She was born at the Saint Jean Hospital in Brussels, Belgium, and is currently ninth in line to the Belgian throne.\n", - "Princess Sophie of Greece and Denmark: Princess Sophie of Greece and Denmark( 26 June 1914 – 24 November 2001) was the fourth child and youngest daughter of Prince Andrew of Greece and Denmark and Princess Alice of Battenberg. The Duke of Edinburgh is her younger brother. Sophie was born at the villa Mon Repos on the island of Corfu in Greece.\n", + "The Mask of Fu Manchu: The Mask of Fu Manchu is a 1932 pre-Code adventure film directed by Charles Brabin. It was written by Irene Kuhn, Edgar Allan Woolf and John Willard based on the 1932 novel of the same name by Sax Rohmer. Starring Boris Karloff as Fu Manchu, and featuring Myrna Loy as his depraved daughter, the movie revolves around Fu Manchu's quest for the golden sword and mask of Genghis Khan. Lewis Stone plays his nemesis. Dr. Petrie is absent from this film.\n", + "The Mysterious Dr. Fu Manchu: The Mysterious Dr. Fu Manchu is a 1929 American pre-Code drama film directed by Rowland V. Lee and starring Warner Oland as Dr. Fu Manchu. It was the first Fu Manchu film of the talkie era. Since this was during the transition period to sound, a silent version was also released in the United States.\n", + "The Face of Fu Manchu: The Face of Fu Manchu is a 1965 thriller film directed by Don Sharp and based on the characters created by Sax Rohmer. It stars Christopher Lee as the eponymous villain, a Chinese criminal mastermind, and Nigel Green as his pursuing rival Nayland Smith, a Scotland Yard detective. The film was a British- West German co-production, and was the first in a five- part series starring Lee and produced by Harry Alan Towers for Constantin Film, the second of which was\" The Brides of Fu Manchu\" released the next year, with the final entry being\" The Castle of Fu Manchu\" in 1969. It was shot in Technicolor and Techniscope, on- location in County Dublin, Ireland.\n", + "The Return of Dr. Fu Manchu: The Return of Dr. Fu Manchu is a 1930 American pre-Code film directed by Rowland V. Lee. It is the second of three films starring Warner Oland as the fiendish Fu Manchu, who returns from apparent death in the previous film,\" The Mysterious Dr. Fu Manchu\"( 1929), to seek revenge on those he holds responsible for the death of his wife and child.\n", + "The Vengeance of Fu Manchu: The Vengeance of Fu Manchu is a 1967 British film directed by Jeremy Summers and starring Christopher Lee, Horst Frank, Douglas Wilmer and Tsai Chin. It was the third British/ West German Constantin Film co-production of the Dr. Fu Manchu series and the first to be filmed in Hong Kong. It was generally released in the U.K. through Warner- Pathé( as a support feature to the Lindsay Shonteff film\" The Million Eyes of Sumuru\") on 3 December 1967.\n", + "The Brides of Fu Manchu: The Brides of Fu Manchu is a 1966 British/ West German Constantin Film co-production adventure crime film based on the fictional Chinese villain Dr. Fu Manchu, created by Sax Rohmer. It was the second film in a series, and was preceded by\" The Face of Fu ManchuThe Vengeance of Fu Manchu\" followed in 1967,\" The Blood of Fu Manchu\" in 1968, and\" The Castle of Fu Manchu\" in 1969. It was produced by Harry Alan Towers for Hallam Productions. Like the first film, it was directed by Don Sharp, and starred Christopher Lee as Fu Manchu. Nigel Green was replaced by Douglas Wilmer as Scotland Yard detective Nayland Smith. The action takes place mainly in London, where much of the location filming took place.\n", + "The Castle of Fu Manchu: The Castle of Fu Manchu( also known as The Torture Chamber of Dr. Fu Manchu and also known by its German title Die Folterkammer des Dr. Fu Man Chu) is a 1969 film and the fifth and final Dr. Fu Manchu film with Christopher Lee portraying the title character.\n", + "The Blood of Fu Manchu: The Blood of Fu Manchu, also known as Fu Manchu and the Kiss of Death, Kiss of Death, Kiss and Kill( U.S. title) and Against All Odds( original U.S. video title), is a 1968 British adventure crime film directed by Jesús Franco, based on the fictional Asian villain Dr. Fu Manchu created by Sax Rohmer. It was the fourth film in a series, and was preceded by\" The Vengeance of Fu Manchu The Castle of Fu Manchu\" followed in 1969. It was produced by Harry Alan Towers for Udastex Films. It starred Christopher Lee as Dr. Fu Manchu, Richard Greene as Scotland Yard detective Nayland Smith, and Howard Marion- Crawford as Dr. Petrie. The movie was filmed in Spain and Brazil. Shirley Eaton appears in a scene that she claimed she was never paid for; apparently, the director Jesús Franco had inserted some stock footage of her from one of her films(\" The Girl from Rio\"( 1968)) into the film without telling her. She only found out years later that she had been in a Fu Manchu film.\n", + "Don Sharp: Donald Herman Sharp( 19 April 192114 December 2011) was an Australian- born British film director. His best known films were made for Hammer in the 1960s, and included\" The Kiss of the Vampire\"( 1962) and\" Rasputin, the Mad Monk\"( 1966). In 1965 he directed\" The Face of Fu Manchu\", based on the character created by Sax Rohmer, and starring Christopher Lee. Sharp also directed the sequel\" The Brides of Fu Manchu\"( 1966). In the 1980s he was also responsible for several hugely popular miniseries adapted from the novels of Barbara Taylor Bradford.\n", + "Blind Shaft: Blind Shaft is a 2003 film about a pair of brutal con artists operating in the illegal coal mines of present- day northern China. The film was written and directed by Li Yang( 李杨), and is based on Chinese writer Liu Qingbang's short novel\" Shen MuSacred Wood\").\n", "\n", - "Q: Are North Marion High School (Oregon) and Seoul High School both located in the same country?\n", + "Q: Which film came out first, Blind Shaft or The Mask Of Fu Manchu?\n", "A:\n", "\n", "\n", "--------------------------------------------------------------------------------\n", "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", "\n", - "I'm sorry, I do not have enough information about North Marion High School and Seoul High School to provide an answer. Please provide more context or information about the schools.\n", + "Blind Shaft is a 2003 film while The Mask of Fu Manchu is a 1932 pre-Code adventure film. Thus, The Mask of Fu Manchu came out earlier than Blind Shaft. So the answer is: The Mask of Fu Manchu.\n", "\n", "--------------------------------------------------------------------------------\n", - "\u001b[32mUpdating context and resetting conversation.\u001b[0m\n", - "doc_ids: [['doc_74', 'doc_68', 'doc_75', 'doc_76', 'doc_19596', 'doc_23187', 'doc_7274', 'doc_11693', 'doc_10593', 'doc_11636']]\n", + "\n", + "\n", + ">>>>>>>>>>>> Below are outputs of Case 2 <<<<<<<<<<<<\n", + "\n", + "\n", + "doc_ids: [['doc_74', 'doc_76', 'doc_68', 'doc_42890', 'doc_75', 'doc_19596', 'doc_45135', 'doc_995', 'doc_7274', 'doc_23187']]\n", "\u001b[32mAdding doc_id doc_74 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id doc_76 to context.\u001b[0m\n", "\u001b[32mAdding doc_id doc_68 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id doc_42890 to context.\u001b[0m\n", "\u001b[32mAdding doc_id doc_75 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_76 to context.\u001b[0m\n", "\u001b[32mAdding doc_id doc_19596 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_23187 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id doc_45135 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id doc_995 to context.\u001b[0m\n", "\u001b[32mAdding doc_id doc_7274 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_11693 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_10593 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_11636 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id doc_23187 to context.\u001b[0m\n", "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", "\n", "You're a retrieve augmented chatbot. You answer user's questions based on your own knowledge and the context provided by the user. You must think step-by-step.\n", @@ -4035,27 +2822,17 @@ "Second, please complete the answer by thinking step-by-step.\n", "\n", "Context:\n", - "Princess Josephine of Baden: Princess Josephine Friederike Luise of Baden( 21 October 1813 – 19 June 1900) was born at Mannheim, the second daughter of Charles, Grand Duke of Baden and his wife, Stéphanie de Beauharnais. Through her son, Carol I, she is the ancestress of the Romanian royal family and the Yugoslav Royal family. Through her younger daughter Marie, she is also the ancestress of the Belgian royal family and the Grand Ducal family of Luxembourg.\n", - "Archduchess Marie Astrid of Austria: Archduchess Marie Astrid of Austria( née\" Princess Marie Astrid of Luxembourg\"; born 17 February 1954 at Castle Betzdorf) is the elder daughter and eldest child of Grand Duke Jean of Luxembourg and Joséphine- Charlotte of Belgium, and the wife of Archduke Carl Christian of Austria.\n", - "Princess Joséphine Marie of Belgium: Princess Joséphine Marie of Belgium( 30 November 1870 — 18 January 1871) was the daughter of Prince Philippe, Count of Flanders, and Princess Marie of Hohenzollern- Sigmaringen. She was the older twin to Princess Henriette of Belgium. In 1872 Joséphine Marie's mother gave birth to another daughter, who was named Joséphine in her memory.\n", - "Princess Joséphine Marie of Belgium: Princess Joséphine Marie of Belgium (30 November 1870 — 18 January 1871) was the daughter of Prince Philippe, Count of Flanders, and Princess Marie of Hohenzollern-Sigmaringen. She was the older twin to Princess Henriette of Belgium. In 1872 Joséphine Marie's mother gave birth to another daughter, who was named Joséphine in her memory.\n", - "Princess Joséphine Caroline of Belgium: Princess Joséphine Caroline of Belgium( 18 October 1872 – 6 January 1958) was the youngest daughter of Prince Philippe, Count of Flanders and Princess Marie of Hohenzollern- Sigmaringen. She was an older sister of Albert I of Belgium.\n", - "Federal University of Maranhão: The Federal University of Maranhão( UFMA) is a federal university in the northeastern state of Maranhão, Brazil.\n", - "Princess Margaretha of Liechtenstein: Princess Margaretha of Liechtenstein( born Princess Margaretha of Luxembourg on 15 May 1957) is the fourth child and second and youngest daughter of Grand Duke Jean of Luxembourg and Princess Joséphine- Charlotte of Belgium. As the sister of Grand Duke Henri of Luxembourg and the sister- in- law of Prince Hans- Adam II of Liechtenstein, she is a princess of two current realms and a member of the Luxembourg and Liechtenstein reigning dynasties.\n", - "Federal University, Lokoja: The Federal University, Lokoja, popularly known as Fulokoja, is a federal university in the confluence city of Lokoja, the capital of Kogi State, North- Central Nigeria. Lokoja lies at the confluence of the Niger and Benue rivers. The Federal University, Lokoja was established in February 2011 by the Federal Government of Nigeria as a result of indispensable need to create more universities in the country.\n", - "Princess Luisa Maria of Belgium, Archduchess of Austria-Este: Princess Luisa Maria of Belgium, Archduchess of Austria- Este( Luisa Maria Anna Martine Pilar; born 11 October 1995) is the fourth child and second daughter of Lorenz, Archduke of Austria- Este, and Princess Astrid of Belgium. She was born at the Saint Jean Hospital in Brussels, Belgium, and is currently ninth in line to the Belgian throne.\n", - "Princess Sophie of Greece and Denmark: Princess Sophie of Greece and Denmark( 26 June 1914 – 24 November 2001) was the fourth child and youngest daughter of Prince Andrew of Greece and Denmark and Princess Alice of Battenberg. The Duke of Edinburgh is her younger brother. Sophie was born at the villa Mon Repos on the island of Corfu in Greece.\n", "Seoul High School: Seoul High School( Hangul: 서울고등학교) is a public high school located in the heart of Seoul, South Korea.\n", + "North Marion High School (Oregon): North Marion High School is a public high school in Aurora, Oregon, United States. The school is part of the North Marion School District with all four schools being located on the same campus. The school draws students from the cities of Aurora, Hubbard, and Donald as well as the communities of Broadacres and Butteville.\n", "Marion High School (Kansas): Marion High School is a public high school in Marion, Kansas, USA. It is one of three schools operated by Marion USD 408, and is the sole high school in the district.\n", + "Northwest High School: Northwest High School or North West High School may refer to:\n", "Marion High School (Indiana): Marion High School is a high school in Marion, Indiana with more than 1,000 students.\n", - "North Marion High School (Oregon): North Marion High School is a public high school in Aurora, Oregon, United States. The school is part of the North Marion School District with all four schools being located on the same campus. The school draws students from the cities of Aurora, Hubbard, and Donald as well as the communities of Broadacres and Butteville.\n", "Macon County High School: Macon County High School is located in Montezuma, Georgia, United States, which is a part of Macon County. Enrollment as of the 2017- 2018 school year is 491.\n", - "International School of Koje: International School of Koje( ISK) is a privately funded international school located in Geoje, South Korea.\n", + "Canyon High School (Ogden, Utah): Canyon High School was a high school in Ogden, Utah.\n", + "Northside High School: Northside High School or North Side High School or Northside Christian School or similar can refer to:\n", "Springs Boys' High School: Springs Boys' High School is a high school in Springs, Gauteng, South Africa.\n", - "Cherokee High School (Georgia): Cherokee High School is one of six public high schools of the Cherokee County School District in Cherokee County, Georgia, United States. It is located in Canton. Established in 1956, it replaced Canton High School, the county's first high school. There are six high schools in the Cherokee County School District: Etowah High School, Sequoyah High School, Woodstock High School, Creekview High School, and River Ridge High School\n", - "Yoon Jong-hwan: Yoon Jong- Hwan( born 16 February 1973 in Gwangju, South Korea) is a South Korean manager and former football player.\n", - "Hikarigaoka Girls' High School: It was established in 1963.\n", - "I'm sorry, I do not have enough information about North Marion High School and Seoul High School to provide an answer.\n", + "International School of Koje: International School of Koje( ISK) is a privately funded international school located in Geoje, South Korea.\n", + "\n", "Q: Are North Marion High School (Oregon) and Seoul High School both located in the same country?\n", "A:\n", "\n", @@ -4063,14 +2840,13 @@ "--------------------------------------------------------------------------------\n", "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", "\n", - "No, North Marion High School is located in the United States, specifically in Oregon, while Seoul High School is located in South Korea. They are not located in the same country.\n", + "No, North Marion High School (Oregon) is located in the United States, specifically in the state of Oregon, while Seoul High School is located in South Korea. So they are not in the same country.\n", "\n", "--------------------------------------------------------------------------------\n", "\u001b[32mUpdating context and resetting conversation.\u001b[0m\n", - "doc_ids: [['doc_68', 'doc_74', 'doc_76', 'doc_75', 'doc_19596', 'doc_69', 'doc_7274', 'doc_24819', 'doc_995', 'doc_23187']]\n", - "\u001b[32mAdding doc_id doc_69 to context.\u001b[0m\n", + "doc_ids: [['doc_76', 'doc_68', 'doc_74', 'doc_75', 'doc_19596', 'doc_42890', 'doc_24819', 'doc_69', 'doc_995', 'doc_7274']]\n", "\u001b[32mAdding doc_id doc_24819 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_995 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id doc_69 to context.\u001b[0m\n", "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", "\n", "You're a retrieve augmented chatbot. You answer user's questions based on your own knowledge and the context provided by the user. You must think step-by-step.\n", @@ -4106,31 +2882,18 @@ "Second, please complete the answer by thinking step-by-step.\n", "\n", "Context:\n", - "Princess Josephine of Baden: Princess Josephine Friederike Luise of Baden( 21 October 1813 – 19 June 1900) was born at Mannheim, the second daughter of Charles, Grand Duke of Baden and his wife, Stéphanie de Beauharnais. Through her son, Carol I, she is the ancestress of the Romanian royal family and the Yugoslav Royal family. Through her younger daughter Marie, she is also the ancestress of the Belgian royal family and the Grand Ducal family of Luxembourg.\n", - "Archduchess Marie Astrid of Austria: Archduchess Marie Astrid of Austria( née\" Princess Marie Astrid of Luxembourg\"; born 17 February 1954 at Castle Betzdorf) is the elder daughter and eldest child of Grand Duke Jean of Luxembourg and Joséphine- Charlotte of Belgium, and the wife of Archduke Carl Christian of Austria.\n", - "Princess Joséphine Marie of Belgium: Princess Joséphine Marie of Belgium( 30 November 1870 — 18 January 1871) was the daughter of Prince Philippe, Count of Flanders, and Princess Marie of Hohenzollern- Sigmaringen. She was the older twin to Princess Henriette of Belgium. In 1872 Joséphine Marie's mother gave birth to another daughter, who was named Joséphine in her memory.\n", - "Princess Joséphine Marie of Belgium: Princess Joséphine Marie of Belgium (30 November 1870 — 18 January 1871) was the daughter of Prince Philippe, Count of Flanders, and Princess Marie of Hohenzollern-Sigmaringen. She was the older twin to Princess Henriette of Belgium. In 1872 Joséphine Marie's mother gave birth to another daughter, who was named Joséphine in her memory.\n", - "Princess Joséphine Caroline of Belgium: Princess Joséphine Caroline of Belgium( 18 October 1872 – 6 January 1958) was the youngest daughter of Prince Philippe, Count of Flanders and Princess Marie of Hohenzollern- Sigmaringen. She was an older sister of Albert I of Belgium.\n", - "Federal University of Maranhão: The Federal University of Maranhão( UFMA) is a federal university in the northeastern state of Maranhão, Brazil.\n", - "Princess Margaretha of Liechtenstein: Princess Margaretha of Liechtenstein( born Princess Margaretha of Luxembourg on 15 May 1957) is the fourth child and second and youngest daughter of Grand Duke Jean of Luxembourg and Princess Joséphine- Charlotte of Belgium. As the sister of Grand Duke Henri of Luxembourg and the sister- in- law of Prince Hans- Adam II of Liechtenstein, she is a princess of two current realms and a member of the Luxembourg and Liechtenstein reigning dynasties.\n", - "Federal University, Lokoja: The Federal University, Lokoja, popularly known as Fulokoja, is a federal university in the confluence city of Lokoja, the capital of Kogi State, North- Central Nigeria. Lokoja lies at the confluence of the Niger and Benue rivers. The Federal University, Lokoja was established in February 2011 by the Federal Government of Nigeria as a result of indispensable need to create more universities in the country.\n", - "Princess Luisa Maria of Belgium, Archduchess of Austria-Este: Princess Luisa Maria of Belgium, Archduchess of Austria- Este( Luisa Maria Anna Martine Pilar; born 11 October 1995) is the fourth child and second daughter of Lorenz, Archduke of Austria- Este, and Princess Astrid of Belgium. She was born at the Saint Jean Hospital in Brussels, Belgium, and is currently ninth in line to the Belgian throne.\n", - "Princess Sophie of Greece and Denmark: Princess Sophie of Greece and Denmark( 26 June 1914 – 24 November 2001) was the fourth child and youngest daughter of Prince Andrew of Greece and Denmark and Princess Alice of Battenberg. The Duke of Edinburgh is her younger brother. Sophie was born at the villa Mon Repos on the island of Corfu in Greece.\n", "Seoul High School: Seoul High School( Hangul: 서울고등학교) is a public high school located in the heart of Seoul, South Korea.\n", + "North Marion High School (Oregon): North Marion High School is a public high school in Aurora, Oregon, United States. The school is part of the North Marion School District with all four schools being located on the same campus. The school draws students from the cities of Aurora, Hubbard, and Donald as well as the communities of Broadacres and Butteville.\n", "Marion High School (Kansas): Marion High School is a public high school in Marion, Kansas, USA. It is one of three schools operated by Marion USD 408, and is the sole high school in the district.\n", + "Northwest High School: Northwest High School or North West High School may refer to:\n", "Marion High School (Indiana): Marion High School is a high school in Marion, Indiana with more than 1,000 students.\n", - "North Marion High School (Oregon): North Marion High School is a public high school in Aurora, Oregon, United States. The school is part of the North Marion School District with all four schools being located on the same campus. The school draws students from the cities of Aurora, Hubbard, and Donald as well as the communities of Broadacres and Butteville.\n", "Macon County High School: Macon County High School is located in Montezuma, Georgia, United States, which is a part of Macon County. Enrollment as of the 2017- 2018 school year is 491.\n", - "International School of Koje: International School of Koje( ISK) is a privately funded international school located in Geoje, South Korea.\n", + "Canyon High School (Ogden, Utah): Canyon High School was a high school in Ogden, Utah.\n", + "Northside High School: Northside High School or North Side High School or Northside Christian School or similar can refer to:\n", "Springs Boys' High School: Springs Boys' High School is a high school in Springs, Gauteng, South Africa.\n", - "Cherokee High School (Georgia): Cherokee High School is one of six public high schools of the Cherokee County School District in Cherokee County, Georgia, United States. It is located in Canton. Established in 1956, it replaced Canton High School, the county's first high school. There are six high schools in the Cherokee County School District: Etowah High School, Sequoyah High School, Woodstock High School, Creekview High School, and River Ridge High School\n", - "Yoon Jong-hwan: Yoon Jong- Hwan( born 16 February 1973 in Gwangju, South Korea) is a South Korean manager and former football player.\n", - "Hikarigaoka Girls' High School: It was established in 1963.\n", - "North Marion High School (West Virginia): North Marion High School is a public Double A (\"AA\") high school in the U.S. state of West Virginia, with a current enrollment of 851 students. North Marion High School is located approximately 4 miles from Farmington, West Virginia on US Route 250 north. While it is closer to the city of Mannington, West Virginia, and is often considered to be located in Rachel, West Virginia, the school mailing address is Farmington. Rachel is a small coal mining community located adjacent to the school, and is an unincorporated municipality. North Marion High School is represented as \"Grantville High School\" in the popular alternative history novel \"1632\" by writer Eric Flint. The novel is set in the fictional town of Grantville, which is based on the real town and surroundings of Mannington.\n", + "International School of Koje: International School of Koje( ISK) is a privately funded international school located in Geoje, South Korea.\n", "Anderson High School (Anderson, Indiana): Anderson High School is a public high school located in Anderson, Indiana.\n", - "Northside High School: Northside High School or North Side High School or Northside Christian School or similar can refer to:\n", - "I'm sorry, I do not have enough information about North Marion High School and Seoul High School to provide an answer.\n", - "No, North Marion High School is located in the United States, specifically in Oregon, while Seoul High School is located in South Korea.\n", + "North Marion High School (West Virginia): North Marion High School is a public Double A (\"AA\") high school in the U.S. state of West Virginia, with a current enrollment of 851 students. North Marion High School is located approximately 4 miles from Farmington, West Virginia on US Route 250 north. While it is closer to the city of Mannington, West Virginia, and is often considered to be located in Rachel, West Virginia, the school mailing address is Farmington. Rachel is a small coal mining community located adjacent to the school, and is an unincorporated municipality. North Marion High School is represented as \"Grantville High School\" in the popular alternative history novel \"1632\" by writer Eric Flint. The novel is set in the fictional town of Grantville, which is based on the real town and surroundings of Mannington.\n", "Q: Are North Marion High School (Oregon) and Seoul High School both located in the same country?\n", "A:\n", "\n", @@ -4138,7 +2901,7 @@ "--------------------------------------------------------------------------------\n", "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", "\n", - "No, North Marion High School is located in the United States, specifically in Oregon, while Seoul High School is located in South Korea. So the answer is: no.\n", + "North Marion High School (Oregon) is located in the country of United States. Seoul High School is located in the country of South Korea. Thus, they are not in the same country. So the answer is: no.\n", "\n", "--------------------------------------------------------------------------------\n" ] @@ -4172,7 +2935,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/notebook/agentchat_groupchat_RAG.ipynb b/notebook/agentchat_groupchat_RAG.ipynb index 89407d9933d4..122d97bb96f2 100644 --- a/notebook/agentchat_groupchat_RAG.ipynb +++ b/notebook/agentchat_groupchat_RAG.ipynb @@ -22,7 +22,7 @@ "\n", "AutoGen requires `Python>=3.8`. To run this notebook example, please install:\n", "```bash\n", - "pip install pyautogen\n", + "pip install \"pyautogen[retrievechat]~=0.2.0b5\"\n", "```" ] }, @@ -33,7 +33,7 @@ "outputs": [], "source": [ "%%capture --no-stderr\n", - "# %pip install pyautogen[retrievechat]~=0.1.11" + "# %pip install \"pyautogen[retrievechat]~=0.2.0b5\"" ] }, { @@ -48,7 +48,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -110,9 +110,20 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/lijiang1/anaconda3/envs/autogen/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "/home/lijiang1/anaconda3/envs/autogen/lib/python3.10/site-packages/torch/cuda/__init__.py:138: UserWarning: CUDA initialization: The NVIDIA driver on your system is too old (found version 11060). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver. (Triggered internally at ../c10/cuda/CUDAFunctions.cpp:108.)\n", + " return torch._C._cuda_getDeviceCount() > 0\n" + ] + } + ], "source": [ "from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent\n", "from autogen import AssistantAgent\n", @@ -120,7 +131,7 @@ "\n", "llm_config = {\n", " \"timeout\": 60,\n", - " \"seed\": 42,\n", + " \"cache_seed\": 42,\n", " \"config_list\": config_list,\n", " \"temperature\": 0,\n", "}\n", @@ -131,16 +142,17 @@ "boss = autogen.UserProxyAgent(\n", " name=\"Boss\",\n", " is_termination_msg=termination_msg,\n", - " human_input_mode=\"TERMINATE\",\n", + " human_input_mode=\"NEVER\",\n", " system_message=\"The boss who ask questions and give tasks.\",\n", " code_execution_config=False, # we don't want to execute code in this case.\n", + " default_auto_reply=\"Reply `TERMINATE` if the task is done.\",\n", ")\n", "\n", "boss_aid = RetrieveUserProxyAgent(\n", " name=\"Boss_Assistant\",\n", " is_termination_msg=termination_msg,\n", " system_message=\"Assistant who has extra content retrieval power for solving difficult problems.\",\n", - " human_input_mode=\"TERMINATE\",\n", + " human_input_mode=\"NEVER\",\n", " max_consecutive_auto_reply=3,\n", " retrieve_config={\n", " \"task\": \"code\",\n", @@ -177,6 +189,7 @@ "\n", "PROBLEM = \"How to use spark for parallel training in FLAML? Give me sample code.\"\n", "\n", + "\n", "def _reset_agents():\n", " boss.reset()\n", " boss_aid.reset()\n", @@ -184,10 +197,11 @@ " pm.reset()\n", " reviewer.reset()\n", "\n", + "\n", "def rag_chat():\n", " _reset_agents()\n", " groupchat = autogen.GroupChat(\n", - " agents=[boss_aid, coder, pm, reviewer], messages=[], max_round=12\n", + " agents=[boss_aid, coder, pm, reviewer], messages=[], max_round=12, speaker_selection_method=\"round_robin\"\n", " )\n", " manager = autogen.GroupChatManager(groupchat=groupchat, llm_config=llm_config)\n", "\n", @@ -198,10 +212,15 @@ " n_results=3,\n", " )\n", "\n", + "\n", "def norag_chat():\n", " _reset_agents()\n", " groupchat = autogen.GroupChat(\n", - " agents=[boss, coder, pm, reviewer], messages=[], max_round=12\n", + " agents=[boss, coder, pm, reviewer],\n", + " messages=[],\n", + " max_round=12,\n", + " speaker_selection_method=\"auto\",\n", + " allow_repeat_speaker=False,\n", " )\n", " manager = autogen.GroupChatManager(groupchat=groupchat, llm_config=llm_config)\n", "\n", @@ -211,8 +230,10 @@ " message=PROBLEM,\n", " )\n", "\n", + "\n", "def call_rag_chat():\n", " _reset_agents()\n", + "\n", " # In this case, we will have multiple user proxy agents and we don't initiate the chat\n", " # with RAG user proxy agent.\n", " # In order to use RAG user proxy agent, we need to wrap RAG agents in a function and call\n", @@ -227,9 +248,9 @@ " else:\n", " ret_msg = boss_aid.generate_init_message(message, n_results=n_results)\n", " return ret_msg if ret_msg else message\n", - " \n", - " boss_aid.human_input_mode = \"NEVER\" # Disable human input for boss_aid since it only retrieves content.\n", - " \n", + "\n", + " boss_aid.human_input_mode = \"NEVER\" # Disable human input for boss_aid since it only retrieves content.\n", + "\n", " llm_config = {\n", " \"functions\": [\n", " {\n", @@ -249,7 +270,7 @@ " ],\n", " \"config_list\": config_list,\n", " \"timeout\": 60,\n", - " \"seed\": 42,\n", + " \"cache_seed\": 42,\n", " }\n", "\n", " for agent in [coder, pm, reviewer]:\n", @@ -265,7 +286,11 @@ " )\n", "\n", " groupchat = autogen.GroupChat(\n", - " agents=[boss, coder, pm, reviewer], messages=[], max_round=12\n", + " agents=[boss, coder, pm, reviewer],\n", + " messages=[],\n", + " max_round=12,\n", + " speaker_selection_method=\"random\",\n", + " allow_repeat_speaker=False,\n", " )\n", " manager = autogen.GroupChatManager(groupchat=groupchat, llm_config=llm_config)\n", "\n", @@ -289,7 +314,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -301,9 +326,6 @@ "How to use spark for parallel training in FLAML? Give me sample code.\n", "\n", "--------------------------------------------------------------------------------\n", - "How to use spark for parallel training in FLAML? Give me sample code.\n", - "\n", - "--------------------------------------------------------------------------------\n", "\u001b[33mSenior_Python_Engineer\u001b[0m (to chat_manager):\n", "\n", "To use Spark for parallel training in FLAML, you can use the `SparkTrials` class provided by FLAML. Here is a sample code:\n", @@ -329,7 +351,7 @@ " task=\"classification\",\n", " n_jobs=1,\n", " ensemble_size=0,\n", - " max_time=60,\n", + " max_trials=10,\n", " trials=SparkTrials(parallelism=2),\n", ")\n", "\n", @@ -339,79 +361,56 @@ "# Evaluate the model\n", "print(\"Best model:\", automl.best_model)\n", "print(\"Best hyperparameters:\", automl.best_config)\n", - "print(\"Test accuracy:\", automl.score(X_test, y_test))\n", + "print(\"Test accuracy:\", automl.score(X_test=X_test, y_test=y_test))\n", "\n", "# Terminate\n", "TERMINATE\n", "```\n", "\n", - "In this code, we first load the credit dataset. Then, we define the search space for the hyperparameters. We create an `AutoML` instance with `SparkTrials` as the `trials` parameter. We set the `parallelism` parameter to 2 to use 2 Spark workers for parallel training. Finally, we fit the model and evaluate it.\n", + "In this code, we first load the credit dataset. Then, we define the search space for the hyperparameters. We create an `AutoML` instance with `SparkTrials` as the `trials` parameter. We set the `parallelism` parameter to 2, which means that FLAML will use 2 Spark workers to run the trials in parallel. Finally, we fit the model and evaluate it.\n", "\n", "--------------------------------------------------------------------------------\n", "\u001b[33mCode_Reviewer\u001b[0m (to chat_manager):\n", "\n", - "Great! This code looks good to me.\n", + "Great! That's a clear and concise example. No further questions from my side.\n", "\n", - "--------------------------------------------------------------------------------\n", + "--------------------------------------------------------------------------------\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "\u001b[33mProduct_Manager\u001b[0m (to chat_manager):\n", "\n", "Thank you! Let me know if you have any other questions.\n", "\n", "--------------------------------------------------------------------------------\n", - "\u001b[31m\n", - ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", "\u001b[33mBoss\u001b[0m (to chat_manager):\n", "\n", + "Reply `TERMINATE` if the task is done.\n", "\n", + "--------------------------------------------------------------------------------\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "GroupChat select_speaker failed to resolve the next speaker's name. Speaker selection will default to the next speaker in the list. This is because the speaker selection OAI call returned:\n", + "The next role to play is not specified in the conversation. Please provide more information.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mSenior_Python_Engineer\u001b[0m (to chat_manager):\n", "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[31m\n", - ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", - "\u001b[33mBoss\u001b[0m (to chat_manager):\n", - "\n", - "\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[31m\n", - ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", - "\u001b[33mBoss\u001b[0m (to chat_manager):\n", - "\n", - "\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[31m\n", - ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", - "\u001b[33mBoss\u001b[0m (to chat_manager):\n", - "\n", - "\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[31m\n", - ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", - "\u001b[33mBoss\u001b[0m (to chat_manager):\n", - "\n", - "\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[31m\n", - ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", - "\u001b[33mBoss\u001b[0m (to chat_manager):\n", - "\n", - "\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[31m\n", - ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", - "\u001b[33mBoss\u001b[0m (to chat_manager):\n", - "\n", - "\n", + "TERMINATE\n", "\n", "--------------------------------------------------------------------------------\n", - "\u001b[31m\n", - ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", - "\u001b[33mBoss\u001b[0m (to chat_manager):\n", - "\n", - "\n", + "TERMINATE\n", "\n", "--------------------------------------------------------------------------------\n" ] @@ -432,7 +431,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -446,17 +445,16 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO:autogen.retrieve_utils:Found 2 chunks.\n" + "Number of requested results 3 is greater than number of elements in index 2, updating n_results = 2\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "doc_ids: [['doc_0', 'doc_1', 'doc_4']]\n", + "doc_ids: [['doc_0', 'doc_1']]\n", "\u001b[32mAdding doc_id doc_0 to context.\u001b[0m\n", "\u001b[32mAdding doc_id doc_1 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_4 to context.\u001b[0m\n", "\u001b[33mBoss_Assistant\u001b[0m (to chat_manager):\n", "\n", "You're a retrieve augmented coding assistant. You answer user's questions based on your own knowledge and the\n", @@ -591,886 +589,208 @@ "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb)\n", "\n", "\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mSenior_Python_Engineer\u001b[0m (to chat_manager):\n", + "\n", + "To use Spark for parallel training in FLAML, you can activate Spark as the parallel backend during parallel tuning in both AutoML and Hyperparameter Tuning, by setting the `use_spark` to `true`. FLAML will dispatch your job to the distributed Spark backend using `joblib-spark`. Here is an example code snippet for using parallel Spark jobs:\n", + "\n", "```python\n", "import flaml\n", - "# for flaml.tune\n", - "with mlflow.start_run(run_name=f\"spark_auto_trials_1686631558\"):\n", - " analysis = flaml.tune.run(\n", - " func_to_tune,\n", - " params,\n", - " metric=\"r2\",\n", - " mode=\"max\",\n", - " mlflow_exp_name=\"test_doc\",\n", - " use_spark=True,\n", - " )\n", - "\n", - "# for flaml.automl\n", "automl_experiment = flaml.AutoML()\n", "automl_settings = {\n", + " \"time_budget\": 30,\n", " \"metric\": \"r2\",\n", " \"task\": \"regression\",\n", + " \"n_concurrent_trials\": 2,\n", " \"use_spark\": True,\n", - " \"mlflow_exp_name\": \"test_doc\",\n", - " \"estimator_list\": [\n", - " \"lgbm\",\n", - " \"rf\",\n", - " \"xgboost\",\n", - " \"extra_tree\",\n", - " \"xgb_limitdepth\",\n", - " ], # catboost does not yet support mlflow autologging\n", + " \"force_cancel\": True, # Activating the force_cancel option can immediately halt Spark jobs once they exceed the allocated time_budget.\n", "}\n", - "with mlflow.start_run(run_name=f\"automl_spark_trials_1686631579\"):\n", - " automl_experiment.fit(X_train=train_x, y_train=train_y, **automl_settings)\n", - "```\n", - "\n", - "\n", - "\n", - "### Results\n", - "*Tune Autolog Trials on MLFlow UI*\n", - "\n", - "\n", - "![Tune Autolog Trials on MLFlow UI](Images/tune_trials.png)\n", - "\n", - "\n", - "*AutoML Autolog Trials on MLFlow UI*\n", - "\n", - "\n", - "![AutoML Autolog Trials on MLFlow UI](Images/automl_trials.png)\n", - "\n", - "\n", - "### Differences Between Auto and Manual Logging\n", - "Autologging is managed by MLFlow, while manual logging is maintained by FLAML.\n", - "\n", - "\n", - "#### Details of Manual Logging\n", - "FLAML logs general artifacts for AutoML tasks. Specifically, we log these artifacts:\n", - "\n", - "**`flaml.tune`**\n", - "\n", - "\n", - "![Manual Log Example for Tuning](Images/manual_log_tune.png)\n", - "\n", - "\n", - "- We create a parent run to log the best metric and the best configuration for the entire tuning process.\n", - "- For each trial, we create a child run to log the metric specific to the tune function and the configuration for that trial.\n", - "\n", - "**`flaml.automl`**\n", - "\n", - "\n", - "![Manual Log Example for AutoML](Images/manual_log_automl.png)\n", "\n", + "automl.fit(\n", + " dataframe=dataframe,\n", + " label=label,\n", + " **automl_settings,\n", + ")\n", + "```\n", "\n", - "- We create a parent run to log the results of the experiment. This includes:\n", - " - The configuration of this model.\n", - " - The `best_validation_loss` produced by this model.\n", - " - The `best_iteration` to identify the point at which this model was found.\n", - "- For each state (a specific learner with different hyperparameters), we record the best trial for this model. This includes:\n", - " - The configuration of the best trial.\n", - " - The `validation_loss` the best trial produces.\n", - " - The `iter_count` to identify how many trials we have conducted for this state.\n", - " - The `pred_time`, which is the time cost of predicting test data for this model.\n", - " - The `wall_clock_time`, which is the time cost of this state.\n", - " - The `sample_size` to show how much data we sampled in this state.\n", - "Note that we also added these information to autolog AutoML run.\n", + "Note that you should not set `use_spark` to `true` when applying AutoML and Tuning for Spark Data. This is because only SparkML models will be used for Spark Data in AutoML and Tuning. As SparkML models run in parallel, there is no need to distribute them with `use_spark` again.\n", "\n", + "I hope this helps! Let me know if you have any further questions.\n", "\n", - "#### Details of Autologging\n", - "Autolog artifacts typically include model parameters, model files, and runtime metrics like the following:\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mProduct_Manager\u001b[0m (to chat_manager):\n", "\n", + "TERMINATE\n", "\n", - "![Autolog Example](Images/autolog_example.png)\n", + "--------------------------------------------------------------------------------\n" + ] + } + ], + "source": [ + "rag_chat()\n", + "# type exit to terminate the chat" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Call RetrieveUserProxyAgent while init chat with another user proxy agent\n", + "Sometimes, there might be a need to use RetrieveUserProxyAgent in group chat without initializing the chat with it. In such scenarios, it becomes essential to create a function that wraps the RAG agents and allows them to be called from other agents." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mBoss\u001b[0m (to chat_manager):\n", "\n", + "How to use spark for parallel training in FLAML? Give me sample code.\n", "\n", - "Artifacts can differ among various machine learning libraries. More detailed information can be found [here](https://mlflow.org/docs/latest/tracking.html#automatic-logging).\n", + "--------------------------------------------------------------------------------\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mProduct_Manager\u001b[0m (to chat_manager):\n", "\n", + "To use Spark for parallel training in FLAML, you can follow these steps:\n", "\n", + "1. Install PySpark and FLAML on your machine.\n", + "2. Start a Spark cluster using the `pyspark` command.\n", + "3. Import the necessary libraries and initialize a SparkSession object.\n", + "4. Load your data into a Spark DataFrame.\n", + "5. Define your search space and search strategy using FLAML's API.\n", + "6. Create a SparkEstimator object and pass it to FLAML's `fit()` method.\n", "\n", + "Here's some sample code to get you started:\n", "\n", - "## Plot Experiment Result\n", - "The `flaml.visualization` module provides utility functions for plotting the optimization process using [plotly](https://plotly.com/python/). Leveraging `plotly`, users can interactively explore experiment results. To use these plotting functions, simply provide your optimized `flaml.AutoML` or `flaml.tune.tune.ExperimentAnalysis` object as input. Optional parameters can be added using keyword arguments.\n", + "```python\n", + "from pyspark.sql import SparkSession\n", + "from flaml import AutoML\n", + "from flaml.data import get_output_from_log\n", "\n", - "Avaliable plotting functions:\n", - "- `plot_optimization_history`: Plot optimization history of all trials in the experiment.\n", - "- `plot_feature_importance`: Plot importance for each feature in the dataset.\n", - "- `plot_parallel_coordinate`: Plot the high-dimensional parameter relationships in the experiment.\n", - "- `plot_contour`: Plot the parameter relationship as contour plot in the experiment.\n", - "- `plot_edf`: Plot the objective value EDF (empirical distribution function) of the experiment.\n", - "- `plot_timeline`: Plot the timeline of the experiment.\n", - "- `plot_slice`: Plot the parameter relationship as slice plot in a study.\n", + "# Initialize a SparkSession object\n", + "spark = SparkSession.builder.appName(\"FLAML-Spark\").getOrCreate()\n", "\n", - "### Figure Examples\n", - "![Plot Examples](Images/plot_samples.png)\n", + "# Load your data into a Spark DataFrame\n", + "data = spark.read.format(\"csv\").option(\"header\", \"true\").load(\"path/to/data.csv\")\n", "\n", - "Check out our example [notebook](../../notebook/trident/automl_plot.ipynb) for a preview of all interactive plots.\n", + "# Define your search space and search strategy\n", + "search_space = {\n", + " \"n_estimators\": {\"domain\": range(10, 100)},\n", + " \"max_depth\": {\"domain\": range(1, 10)},\n", + " \"learning_rate\": {\"domain\": [0.001, 0.01, 0.1]},\n", + "}\n", + "search_strategy = \"skopt\"\n", "\n", + "# Create a SparkEstimator object\n", + "from pyspark.ml.classification import GBTClassifier\n", + "estimator = GBTClassifier()\n", "\n", + "# Pass the SparkEstimator object to FLAML's fit() method\n", + "automl = AutoML()\n", + "automl.fit(\n", + " X_train=data,\n", + " estimator=estimator,\n", + " task=\"classification\",\n", + " search_space=search_space,\n", + " search_alg=search_strategy,\n", + " n_jobs=-1,\n", + ")\n", "\n", + "# Get the best model and its hyperparameters\n", + "best_model = automl.model\n", + "best_params = automl.best_config\n", "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[32mAdding doc_id doc_1 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_4 to context.\u001b[0m\n", - "\u001b[33mBoss_Assistant\u001b[0m (to chat_manager):\n", + "# Print the results\n", + "print(f\"Best model: {best_model}\")\n", + "print(f\"Best hyperparameters: {best_params}\")\n", "\n", - "You're a retrieve augmented coding assistant. You answer user's questions based on your own knowledge and the\n", - "context provided by the user.\n", - "If you can't answer the question with or without the current context, you should reply exactly `UPDATE CONTEXT`.\n", - "For code generation, you must obey the following rules:\n", - "Rule 1. You MUST NOT install any packages because all the packages needed are already installed.\n", - "Rule 2. You must follow the formats below to write your code:\n", - "```language\n", - "# your code\n", + "# Stop the SparkSession object\n", + "spark.stop()\n", "```\n", "\n", - "User's question is: How to use spark for parallel training in FLAML? Give me sample code.\n", - "\n", - "Context is: # Integrate - Spark\n", - "\n", - "FLAML has integrated Spark for distributed training. There are two main aspects of integration with Spark:\n", - "- Use Spark ML estimators for AutoML.\n", - "- Use Spark to run training in parallel spark jobs.\n", - "\n", - "## Spark ML Estimators\n", - "\n", - "FLAML integrates estimators based on Spark ML models. These models are trained in parallel using Spark, so we called them Spark estimators. To use these models, you first need to organize your data in the required format.\n", - "\n", - "### Data\n", + "Note that the `n_jobs` parameter is set to `-1` to use all available cores on the Spark cluster. You can adjust this value to control the level of parallelism. Also, the `get_output_from_log()` function can be used to extract the results from the FLAML log file. \n", "\n", - "For Spark estimators, AutoML only consumes Spark data. FLAML provides a convenient function `to_pandas_on_spark` in the `flaml.automl.spark.utils` module to convert your data into a pandas-on-spark (`pyspark.pandas`) dataframe/series, which Spark estimators require.\n", + "TERMINATE\n", "\n", - "This utility function takes data in the form of a `pandas.Dataframe` or `pyspark.sql.Dataframe` and converts it into a pandas-on-spark dataframe. It also takes `pandas.Series` or `pyspark.sql.Dataframe` and converts it into a [pandas-on-spark](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/index.html) series. If you pass in a `pyspark.pandas.Dataframe`, it will not make any changes.\n", + "--------------------------------------------------------------------------------\n", + "To use Spark for parallel training in FLAML, you can follow these steps:\n", "\n", - "This function also accepts optional arguments `index_col` and `default_index_type`.\n", - "- `index_col` is the column name to use as the index, default is None.\n", - "- `default_index_type` is the default index type, default is \"distributed-sequence\". More info about default index type could be found on Spark official [documentation](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/options.html#default-index-type)\n", + "1. Install PySpark and FLAML on your machine.\n", + "2. Start a Spark cluster using the `pyspark` command.\n", + "3. Import the necessary libraries and initialize a SparkSession object.\n", + "4. Load your data into a Spark DataFrame.\n", + "5. Define your search space and search strategy using FLAML's API.\n", + "6. Create a SparkEstimator object and pass it to FLAML's `fit()` method.\n", "\n", - "Here is an example code snippet for Spark Data:\n", + "Here's some sample code to get you started:\n", "\n", "```python\n", - "import pandas as pd\n", - "from flaml.automl.spark.utils import to_pandas_on_spark\n", - "# Creating a dictionary\n", - "data = {\"Square_Feet\": [800, 1200, 1800, 1500, 850],\n", - " \"Age_Years\": [20, 15, 10, 7, 25],\n", - " \"Price\": [100000, 200000, 300000, 240000, 120000]}\n", - "\n", - "# Creating a pandas DataFrame\n", - "dataframe = pd.DataFrame(data)\n", - "label = \"Price\"\n", + "from pyspark.sql import SparkSession\n", + "from flaml import AutoML\n", + "from flaml.data import get_output_from_log\n", "\n", - "# Convert to pandas-on-spark dataframe\n", - "psdf = to_pandas_on_spark(dataframe)\n", - "```\n", + "# Initialize a SparkSession object\n", + "spark = SparkSession.builder.appName(\"FLAML-Spark\").getOrCreate()\n", "\n", - "To use Spark ML models you need to format your data appropriately. Specifically, use [`VectorAssembler`](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.VectorAssembler.html) to merge all feature columns into a single vector column.\n", + "# Load your data into a Spark DataFrame\n", + "data = spark.read.format(\"csv\").option(\"header\", \"true\").load(\"path/to/data.csv\")\n", "\n", - "Here is an example of how to use it:\n", - "```python\n", - "from pyspark.ml.feature import VectorAssembler\n", - "columns = psdf.columns\n", - "feature_cols = [col for col in columns if col != label]\n", - "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n", - "psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\"]\n", - "```\n", + "# Define your search space and search strategy\n", + "search_space = {\n", + " \"n_estimators\": {\"domain\": range(10, 100)},\n", + " \"max_depth\": {\"domain\": range(1, 10)},\n", + " \"learning_rate\": {\"domain\": [0.001, 0.01, 0.1]},\n", + "}\n", + "search_strategy = \"skopt\"\n", "\n", - "Later in conducting the experiment, use your pandas-on-spark data like non-spark data and pass them using `X_train, y_train` or `dataframe, label`.\n", + "# Create a SparkEstimator object\n", + "from pyspark.ml.classification import GBTClassifier\n", + "estimator = GBTClassifier()\n", "\n", - "### Estimators\n", - "#### Model List\n", - "- `lgbm_spark`: The class for fine-tuning Spark version LightGBM models, using [SynapseML](https://microsoft.github.io/SynapseML/docs/features/lightgbm/about/) API.\n", + "# Pass the SparkEstimator object to FLAML's fit() method\n", + "automl = AutoML()\n", + "automl.fit(\n", + " X_train=data,\n", + " estimator=estimator,\n", + " task=\"classification\",\n", + " search_space=search_space,\n", + " search_alg=search_strategy,\n", + " n_jobs=-1,\n", + ")\n", "\n", - "#### Usage\n", - "First, prepare your data in the required format as described in the previous section.\n", + "# Get the best model and its hyperparameters\n", + "best_model = automl.model\n", + "best_params = automl.best_config\n", "\n", - "By including the models you intend to try in the `estimators_list` argument to `flaml.automl`, FLAML will start trying configurations for these models. If your input is Spark data, FLAML will also use estimators with the `_spark` postfix by default, even if you haven't specified them.\n", + "# Print the results\n", + "print(f\"Best model: {best_model}\")\n", + "print(f\"Best hyperparameters: {best_params}\")\n", "\n", - "Here is an example code snippet using SparkML models in AutoML:\n", - "\n", - "```python\n", - "import flaml\n", - "# prepare your data in pandas-on-spark format as we previously mentioned\n", - "\n", - "automl = flaml.AutoML()\n", - "settings = {\n", - " \"time_budget\": 30,\n", - " \"metric\": \"r2\",\n", - " \"estimator_list\": [\"lgbm_spark\"], # this setting is optional\n", - " \"task\": \"regression\",\n", - "}\n", - "\n", - "automl.fit(\n", - " dataframe=psdf,\n", - " label=label,\n", - " **settings,\n", - ")\n", - "```\n", - "\n", - "\n", - "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/automl_bankrupt_synapseml.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/automl_bankrupt_synapseml.ipynb)\n", - "\n", - "## Parallel Spark Jobs\n", - "You can activate Spark as the parallel backend during parallel tuning in both [AutoML](/docs/Use-Cases/Task-Oriented-AutoML#parallel-tuning) and [Hyperparameter Tuning](/docs/Use-Cases/Tune-User-Defined-Function#parallel-tuning), by setting the `use_spark` to `true`. FLAML will dispatch your job to the distributed Spark backend using [`joblib-spark`](https://github.com/joblib/joblib-spark).\n", - "\n", - "Please note that you should not set `use_spark` to `true` when applying AutoML and Tuning for Spark Data. This is because only SparkML models will be used for Spark Data in AutoML and Tuning. As SparkML models run in parallel, there is no need to distribute them with `use_spark` again.\n", - "\n", - "All the Spark-related arguments are stated below. These arguments are available in both Hyperparameter Tuning and AutoML:\n", - "\n", - "\n", - "- `use_spark`: boolean, default=False | Whether to use spark to run the training in parallel spark jobs. This can be used to accelerate training on large models and large datasets, but will incur more overhead in time and thus slow down training in some cases. GPU training is not supported yet when use_spark is True. For Spark clusters, by default, we will launch one trial per executor. However, sometimes we want to launch more trials than the number of executors (e.g., local mode). In this case, we can set the environment variable `FLAML_MAX_CONCURRENT` to override the detected `num_executors`. The final number of concurrent trials will be the minimum of `n_concurrent_trials` and `num_executors`.\n", - "- `n_concurrent_trials`: int, default=1 | The number of concurrent trials. When n_concurrent_trials > 1, FLAML performes parallel tuning.\n", - "- `force_cancel`: boolean, default=False | Whether to forcely cancel Spark jobs if the search time exceeded the time budget. Spark jobs include parallel tuning jobs and Spark-based model training jobs.\n", - "\n", - "An example code snippet for using parallel Spark jobs:\n", - "```python\n", - "import flaml\n", - "automl_experiment = flaml.AutoML()\n", - "automl_settings = {\n", - " \"time_budget\": 30,\n", - " \"metric\": \"r2\",\n", - " \"task\": \"regression\",\n", - " \"n_concurrent_trials\": 2,\n", - " \"use_spark\": True,\n", - " \"force_cancel\": True, # Activating the force_cancel option can immediately halt Spark jobs once they exceed the allocated time_budget.\n", - "}\n", - "\n", - "automl.fit(\n", - " dataframe=dataframe,\n", - " label=label,\n", - " **automl_settings,\n", - ")\n", - "```\n", - "\n", - "\n", - "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb)\n", - "\n", - "\n", - "```python\n", - "import flaml\n", - "# for flaml.tune\n", - "with mlflow.start_run(run_name=f\"spark_auto_trials_1686631558\"):\n", - " analysis = flaml.tune.run(\n", - " func_to_tune,\n", - " params,\n", - " metric=\"r2\",\n", - " mode=\"max\",\n", - " mlflow_exp_name=\"test_doc\",\n", - " use_spark=True,\n", - " )\n", - "\n", - "# for flaml.automl\n", - "automl_experiment = flaml.AutoML()\n", - "automl_settings = {\n", - " \"metric\": \"r2\",\n", - " \"task\": \"regression\",\n", - " \"use_spark\": True,\n", - " \"mlflow_exp_name\": \"test_doc\",\n", - " \"estimator_list\": [\n", - " \"lgbm\",\n", - " \"rf\",\n", - " \"xgboost\",\n", - " \"extra_tree\",\n", - " \"xgb_limitdepth\",\n", - " ], # catboost does not yet support mlflow autologging\n", - "}\n", - "with mlflow.start_run(run_name=f\"automl_spark_trials_1686631579\"):\n", - " automl_experiment.fit(X_train=train_x, y_train=train_y, **automl_settings)\n", - "```\n", - "\n", - "\n", - "\n", - "### Results\n", - "*Tune Autolog Trials on MLFlow UI*\n", - "\n", - "\n", - "![Tune Autolog Trials on MLFlow UI](Images/tune_trials.png)\n", - "\n", - "\n", - "*AutoML Autolog Trials on MLFlow UI*\n", - "\n", - "\n", - "![AutoML Autolog Trials on MLFlow UI](Images/automl_trials.png)\n", - "\n", - "\n", - "### Differences Between Auto and Manual Logging\n", - "Autologging is managed by MLFlow, while manual logging is maintained by FLAML.\n", - "\n", - "\n", - "#### Details of Manual Logging\n", - "FLAML logs general artifacts for AutoML tasks. Specifically, we log these artifacts:\n", - "\n", - "**`flaml.tune`**\n", - "\n", - "\n", - "![Manual Log Example for Tuning](Images/manual_log_tune.png)\n", - "\n", - "\n", - "- We create a parent run to log the best metric and the best configuration for the entire tuning process.\n", - "- For each trial, we create a child run to log the metric specific to the tune function and the configuration for that trial.\n", - "\n", - "**`flaml.automl`**\n", - "\n", - "\n", - "![Manual Log Example for AutoML](Images/manual_log_automl.png)\n", - "\n", - "\n", - "- We create a parent run to log the results of the experiment. This includes:\n", - " - The configuration of this model.\n", - " - The `best_validation_loss` produced by this model.\n", - " - The `best_iteration` to identify the point at which this model was found.\n", - "- For each state (a specific learner with different hyperparameters), we record the best trial for this model. This includes:\n", - " - The configuration of the best trial.\n", - " - The `validation_loss` the best trial produces.\n", - " - The `iter_count` to identify how many trials we have conducted for this state.\n", - " - The `pred_time`, which is the time cost of predicting test data for this model.\n", - " - The `wall_clock_time`, which is the time cost of this state.\n", - " - The `sample_size` to show how much data we sampled in this state.\n", - "Note that we also added these information to autolog AutoML run.\n", - "\n", - "\n", - "#### Details of Autologging\n", - "Autolog artifacts typically include model parameters, model files, and runtime metrics like the following:\n", - "\n", - "\n", - "![Autolog Example](Images/autolog_example.png)\n", - "\n", - "\n", - "Artifacts can differ among various machine learning libraries. More detailed information can be found [here](https://mlflow.org/docs/latest/tracking.html#automatic-logging).\n", - "\n", - "\n", - "\n", - "\n", - "## Plot Experiment Result\n", - "The `flaml.visualization` module provides utility functions for plotting the optimization process using [plotly](https://plotly.com/python/). Leveraging `plotly`, users can interactively explore experiment results. To use these plotting functions, simply provide your optimized `flaml.AutoML` or `flaml.tune.tune.ExperimentAnalysis` object as input. Optional parameters can be added using keyword arguments.\n", - "\n", - "Avaliable plotting functions:\n", - "- `plot_optimization_history`: Plot optimization history of all trials in the experiment.\n", - "- `plot_feature_importance`: Plot importance for each feature in the dataset.\n", - "- `plot_parallel_coordinate`: Plot the high-dimensional parameter relationships in the experiment.\n", - "- `plot_contour`: Plot the parameter relationship as contour plot in the experiment.\n", - "- `plot_edf`: Plot the objective value EDF (empirical distribution function) of the experiment.\n", - "- `plot_timeline`: Plot the timeline of the experiment.\n", - "- `plot_slice`: Plot the parameter relationship as slice plot in a study.\n", - "\n", - "### Figure Examples\n", - "![Plot Examples](Images/plot_samples.png)\n", - "\n", - "Check out our example [notebook](../../notebook/trident/automl_plot.ipynb) for a preview of all interactive plots.\n", - "\n", - "\n", - "\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33mSenior_Python_Engineer\u001b[0m (to chat_manager):\n", - "\n", - "To use Spark for parallel training in FLAML, you can activate Spark as the parallel backend during parallel tuning in both AutoML and Hyperparameter Tuning, by setting the `use_spark` to `true`. FLAML will dispatch your job to the distributed Spark backend using `joblib-spark`. \n", - "\n", - "Here is an example code snippet for using parallel Spark jobs:\n", - "\n", - "```python\n", - "import flaml\n", - "automl_experiment = flaml.AutoML()\n", - "automl_settings = {\n", - " \"metric\": \"r2\",\n", - " \"task\": \"regression\",\n", - " \"use_spark\": True,\n", - " \"estimator_list\": [\n", - " \"lgbm\",\n", - " \"rf\",\n", - " \"xgboost\",\n", - " \"extra_tree\",\n", - " \"xgb_limitdepth\",\n", - " ],\n", - "}\n", - "automl_experiment.fit(X_train=train_x, y_train=train_y, **automl_settings)\n", - "```\n", - "\n", - "Note that you should not set `use_spark` to `true` when applying AutoML and Tuning for Spark Data. This is because only SparkML models will be used for Spark Data in AutoML and Tuning. As SparkML models run in parallel, there is no need to distribute them with `use_spark` again.\n", - "\n", - "You can also use Spark ML estimators for AutoML. FLAML integrates estimators based on Spark ML models. These models are trained in parallel using Spark, so we called them Spark estimators. To use these models, you first need to organize your data in the required format.\n", - "\n", - "Here is an example code snippet for Spark Data:\n", - "\n", - "```python\n", - "import pandas as pd\n", - "from flaml.automl.spark.utils import to_pandas_on_spark\n", - "# Creating a dictionary\n", - "data = {\"Square_Feet\": [800, 1200, 1800, 1500, 850],\n", - " \"Age_Years\": [20, 15, 10, 7, 25],\n", - " \"Price\": [100000, 200000, 300000, 240000, 120000]}\n", - "\n", - "# Creating a pandas DataFrame\n", - "dataframe = pd.DataFrame(data)\n", - "label = \"Price\"\n", - "\n", - "# Convert to pandas-on-spark dataframe\n", - "psdf = to_pandas_on_spark(dataframe)\n", - "```\n", - "\n", - "To use Spark ML models you need to format your data appropriately. Specifically, use `VectorAssembler` to merge all feature columns into a single vector column.\n", - "\n", - "Here is an example of how to use it:\n", - "```python\n", - "from pyspark.ml.feature import VectorAssembler\n", - "columns = psdf.columns\n", - "feature_cols = [col for col in columns if col != label]\n", - "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n", - "psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\"]\n", - "```\n", - "\n", - "Later in conducting the experiment, use your pandas-on-spark data like non-spark data and pass them using `X_train, y_train` or `dataframe, label`.\n", - "\n", - "You can also plot the optimization process using `plotly` by providing your optimized `flaml.AutoML` or `flaml.tune.tune.ExperimentAnalysis` object as input. Optional parameters can be added using keyword arguments. Available plotting functions include `plot_optimization_history`, `plot_feature_importance`, `plot_parallel_coordinate`, `plot_contour`, `plot_edf`, `plot_timeline`, and `plot_slice`.\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33mProduct_Manager\u001b[0m (to chat_manager):\n", - "\n", - "\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33mCode_Reviewer\u001b[0m (to chat_manager):\n", - "\n", - "Is there anything else you need help with?\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[31m\n", - ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", - "\u001b[33mBoss_Assistant\u001b[0m (to chat_manager):\n", - "\n", - "\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33mProduct_Manager\u001b[0m (to chat_manager):\n", - "\n", - "No, that's all. Thank you for your help!\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33mCode_Reviewer\u001b[0m (to chat_manager):\n", - "\n", - "You're welcome! Don't hesitate to ask if you have any more questions in the future. Have a great day!\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[31m\n", - ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", - "\u001b[33mBoss_Assistant\u001b[0m (to chat_manager):\n", - "\n", - "\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33mSenior_Python_Engineer\u001b[0m (to chat_manager):\n", - "\n", - "Have a great day too!\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33mProduct_Manager\u001b[0m (to chat_manager):\n", - "\n", - "Thank you!\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33mCode_Reviewer\u001b[0m (to chat_manager):\n", - "\n", - "You're welcome!\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[31m\n", - ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", - "\u001b[33mBoss_Assistant\u001b[0m (to chat_manager):\n", + "# Stop the SparkSession object\n", + "spark.stop()\n", + "```\n", "\n", + "Note that the `n_jobs` parameter is set to `-1` to use all available cores on the Spark cluster. You can adjust this value to control the level of parallelism. Also, the `get_output_from_log()` function can be used to extract the results from the FLAML log file. \n", "\n", + "TERMINATE\n", "\n", "--------------------------------------------------------------------------------\n" ] } ], - "source": [ - "rag_chat()\n", - "# type exit to terminate the chat" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Call RetrieveUserProxyAgent while init chat with another user proxy agent\n", - "Sometimes, there might be a need to use RetrieveUserProxyAgent in group chat without initializing the chat with it. In such scenarios, it becomes essential to create a function that wraps the RAG agents and allows them to be called from other agents." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[33mBoss\u001b[0m (to chat_manager):\n", - "\n", - "How to use spark for parallel training in FLAML? Give me sample code.\n", - "\n", - "--------------------------------------------------------------------------------\n", - "How to use spark for parallel training in FLAML? Give me sample code.\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33mSenior_Python_Engineer\u001b[0m (to chat_manager):\n", - "\n", - "\u001b[32m***** Suggested function Call: retrieve_content *****\u001b[0m\n", - "Arguments: \n", - "{\n", - " \"message\": \"How to use spark for parallel training in FLAML?\"\n", - "}\n", - "\u001b[32m*****************************************************\u001b[0m\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[35m\n", - ">>>>>>>> EXECUTING FUNCTION retrieve_content...\u001b[0m\n", - "doc_ids: [['doc_0', 'doc_1', 'doc_4']]\n", - "\u001b[32mAdding doc_id doc_0 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_1 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_4 to context.\u001b[0m\n", - "\u001b[33mSenior_Python_Engineer\u001b[0m (to chat_manager):\n", - "\n", - "\u001b[32m***** Response from calling function \"retrieve_content\" *****\u001b[0m\n", - "You're a retrieve augmented coding assistant. You answer user's questions based on your own knowledge and the\n", - "context provided by the user.\n", - "If you can't answer the question with or without the current context, you should reply exactly `UPDATE CONTEXT`.\n", - "For code generation, you must obey the following rules:\n", - "Rule 1. You MUST NOT install any packages because all the packages needed are already installed.\n", - "Rule 2. You must follow the formats below to write your code:\n", - "```language\n", - "# your code\n", - "```\n", - "\n", - "User's question is: How to use spark for parallel training in FLAML?\n", - "\n", - "Context is: # Integrate - Spark\n", - "\n", - "FLAML has integrated Spark for distributed training. There are two main aspects of integration with Spark:\n", - "- Use Spark ML estimators for AutoML.\n", - "- Use Spark to run training in parallel spark jobs.\n", - "\n", - "## Spark ML Estimators\n", - "\n", - "FLAML integrates estimators based on Spark ML models. These models are trained in parallel using Spark, so we called them Spark estimators. To use these models, you first need to organize your data in the required format.\n", - "\n", - "### Data\n", - "\n", - "For Spark estimators, AutoML only consumes Spark data. FLAML provides a convenient function `to_pandas_on_spark` in the `flaml.automl.spark.utils` module to convert your data into a pandas-on-spark (`pyspark.pandas`) dataframe/series, which Spark estimators require.\n", - "\n", - "This utility function takes data in the form of a `pandas.Dataframe` or `pyspark.sql.Dataframe` and converts it into a pandas-on-spark dataframe. It also takes `pandas.Series` or `pyspark.sql.Dataframe` and converts it into a [pandas-on-spark](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/index.html) series. If you pass in a `pyspark.pandas.Dataframe`, it will not make any changes.\n", - "\n", - "This function also accepts optional arguments `index_col` and `default_index_type`.\n", - "- `index_col` is the column name to use as the index, default is None.\n", - "- `default_index_type` is the default index type, default is \"distributed-sequence\". More info about default index type could be found on Spark official [documentation](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/options.html#default-index-type)\n", - "\n", - "Here is an example code snippet for Spark Data:\n", - "\n", - "```python\n", - "import pandas as pd\n", - "from flaml.automl.spark.utils import to_pandas_on_spark\n", - "# Creating a dictionary\n", - "data = {\"Square_Feet\": [800, 1200, 1800, 1500, 850],\n", - " \"Age_Years\": [20, 15, 10, 7, 25],\n", - " \"Price\": [100000, 200000, 300000, 240000, 120000]}\n", - "\n", - "# Creating a pandas DataFrame\n", - "dataframe = pd.DataFrame(data)\n", - "label = \"Price\"\n", - "\n", - "# Convert to pandas-on-spark dataframe\n", - "psdf = to_pandas_on_spark(dataframe)\n", - "```\n", - "\n", - "To use Spark ML models you need to format your data appropriately. Specifically, use [`VectorAssembler`](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.VectorAssembler.html) to merge all feature columns into a single vector column.\n", - "\n", - "Here is an example of how to use it:\n", - "```python\n", - "from pyspark.ml.feature import VectorAssembler\n", - "columns = psdf.columns\n", - "feature_cols = [col for col in columns if col != label]\n", - "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n", - "psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\"]\n", - "```\n", - "\n", - "Later in conducting the experiment, use your pandas-on-spark data like non-spark data and pass them using `X_train, y_train` or `dataframe, label`.\n", - "\n", - "### Estimators\n", - "#### Model List\n", - "- `lgbm_spark`: The class for fine-tuning Spark version LightGBM models, using [SynapseML](https://microsoft.github.io/SynapseML/docs/features/lightgbm/about/) API.\n", - "\n", - "#### Usage\n", - "First, prepare your data in the required format as described in the previous section.\n", - "\n", - "By including the models you intend to try in the `estimators_list` argument to `flaml.automl`, FLAML will start trying configurations for these models. If your input is Spark data, FLAML will also use estimators with the `_spark` postfix by default, even if you haven't specified them.\n", - "\n", - "Here is an example code snippet using SparkML models in AutoML:\n", - "\n", - "```python\n", - "import flaml\n", - "# prepare your data in pandas-on-spark format as we previously mentioned\n", - "\n", - "automl = flaml.AutoML()\n", - "settings = {\n", - " \"time_budget\": 30,\n", - " \"metric\": \"r2\",\n", - " \"estimator_list\": [\"lgbm_spark\"], # this setting is optional\n", - " \"task\": \"regression\",\n", - "}\n", - "\n", - "automl.fit(\n", - " dataframe=psdf,\n", - " label=label,\n", - " **settings,\n", - ")\n", - "```\n", - "\n", - "\n", - "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/automl_bankrupt_synapseml.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/automl_bankrupt_synapseml.ipynb)\n", - "\n", - "## Parallel Spark Jobs\n", - "You can activate Spark as the parallel backend during parallel tuning in both [AutoML](/docs/Use-Cases/Task-Oriented-AutoML#parallel-tuning) and [Hyperparameter Tuning](/docs/Use-Cases/Tune-User-Defined-Function#parallel-tuning), by setting the `use_spark` to `true`. FLAML will dispatch your job to the distributed Spark backend using [`joblib-spark`](https://github.com/joblib/joblib-spark).\n", - "\n", - "Please note that you should not set `use_spark` to `true` when applying AutoML and Tuning for Spark Data. This is because only SparkML models will be used for Spark Data in AutoML and Tuning. As SparkML models run in parallel, there is no need to distribute them with `use_spark` again.\n", - "\n", - "All the Spark-related arguments are stated below. These arguments are available in both Hyperparameter Tuning and AutoML:\n", - "\n", - "\n", - "- `use_spark`: boolean, default=False | Whether to use spark to run the training in parallel spark jobs. This can be used to accelerate training on large models and large datasets, but will incur more overhead in time and thus slow down training in some cases. GPU training is not supported yet when use_spark is True. For Spark clusters, by default, we will launch one trial per executor. However, sometimes we want to launch more trials than the number of executors (e.g., local mode). In this case, we can set the environment variable `FLAML_MAX_CONCURRENT` to override the detected `num_executors`. The final number of concurrent trials will be the minimum of `n_concurrent_trials` and `num_executors`.\n", - "- `n_concurrent_trials`: int, default=1 | The number of concurrent trials. When n_concurrent_trials > 1, FLAML performes parallel tuning.\n", - "- `force_cancel`: boolean, default=False | Whether to forcely cancel Spark jobs if the search time exceeded the time budget. Spark jobs include parallel tuning jobs and Spark-based model training jobs.\n", - "\n", - "An example code snippet for using parallel Spark jobs:\n", - "```python\n", - "import flaml\n", - "automl_experiment = flaml.AutoML()\n", - "automl_settings = {\n", - " \"time_budget\": 30,\n", - " \"metric\": \"r2\",\n", - " \"task\": \"regression\",\n", - " \"n_concurrent_trials\": 2,\n", - " \"use_spark\": True,\n", - " \"force_cancel\": True, # Activating the force_cancel option can immediately halt Spark jobs once they exceed the allocated time_budget.\n", - "}\n", - "\n", - "automl.fit(\n", - " dataframe=dataframe,\n", - " label=label,\n", - " **automl_settings,\n", - ")\n", - "```\n", - "\n", - "\n", - "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb)\n", - "\n", - "\n", - "```python\n", - "import flaml\n", - "# for flaml.tune\n", - "with mlflow.start_run(run_name=f\"spark_auto_trials_1686631558\"):\n", - " analysis = flaml.tune.run(\n", - " func_to_tune,\n", - " params,\n", - " metric=\"r2\",\n", - " mode=\"max\",\n", - " mlflow_exp_name=\"test_doc\",\n", - " use_spark=True,\n", - " )\n", - "\n", - "# for flaml.automl\n", - "automl_experiment = flaml.AutoML()\n", - "automl_settings = {\n", - " \"metric\": \"r2\",\n", - " \"task\": \"regression\",\n", - " \"use_spark\": True,\n", - " \"mlflow_exp_name\": \"test_doc\",\n", - " \"estimator_list\": [\n", - " \"lgbm\",\n", - " \"rf\",\n", - " \"xgboost\",\n", - " \"extra_tree\",\n", - " \"xgb_limitdepth\",\n", - " ], # catboost does not yet support mlflow autologging\n", - "}\n", - "with mlflow.start_run(run_name=f\"automl_spark_trials_1686631579\"):\n", - " automl_experiment.fit(X_train=train_x, y_train=train_y, **automl_settings)\n", - "```\n", - "\n", - "\n", - "\n", - "### Results\n", - "*Tune Autolog Trials on MLFlow UI*\n", - "\n", - "\n", - "![Tune Autolog Trials on MLFlow UI](Images/tune_trials.png)\n", - "\n", - "\n", - "*AutoML Autolog Trials on MLFlow UI*\n", - "\n", - "\n", - "![AutoML Autolog Trials on MLFlow UI](Images/automl_trials.png)\n", - "\n", - "\n", - "### Differences Between Auto and Manual Logging\n", - "Autologging is managed by MLFlow, while manual logging is maintained by FLAML.\n", - "\n", - "\n", - "#### Details of Manual Logging\n", - "FLAML logs general artifacts for AutoML tasks. Specifically, we log these artifacts:\n", - "\n", - "**`flaml.tune`**\n", - "\n", - "\n", - "![Manual Log Example for Tuning](Images/manual_log_tune.png)\n", - "\n", - "\n", - "- We create a parent run to log the best metric and the best configuration for the entire tuning process.\n", - "- For each trial, we create a child run to log the metric specific to the tune function and the configuration for that trial.\n", - "\n", - "**`flaml.automl`**\n", - "\n", - "\n", - "![Manual Log Example for AutoML](Images/manual_log_automl.png)\n", - "\n", - "\n", - "- We create a parent run to log the results of the experiment. This includes:\n", - " - The configuration of this model.\n", - " - The `best_validation_loss` produced by this model.\n", - " - The `best_iteration` to identify the point at which this model was found.\n", - "- For each state (a specific learner with different hyperparameters), we record the best trial for this model. This includes:\n", - " - The configuration of the best trial.\n", - " - The `validation_loss` the best trial produces.\n", - " - The `iter_count` to identify how many trials we have conducted for this state.\n", - " - The `pred_time`, which is the time cost of predicting test data for this model.\n", - " - The `wall_clock_time`, which is the time cost of this state.\n", - " - The `sample_size` to show how much data we sampled in this state.\n", - "Note that we also added these information to autolog AutoML run.\n", - "\n", - "\n", - "#### Details of Autologging\n", - "Autolog artifacts typically include model parameters, model files, and runtime metrics like the following:\n", - "\n", - "\n", - "![Autolog Example](Images/autolog_example.png)\n", - "\n", - "\n", - "Artifacts can differ among various machine learning libraries. More detailed information can be found [here](https://mlflow.org/docs/latest/tracking.html#automatic-logging).\n", - "\n", - "\n", - "\n", - "\n", - "## Plot Experiment Result\n", - "The `flaml.visualization` module provides utility functions for plotting the optimization process using [plotly](https://plotly.com/python/). Leveraging `plotly`, users can interactively explore experiment results. To use these plotting functions, simply provide your optimized `flaml.AutoML` or `flaml.tune.tune.ExperimentAnalysis` object as input. Optional parameters can be added using keyword arguments.\n", - "\n", - "Avaliable plotting functions:\n", - "- `plot_optimization_history`: Plot optimization history of all trials in the experiment.\n", - "- `plot_feature_importance`: Plot importance for each feature in the dataset.\n", - "- `plot_parallel_coordinate`: Plot the high-dimensional parameter relationships in the experiment.\n", - "- `plot_contour`: Plot the parameter relationship as contour plot in the experiment.\n", - "- `plot_edf`: Plot the objective value EDF (empirical distribution function) of the experiment.\n", - "- `plot_timeline`: Plot the timeline of the experiment.\n", - "- `plot_slice`: Plot the parameter relationship as slice plot in a study.\n", - "\n", - "### Figure Examples\n", - "![Plot Examples](Images/plot_samples.png)\n", - "\n", - "Check out our example [notebook](../../notebook/trident/automl_plot.ipynb) for a preview of all interactive plots.\n", - "\n", - "\n", - "\n", - "\u001b[32m*************************************************************\u001b[0m\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33mProduct_Manager\u001b[0m (to chat_manager):\n", - "\n", - "To use Spark for parallel training in FLAML, you can follow these steps:\n", - "\n", - "1. Prepare your data in the required format. FLAML only consumes Spark data for Spark estimators. You can use the `to_pandas_on_spark` function from the `flaml.automl.spark.utils` module to convert your data into a pandas-on-spark dataframe. Here's an example:\n", - "\n", - "```python\n", - "import pandas as pd\n", - "from flaml.automl.spark.utils import to_pandas_on_spark\n", - "\n", - "# Create a dictionary\n", - "data = {\n", - " \"Square_Feet\": [800, 1200, 1800, 1500, 850],\n", - " \"Age_Years\": [20, 15, 10, 7, 25],\n", - " \"Price\": [100000, 200000, 300000, 240000, 120000]\n", - "}\n", - "\n", - "# Create a pandas DataFrame\n", - "dataframe = pd.DataFrame(data)\n", - "label = \"Price\"\n", - "\n", - "# Convert to pandas-on-spark dataframe\n", - "psdf = to_pandas_on_spark(dataframe)\n", - "```\n", - "\n", - "2. Use the Spark ML estimators in FLAML. FLAML integrates estimators based on Spark ML models. You can include the models you want to try in the `estimator_list` argument when creating an instance of `flaml.AutoML`. Here's an example:\n", - "\n", - "```python\n", - "import flaml\n", - "\n", - "automl = flaml.AutoML()\n", - "settings = {\n", - " \"time_budget\": 30,\n", - " \"metric\": \"r2\",\n", - " \"estimator_list\": [\"lgbm_spark\"], # Optional: specify the Spark ML estimator\n", - " \"task\": \"regression\"\n", - "}\n", - "\n", - "automl.fit(\n", - " dataframe=psdf,\n", - " label=label,\n", - " **settings\n", - ")\n", - "```\n", - "\n", - "3. Activate Spark as the parallel backend. You can set the `use_spark` parameter to `True` to activate Spark as the parallel backend during parallel tuning. FLAML will dispatch your job to the distributed Spark backend using `joblib-spark`. Here's an example:\n", - "\n", - "```python\n", - "import flaml\n", - "\n", - "automl_experiment = flaml.AutoML()\n", - "automl_settings = {\n", - " \"time_budget\": 30,\n", - " \"metric\": \"r2\",\n", - " \"task\": \"regression\",\n", - " \"n_concurrent_trials\": 2,\n", - " \"use_spark\": True,\n", - " \"force_cancel\": True # Optional: force cancel Spark jobs if time budget is exceeded\n", - "}\n", - "\n", - "automl_experiment.fit(\n", - " dataframe=dataframe,\n", - " label=label,\n", - " **automl_settings\n", - ")\n", - "```\n", - "\n", - "These are the steps to use Spark for parallel training in FLAML. Let me know if you need any further assistance!\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33mCode_Reviewer\u001b[0m (to chat_manager):\n", - "\n", - "Great! You now have the steps to use Spark for parallel training in FLAML. If you have any more questions, feel free to ask.\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[31m\n", - ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", - "\u001b[33mBoss\u001b[0m (to chat_manager):\n", - "\n", - "\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33mSenior_Python_Engineer\u001b[0m (to chat_manager):\n", - "\n", - "TERMINATE\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[31m\n", - ">>>>>>>> NO HUMAN INPUT RECEIVED.\u001b[0m\n" - ] - } - ], "source": [ "call_rag_chat()" ] diff --git a/notebook/agentchat_qdrant_RetrieveChat.ipynb b/notebook/agentchat_qdrant_RetrieveChat.ipynb index b05848c1c5d1..c47deedc2a07 100644 --- a/notebook/agentchat_qdrant_RetrieveChat.ipynb +++ b/notebook/agentchat_qdrant_RetrieveChat.ipynb @@ -29,10 +29,19 @@ "\n", "AutoGen requires `Python>=3.8`. To run this notebook example, please install the [retrievechat] option.\n", "```bash\n", - "pip install \"pyautogen[retrievechat] flaml[automl] qdrant_client[fastembed]\"\n", + "pip install \"pyautogen[retrievechat]~=0.2.0b5\" \"flaml[automl]\" \"qdrant_client[fastembed]\"\n", "```" ] }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# %pip install \"pyautogen[retrievechat]~=0.2.0b5\" \"flaml[automl]\" \"qdrant_client[fastembed]\"" + ] + }, { "attachments": {}, "cell_type": "markdown", @@ -165,7 +174,7 @@ " system_message=\"You are a helpful assistant.\",\n", " llm_config={\n", " \"timeout\": 600,\n", - " \"seed\": 42,\n", + " \"cache_seed\": 42,\n", " \"config_list\": config_list,\n", " },\n", ")\n", @@ -1224,7 +1233,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.6" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/notebook/agentchat_teaching.ipynb b/notebook/agentchat_teaching.ipynb index dcee7cd5b025..51b4a94e47d8 100644 --- a/notebook/agentchat_teaching.ipynb +++ b/notebook/agentchat_teaching.ipynb @@ -24,7 +24,7 @@ "\n", "AutoGen requires `Python>=3.8`. To run this notebook example, please install:\n", "```bash\n", - "pip install pyautogen\n", + "pip install \"pyautogen~=0.2.0b5\"\n", "```" ] }, @@ -34,7 +34,7 @@ "metadata": {}, "outputs": [], "source": [ - "# %pip install --quiet pyautogen~=0.1.0" + "# %pip install --quiet \"pyautogen~=0.2.0b5\"" ] }, { @@ -85,7 +85,7 @@ "\n", "llm_config={\n", " \"timeout\": 600,\n", - " \"seed\": 44, # change the seed for different trials\n", + " \"cache_seed\": 44, # change the seed for different trials\n", " \"config_list\": autogen.config_list_from_json(\n", " \"OAI_CONFIG_LIST\",\n", " filter_dict={\"model\": [\"gpt-4-32k\"]},\n", diff --git a/setup.py b/setup.py index 4dc5d4e9dd64..ab7ab28ade10 100644 --- a/setup.py +++ b/setup.py @@ -46,6 +46,7 @@ "pre-commit", "pytest-asyncio", "pytest>=6.1.1", + "mock", ], "blendsearch": ["flaml[blendsearch]"], "mathchat": ["sympy", "pydantic==1.10.9", "wolframalpha"], diff --git a/test/agentchat/test_groupchat.py b/test/agentchat/test_groupchat.py index 6f634fd8677f..ff83cac2e381 100644 --- a/test/agentchat/test_groupchat.py +++ b/test/agentchat/test_groupchat.py @@ -1,4 +1,6 @@ import pytest +import mock +import builtins import autogen import json @@ -8,7 +10,7 @@ def test_func_call_groupchat(): "alice", human_input_mode="NEVER", llm_config=False, - default_auto_reply="This is alice sepaking.", + default_auto_reply="This is alice speaking.", ) agent2 = autogen.ConversableAgent( "bob", @@ -56,7 +58,7 @@ def test_chat_manager(): max_consecutive_auto_reply=2, human_input_mode="NEVER", llm_config=False, - default_auto_reply="This is alice sepaking.", + default_auto_reply="This is alice speaking.", ) agent2 = autogen.ConversableAgent( "bob", @@ -83,6 +85,150 @@ def test_chat_manager(): agent2.initiate_chat(group_chat_manager, message={"function_call": {"name": "func", "arguments": '{"x": 1}'}}) +def _test_selection_method(method: str): + agent1 = autogen.ConversableAgent( + "alice", + max_consecutive_auto_reply=10, + human_input_mode="NEVER", + llm_config=False, + default_auto_reply="This is alice speaking.", + ) + agent2 = autogen.ConversableAgent( + "bob", + max_consecutive_auto_reply=10, + human_input_mode="NEVER", + llm_config=False, + default_auto_reply="This is bob speaking.", + ) + agent3 = autogen.ConversableAgent( + "charlie", + max_consecutive_auto_reply=10, + human_input_mode="NEVER", + llm_config=False, + default_auto_reply="This is charlie speaking.", + ) + + groupchat = autogen.GroupChat( + agents=[agent1, agent2, agent3], + messages=[], + max_round=6, + speaker_selection_method=method, + allow_repeat_speaker=False if method == "manual" else True, + ) + group_chat_manager = autogen.GroupChatManager(groupchat=groupchat, llm_config=False) + + if method == "round_robin": + agent1.initiate_chat(group_chat_manager, message="This is alice speaking.") + assert len(agent1.chat_messages[group_chat_manager]) == 6 + assert len(groupchat.messages) == 6 + assert [msg["content"] for msg in agent1.chat_messages[group_chat_manager]] == [ + "This is alice speaking.", + "This is bob speaking.", + "This is charlie speaking.", + ] * 2 + elif method == "auto": + agent1.initiate_chat(group_chat_manager, message="This is alice speaking.") + assert len(agent1.chat_messages[group_chat_manager]) == 6 + assert len(groupchat.messages) == 6 + elif method == "random": + agent1.initiate_chat(group_chat_manager, message="This is alice speaking.") + assert len(agent1.chat_messages[group_chat_manager]) == 6 + assert len(groupchat.messages) == 6 + elif method == "manual": + for user_input in ["", "q", "x", "1", "10"]: + with mock.patch.object(builtins, "input", lambda _: user_input): + group_chat_manager.reset() + agent1.reset() + agent2.reset() + agent3.reset() + agent1.initiate_chat(group_chat_manager, message="This is alice speaking.") + if user_input == "1": + assert len(agent1.chat_messages[group_chat_manager]) == 6 + assert len(groupchat.messages) == 6 + assert [msg["content"] for msg in agent1.chat_messages[group_chat_manager]] == [ + "This is alice speaking.", + "This is bob speaking.", + "This is alice speaking.", + "This is bob speaking.", + "This is alice speaking.", + "This is bob speaking.", + ] + else: + assert len(agent1.chat_messages[group_chat_manager]) == 6 + assert len(groupchat.messages) == 6 + elif method == "wrong": + with pytest.raises(ValueError): + agent1.initiate_chat(group_chat_manager, message="This is alice speaking.") + + +def test_speaker_selection_method(): + for method in ["auto", "round_robin", "random", "manual", "wrong", "RounD_roBin"]: + _test_selection_method(method) + + +def _test_n_agents_less_than_3(method): + agent1 = autogen.ConversableAgent( + "alice", + max_consecutive_auto_reply=10, + human_input_mode="NEVER", + llm_config=False, + default_auto_reply="This is alice speaking.", + ) + agent2 = autogen.ConversableAgent( + "bob", + max_consecutive_auto_reply=10, + human_input_mode="NEVER", + llm_config=False, + default_auto_reply="This is bob speaking.", + ) + # test two agents + groupchat = autogen.GroupChat( + agents=[agent1, agent2], + messages=[], + max_round=6, + speaker_selection_method=method, + allow_repeat_speaker=True if method == "random" else False, + ) + group_chat_manager = autogen.GroupChatManager(groupchat=groupchat, llm_config=False) + agent1.initiate_chat(group_chat_manager, message="This is alice speaking.") + assert len(agent1.chat_messages[group_chat_manager]) == 6 + assert len(groupchat.messages) == 6 + if method != "random" or method.lower() == "round_robin": + assert [msg["content"] for msg in agent1.chat_messages[group_chat_manager]] == [ + "This is alice speaking.", + "This is bob speaking.", + ] * 3 + + # test one agent + groupchat = autogen.GroupChat( + agents=[agent1], + messages=[], + max_round=6, + speaker_selection_method="round_robin", + allow_repeat_speaker=False, + ) + with pytest.raises(ValueError): + group_chat_manager = autogen.GroupChatManager(groupchat=groupchat, llm_config=False) + agent1.initiate_chat(group_chat_manager, message="This is alice speaking.") + + # test zero agent + groupchat = autogen.GroupChat( + agents=[], + messages=[], + max_round=6, + speaker_selection_method="round_robin", + allow_repeat_speaker=False, + ) + with pytest.raises(ValueError): + group_chat_manager = autogen.GroupChatManager(groupchat=groupchat, llm_config=False) + agent1.initiate_chat(group_chat_manager, message="This is alice speaking.") + + +def test_n_agents_less_than_3(): + for method in ["auto", "round_robin", "random", "RounD_roBin"]: + _test_n_agents_less_than_3(method) + + def test_plugin(): # Give another Agent class ability to manage group chat agent1 = autogen.ConversableAgent( @@ -90,7 +236,7 @@ def test_plugin(): max_consecutive_auto_reply=2, human_input_mode="NEVER", llm_config=False, - default_auto_reply="This is alice sepaking.", + default_auto_reply="This is alice speaking.", ) agent2 = autogen.ConversableAgent( "bob", @@ -185,8 +331,10 @@ def test_agent_mentions(): if __name__ == "__main__": - test_func_call_groupchat() + # test_func_call_groupchat() # test_broadcast() - test_chat_manager() + # test_chat_manager() # test_plugin() + test_speaker_selection_method() + test_n_agents_less_than_3() # test_agent_mentions()