From 4cb46ebe1f9e0ab32afb852dc84ce39b40940467 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Fri, 12 Jul 2024 13:56:12 +0200 Subject: [PATCH] is_predictable_without_description? (#307) --- .../tools/is_predictable.py | 100 +++++++++++++++--- .../tools/utils.py | 4 + pyproject.toml | 2 +- tests/tools/test_is_predictable.py | 57 +++++++++- 4 files changed, 148 insertions(+), 15 deletions(-) diff --git a/prediction_market_agent_tooling/tools/is_predictable.py b/prediction_market_agent_tooling/tools/is_predictable.py index 7d07f829..29e0d110 100644 --- a/prediction_market_agent_tooling/tools/is_predictable.py +++ b/prediction_market_agent_tooling/tools/is_predictable.py @@ -2,6 +2,7 @@ from prediction_market_agent_tooling.config import APIKeys from prediction_market_agent_tooling.tools.cache import persistent_inmemory_cache +from prediction_market_agent_tooling.tools.utils import LLM_SUPER_LOW_TEMPERATURE # I tried to make it return a JSON, but it didn't work well in combo with asking it to do chain of thought. QUESTION_IS_PREDICTABLE_BINARY_PROMPT = """Main signs about a fully qualified question (sometimes referred to as a "market"): @@ -28,6 +29,47 @@ Finally, write your final decision, write `decision: ` followed by either "yes it is fully qualified" or "no it isn't fully qualified" about the question. Don't write anything else after that. You must include "yes" or "no". """ +QUESTION_IS_PREDICTABLE_WITHOUT_DESCRIPTION_PROMPT = """Main signs about a fully self-contained question (sometimes referred to as a "market"): +- Description of the question can not contain any additional information required to answer the question. + +For the question: + +``` +{question} +``` + +And the description: + +``` +{description} +``` + +Description refers only to the text above and nothing else. + +Even if the question is somewhat vague, but even the description does not contain enough of extra information, it's okay and the question is fully self-contained. +If the question is vague and the description contains the information required to answer the question, it's not fully self-contained and the answer is "no". + +Follow a chain of thought to evaluate if the question doesn't need the description to be answered. + +Start by examining detaily the question and the description. Write down their parts, what they refer to and what they contain. + +Continue by writing comparison of the question and the description content. Write down what the question contains and what the description contains. + +Explain, why do you think it does or doesn't need the description. + +Description can contain additional information, but it can not contain any information required to answer the question. + +Description can contain additional information about the exact resolution criteria, but the question should be answerable even without it. + +As long as the question contains some time frame, it's okay if the description only specifies it in more detail. + +Description usually contains the question in more detailed form, but the question on its own should be answerable. + +For example, that means, description can not contain date if question doesn't contain it. Description can not contain target if the question doesn't contain it, etc. + +Finally, write your final decision, write `decision: ` followed by either "yes it is fully self-contained" or "no it isn't fully self-contained" about the question. Don't write anything else after that. You must include "yes" or "no". +""" + @persistent_inmemory_cache def is_predictable_binary( @@ -42,12 +84,12 @@ def is_predictable_binary( from langchain.prompts import ChatPromptTemplate from langchain_openai import ChatOpenAI except ImportError: - logger.info("langchain not installed, skipping is_predictable_binary") + logger.error("langchain not installed, skipping is_predictable_binary") return True llm = ChatOpenAI( model=engine, - temperature=0.0, + temperature=LLM_SUPER_LOW_TEMPERATURE, api_key=APIKeys().openai_api_key_secretstr_v1, ) @@ -55,20 +97,54 @@ def is_predictable_binary( messages = prompt.format_messages(question=question) completion = str(llm(messages, max_tokens=512).content) + return parse_decision_yes_no_completion(question, completion) + + +@persistent_inmemory_cache +def is_predictable_without_description( + question: str, + description: str, + engine: str = "gpt-4-1106-preview", + prompt_template: str = QUESTION_IS_PREDICTABLE_WITHOUT_DESCRIPTION_PROMPT, +) -> bool: + """ + Evaluate if the question is fully self-contained. + """ + try: + from langchain.prompts import ChatPromptTemplate + from langchain_openai import ChatOpenAI + except ImportError: + logger.error( + "langchain not installed, skipping is_predictable_without_description" + ) + return True + + llm = ChatOpenAI( + model=engine, + temperature=LLM_SUPER_LOW_TEMPERATURE, + api_key=APIKeys().openai_api_key_secretstr_v1, + ) + + prompt = ChatPromptTemplate.from_template(template=prompt_template) + messages = prompt.format_messages( + question=question, + description=description, + ) + completion = str(llm(messages, max_tokens=512).content) + + return parse_decision_yes_no_completion(question, completion) + + +def parse_decision_yes_no_completion(question: str, completion: str) -> bool: + logger.debug(completion) try: decision = completion.lower().rsplit("decision", 1)[1] except IndexError as e: - raise ValueError( - f"Invalid completion in is_predictable for `{question}`: {completion}" - ) from e + raise ValueError(f"Invalid completion for `{question}`: {completion}") from e if "yes" in decision: - is_predictable = True + return True elif "no" in decision: - is_predictable = False + return False else: - raise ValueError( - f"Invalid completion in is_predictable for `{question}`: {completion}" - ) - - return is_predictable + raise ValueError(f"Invalid completion for `{question}`: {completion}") diff --git a/prediction_market_agent_tooling/tools/utils.py b/prediction_market_agent_tooling/tools/utils.py index 14b96dcf..2351fe70 100644 --- a/prediction_market_agent_tooling/tools/utils.py +++ b/prediction_market_agent_tooling/tools/utils.py @@ -19,6 +19,10 @@ T = TypeVar("T") +# t=0 is mathematically impossible and it's not clear how OpenAI (and others) handle it, as a result, even with t=0, gpt-4-turbo produces very different outputs, +# it seems that using a very low temperature is the best way to have as consistent outputs as possible: https://community.openai.com/t/why-the-api-output-is-inconsistent-even-after-the-temperature-is-set-to-0/329541/12 +LLM_SUPER_LOW_TEMPERATURE = 0.00000001 + def check_not_none( value: Optional[T], diff --git a/pyproject.toml b/pyproject.toml index 7047227f..65747e62 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "prediction-market-agent-tooling" -version = "0.42.1" +version = "0.43.0" description = "Tools to benchmark, deploy and monitor prediction market agents." authors = ["Gnosis"] readme = "README.md" diff --git a/tests/tools/test_is_predictable.py b/tests/tools/test_is_predictable.py index bf85879e..c30b7254 100644 --- a/tests/tools/test_is_predictable.py +++ b/tests/tools/test_is_predictable.py @@ -1,6 +1,9 @@ import pytest -from prediction_market_agent_tooling.tools.is_predictable import is_predictable_binary +from prediction_market_agent_tooling.tools.is_predictable import ( + is_predictable_binary, + is_predictable_without_description, +) from tests.utils import RUN_PAID_TESTS @@ -41,4 +44,54 @@ def test_is_predictable_binary(question: str, answerable: bool) -> None: assert ( is_predictable_binary(question=question) == answerable - ), f"Question is not evaluated correctly, see the completion: {is_predictable_binary}" + ), f"Question is not evaluated correctly." + + +@pytest.mark.skipif(not RUN_PAID_TESTS, reason="This test costs money to run.") +@pytest.mark.parametrize( + "question, description, answerable", + [ + ( + "Russian nuke in space?", + "Will resolve to 'True' if Russian nuke will be in space by the end of 2024.", + False, # False, because description clarifies the date 2024. + ), + ( + "Russian nuke in space in 2024?", + "Will resolve to 'True' if Russian nuke will be in space by the end of 2024.", + True, # True, because description doesn't provide any extra information. + ), + ( + "Will cash withdrawals be enabled before August 1st?", + "Will Manifold officially enable cash withdrawals to user accounts (not just charities) any time before August 1st, 2024? Cash withdrawals must be an active part of the Manifold UI users could theoretically use.", + False, # False, because description provides context about Manifold. + ), + ( + "If they play, will Biden beat Trump at golf?", + "Resolves N/A if they don't play golf before the election.", + False, # False, because description provides the time frame. + ), + ( + "Will Biden be the 2024 Democratic Nominee?", + "The resolution is to the first nominee formally selected by the Democratic Party (which happens at the Democratic National Convention). If the nominee is later replaced (for example, due to dropping out of the election, death, etc) that does not change the resolution. If a candidate becomes presumptive nominee after securing a majority of pledged delegates, that is not sufficient for resolution, until formally selected as nominee.", + False, # False, because `nominee` could mean multiple things that are clarified in the description. + ), + ( + "Will Biden win the 2024 US Presidential Election?", + "Resolves to the person who wins the majority of votes for US President in the Electoral College, or selected by Congress following the contingency procedure in the Twelfth Amendment.", + True, # True, because description doesn't provide any necessary information. + ), + ( + "Will an AI get gold on any International Math Olympiad by 2025?", + "Resolves to YES if either Eliezer or Paul acknowledge that an AI has succeeded at this task.", + True, # True, because description doesn't provide any extra information. + ), + ], +) +def test_is_predictable_without_description( + question: str, description: str, answerable: bool +) -> None: + assert ( + is_predictable_without_description(question=question, description=description) + == answerable + ), f"Question is not evaluated correctly."