llm-generated-clues-reasoning.py

"""
LLM generated CLUES and REASONING

Using dataset/sign_events_data_statements.csv with columns: 
signevent,q1,q2,unid,id,outcome_class ## signevent is probably signature event
Dataset size: [1640 rows x 6 columns]
in the CSV file, row 1 is the header row, 
    truthful data starts from row 2 and ends at row 784
    deceptive data starts from row 785 and ends at row 1641

TRUTHFUL rows [:783]
DECEPTIVE rows [783:]

Adding the CLUES and REASONING columns generated by the LLM.
and calling it ['contains_clues', 'clues', 'reasoning'] ## contains_cluees si bool
"""
import json
from utils.openai_interface import init_openai, get_chat_completion_with_backoff
from utils.llm_generated_clues_reason_output import prep_output_df, save_output_df, parse_n_write_response

init_openai()

dataset_used = 'dataset/sign_events_data_statements.csv'
llm_generated_dataset = 'dataset/llm_generated_clues_reasoning_events_data_statements.csv'
temperature = 1.0

import pandas as pd
df = pd.read_csv (dataset_used)
# simple EDA
print(df.columns)
print(f'shape: {df.shape}')  # should be 1640 x 6
print(df.head)
print(df.iloc[782:784]['outcome_class'])  # note the cross-ver of the 'outcome_class'

newline = '\n'
system_role_content = f"""
You are an expert in Linguistic Inquiry Word Count (LIWC) analysis. 
Your job is to provide reasons for why a statement is classified as either TRUTHFUL or DECEPTIVE. 
You'll be provided the LIWC categories that are helpful factors is deciding whether the statement is either TRUTHFUL or DECEPTIVE. 
"""

def create_prelude(gt, not_of_gt):
    prelude = f"""

Here are the LIWC categories and corresponding examples that help to decide whether a statement is TRUTHFUL:
    ingestion - examples are: " dish", "eat", "pizza"
    biological-processes - examples are: "eat", "blood", "pain"
    numbers - examples are: "second", "thousand", "5", "10"
    leisure - examples are: "cook", "chat", "movie" 
    future tense focus - examples are:  "may", "will", "soon"

Here are the LIWC categories and corresponding examples that help to decide whether a statement is DECEPTIVE:
    apostrophes - examples are, "haven't", "won't", "she's", "can't"
    past tense focused -  examples are, "ago", "did", "talked", "promised", "gotten"
    reward  - examples are, "congratulate", "accomplishment" , "take", "prize", "benefit"
    all pronouns - examples are, "I", "them", "itself"
    personal-pronouns - examples are, "I", "them", "her"
    exclamation-mark - example is, "!"
    emotional tone - examples are, "love", "nice", "sweet"

In the PARAGRAPH section below: 
First, search for words or phrases similar to the TRUTHFUL and DECEPTIVE categories examples and construct them in JSON form.
Treat each sub-category within the two categories in order of decreasing importance.
List each sub-category even if it it's empty.
Next, provide a short reason as to why the paragraph is classified as {gt} based on the words or pharses from BOTH the sub-categories above.
Also, you must provide reasons to support why the paragraph is not {not_of_gt}. Do not create additional key/values.
Be crisp and condident in your reasoning, do not use the word such as "likey", "maybe" or "might".
Generate the response in  JSON form with keys, "TRUTHFUL CATEGORIES FOUND", "DECEPTIVE CATEGORIES FOUND", "REASONING", "CLASSIFICATION". 
The CLASSIFICATION key can have only two values: "TRUTHFUL" or "DECEPTIVE"
Include nothing else.
"""

    return prelude + newline

def create_input(row):
    """
    For now, conbine question 1 and question 2
    """
    return 'PARAGRAPH: ' + row['q1'] + '\n' + row['q2']

def construct_context(row, gt, not_of_gt):
    """ 
    put the prelude and the response to q1 and q2 
    """
    context = ''
    # print(create_prelude())
    context += create_prelude(gt, not_of_gt)
    context += create_input(df.loc[row].copy())
    return context

# not limiting on the 'reasoning' word-count 

if __name__ == "__main__":
    ground_truths = []
    truth_start_row, truth_end_row = 10, 10
    deceptive_start_row, deceptive_end_row = 887, 888
    rows = list(range(truth_start_row, truth_end_row))
    rows += list(range(deceptive_start_row, deceptive_end_row))
    print(f'Rows used to generate the clues + reasoning: {rows}')
    out_df = prep_output_df(dataset_used, ['contains_clues', 'clues', 'reasoning'])

    model = 'gpt-4-1106-preview'  # cheapest at the highest quality
    print(f'Model:{model}')
    for row in rows:
        print(f"{20*'-'}{row}{20*'-'}")
        ground_truth = 'TRUTHFUL' if df.loc[row]['outcome_class'] == 't' else 'DECEPTIVE'
        not_groundtruth = 'TRUTHFUL' if df.loc[row]['outcome_class'] == 'd' else 'DECEPTIVE'
        final_context = construct_context(row=row, gt=ground_truth, not_of_gt=not_groundtruth)
        print(final_context)
        print(f'ground truth (GT): {ground_truth}')
        # print(f'INPUT:\n Q1:\n {df.loc[row]["q1"]} \n Q2:\n {df.loc[row]["q2"]}')

        response = get_chat_completion_with_backoff(final_context, system_role_content, model=model, temperature=temperature)
        print(f'Response---------------------\n')
        print(response)  # will pretty print
        # out_df = parse_n_write_response(response, out_df, row)

# TODO save_output_df(llm_generated_dataset, out_df)