Skip to content

Commit

Permalink
Fix lint for longwriter (#240)
Browse files Browse the repository at this point in the history
* add

* refine readme for longwriter

* add tech report for longwriter

* update

* update task config

* update

* fix lint

* fix lint for openai_api
  • Loading branch information
wangxingjun778 authored Dec 14, 2024
1 parent 654a9d9 commit ef3321e
Show file tree
Hide file tree
Showing 5 changed files with 64 additions and 147 deletions.
18 changes: 13 additions & 5 deletions evalscope/models/api/openai_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@
import threading
import time
from asyncio import Queue
from concurrent.futures import ThreadPoolExecutor
from modelscope.utils.logger import get_logger
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from typing import Dict, List, Optional, Union

from evalscope.utils.logger import get_logger

logger = get_logger()


Expand Down Expand Up @@ -45,7 +47,7 @@ def __init__(

self.token_bucket = TokenBucket(query_per_second, verbose)

def generate_simple(self, inputs: Union[List[str]]):
def generate_simple(self, inputs: Union[List[str]], num_proc: int = 8):

def process_one(in_data: str):

Expand Down Expand Up @@ -97,8 +99,14 @@ def process_one(in_data: str):
else:
return resp['choices'][0]['text'].strip()

with ThreadPoolExecutor() as executor:
results = list(executor.map(process_one, inputs))
results = []
with ThreadPoolExecutor(max_workers=num_proc) as executor:
# Submit all tasks
future_to_task = {executor.submit(process_one, input_one): input_one for input_one in inputs}

# Show progress bar
for future in tqdm(as_completed(future_to_task), total=len(inputs), desc=['Predicting']):
results.append(future.result())

return results

Expand Down
22 changes: 11 additions & 11 deletions evalscope/third_party/longbench_write/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,26 +29,26 @@ task_cfg = dict(stage=['infer', 'eval_l', 'eval_q'],
input_data_path=None,
output_dir='./outputs',
infer_config={
'openai_api_base': 'http://127.0.0.1:8000/v1/chat/completions',
'is_chat': True,
'verbose': False,
'openai_api_base': 'http://127.0.0.1:8000/v1/chat/completions',
'is_chat': True,
'verbose': False,
'generation_kwargs': {
'max_new_tokens': 32768,
'temperature': 0.5,
'max_new_tokens': 32768,
'temperature': 0.5,
'repetition_penalty': 1.0
},
'proc_num': 16,
},
eval_config={
# No need to set OpenAI info if skipping the stage `eval_q`
'openai_api_key': None,
'openai_api_base': 'https://api.openai.com/v1/chat/completions',
'openai_gpt_model': 'gpt-4o-2024-05-13',
'openai_api_key': None,
'openai_api_base': 'https://api.openai.com/v1/chat/completions',
'openai_gpt_model': 'gpt-4o-2024-05-13',
'generation_kwargs': {
'max_new_tokens': 1024,
'temperature': 0.5,
'max_new_tokens': 1024,
'temperature': 0.5,
'stop': None
},
},
'proc_num': 8
}
)
Expand Down
52 changes: 26 additions & 26 deletions evalscope/third_party/longbench_write/eval.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,16 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# Copyright (c) ZhipuAI, Inc. and its affiliates.
import multiprocessing
import os
import json
import random
import re
from concurrent.futures import ThreadPoolExecutor

import matplotlib.pyplot as plt
import numpy as np
import os
import random
import re
import requests
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

from evalscope.utils import jsonl_to_list
from evalscope.utils import get_logger
from evalscope.utils import get_logger, jsonl_to_list

logger = get_logger()

Expand Down Expand Up @@ -52,14 +49,16 @@ def score(x, y):
return 100 * max(0, 1. - (x / y - 1) / 2)

def eval(self, dump_res: bool = True):
# example = {"prompt": "Write an outline for a short 100-word blog post about xxx", "type": "Community Forum", "length": 100, "response_length": 103, "response": "I. Introduction A. xxx"}
# example = {"prompt": "Write an outline for a short 100-word blog post about xxx",
# "type": "Community Forum", "length": 100, "response_length": 103,
# "response": "I. Introduction A. xxx"}
predictions = [json.loads(line) for line in open(self.pred_path, encoding='utf-8')]
x, y, scores = [], [], []

for pred in tqdm(predictions, total=len(predictions), desc='[Processing eval_l]'):
x.append(pred["length"])
y.append(pred["response_length"])
scores.append(self.score(pred["length"], pred["response_length"]))
x.append(pred['length'])
y.append(pred['response_length'])
scores.append(self.score(pred['length'], pred['response_length']))

avg_score_l = np.mean(scores)
logger.info(f'Average score of length evaluation: {avg_score_l:.2f}')
Expand Down Expand Up @@ -105,7 +104,7 @@ class EvalQuality:

EVAL_Q = 'eval_quality'
OPENAI_BASE_URL = 'https://api.openai.com/v1/chat/completions'
DIMS = ["Relevance", "Accuracy", "Coherence", "Clarity", "Breadth and Depth", "Reading Experience"]
DIMS = ['Relevance', 'Accuracy', 'Coherence', 'Clarity', 'Breadth and Depth', 'Reading Experience']

def __init__(self,
model: str,
Expand Down Expand Up @@ -153,17 +152,17 @@ def get_response_gpt4(self, prompt, temperature=0.5, max_new_tokens=1024, stop=N
tries += 1
try:
headers = {
'Authorization': "Bearer {}".format(self.openai_api_key),
'Authorization': 'Bearer {}'.format(self.openai_api_key),
}
messages = [
{'role': 'user', 'content': prompt},
]
resp = requests.post(self.openai_api_base, json={
"model": self.openai_gpt_model,
"messages": messages,
"temperature": temperature,
"max_tokens": max_new_tokens,
"stop": stop,
'model': self.openai_gpt_model,
'messages': messages,
'temperature': temperature,
'max_tokens': max_new_tokens,
'stop': stop,
}, headers=headers, timeout=600)
if resp.status_code != 200:
raise Exception(resp.text)
Expand All @@ -173,16 +172,16 @@ def get_response_gpt4(self, prompt, temperature=0.5, max_new_tokens=1024, stop=N
except KeyboardInterrupt as e:
raise e
except Exception as e:
if "maximum context length" in str(e):
if 'maximum context length' in str(e):
raise e
elif "triggering" in str(e):
elif 'triggering' in str(e):
return 'Trigger OpenAI\'s content management policy'
logger.error("Error Occurs: \"%s\" Retry ..." % (str(e)))
else:
logger.error("Max tries. Failed.")
return "Max tries. Failed."
logger.error('Max tries. Failed.')
return 'Max tries. Failed.'
try:
return resp["choices"][0]["message"]["content"]
return resp['choices'][0]['message']['content']
except:
return ''

Expand All @@ -196,7 +195,7 @@ def extract_info(pattern, text):

def process_data(self, item):
# for item in tqdm(items, total=len(items), desc=f'Process of eval_q: '):
prompt = self.prompt_template.replace('$INST$', item['prompt']).replace('$RESPONSE$', item["response"])
prompt = self.prompt_template.replace('$INST$', item['prompt']).replace('$RESPONSE$', item['response'])
scores = None
output = self.get_response_gpt4(prompt, **self.generation_kwargs)
try:
Expand Down Expand Up @@ -236,7 +235,8 @@ def eval(self):
total_score = dict()
for dim in self.DIMS:
# scores = [float(score[dim]) if dim in score else 3 for score in self.eval_scores]
scores = [float(item['scores'][dim]) if 'scores' in item and dim in item['scores'] else 3 for item in self.eval_scores]
scores = [float(item['scores'][dim]) if 'scores' in item and dim in item['scores']
else 3 for item in self.eval_scores]
total_score[dim] = ((sum(scores) / len(scores)) - 1) * 25
total_score['total'] = sum(total_score.values()) / len(total_score)
logger.info(f'Total score of quality evaluation: {total_score["total"]:.2f}')
Expand Down
114 changes: 12 additions & 102 deletions evalscope/third_party/longbench_write/infer.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,15 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# Copyright (c) ZhipuAI, Inc. and its affiliates.

import os
import json
from typing import List

import torch
import numpy as np
import os
import random
from modelscope import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import torch
from typing import List

from evalscope.third_party.longbench_write.utils import count_words
from evalscope.models.api import OpenaiApi
from evalscope.third_party.longbench_write.utils import count_words
from evalscope.utils import get_logger

logger = get_logger()
Expand All @@ -25,39 +22,6 @@
"""


def get_pred(rank, world_size, data, path, max_new_tokens, temperature, tokenizer, fout):
device = torch.device(f'cuda:{rank}')
model = AutoModelForCausalLM.from_pretrained(path, trust_remote_code=True, torch_dtype=torch.bfloat16).to(device)
model = model.eval()

for dt in tqdm(data, total=len(data), desc=f'Infer on rank-{rank}: '):
prompt = dt['prompt']
if "llama" in path.lower():
prompt = f"[INST]{prompt}[/INST]"
input = tokenizer(prompt, truncation=False, return_tensors="pt").to(device)
context_length = input.input_ids.shape[-1]
output = model.generate(
**input,
max_new_tokens=max_new_tokens,
num_beams=1,
do_sample=True,
temperature=temperature,
)[0]
response = tokenizer.decode(output[context_length:], skip_special_tokens=True)
else:
response, history = model.chat(tokenizer, prompt, history=[], max_new_tokens=max_new_tokens,
temperature=temperature)
dt["response_length"], _ = count_words(response)
dt["response"] = response

logger.info(dt)

fout.write(json.dumps(dt, ensure_ascii=False) + '\n')
fout.flush()

logger.info(f'Successfully generated predictions for {len(data)} samples.')


def seed_everything(seed):
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
Expand All @@ -68,63 +32,6 @@ def seed_everything(seed):
torch.cuda.manual_seed_all(seed)


# def run_infer(model: str,
# data_path: str,
# output_dir: str,
# generation_kwargs: dict = None,
# enable: bool = True, ):
# """
# Process inference for LongWriter model.
#
# Args:
# model: The model id of the LongWriter model on ModelScope, or local model path.
# data_path: The path to the data file.
# output_dir: The output directory for the predictions.
# generation_kwargs: The generation arguments for the model.
# Attributes: `max_new_tokens`: The maximum number of tokens to generate. `temperature`: The temperature
# enable: Whether to run infer process.
# """
# model_id_path: str = os.path.join(output_dir, model.strip(os.sep).replace(os.sep, '__'))
#
# if not enable:
# logger.warning('*** Skip `infer` stage ***')
# return f'{model_id_path}/pred.jsonl'
#
# seed_everything(42)
#
# os.makedirs(model_id_path, exist_ok=True)
# fout = open(f'{model_id_path}/pred.jsonl', 'w', encoding='utf-8')
#
# if generation_kwargs is None:
# generation_kwargs = dict({
# 'max_new_tokens': 32768,
# 'temperature': 0.5
# })
#
# tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
# world_size = torch.cuda.device_count()
#
# logger.info(f'>>Input data path: {data_path}')
# with open(data_path, encoding='utf-8') as f:
# data = [json.loads(line) for line in f]
#
# data_subsets = [data[i::world_size] for i in range(world_size)]
# processes = []
# for rank in range(world_size):
# p = mp.Process(target=get_pred,
# args=(rank, world_size, data_subsets[rank], model, generation_kwargs.get('max_new_tokens'), generation_kwargs.get('temperature'), tokenizer, fout))
# p.start()
# processes.append(p)
#
# for p in processes:
# p.join()
#
# logger.info(f'Finish generating predictions for {model}.')
# logger.info(f'Predictions are saved in {model_id_path}/pred.jsonl.')
#
# return f'{model_id_path}/pred.jsonl'


def run_infer(model: str,
data_path: str,
output_dir: str,
Expand Down Expand Up @@ -175,7 +82,8 @@ def run_infer(model: str,

api_client = OpenaiApi(model=model,
openai_api_key=None,
openai_api_base=api_config.get('openai_api_base', 'http://127.0.0.1:8000/v1/chat/completions'),
openai_api_base=api_config.get('openai_api_base',
'http://127.0.0.1:8000/v1/chat/completions'),
max_new_tokens=generation_kwargs.get('max_new_tokens', 4096),
temperature=generation_kwargs.get('temperature', 0.0),
repetition_penalty=generation_kwargs.get('repetition_penalty', 1.0),
Expand All @@ -184,17 +92,19 @@ def run_infer(model: str,
)

# TODO: refine generate_simple
results: List[str] = api_client.generate_simple(inputs=[example['prompt'] for example in data_list], num_proc=proc_num)
assert len(results) == len(data_list), f'Error: The number of predictions {len(results)} is not equal to the number of inputs {len(data_list)}.'
results: List[str] = api_client.generate_simple(inputs=[example['prompt'] for example in data_list],
num_proc=proc_num)
assert len(results) == len(data_list), \
f'Error: The number of predictions {len(results)} is not equal to the number of inputs {len(data_list)}.'
logger.info(f'Finish generating predictions with {len(data_list)} samples for {model}')

# Outputs
os.makedirs(model_id_path, exist_ok=True)
output_pred_file: str = f'{model_id_path}/pred.jsonl'
with open(output_pred_file, 'w', encoding='utf-8') as f:
for dt, res in zip(data_list, results):
dt["response_length"], _ = count_words(res)
dt["response"] = res
dt['response_length'], _ = count_words(res)
dt['response'] = res
f.write(json.dumps(dt, ensure_ascii=False) + '\n')

logger.info(f'Predictions are saved in {output_pred_file}')
Expand Down
5 changes: 2 additions & 3 deletions evalscope/third_party/longbench_write/longbench_write.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,9 @@
import os
from typing import Union

from evalscope.third_party.longbench_write.infer import run_infer
from evalscope.third_party.longbench_write.eval import run_eval
from evalscope.utils import yaml_to_dict, json_to_dict
from evalscope.utils import get_logger
from evalscope.third_party.longbench_write.infer import run_infer
from evalscope.utils import get_logger, json_to_dict, yaml_to_dict

logger = get_logger()

Expand Down

0 comments on commit ef3321e

Please sign in to comment.