Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 22 additions & 1 deletion federatedscope/llm/eval/eval_for_code/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,25 @@
* uncomment the following line 59 in `human-eval/human_eval/execution.py`
* `exec(check_program, exec_globals)`
* Evaluate
* `evaluate_functional_correctness {cfg.federate.save_to}_humaneval_answer.jsonl`
* `evaluate_functional_correctness {cfg.federate.save_to}_humaneval_answer.jsonl`

# HumanEvalX Usage

* Using the trained model to generate codes from prompt, and save them as 5 `jsonl` files (`['cpp', 'go', 'java', 'js', 'python']`).

* `python federatedscope/llm/eval/eval_for_code/humanevalx.py --cfg federatedscope/llm/baseline/llama.yaml`

* The file name of `jsonl` should be `{cfg.federate.save_to}_humanevalx_{LANGUAGE}_answer.jsonl`

* Use HumanEvalX Docker Image to test the pass@k score

* `docker pull rishubi/codegeex:latest`

* ```bash
docker run -it --mount type=bind,source=$PWD,target=/workspace/fs rishubi/codegeex:latest /bin/bash -c "cd CodeGeeX; git fetch; git pull; pip install -e .; \
bash scripts/evaluate_humaneval_x.sh ../fs/{cfg.federate.save_to}_humanevalx_cpp_answer.jsonl cpp 1; \
bash scripts/evaluate_humaneval_x.sh ../fs/{cfg.federate.save_to}_humanevalx_go_answer.jsonl go 1; \
bash scripts/evaluate_humaneval_x.sh ../fs/{cfg.federate.save_to}_humanevalx_java_answer.jsonl java 1; \
bash scripts/evaluate_humaneval_x.sh ../fs/{cfg.federate.save_to}_humanevalx_js_answer.jsonl js 1; \
bash scripts/evaluate_humaneval_x.sh ../fs/{cfg.federate.save_to}_humanevalx_python_answer.jsonl python 1; exit"
```
140 changes: 140 additions & 0 deletions federatedscope/llm/eval/eval_for_code/humanevalx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
import os
import torch
import json
import transformers
from transformers import GenerationConfig
from tqdm import tqdm

from federatedscope.core.configs.config import global_cfg
from federatedscope.core.cmd_args import parse_args, parse_client_cfg
from federatedscope.core.auxiliaries.utils import setup_seed
from federatedscope.core.auxiliaries.logging import update_logger
from federatedscope.llm.dataloader.dataloader import load_jsonl
from federatedscope.core.data.utils import download_url
from federatedscope.llm.misc.fschat import FSChatBot

transformers.logging.set_verbosity(40)

DEBUG = False
NUM_ANSWERS_PER_QUESTION = 5
LANGUAGES = ['cpp', 'go', 'java', 'js', 'python']
LANGUAGE_TAG = {
"cpp": "// language: C++",
"python": "# language: Python",
"java": "// language: Java",
"js": "// language: JavaScript",
"go": "// language: Go",
}


def clean_answer(code, language_type=None):
"""
Cleans up the generated code.
Borrow from: https://github.com/THUDM/CodeGeeX/blob/main/codegeex
/benchmark/utils.py
"""
code = code.replace('\u00a0', '')
if language_type.lower() == "python":
end_words = ["\ndef", "\nclass", "\nif", "\n#", "\nprint", "\nassert"]
for w in end_words:
if w in code:
code = code[:code.rfind(w)]
elif language_type.lower() == "java":
main_pos = code.find("public static void main")
if main_pos != -1:
code = code[:main_pos] + '}'
if '}' in code:
code = code[:code.rfind('}')] + '}'
if code.count('{') + 1 == code.count('}'):
code += "\n}"
elif language_type.lower() == "go":
end_words = ["\n//", "\nfunc main("]
for w in end_words:
if w in code:
code = code[:code.rfind(w)]
if '}' in code:
code = code[:code.rfind('}')] + '}'
elif language_type.lower() == "cpp":
if '}' in code:
code = code[:code.rfind('}')] + '}'
elif language_type.lower() == "js":
if '}' in code:
code = code[:code.rfind('}')] + '}'
return code


@torch.no_grad()
def main():
init_cfg = global_cfg.clone()
args = parse_args()

if args.cfg_file:
init_cfg.merge_from_file(args.cfg_file)
cfg_opt, client_cfg_opt = parse_client_cfg(args.opts)
init_cfg.merge_from_list(cfg_opt)

update_logger(init_cfg, clear_before_add=True)
setup_seed(init_cfg.seed)

# load your finetuned model (saved as xxx.ckpt)
# in yaml file federate.save_to
fschatbot = FSChatBot(init_cfg)

for lang in LANGUAGES:
out_file = \
f'{init_cfg.federate.save_to}_humanevalx_{lang}_answer.jsonl'

# Get test file
fp = os.path.join(init_cfg.data.root, f'humaneval_{lang}.jsonl.gz')
if not os.path.exists(fp):
download_url(
'https://github.com/THUDM/CodeGeeX/raw'
'/e64e88e40a73358bb4ad60ef24114355e7141880/codegeex'
f'/benchmark/humaneval-x/{lang}/data/humaneval_'
f'{lang}.jsonl.gz', init_cfg.data.root)
list_data_dict = load_jsonl(fp,
instruction='prompt',
category='task_id',
is_gzip=True)

answers = []
for sample in tqdm(list_data_dict):
input_text = LANGUAGE_TAG[lang] + '\n' + sample['instruction']
generation_config = GenerationConfig(
temperature=0.1,
top_k=40,
top_p=0.75,
do_sample=True,
num_return_sequences=NUM_ANSWERS_PER_QUESTION,
)
generate_kwargs = dict(
generation_config=generation_config,
max_new_tokens=128,
)
try:
model_completions = fschatbot.generate(input_text,
generate_kwargs)
except torch.cuda.OutOfMemoryError as error:
print(error)
model_completions = [
'' for _ in range(NUM_ANSWERS_PER_QUESTION)
]

for i, completion in enumerate(model_completions):
completion = clean_answer(completion, language_type=lang)
answers.append(
dict(task_id=sample['category'], generation=completion))
if DEBUG:
print(f"task_id: {sample['category']},\n"
f"generation {i + 1}:\n{completion}\n\n")

# Save as samples.jsonl for eval pass@k score
# Run `evaluate_functional_correctness samples.jsonl`
with open(out_file, 'w') as f:
for answer in answers:
json_str = json.dumps(answer)
f.write(json_str + '\n')


if __name__ == "__main__":
main()