This commit is contained in:
alexisxy 2023-10-20 19:36:17 -04:00
parent 7730a85191
commit f91eb5bbdf
3 changed files with 14 additions and 74 deletions

View File

@ -1,66 +0,0 @@
"""Evaluate by using the traces.zip files saved"""
import argparse
import json
import os
import sys
import tempfile
import zipfile
from playwright.sync_api import Page, sync_playwright
from evaluation_harness import evaluator_router
from evaluation_harness.helper_functions import PseudoPage
def eval_trace(trace_path: str, task_id: int, config_file_folder: str):
# load the config file
config_file = f"{config_file_folder}/{task_id}.json"
with open(config_file, "r") as f:
config = json.load(f)
if "string_match" in config["eval"]["eval_types"]:
raise ValueError(
"string_match is not supported in this evaluation script"
)
# extract the last url from the trace file
temp_dir = tempfile.TemporaryDirectory()
with zipfile.ZipFile(trace_path, "r") as zip_ref:
zip_ref.extractall(temp_dir.name)
with open(f"{temp_dir.name}/trace.trace", "r") as f:
trace = []
for line in f:
trace.append(json.loads(line))
last_url = ""
for step in trace[::-1]:
if step.get("type", None) == "frame-snapshot":
last_url = step["snapshot"]["frameUrl"]
break
if not last_url:
raise ValueError("Cannot find the last url in the trace file")
# start the playwright
context_manager = sync_playwright()
playwright = context_manager.__enter__()
browser = playwright.chromium.launch(headless=True)
context = browser.new_context()
page = context.new_page()
page.goto("https://trace.playwright.dev/")
with page.expect_file_chooser() as fc_info:
page.get_by_role("button", name="Select file(s)").click()
file_chooser = fc_info.value
file_chooser.set_files(trace_path)
with page.expect_popup() as page1_info:
page.get_by_role("button", name="").click()
page1 = page1_info.value
pseudo_page = PseudoPage(page1, last_url)
evaluator = evaluator_router(config_file)
score = evaluator(
trajectory=[],
config_file=config_file,
page=pseudo_page,
client=pseudo_page.context.new_cdp_session(pseudo_page),
)
print(score)

View File

@ -163,6 +163,8 @@ def llm_fuzzy_match(pred: str, reference: str, question: str) -> float:
messages=messages,
temperature=0,
max_tokens=768,
top_p=1.0,
context_length=0,
).lower()
if "partially correct" in response or "incorrect" in response:
return 0.0

20
run.py
View File

@ -423,13 +423,17 @@ if __name__ == "__main__":
test_file_list.append(f"config_files/{i}.json")
if "debug" not in args.result_dir:
test_file_list = get_unfinished(test_file_list, args.result_dir)
print(f"Total {len(test_file_list)} tasks left")
args.render = False
args.render_screenshot = True
args.save_trace_enabled = True
args.current_viewport_only = True
dump_config(args)
if len(test_file_list) == 0:
logger.info("No task left to run")
else:
print(f"Total {len(test_file_list)} tasks left")
args.render = False
args.render_screenshot = True
args.save_trace_enabled = True
agent = construct_agent(args)
test(args, agent, test_file_list)
args.current_viewport_only = True
dump_config(args)
agent = construct_agent(args)
test(args, agent, test_file_list)