mirror of
https://github.com/web-arena-x/webarena.git
synced 2026-02-06 11:16:53 +00:00
minor
This commit is contained in:
parent
7730a85191
commit
f91eb5bbdf
@ -1,66 +0,0 @@
|
||||
"""Evaluate by using the traces.zip files saved"""
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
import zipfile
|
||||
|
||||
from playwright.sync_api import Page, sync_playwright
|
||||
|
||||
from evaluation_harness import evaluator_router
|
||||
from evaluation_harness.helper_functions import PseudoPage
|
||||
|
||||
|
||||
def eval_trace(trace_path: str, task_id: int, config_file_folder: str):
|
||||
# load the config file
|
||||
config_file = f"{config_file_folder}/{task_id}.json"
|
||||
with open(config_file, "r") as f:
|
||||
config = json.load(f)
|
||||
|
||||
if "string_match" in config["eval"]["eval_types"]:
|
||||
raise ValueError(
|
||||
"string_match is not supported in this evaluation script"
|
||||
)
|
||||
|
||||
# extract the last url from the trace file
|
||||
temp_dir = tempfile.TemporaryDirectory()
|
||||
with zipfile.ZipFile(trace_path, "r") as zip_ref:
|
||||
zip_ref.extractall(temp_dir.name)
|
||||
with open(f"{temp_dir.name}/trace.trace", "r") as f:
|
||||
trace = []
|
||||
for line in f:
|
||||
trace.append(json.loads(line))
|
||||
last_url = ""
|
||||
for step in trace[::-1]:
|
||||
if step.get("type", None) == "frame-snapshot":
|
||||
last_url = step["snapshot"]["frameUrl"]
|
||||
break
|
||||
if not last_url:
|
||||
raise ValueError("Cannot find the last url in the trace file")
|
||||
|
||||
# start the playwright
|
||||
context_manager = sync_playwright()
|
||||
playwright = context_manager.__enter__()
|
||||
browser = playwright.chromium.launch(headless=True)
|
||||
context = browser.new_context()
|
||||
page = context.new_page()
|
||||
page.goto("https://trace.playwright.dev/")
|
||||
with page.expect_file_chooser() as fc_info:
|
||||
page.get_by_role("button", name="Select file(s)").click()
|
||||
file_chooser = fc_info.value
|
||||
file_chooser.set_files(trace_path)
|
||||
with page.expect_popup() as page1_info:
|
||||
page.get_by_role("button", name="").click()
|
||||
page1 = page1_info.value
|
||||
|
||||
pseudo_page = PseudoPage(page1, last_url)
|
||||
evaluator = evaluator_router(config_file)
|
||||
|
||||
score = evaluator(
|
||||
trajectory=[],
|
||||
config_file=config_file,
|
||||
page=pseudo_page,
|
||||
client=pseudo_page.context.new_cdp_session(pseudo_page),
|
||||
)
|
||||
print(score)
|
||||
@ -163,6 +163,8 @@ def llm_fuzzy_match(pred: str, reference: str, question: str) -> float:
|
||||
messages=messages,
|
||||
temperature=0,
|
||||
max_tokens=768,
|
||||
top_p=1.0,
|
||||
context_length=0,
|
||||
).lower()
|
||||
if "partially correct" in response or "incorrect" in response:
|
||||
return 0.0
|
||||
|
||||
4
run.py
4
run.py
@ -423,6 +423,10 @@ if __name__ == "__main__":
|
||||
test_file_list.append(f"config_files/{i}.json")
|
||||
if "debug" not in args.result_dir:
|
||||
test_file_list = get_unfinished(test_file_list, args.result_dir)
|
||||
|
||||
if len(test_file_list) == 0:
|
||||
logger.info("No task left to run")
|
||||
else:
|
||||
print(f"Total {len(test_file_list)} tasks left")
|
||||
args.render = False
|
||||
args.render_screenshot = True
|
||||
|
||||
Loading…
Reference in New Issue
Block a user