From f91eb5bbdff45701461e3f4af85ae2fcb5017a50 Mon Sep 17 00:00:00 2001 From: alexisxy Date: Fri, 20 Oct 2023 19:36:17 -0400 Subject: [PATCH] minor --- evaluation_harness/evaluate_by_trace.py | 66 ------------------------- evaluation_harness/helper_functions.py | 2 + run.py | 20 +++++--- 3 files changed, 14 insertions(+), 74 deletions(-) delete mode 100644 evaluation_harness/evaluate_by_trace.py diff --git a/evaluation_harness/evaluate_by_trace.py b/evaluation_harness/evaluate_by_trace.py deleted file mode 100644 index 3820789..0000000 --- a/evaluation_harness/evaluate_by_trace.py +++ /dev/null @@ -1,66 +0,0 @@ -"""Evaluate by using the traces.zip files saved""" -import argparse -import json -import os -import sys -import tempfile -import zipfile - -from playwright.sync_api import Page, sync_playwright - -from evaluation_harness import evaluator_router -from evaluation_harness.helper_functions import PseudoPage - - -def eval_trace(trace_path: str, task_id: int, config_file_folder: str): - # load the config file - config_file = f"{config_file_folder}/{task_id}.json" - with open(config_file, "r") as f: - config = json.load(f) - - if "string_match" in config["eval"]["eval_types"]: - raise ValueError( - "string_match is not supported in this evaluation script" - ) - - # extract the last url from the trace file - temp_dir = tempfile.TemporaryDirectory() - with zipfile.ZipFile(trace_path, "r") as zip_ref: - zip_ref.extractall(temp_dir.name) - with open(f"{temp_dir.name}/trace.trace", "r") as f: - trace = [] - for line in f: - trace.append(json.loads(line)) - last_url = "" - for step in trace[::-1]: - if step.get("type", None) == "frame-snapshot": - last_url = step["snapshot"]["frameUrl"] - break - if not last_url: - raise ValueError("Cannot find the last url in the trace file") - - # start the playwright - context_manager = sync_playwright() - playwright = context_manager.__enter__() - browser = playwright.chromium.launch(headless=True) - context = browser.new_context() - page = context.new_page() - page.goto("https://trace.playwright.dev/") - with page.expect_file_chooser() as fc_info: - page.get_by_role("button", name="Select file(s)").click() - file_chooser = fc_info.value - file_chooser.set_files(trace_path) - with page.expect_popup() as page1_info: - page.get_by_role("button", name="").click() - page1 = page1_info.value - - pseudo_page = PseudoPage(page1, last_url) - evaluator = evaluator_router(config_file) - - score = evaluator( - trajectory=[], - config_file=config_file, - page=pseudo_page, - client=pseudo_page.context.new_cdp_session(pseudo_page), - ) - print(score) diff --git a/evaluation_harness/helper_functions.py b/evaluation_harness/helper_functions.py index 535dfcf..5baf466 100644 --- a/evaluation_harness/helper_functions.py +++ b/evaluation_harness/helper_functions.py @@ -163,6 +163,8 @@ def llm_fuzzy_match(pred: str, reference: str, question: str) -> float: messages=messages, temperature=0, max_tokens=768, + top_p=1.0, + context_length=0, ).lower() if "partially correct" in response or "incorrect" in response: return 0.0 diff --git a/run.py b/run.py index 010bc54..cee3c98 100644 --- a/run.py +++ b/run.py @@ -423,13 +423,17 @@ if __name__ == "__main__": test_file_list.append(f"config_files/{i}.json") if "debug" not in args.result_dir: test_file_list = get_unfinished(test_file_list, args.result_dir) - print(f"Total {len(test_file_list)} tasks left") - args.render = False - args.render_screenshot = True - args.save_trace_enabled = True - args.current_viewport_only = True - dump_config(args) + if len(test_file_list) == 0: + logger.info("No task left to run") + else: + print(f"Total {len(test_file_list)} tasks left") + args.render = False + args.render_screenshot = True + args.save_trace_enabled = True - agent = construct_agent(args) - test(args, agent, test_file_list) + args.current_viewport_only = True + dump_config(args) + + agent = construct_agent(args) + test(args, agent, test_file_list)