minor

2026-02-06 11:16:53 +00:00 · 2023-10-20 19:36:17 -04:00 · 2023-10-20 19:36:17 -04:00 · f91eb5bbdf
commit f91eb5bbdf
parent 7730a85191
3 changed files with 14 additions and 74 deletions
--- a/evaluation_harness/evaluate_by_trace.py
+++ b/evaluation_harness/evaluate_by_trace.py
@ -1,66 +0,0 @@
-"""Evaluate by using the traces.zip files saved"""
-import argparse
-import json
-import os
-import sys
-import tempfile
-import zipfile
-
-from playwright.sync_api import Page, sync_playwright
-
-from evaluation_harness import evaluator_router
-from evaluation_harness.helper_functions import PseudoPage
-
-
-def eval_trace(trace_path: str, task_id: int, config_file_folder: str):
-    # load the config file
-    config_file = f"{config_file_folder}/{task_id}.json"
-    with open(config_file, "r") as f:
-        config = json.load(f)
-
-    if "string_match" in config["eval"]["eval_types"]:
-        raise ValueError(
-            "string_match is not supported in this evaluation script"
-        )
-
-    # extract the last url from the trace file
-    temp_dir = tempfile.TemporaryDirectory()
-    with zipfile.ZipFile(trace_path, "r") as zip_ref:
-        zip_ref.extractall(temp_dir.name)
-    with open(f"{temp_dir.name}/trace.trace", "r") as f:
-        trace = []
-        for line in f:
-            trace.append(json.loads(line))
-    last_url = ""
-    for step in trace[::-1]:
-        if step.get("type", None) == "frame-snapshot":
-            last_url = step["snapshot"]["frameUrl"]
-            break
-    if not last_url:
-        raise ValueError("Cannot find the last url in the trace file")
-
-    # start the playwright
-    context_manager = sync_playwright()
-    playwright = context_manager.__enter__()
-    browser = playwright.chromium.launch(headless=True)
-    context = browser.new_context()
-    page = context.new_page()
-    page.goto("https://trace.playwright.dev/")
-    with page.expect_file_chooser() as fc_info:
-        page.get_by_role("button", name="Select file(s)").click()
-    file_chooser = fc_info.value
-    file_chooser.set_files(trace_path)
-    with page.expect_popup() as page1_info:
-        page.get_by_role("button", name="").click()
-    page1 = page1_info.value
-
-    pseudo_page = PseudoPage(page1, last_url)
-    evaluator = evaluator_router(config_file)
-
-    score = evaluator(
-        trajectory=[],
-        config_file=config_file,
-        page=pseudo_page,
-        client=pseudo_page.context.new_cdp_session(pseudo_page),
-    )
-    print(score)
--- a/evaluation_harness/helper_functions.py
+++ b/evaluation_harness/helper_functions.py
@ -163,6 +163,8 @@ def llm_fuzzy_match(pred: str, reference: str, question: str) -> float:
        messages=messages,
        temperature=0,
        max_tokens=768,
+        top_p=1.0,
+        context_length=0,
    ).lower()
    if "partially correct" in response or "incorrect" in response:
        return 0.0
--- a/run.py
+++ b/run.py
@ -423,6 +423,10 @@ if __name__ == "__main__":
        test_file_list.append(f"config_files/{i}.json")
    if "debug" not in args.result_dir:
        test_file_list = get_unfinished(test_file_list, args.result_dir)
+
+    if len(test_file_list) == 0:
+        logger.info("No task left to run")
+    else:
        print(f"Total {len(test_file_list)} tasks left")
        args.render = False
        args.render_screenshot = True