From f91eb5bbdff45701461e3f4af85ae2fcb5017a50 Mon Sep 17 00:00:00 2001
From: alexisxy <alexisxy0418@gmail.com>
Date: Fri, 20 Oct 2023 19:36:17 -0400
Subject: [PATCH] minor

---
 evaluation_harness/evaluate_by_trace.py | 66 -------------------------
 evaluation_harness/helper_functions.py  |  2 +
 run.py                                  | 20 +++++---
 3 files changed, 14 insertions(+), 74 deletions(-)
 delete mode 100644 evaluation_harness/evaluate_by_trace.py

diff --git a/evaluation_harness/evaluate_by_trace.py b/evaluation_harness/evaluate_by_trace.py
deleted file mode 100644
index 3820789..0000000
--- a/evaluation_harness/evaluate_by_trace.py
+++ /dev/null
@@ -1,66 +0,0 @@
-"""Evaluate by using the traces.zip files saved"""
-import argparse
-import json
-import os
-import sys
-import tempfile
-import zipfile
-
-from playwright.sync_api import Page, sync_playwright
-
-from evaluation_harness import evaluator_router
-from evaluation_harness.helper_functions import PseudoPage
-
-
-def eval_trace(trace_path: str, task_id: int, config_file_folder: str):
-    # load the config file
-    config_file = f"{config_file_folder}/{task_id}.json"
-    with open(config_file, "r") as f:
-        config = json.load(f)
-
-    if "string_match" in config["eval"]["eval_types"]:
-        raise ValueError(
-            "string_match is not supported in this evaluation script"
-        )
-
-    # extract the last url from the trace file
-    temp_dir = tempfile.TemporaryDirectory()
-    with zipfile.ZipFile(trace_path, "r") as zip_ref:
-        zip_ref.extractall(temp_dir.name)
-    with open(f"{temp_dir.name}/trace.trace", "r") as f:
-        trace = []
-        for line in f:
-            trace.append(json.loads(line))
-    last_url = ""
-    for step in trace[::-1]:
-        if step.get("type", None) == "frame-snapshot":
-            last_url = step["snapshot"]["frameUrl"]
-            break
-    if not last_url:
-        raise ValueError("Cannot find the last url in the trace file")
-
-    # start the playwright
-    context_manager = sync_playwright()
-    playwright = context_manager.__enter__()
-    browser = playwright.chromium.launch(headless=True)
-    context = browser.new_context()
-    page = context.new_page()
-    page.goto("https://trace.playwright.dev/")
-    with page.expect_file_chooser() as fc_info:
-        page.get_by_role("button", name="Select file(s)").click()
-    file_chooser = fc_info.value
-    file_chooser.set_files(trace_path)
-    with page.expect_popup() as page1_info:
-        page.get_by_role("button", name="").click()
-    page1 = page1_info.value
-
-    pseudo_page = PseudoPage(page1, last_url)
-    evaluator = evaluator_router(config_file)
-
-    score = evaluator(
-        trajectory=[],
-        config_file=config_file,
-        page=pseudo_page,
-        client=pseudo_page.context.new_cdp_session(pseudo_page),
-    )
-    print(score)
diff --git a/evaluation_harness/helper_functions.py b/evaluation_harness/helper_functions.py
index 535dfcf..5baf466 100644
--- a/evaluation_harness/helper_functions.py
+++ b/evaluation_harness/helper_functions.py
@@ -163,6 +163,8 @@ def llm_fuzzy_match(pred: str, reference: str, question: str) -> float:
         messages=messages,
         temperature=0,
         max_tokens=768,
+        top_p=1.0,
+        context_length=0,
     ).lower()
     if "partially correct" in response or "incorrect" in response:
         return 0.0
diff --git a/run.py b/run.py
index 010bc54..cee3c98 100644
--- a/run.py
+++ b/run.py
@@ -423,13 +423,17 @@ if __name__ == "__main__":
         test_file_list.append(f"config_files/{i}.json")
     if "debug" not in args.result_dir:
         test_file_list = get_unfinished(test_file_list, args.result_dir)
-    print(f"Total {len(test_file_list)} tasks left")
-    args.render = False
-    args.render_screenshot = True
-    args.save_trace_enabled = True
 
-    args.current_viewport_only = True
-    dump_config(args)
+    if len(test_file_list) == 0:
+        logger.info("No task left to run")
+    else:
+        print(f"Total {len(test_file_list)} tasks left")
+        args.render = False
+        args.render_screenshot = True
+        args.save_trace_enabled = True
 
-    agent = construct_agent(args)
-    test(args, agent, test_file_list)
+        args.current_viewport_only = True
+        dump_config(args)
+
+        agent = construct_agent(args)
+        test(args, agent, test_file_list)