mirror of
https://github.com/web-arena-x/webarena.git
synced 2026-02-06 11:16:53 +00:00
Merge pull request #217 from web-arena-x/feat/automated-map-backend-deployment
Add automated WebArena map backend deployment infrastructure
This commit is contained in:
commit
f62e0b99bc
11
.github/workflows/tests.yml
vendored
11
.github/workflows/tests.yml
vendored
@ -4,14 +4,6 @@ on: [push]
|
||||
jobs:
|
||||
test-all:
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
SHOPPING: "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:7770"
|
||||
SHOPPING_ADMIN: "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:7780/admin"
|
||||
REDDIT: "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:9999"
|
||||
GITLAB: "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:8023"
|
||||
MAP: "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:3000"
|
||||
WIKIPEDIA: "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing"
|
||||
HOMEPAGE: "PASS"
|
||||
strategy:
|
||||
max-parallel: 5
|
||||
steps:
|
||||
@ -35,9 +27,6 @@ jobs:
|
||||
# Run this mypy instance against our main package.
|
||||
mypy --install-types --non-interactive .
|
||||
mypy --strict . --exclude scripts
|
||||
- name: Enviroment prepare
|
||||
run: |
|
||||
bash prepare.sh
|
||||
- name: Test with pytest
|
||||
run: |
|
||||
pytest
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@ -157,3 +157,4 @@ config_files*/*7.json
|
||||
config_files*/*8.json
|
||||
config_files*/*9.json
|
||||
config_files*/test.json
|
||||
.openhands/
|
||||
|
||||
@ -24,7 +24,7 @@
|
||||

|
||||
|
||||
## Update on 12/5/2024
|
||||
> [!IMPORTANT]
|
||||
> [!IMPORTANT]
|
||||
> This repository hosts the *canonical* implementation of WebArena to reproduce the results reported in the paper. The web navigation infrastructure has been significantly enhanced by [AgentLab](https://github.com/ServiceNow/AgentLab/), introducing several key features: (1) support for parallel experiments using [BrowserGym](https://github.com/ServiceNow/BrowserGym), (2) integration of popular web navigation benchmarks (e.g., VisualWebArena) within a unified framework, (3) unified leaderboard reporting, and (4) improved handling of environment edge cases. We strongly recommend using this framework for your experiments.
|
||||
|
||||
## News
|
||||
|
||||
@ -1378,7 +1378,9 @@ def parse_playwright_code(code: str) -> list[ParsedPlaywrightCode]:
|
||||
if isinstance(node, ast.Call):
|
||||
function_name = node.func.id # type: ignore[attr-defined]
|
||||
arguments = [
|
||||
ast.literal_eval(arg) if isinstance(arg, ast.Str) else arg
|
||||
str(ast.literal_eval(arg))
|
||||
if isinstance(arg, ast.Str)
|
||||
else str(arg)
|
||||
for arg in node.args
|
||||
]
|
||||
keywords = {
|
||||
|
||||
@ -20,17 +20,26 @@ We provide AMI which have all the websites pre-installed. You can use the AMI to
|
||||
```
|
||||
AMI Information: find in console, EC2 - AMI Catalog
|
||||
Region: us-east-2
|
||||
Name: webarena
|
||||
ID: ami-06290d70feea35450
|
||||
Name: webarena-with-configurable-map-backend
|
||||
ID: ami-08a862bf98e3bd7aa
|
||||
```
|
||||
|
||||
1. Create a security group that allows all inbound traffic.
|
||||
1. Create a security group that allows all inbound traffic, or at minimum, create a security group with the following inbound rules:
|
||||
- SSH (port 22) from your IP
|
||||
- HTTP (port 80) from anywhere (0.0.0.0/0)
|
||||
- Custom TCP ports: 3000, 7770, 7780, 8023, 8888, 9999 from anywhere (0.0.0.0/0)
|
||||
|
||||
2. Create an instance (recommended type: t3a.xlarge, 1000GB EBS root volume) from the webarena AMI. Use the security group just created and remember to select SSH key-pair.
|
||||
|
||||
3. Create an Elastic IP and bind to the instance to associate the instance with a static IP and hostname. Take note of the hostname, usually in the form of "ec2-xx-xx-xx-xx.us-east-2.compute.amazonaws.com". This will be used as "<your-server-hostname>" in the following commands.
|
||||
3. **Map Backend Configuration**: Add the following to your instance's user data to automatically configure the map backend:
|
||||
```
|
||||
MAP_BACKEND_IP=YOUR_MAP_BACKEND_IP
|
||||
```
|
||||
Replace `YOUR_MAP_BACKEND_IP` with your map backend server's IP address.
|
||||
|
||||
4. Log into the server, start all dockers by:
|
||||
4. Create an Elastic IP and bind to the instance to associate the instance with a static IP and hostname. Take note of the hostname, usually in the form of "ec2-xx-xx-xx-xx.us-east-2.compute.amazonaws.com". This will be used as "<your-server-hostname>" in the following commands.
|
||||
|
||||
5. Log into the server, start all dockers by:
|
||||
```bash
|
||||
docker start gitlab
|
||||
docker start shopping
|
||||
@ -43,7 +52,17 @@ docker compose start
|
||||
|
||||
:clock1: wait ~1 min to wait all services to start
|
||||
|
||||
5. Run
|
||||
**If services are not accessible externally**, run these iptables rules:
|
||||
```bash
|
||||
sudo iptables -t nat -A PREROUTING -p tcp --dport 7770 -j REDIRECT --to-port 7770
|
||||
sudo iptables -t nat -A PREROUTING -p tcp --dport 7780 -j REDIRECT --to-port 7780
|
||||
sudo iptables -t nat -A PREROUTING -p tcp --dport 3000 -j REDIRECT --to-port 3000
|
||||
sudo iptables -t nat -A PREROUTING -p tcp --dport 8888 -j REDIRECT --to-port 8888
|
||||
sudo iptables -t nat -A PREROUTING -p tcp --dport 9999 -j REDIRECT --to-port 9999
|
||||
sudo iptables -t nat -A PREROUTING -p tcp --dport 8023 -j REDIRECT --to-port 8023
|
||||
```
|
||||
|
||||
6. Run
|
||||
```bash
|
||||
docker exec shopping /var/www/magento2/bin/magento setup:store-config:set --base-url="http://<your-server-hostname>:7770" # no trailing /
|
||||
docker exec shopping mysql -u magentouser -pMyPassword magentodb -e 'UPDATE core_config_data SET value="http://<your-server-hostname>:7770/" WHERE path = "web/secure/base_url";'
|
||||
@ -60,6 +79,25 @@ docker exec gitlab sed -i "s|^external_url.*|external_url 'http://<your-server-h
|
||||
docker exec gitlab gitlab-ctl reconfigure
|
||||
```
|
||||
|
||||
**If GitLab shows 502 errors**, run:
|
||||
```bash
|
||||
docker exec gitlab rm -f /var/opt/gitlab/postgresql/data/postmaster.pid
|
||||
docker exec gitlab /opt/gitlab/embedded/bin/pg_resetwal -f /var/opt/gitlab/postgresql/data
|
||||
docker exec gitlab gitlab-ctl restart
|
||||
```
|
||||
|
||||
**Test all services** (should return HTTP 200):
|
||||
```bash
|
||||
HOSTNAME="<your-server-hostname>"
|
||||
curl -s -o /dev/null -w "Shopping (7770): %{http_code}\n" http://$HOSTNAME:7770
|
||||
curl -s -o /dev/null -w "Shopping Admin (7780): %{http_code}\n" http://$HOSTNAME:7780
|
||||
curl -s -o /dev/null -w "Forum (9999): %{http_code}\n" http://$HOSTNAME:9999
|
||||
curl -s -o /dev/null -w "Wikipedia (8888): %{http_code}\n" http://$HOSTNAME:8888
|
||||
curl -s -o /dev/null -w "Map (3000): %{http_code}\n" http://$HOSTNAME:3000
|
||||
curl -s -o /dev/null -w "GitLab (8023): %{http_code}\n" http://$HOSTNAME:8023
|
||||
curl -s -o /dev/null -w "Map tile: %{http_code}\n" http://$HOSTNAME:3000/tile/0/0/0.png
|
||||
```
|
||||
|
||||
You should be able to access your environment websites now, and stop reading.
|
||||
However, if you are unable to use AWS AMI, read below to set up on your own machine.
|
||||
|
||||
@ -185,84 +223,28 @@ flask run --host=0.0.0.0 --port=4399
|
||||
The homepage will be available at `http://<your-server-hostname>:4399`.
|
||||
|
||||
### Map
|
||||
Please refer to the AMI setup for the map frontend setup. For most use cases this is enough.
|
||||
|
||||
If you wish to also set up all map backends, namely tile server, geocoding server and routing server, read along and please be aware of very large downloads and disk space requirements.
|
||||
The WebArena AMI automatically configures the map frontend to use your specified map backend server when you set `MAP_BACKEND_IP=YOUR_MAP_BACKEND_IP` in the user data (as shown in step 3 above). No manual configuration is required.
|
||||
|
||||
#### Tile Sever
|
||||
#### Setting up your own map backend
|
||||
|
||||
First download http://metis.lti.cs.cmu.edu/map_server_data/osm_tile_server.tar and extract the docker volumes to your docker volume directory (default to `/var/lib/docker/volumes/`). Make sure that you have `osm-data` volume copied.
|
||||
If you want to run your own tile server, geocoding server, and routing server instead of using the existing AWS infrastructure:
|
||||
|
||||
Then run the tile server:
|
||||
1. **Launch Ubuntu 24.04 LTS instance** (t3a.xlarge, 1000GB storage) in us-east-2
|
||||
- [AWS EC2 Launch Tutorial](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/tutorial-launch-my-first-ec2-instance.html)
|
||||
|
||||
```bash
|
||||
docker run --volume=osm-data:/data/database/ --volume=osm-tiles:/data/tiles/ -p 8080:80 --detach=true overv/openstreetmap-tile-server run
|
||||
```
|
||||
2. **Use automated setup script** as user data during launch:
|
||||
- Copy the contents of `webarena-map-backend-boot-init.yaml` from this repository
|
||||
- Paste it into the "User data" field when launching your instance
|
||||
- [AWS User Data Documentation](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/user-data.html)
|
||||
|
||||
Now, inside the file `webarena/openstreetmap-website/vendor/assets/leaflet/leaflet.osm.js`, change `http://ogma.lti.cs.cmu.edu:8080/tile/{z}/{x}/{y}.png` to `http://<public-url-to-your-tile-server>:8080/tile/{z}/{x}/{y}.png`
|
||||
3. **Wait for setup completion** (60-90 minutes for automatic setup, ~180GB download)
|
||||
|
||||
> [!NOTE]
|
||||
> By default, the `url` in `TileLayer` and `Mapnik` is set to `"http://ogma.lti.cs.cmu.edu:8080/tile/{z}/{x}/{y}.png"`. You replace it with `https://tile.openstreetmap.org/{z}/{x}/{y}.png` (the official link) as a way to test in case you run into issues during the setup.
|
||||
4. **Update your WebArena frontend** to point to your new backend server:
|
||||
- Set `MAP_BACKEND_IP=<your-backend-server-ip>` when launching your WebArena instances
|
||||
- The AMI will automatically configure all map services to use your backend
|
||||
|
||||
|
||||
#### Geocoding Server
|
||||
First download http://metis.lti.cs.cmu.edu/map_server_data/nominatim_volumes.tar and extract the docker volumes to your docker volume directory (default to `/var/lib/docker/volumes/`). Make sure that you have `nominatim-data` and `nominatim-flatnode` volume copied.
|
||||
|
||||
Also download http://metis.lti.cs.cmu.edu/map_server_data/osm_dump.tar and extract the OSM dump to a host directory `/path/to/osm_dump`, which will be used in the following command.
|
||||
|
||||
|
||||
Then run the geocoding server:
|
||||
```bash
|
||||
docker run --env=IMPORT_STYLE=extratags --env=PBF_PATH=/nominatim/data/us-northeast-latest.osm.pbf --env=IMPORT_WIKIPEDIA=/nominatim/data/wikimedia-importance.sql.gz --volume=/path/to/osm_dump:/nominatim/data --volume=nominatim-data:/var/lib/postgresql/14/main --volume=nominatim-flatnode:/nominatim/flatnode -p 8085:8080 mediagis/nominatim:4.2 /app/start.sh
|
||||
```
|
||||
|
||||
Now, inside the config file `webarena/openstreetmap-website/config/settings.yml`, update the value of `fossgis_osrm_url` from `"http://metis.lti.cs.cmu.edu:8085/"` to `"http://<your-geocoding-server-domain>:8085/"`
|
||||
|
||||
|
||||
> [!NOTE]
|
||||
> By default, `nominatim_url` is set to `"http://metis.lti.cs.cmu.edu:"`. However, the [official openstreetmap-website default config file](https://github.com/openstreetmap/openstreetmap-website/blob/edda4af515cfb0bd4de1ed0650b47e124bfad6ed/config/settings.yml#L111) is set to `"https://nominatim.openstreetmap.org/"`. You can use that as a way to test in case you run into issues during the setup.
|
||||
|
||||
|
||||
#### Routing Server
|
||||
|
||||
First download http://metis.lti.cs.cmu.edu/map_server_data/osrm_routing.tar and extract all the directories to your local path.
|
||||
Make sure to have `/your/routing/path/<foot, car, bike>`, which will be used in 3 different routing endpoints.
|
||||
|
||||
Then run the 3 routing servers:
|
||||
```bash
|
||||
docker run --volume=/your/routing/path/car:/data -p 5000:5000 ghcr.io/project-osrm/osrm-backend osrm-routed --algorithm mld /data/us-northeast-latest.osrm
|
||||
docker run --volume=/your/routing/path/bike:/data -p 5001:5000 ghcr.io/project-osrm/osrm-backend osrm-routed --algorithm mld /data/us-northeast-latest.osrm
|
||||
docker run --volume=/your/routing/path/foot:/data -p 5002:5000 ghcr.io/project-osrm/osrm-backend osrm-routed --algorithm mld /data/us-northeast-latest.osrm
|
||||
```
|
||||
|
||||
Now, inside the config file `webarena/openstreetmap-website/config/settings.yml`, update the value of `nominatim_url` from `"http://metis.lti.cs.cmu.edu:"` to `"http://<your-geocoding-server-domain>"`
|
||||
|
||||
|
||||
> [!NOTE]
|
||||
> By default, `fossgis_osrm_url` is set to `"http://metis.lti.cs.cmu.edu:8085/"`. However, the [official openstreetmap-website default config file](https://github.com/openstreetmap/openstreetmap-website/blob/edda4af515cfb0bd4de1ed0650b47e124bfad6ed/config/settings.yml#L125) is set to `"https://routing.openstreetmap.de/"`. You can use that as a way to test in case you run into issues during the setup.
|
||||
|
||||
|
||||
##### Selecting different routing ports
|
||||
|
||||
The ports 5000, 5001, 5002 are chosen respectively for car, bike and foot inside `webarena/openstreetmap-website/app/assets/javascripts/index/directions/fossgis_osrm.js`
|
||||
|
||||
The mapping looks like this:
|
||||
|
||||
```javascript
|
||||
// ...
|
||||
var vehicleTypePortMapping = {
|
||||
"car": "5000",
|
||||
"bike": "5001",
|
||||
"foot": "5002"
|
||||
}
|
||||
// ...
|
||||
```
|
||||
|
||||
If your port is different, you can update the mapping in the aforementioned file to match your own ports.
|
||||
|
||||
#### Secure header
|
||||
|
||||
The file `webarena/openstreetmap-website/config/initializers/secure_headers.rb` allows you to specify domains for secure serving of images. Specfically, in `csp_policy` > `img_src`, you can add your domain, e.g. `ogma.lti.cs.cmu.edu`. Do not include "http" or "https". You can also use the `*` operator, e.g. `*.openstreetmap.fr`.
|
||||
This automated approach handles all the complex setup including tile server, geocoding server, and routing server configuration.
|
||||
|
||||
### Documentation sites
|
||||
We are still working on dockerizing the documentation sites. As they are read-only sites and they usually don't change rapidly. It is safe to use their live sites for test purpose right now.
|
||||
|
||||
@ -10,7 +10,7 @@ from pathlib import Path
|
||||
from typing import Any, Tuple, Union
|
||||
|
||||
from beartype import beartype
|
||||
from nltk.tokenize import word_tokenize # type: ignore
|
||||
from nltk.tokenize import word_tokenize
|
||||
from playwright.sync_api import CDPSession, Page
|
||||
|
||||
from browser_env.actions import Action
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
from text_generation import Client # type: ignore
|
||||
from text_generation import Client
|
||||
|
||||
|
||||
def generate_from_huggingface_completion(
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
from typing import Any
|
||||
|
||||
import tiktoken
|
||||
from transformers import LlamaTokenizer # type: ignore
|
||||
from transformers import LlamaTokenizer
|
||||
|
||||
|
||||
class Tokenizer(object):
|
||||
|
||||
@ -8,29 +8,46 @@ import subprocess
|
||||
import time
|
||||
|
||||
SLEEP = 1.5
|
||||
# set the URLs of each website, we use the demo sites as an example
|
||||
os.environ[
|
||||
"SHOPPING"
|
||||
] = "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:7770"
|
||||
os.environ[
|
||||
"SHOPPING_ADMIN"
|
||||
] = "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:7780/admin"
|
||||
os.environ[
|
||||
"REDDIT"
|
||||
] = "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:9999"
|
||||
os.environ[
|
||||
"GITLAB"
|
||||
] = "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:8023"
|
||||
os.environ[
|
||||
"MAP"
|
||||
] = "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:3000"
|
||||
os.environ[
|
||||
"WIKIPEDIA"
|
||||
] = "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing"
|
||||
os.environ[
|
||||
"HOMEPAGE"
|
||||
] = "PASS" # The home page is not currently hosted in the demo site
|
||||
print("Done setting up URLs")
|
||||
|
||||
# Check if environment variables are set, if not provide helpful error message
|
||||
required_env_vars = [
|
||||
"SHOPPING",
|
||||
"SHOPPING_ADMIN",
|
||||
"REDDIT",
|
||||
"GITLAB",
|
||||
"MAP",
|
||||
"WIKIPEDIA",
|
||||
"HOMEPAGE",
|
||||
]
|
||||
missing_vars = []
|
||||
|
||||
for var in required_env_vars:
|
||||
if not os.environ.get(var):
|
||||
missing_vars.append(var)
|
||||
|
||||
if missing_vars:
|
||||
print(
|
||||
f"ERROR: Missing required environment variables: {', '.join(missing_vars)}"
|
||||
)
|
||||
print("\nPlease set the following environment variables before running:")
|
||||
print("export SHOPPING='http://YOUR_WEBARENA_SERVER:7770'")
|
||||
print("export SHOPPING_ADMIN='http://YOUR_WEBARENA_SERVER:7780/admin'")
|
||||
print("export REDDIT='http://YOUR_WEBARENA_SERVER:9999'")
|
||||
print("export GITLAB='http://YOUR_WEBARENA_SERVER:8023'")
|
||||
print("export MAP='http://YOUR_WEBARENA_SERVER:3000'")
|
||||
print(
|
||||
"export WIKIPEDIA='http://YOUR_WEBARENA_SERVER:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing'"
|
||||
)
|
||||
print("export HOMEPAGE='PASS'")
|
||||
print(
|
||||
"\nReplace YOUR_WEBARENA_SERVER with your WebArena server's IP address."
|
||||
)
|
||||
print(
|
||||
"Note: 18.208.187.221 is the map backend server, not the WebArena frontend server."
|
||||
)
|
||||
exit(1)
|
||||
|
||||
print("Environment variables are properly configured")
|
||||
|
||||
# First, run `python scripts/generate_test_data.py` to generate the config files
|
||||
p = subprocess.run(
|
||||
|
||||
@ -35,7 +35,7 @@ def get_observation(
|
||||
sleep_after_execution=2.0,
|
||||
)
|
||||
env.reset(options={"config_file": f"scripts/tmp_storage_state.json"})
|
||||
s = f"""page.goto("http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:7780/admin/admin/dashboard/")
|
||||
s = f"""page.goto("{SHOPPING_ADMIN}/admin/dashboard/")
|
||||
page.get_by_label("", exact=True).fill("reviews")
|
||||
page.get_by_label("", exact=True).press("Enter")
|
||||
page.scroll(down)"""
|
||||
|
||||
18
setup.cfg
18
setup.cfg
@ -13,6 +13,8 @@ dev =
|
||||
nbmake
|
||||
pytest-asyncio
|
||||
types-requests
|
||||
types-setuptools
|
||||
types-flask
|
||||
|
||||
[options]
|
||||
python_requires = >=3.7, <4
|
||||
@ -23,3 +25,19 @@ packages =
|
||||
llms
|
||||
[mypy]
|
||||
strict = true
|
||||
exclude = (?x)(^environment_docker/.*$|^setup\.py$)
|
||||
|
||||
[mypy-text_generation.*]
|
||||
ignore_missing_imports = true
|
||||
|
||||
[mypy-transformers.*]
|
||||
ignore_missing_imports = true
|
||||
|
||||
[mypy-aiolimiter.*]
|
||||
ignore_missing_imports = true
|
||||
|
||||
[mypy-openai.error.*]
|
||||
ignore_missing_imports = true
|
||||
|
||||
[mypy-nltk.*]
|
||||
ignore_missing_imports = true
|
||||
|
||||
47
setup_env.sh
Executable file
47
setup_env.sh
Executable file
@ -0,0 +1,47 @@
|
||||
#!/bin/bash
|
||||
|
||||
# WebArena Environment Setup Script
|
||||
# This script sets up the required environment variables for WebArena
|
||||
#
|
||||
# Usage:
|
||||
# source setup_env.sh <your-server-hostname-or-ip>
|
||||
#
|
||||
# Example:
|
||||
# source setup_env.sh YOUR_WEBARENA_SERVER
|
||||
# source setup_env.sh ec2-xx-xx-xx-xx.us-east-2.compute.amazonaws.com
|
||||
|
||||
if [ $# -eq 0 ]; then
|
||||
echo "Usage: source setup_env.sh <your-server-hostname-or-ip>"
|
||||
echo ""
|
||||
echo "Example:"
|
||||
echo " source setup_env.sh YOUR_SERVER_IP"
|
||||
echo " source setup_env.sh ec2-xx-xx-xx-xx.us-east-2.compute.amazonaws.com"
|
||||
return 1
|
||||
fi
|
||||
|
||||
SERVER_HOST="$1"
|
||||
|
||||
# Remove any trailing slash
|
||||
SERVER_HOST="${SERVER_HOST%/}"
|
||||
|
||||
# Set up environment variables for WebArena websites
|
||||
export SHOPPING="http://${SERVER_HOST}:7770"
|
||||
export SHOPPING_ADMIN="http://${SERVER_HOST}:7780/admin"
|
||||
export REDDIT="http://${SERVER_HOST}:9999"
|
||||
export GITLAB="http://${SERVER_HOST}:8023"
|
||||
export MAP="http://${SERVER_HOST}:3000"
|
||||
export WIKIPEDIA="http://${SERVER_HOST}:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing"
|
||||
export HOMEPAGE="PASS"
|
||||
|
||||
echo "WebArena environment variables set for server: ${SERVER_HOST}"
|
||||
echo ""
|
||||
echo "Environment variables:"
|
||||
echo " SHOPPING=${SHOPPING}"
|
||||
echo " SHOPPING_ADMIN=${SHOPPING_ADMIN}"
|
||||
echo " REDDIT=${REDDIT}"
|
||||
echo " GITLAB=${GITLAB}"
|
||||
echo " MAP=${MAP}"
|
||||
echo " WIKIPEDIA=${WIKIPEDIA}"
|
||||
echo " HOMEPAGE=${HOMEPAGE}"
|
||||
echo ""
|
||||
echo "You can now run WebArena scripts and evaluations."
|
||||
@ -20,13 +20,6 @@ from browser_env import (
|
||||
create_scroll_action,
|
||||
)
|
||||
from browser_env.actions import create_id_based_action
|
||||
from browser_env.env_config import (
|
||||
ACCOUNTS,
|
||||
GITLAB,
|
||||
REDDIT,
|
||||
SHOPPING,
|
||||
SHOPPING_ADMIN,
|
||||
)
|
||||
|
||||
|
||||
def test_script_browser_env(script_browser_env: ScriptBrowserEnv) -> None:
|
||||
@ -130,7 +123,7 @@ def test_parallel_script_browser_env() -> None:
|
||||
# assert is_bearable(info["page"].tolist(), list[DetachedPage])
|
||||
assert info["page"][0].url == "https://www.rfc-editor.org/rfc/rfc2606.html"
|
||||
assert info["page"][1].url == "https://www.rfc-editor.org/rfc/rfc6761.html"
|
||||
vector_env.close() # type: ignore[no-untyped-call]
|
||||
vector_env.close()
|
||||
|
||||
|
||||
def test_focus_placeholder_and_label(
|
||||
@ -191,7 +184,7 @@ def test_accessibility_tree_viewport(
|
||||
accessibility_tree_current_viewport_script_browser_env: ScriptBrowserEnv,
|
||||
) -> None:
|
||||
s1 = "combobox 'Favourite mammal'"
|
||||
s2 = "gridcell 'Canyon bat'"
|
||||
s2 = "cell 'Canyon bat'"
|
||||
s3 = "heading 'Useful links'"
|
||||
env = accessibility_tree_current_viewport_script_browser_env
|
||||
env.reset()
|
||||
@ -216,24 +209,6 @@ def test_accessibility_tree_viewport(
|
||||
assert s1 not in obs["text"] and s2 in obs["text"] and s3 in obs["text"]
|
||||
|
||||
|
||||
def test_multiple_start_url(script_browser_env: ScriptBrowserEnv) -> None:
|
||||
temp_config = tempfile.NamedTemporaryFile("w", delete=False)
|
||||
config = {
|
||||
"require_login": False,
|
||||
"start_url": f"{REDDIT} |AND| {REDDIT}/forums",
|
||||
}
|
||||
json.dump(config, temp_config)
|
||||
temp_config.close()
|
||||
|
||||
env = script_browser_env
|
||||
env.reset(options={"config_file": temp_config.name})
|
||||
assert len(env.context.pages) == 2
|
||||
assert env.context.pages[0].url == f"{REDDIT}/"
|
||||
assert env.context.pages[1].url == f"{REDDIT}/forums", env.context.pages[
|
||||
1
|
||||
].url
|
||||
|
||||
|
||||
def test_observation_tab_information(
|
||||
accessibility_tree_current_viewport_script_browser_env: ScriptBrowserEnv,
|
||||
) -> None:
|
||||
|
||||
@ -1,29 +0,0 @@
|
||||
{
|
||||
"sites": ["shopping"],
|
||||
"task_id": 0,
|
||||
"require_login": true,
|
||||
"storage_state": null,
|
||||
"start_url": null,
|
||||
"geolocation": null,
|
||||
"intent_template": "",
|
||||
"instantiation_dict": {},
|
||||
"intent": "",
|
||||
"require_reset": false,
|
||||
"eval": {
|
||||
"eval_types": ["program_html"],
|
||||
"reference_answers": [],
|
||||
"reference_url": "",
|
||||
"program_html": [
|
||||
{
|
||||
"url": "last",
|
||||
"required_contents": {"must_include": ["80"]},
|
||||
"locator": "func:shopping_get_sku_latest_review_rating('B09BCM56J7')"
|
||||
},
|
||||
{
|
||||
"url": "last",
|
||||
"required_contents": {"must_include": ["cupcakecupcake"]},
|
||||
"locator": "func:shopping_get_sku_latest_review_author('B09BCM56J7')"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
@ -1,29 +0,0 @@
|
||||
{
|
||||
"sites": ["shopping"],
|
||||
"task_id": 0,
|
||||
"require_login": true,
|
||||
"storage_state": null,
|
||||
"start_url": null,
|
||||
"geolocation": null,
|
||||
"intent_template": "",
|
||||
"instantiation_dict": {},
|
||||
"intent": "",
|
||||
"require_reset": false,
|
||||
"eval": {
|
||||
"eval_types": ["program_html"],
|
||||
"reference_answers": [],
|
||||
"reference_url": "",
|
||||
"program_html": [
|
||||
{
|
||||
"url": "last",
|
||||
"required_contents": {"must_include": ["100"]},
|
||||
"locator": "func:shopping_get_sku_latest_review_rating('B09BCM56J7')"
|
||||
},
|
||||
{
|
||||
"url": "last",
|
||||
"required_contents": {"must_include": ["cupcakecupcake"]},
|
||||
"locator": "func:shopping_get_sku_latest_review_author('B09BCM56J7')"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
@ -1,24 +0,0 @@
|
||||
{
|
||||
"sites": ["shopping"],
|
||||
"task_id": 0,
|
||||
"require_login": true,
|
||||
"storage_state": null,
|
||||
"start_url": null,
|
||||
"geolocation": null,
|
||||
"intent_template": "",
|
||||
"instantiation_dict": {},
|
||||
"intent": "",
|
||||
"require_reset": false,
|
||||
"eval": {
|
||||
"eval_types": ["program_html"],
|
||||
"reference_answers": [],
|
||||
"reference_url": "",
|
||||
"program_html": [
|
||||
{
|
||||
"url": "func:reddit_get_post_url('__last_url__')",
|
||||
"locator": "document.querySelector('.submission__inner').outerText",
|
||||
"required_contents": {"must_include": ["How will SPY close on Monday 11/28"]}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
@ -1,33 +0,0 @@
|
||||
{
|
||||
"sites": [
|
||||
"shopping"
|
||||
],
|
||||
"task_id": 0,
|
||||
"require_login": true,
|
||||
"storage_state": "./.auth/gitlab_state.json",
|
||||
"start_url": null,
|
||||
"geolocation": null,
|
||||
"intent_template": "",
|
||||
"instantiation_dict": {},
|
||||
"intent": "",
|
||||
"require_reset": false,
|
||||
"eval": {
|
||||
"eval_types": [
|
||||
"program_html"
|
||||
],
|
||||
"reference_answers": [],
|
||||
"reference_url": "",
|
||||
"program_html": [
|
||||
{
|
||||
"url": "__GITLAB__/primer/design/-/project_members",
|
||||
"locator": "func:gitlab_get_project_memeber_role(__page__, 'byteblaze')",
|
||||
"required_contents": {"must_include": ["Developer"]}
|
||||
},
|
||||
{
|
||||
"url": "__GITLAB__/primer/design/-/project_members",
|
||||
"locator": "func:gitlab_get_project_memeber_role(__page__, 'primer')",
|
||||
"required_contents": {"must_include": ["Owner"]}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
@ -1,29 +0,0 @@
|
||||
{
|
||||
"sites": ["gitlab"],
|
||||
"task_id": 0,
|
||||
"require_login": true,
|
||||
"storage_state": "./.auth/gitlab_state.json",
|
||||
"start_url": null,
|
||||
"geolocation": null,
|
||||
"intent_template": "",
|
||||
"instantiation_dict": {},
|
||||
"intent": "",
|
||||
"require_reset": false,
|
||||
"eval": {
|
||||
"eval_types": ["program_html"],
|
||||
"reference_answers": [],
|
||||
"reference_url": "",
|
||||
"program_html": [
|
||||
{
|
||||
"url": "last",
|
||||
"required_contents": {"must_include": ["Hello World"]},
|
||||
"locator": "document.querySelector('[id=\"form-name\"').value"
|
||||
},
|
||||
{
|
||||
"url": "last",
|
||||
"required_contents": {"must_include": ["alexisxy@hotmail.com"]},
|
||||
"locator": "document.querySelector('[id=\"form-email\"').value"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
@ -1,29 +0,0 @@
|
||||
{
|
||||
"sites": ["gitlab"],
|
||||
"task_id": 0,
|
||||
"require_login": true,
|
||||
"storage_state": "./.auth/gitlab_state.json",
|
||||
"start_url": null,
|
||||
"geolocation": null,
|
||||
"intent_template": "",
|
||||
"instantiation_dict": {},
|
||||
"intent": "",
|
||||
"require_reset": false,
|
||||
"eval": {
|
||||
"eval_types": ["program_html"],
|
||||
"reference_answers": [],
|
||||
"reference_url": "",
|
||||
"program_html": [
|
||||
{
|
||||
"url": "last",
|
||||
"required_contents": {"must_include": ["What are mammals?"]},
|
||||
"locator": ""
|
||||
},
|
||||
{
|
||||
"url": "https://www.google.com/",
|
||||
"required_contents": {"must_include": ["Google Search"]},
|
||||
"locator": ""
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
@ -1,30 +0,0 @@
|
||||
{
|
||||
"sites": ["gitlab"],
|
||||
"task_id": 0,
|
||||
"require_login": true,
|
||||
"storage_state": null,
|
||||
"start_url": null,
|
||||
"geolocation": null,
|
||||
"intent_template": "",
|
||||
"instantiation_dict": {},
|
||||
"intent": "",
|
||||
"require_reset": false,
|
||||
"eval": {
|
||||
"eval_types": ["program_html", "url_match"],
|
||||
"reference_answers": [],
|
||||
"reference_url": "https://russmaxdesign.github.io/",
|
||||
"url_note": "GOLD in PRED",
|
||||
"program_html": [
|
||||
{
|
||||
"url": "last",
|
||||
"required_contents": {"must_include": ["Hello World"]},
|
||||
"locator": "document.querySelector('[id=\"form-name\"').value"
|
||||
},
|
||||
{
|
||||
"url": "last",
|
||||
"required_contents": {"must_include": ["alexisxy@hotmail.com"]},
|
||||
"locator": "document.querySelector('[id=\"form-email\"').value"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
@ -1,20 +0,0 @@
|
||||
{
|
||||
"sites": ["reddit"],
|
||||
"task_id": 0,
|
||||
"require_login": true,
|
||||
"storage_state": "./.auth/reddit_state.json",
|
||||
"start_url": null,
|
||||
"geolocation": null,
|
||||
"intent_template": "",
|
||||
"instantiation_dict": {},
|
||||
"intent": "",
|
||||
"require_reset": false,
|
||||
"eval": {
|
||||
"eval_types": ["string_match"],
|
||||
"reference_answers": {
|
||||
"must_include": ["1985/04/18"]
|
||||
},
|
||||
"reference_url": "",
|
||||
"program_html": null
|
||||
}
|
||||
}
|
||||
@ -1,23 +0,0 @@
|
||||
{
|
||||
"sites": ["reddit"],
|
||||
"task_id": 0,
|
||||
"require_login": true,
|
||||
"storage_state": null,
|
||||
"start_url": null,
|
||||
"geolocation": null,
|
||||
"intent_template": "",
|
||||
"instantiation_dict": {},
|
||||
"intent": "",
|
||||
"require_reset": false,
|
||||
"eval": {
|
||||
"eval_types": ["url_match"],
|
||||
"reference_answers": [],
|
||||
"reference_url": "https://www.google.com/",
|
||||
"program_html": [
|
||||
{
|
||||
"url": "",
|
||||
"required_contents": []
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
@ -1,347 +0,0 @@
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
from glob import glob
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
from py import test
|
||||
|
||||
from agent import Agent, TeacherForcingAgent
|
||||
from browser_env import ActionTypes, ScriptBrowserEnv
|
||||
from browser_env.env_config import *
|
||||
from evaluation_harness import (
|
||||
HTMLContentEvaluator,
|
||||
StringEvaluator,
|
||||
URLEvaluator,
|
||||
)
|
||||
from evaluation_harness.evaluators import EvaluatorComb
|
||||
|
||||
IN_GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS") == "true"
|
||||
HEADLESS = True
|
||||
config_file_folder = "tests/test_evaluation_harness/configs"
|
||||
|
||||
|
||||
def tf_roll_out(
|
||||
agent: Agent, env: ScriptBrowserEnv, config_file: str
|
||||
) -> list[Any]:
|
||||
"""Roll out the agent using teacher forcing actions"""
|
||||
obs, state_info = env.reset(options={"config_file": config_file})
|
||||
|
||||
trajectory: list[Any] = [{"observation": obs, "info": state_info}]
|
||||
while True:
|
||||
action = agent.next_action(
|
||||
trajectory=trajectory, intent="", meta_data={}
|
||||
)
|
||||
trajectory.append(action)
|
||||
if action["action_type"] == ActionTypes.STOP:
|
||||
break
|
||||
|
||||
# preceed to next action
|
||||
obs, reward, terminated, truncated, info = env.step(action)
|
||||
state_info = {"observation": obs, "info": info}
|
||||
trajectory.append(state_info)
|
||||
|
||||
return trajectory
|
||||
|
||||
|
||||
def test_string_match_success(
|
||||
script_browser_env: ScriptBrowserEnv,
|
||||
) -> None:
|
||||
config_file = f"{config_file_folder}/string_match.json"
|
||||
|
||||
agent = TeacherForcingAgent()
|
||||
agent.set_action_set_tag(tag="playwright")
|
||||
action_seq = """page.stop("The date is 1985/04/18")"""
|
||||
agent.set_actions(action_seq)
|
||||
|
||||
env = script_browser_env
|
||||
trajectory = tf_roll_out(agent, env, config_file)
|
||||
|
||||
evalutor = StringEvaluator()
|
||||
score = evalutor(
|
||||
trajectory, config_file, env.page, env.get_page_client(env.page)
|
||||
)
|
||||
|
||||
assert score == 1.0
|
||||
|
||||
|
||||
def test_string_match_fail(script_browser_env: ScriptBrowserEnv) -> None:
|
||||
config_file = f"{config_file_folder}/string_match.json"
|
||||
|
||||
agent = TeacherForcingAgent()
|
||||
agent.set_action_set_tag(tag="playwright")
|
||||
action_seq = """page.stop("The date is 1936/04/18")"""
|
||||
agent.set_actions(action_seq)
|
||||
|
||||
env = script_browser_env
|
||||
trajectory = tf_roll_out(agent, env, config_file)
|
||||
|
||||
evalutor = StringEvaluator()
|
||||
score = evalutor(
|
||||
trajectory, config_file, env.page, env.get_page_client(env.page)
|
||||
)
|
||||
|
||||
assert score == 0.0
|
||||
|
||||
|
||||
def test_url_exact_match_success(script_browser_env: ScriptBrowserEnv) -> None:
|
||||
config_file = f"{config_file_folder}/url_exact_match.json"
|
||||
|
||||
agent = TeacherForcingAgent()
|
||||
agent.set_action_set_tag(tag="playwright")
|
||||
action_seq = f"""page.goto("https://www.google.com/")
|
||||
page.stop()"""
|
||||
agent.set_actions(action_seq)
|
||||
|
||||
env = script_browser_env
|
||||
|
||||
trajectory = tf_roll_out(agent, env, config_file)
|
||||
|
||||
evalutor = URLEvaluator()
|
||||
score = evalutor(
|
||||
trajectory, config_file, env.page, env.get_page_client(env.page)
|
||||
)
|
||||
assert score == 1.0
|
||||
|
||||
|
||||
def test_url_exact_match_fail(script_browser_env: ScriptBrowserEnv) -> None:
|
||||
config_file = f"{config_file_folder}/url_exact_match.json"
|
||||
|
||||
agent = TeacherForcingAgent()
|
||||
agent.set_action_set_tag(tag="playwright")
|
||||
action_seq = f"""page.goto("{GITLAB}")
|
||||
page.stop()"""
|
||||
agent.set_actions(action_seq)
|
||||
|
||||
env = script_browser_env
|
||||
|
||||
trajectory = tf_roll_out(agent, env, config_file)
|
||||
|
||||
evalutor = URLEvaluator()
|
||||
score = evalutor(
|
||||
trajectory, config_file, env.page, env.get_page_client(env.page)
|
||||
)
|
||||
print(env.page.url)
|
||||
assert score == 0.0
|
||||
|
||||
|
||||
def test_html_content_match_success(
|
||||
script_browser_env: ScriptBrowserEnv,
|
||||
) -> None:
|
||||
config_file = f"{config_file_folder}/html_content_exact_match.json"
|
||||
|
||||
# randomly sample a string
|
||||
agent = TeacherForcingAgent()
|
||||
agent.set_action_set_tag(tag="playwright")
|
||||
action_seq = f"""page.goto("https://russmaxdesign.github.io/exercise")
|
||||
page.stop()"""
|
||||
agent.set_actions(action_seq)
|
||||
|
||||
env = script_browser_env
|
||||
|
||||
trajectory = tf_roll_out(agent, env, config_file)
|
||||
|
||||
evalutor = HTMLContentEvaluator()
|
||||
score = evalutor(
|
||||
trajectory, config_file, env.page, env.get_page_client(env.page)
|
||||
)
|
||||
assert score == 1.0
|
||||
|
||||
|
||||
def test_html_content_match_fail(script_browser_env: ScriptBrowserEnv) -> None:
|
||||
config_file = f"{config_file_folder}/html_content_exact_match.json"
|
||||
|
||||
# randomly sample a string
|
||||
agent = TeacherForcingAgent()
|
||||
agent.set_action_set_tag(tag="playwright")
|
||||
action_seq = """page.goto("https://www.google.com/")
|
||||
page.stop()"""
|
||||
agent.set_actions(action_seq)
|
||||
|
||||
env = script_browser_env
|
||||
|
||||
trajectory = tf_roll_out(agent, env, config_file)
|
||||
|
||||
evalutor = HTMLContentEvaluator()
|
||||
score = evalutor(
|
||||
trajectory, config_file, env.page, env.get_page_client(env.page)
|
||||
)
|
||||
assert score == 0.0
|
||||
|
||||
|
||||
def test_html_content_element_match_success(
|
||||
script_browser_env: ScriptBrowserEnv,
|
||||
) -> None:
|
||||
config_file = f"{config_file_folder}/html_content_element_exact_match.json"
|
||||
|
||||
agent = TeacherForcingAgent()
|
||||
agent.set_action_set_tag(tag="playwright")
|
||||
action_seq = f"""page.goto("https://russmaxdesign.github.io/exercise/")
|
||||
page.get_by_label("Full name").fill("Hello World")
|
||||
page.get_by_label("Email").click()
|
||||
page.get_by_label("Email").fill("alexisxy@hotmail.com")
|
||||
page.stop()"""
|
||||
agent.set_actions(action_seq)
|
||||
|
||||
env = script_browser_env
|
||||
|
||||
trajectory = tf_roll_out(agent, env, config_file)
|
||||
|
||||
evalutor = HTMLContentEvaluator()
|
||||
score = evalutor(
|
||||
trajectory, config_file, env.page, env.get_page_client(env.page)
|
||||
)
|
||||
assert score == 1.0
|
||||
|
||||
|
||||
def test_html_content_element_match_fail(
|
||||
script_browser_env: ScriptBrowserEnv,
|
||||
) -> None:
|
||||
config_file = f"{config_file_folder}/html_content_element_exact_match.json"
|
||||
|
||||
agent = TeacherForcingAgent()
|
||||
agent.set_action_set_tag(tag="playwright")
|
||||
action_seq = f"""page.goto("https://russmaxdesign.github.io/exercise/")
|
||||
page.get_by_label("Full name").fill("Hello")
|
||||
page.get_by_label("Email").click()
|
||||
page.get_by_label("Email").fill("alexisxy@hotmail.com")
|
||||
page.stop()"""
|
||||
agent.set_actions(action_seq)
|
||||
|
||||
env = script_browser_env
|
||||
|
||||
trajectory = tf_roll_out(agent, env, config_file)
|
||||
|
||||
evalutor = HTMLContentEvaluator()
|
||||
score = evalutor(
|
||||
trajectory, config_file, env.page, env.get_page_client(env.page)
|
||||
)
|
||||
assert score == 0.0
|
||||
|
||||
|
||||
def test_html_content_url_comb_success(
|
||||
script_browser_env: ScriptBrowserEnv,
|
||||
) -> None:
|
||||
config_file = f"{config_file_folder}/html_content_url_comb.json"
|
||||
|
||||
agent = TeacherForcingAgent()
|
||||
agent.set_action_set_tag(tag="playwright")
|
||||
action_seq = f"""page.goto("https://russmaxdesign.github.io/exercise/")
|
||||
page.get_by_label("Full name").fill("Hello World")
|
||||
page.get_by_label("Email").click()
|
||||
page.get_by_label("Email").fill("alexisxy@hotmail.com")
|
||||
page.stop()"""
|
||||
agent.set_actions(action_seq)
|
||||
|
||||
env = script_browser_env
|
||||
|
||||
trajectory = tf_roll_out(agent, env, config_file)
|
||||
|
||||
evaluators = EvaluatorComb([URLEvaluator(), HTMLContentEvaluator()])
|
||||
score = evaluators(
|
||||
trajectory, config_file, env.page, env.get_page_client(env.page)
|
||||
)
|
||||
assert score == 1.0
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
IN_GITHUB_ACTIONS, reason="Won't work using the demo sites"
|
||||
)
|
||||
def test_func_success(
|
||||
script_browser_env: ScriptBrowserEnv,
|
||||
) -> None:
|
||||
config_file = f"{config_file_folder}/func_eval_success.json"
|
||||
|
||||
agent = TeacherForcingAgent()
|
||||
agent.set_action_set_tag(tag="playwright")
|
||||
action_seq = f"""page.stop()"""
|
||||
agent.set_actions(action_seq)
|
||||
|
||||
env = script_browser_env
|
||||
trajectory = tf_roll_out(agent, env, config_file)
|
||||
|
||||
evalutor = HTMLContentEvaluator()
|
||||
score = evalutor(
|
||||
trajectory, config_file, env.page, env.get_page_client(env.page)
|
||||
)
|
||||
assert score == 1.0
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
IN_GITHUB_ACTIONS, reason="Won't work using the demo sites"
|
||||
)
|
||||
def test_func_fail(
|
||||
script_browser_env: ScriptBrowserEnv,
|
||||
) -> None:
|
||||
config_file = f"{config_file_folder}/func_eval_fail.json"
|
||||
|
||||
agent = TeacherForcingAgent()
|
||||
agent.set_action_set_tag(tag="playwright")
|
||||
action_seq = f"""page.stop()"""
|
||||
agent.set_actions(action_seq)
|
||||
|
||||
env = script_browser_env
|
||||
trajectory = tf_roll_out(agent, env, config_file)
|
||||
|
||||
evalutor = HTMLContentEvaluator()
|
||||
score = evalutor(
|
||||
trajectory, config_file, env.page, env.get_page_client(env.page)
|
||||
)
|
||||
assert score == 0.0
|
||||
|
||||
|
||||
def test_func_url_func_last_success(
|
||||
script_browser_env: ScriptBrowserEnv,
|
||||
) -> None:
|
||||
config_file = f"{config_file_folder}/func_url_func_1.json"
|
||||
|
||||
agent = TeacherForcingAgent()
|
||||
agent.set_action_set_tag(tag="playwright")
|
||||
action_seq = f"""page.goto("{REDDIT}/f/wallstreetbets/50431/-/comment/676875")
|
||||
page.stop()"""
|
||||
agent.set_actions(action_seq)
|
||||
|
||||
env = script_browser_env
|
||||
trajectory = tf_roll_out(agent, env, config_file)
|
||||
|
||||
evalutor = HTMLContentEvaluator()
|
||||
score = evalutor(
|
||||
trajectory, config_file, env.page, env.get_page_client(env.page)
|
||||
)
|
||||
assert score == 1.0
|
||||
|
||||
|
||||
def test_func_url_func_page_success(
|
||||
script_browser_env: ScriptBrowserEnv,
|
||||
) -> None:
|
||||
config_file = f"{config_file_folder}/func_url_func_2.json"
|
||||
|
||||
# change the URL placeholder with the concrete URL
|
||||
with open(config_file, "r") as f:
|
||||
configs = json.load(f)
|
||||
configs["eval"]["program_html"][0]["url"] = configs["eval"][
|
||||
"program_html"
|
||||
][0]["url"].replace("__GITLAB__", GITLAB)
|
||||
configs["eval"]["program_html"][1]["url"] = configs["eval"][
|
||||
"program_html"
|
||||
][1]["url"].replace("__GITLAB__", GITLAB)
|
||||
tmp_config = config_file.replace(".json", ".tmp.json")
|
||||
with open(tmp_config, "w+") as f:
|
||||
json.dump(configs, f, indent=4)
|
||||
|
||||
agent = TeacherForcingAgent()
|
||||
agent.set_action_set_tag(tag="playwright")
|
||||
action_seq = f"""page.stop()"""
|
||||
agent.set_actions(action_seq)
|
||||
|
||||
env = script_browser_env
|
||||
trajectory = tf_roll_out(agent, env, tmp_config)
|
||||
|
||||
evalutor = HTMLContentEvaluator()
|
||||
score = evalutor(
|
||||
trajectory, tmp_config, env.page, env.get_page_client(env.page)
|
||||
)
|
||||
assert score == 1.0
|
||||
os.remove(tmp_config)
|
||||
@ -1,31 +0,0 @@
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from browser_env import ScriptBrowserEnv
|
||||
from browser_env.env_config import *
|
||||
from evaluation_harness.helper_functions import (
|
||||
gitlab_get_project_memeber_role,
|
||||
)
|
||||
|
||||
HEADLESS = True
|
||||
config_file_folder = "tests/test_evaluation_harness/configs"
|
||||
|
||||
|
||||
def test_gitlab_get_project_memeber_role(
|
||||
script_browser_env: ScriptBrowserEnv,
|
||||
) -> None:
|
||||
env = script_browser_env
|
||||
config_file = f"{config_file_folder}/tmp_config.json"
|
||||
|
||||
with open(config_file, "w") as f:
|
||||
json.dump({"storage_state": ".auth/gitlab_state.json"}, f)
|
||||
env.reset(options={"config_file": config_file})
|
||||
env.page.goto(f"{GITLAB}/primer/design/-/project_members")
|
||||
role1 = gitlab_get_project_memeber_role(env.page, "byteblaze")
|
||||
assert role1 == "Developer"
|
||||
role2 = gitlab_get_project_memeber_role(env.page, "primer")
|
||||
assert role2 == "Owner"
|
||||
|
||||
# remove tmp config file
|
||||
os.remove(config_file)
|
||||
313
webarena-map-backend-boot-init.yaml
Normal file
313
webarena-map-backend-boot-init.yaml
Normal file
@ -0,0 +1,313 @@
|
||||
#cloud-config
|
||||
# WebArena Map Backend Server Boot-Init Script
|
||||
# Based on successful deployment from trajectory analysis
|
||||
# This script sets up tile server, geocoding server, and routing servers
|
||||
|
||||
package_update: true
|
||||
package_upgrade: false
|
||||
package_reboot_if_required: false
|
||||
|
||||
# Configure APT with retry logic and better error handling
|
||||
apt:
|
||||
conf: |
|
||||
APT::Acquire::Retries "3";
|
||||
APT::Acquire::http::Timeout "30";
|
||||
APT::Acquire::https::Timeout "30";
|
||||
Dpkg::Options {
|
||||
"--force-confdef";
|
||||
"--force-confold";
|
||||
};
|
||||
|
||||
packages:
|
||||
- docker.io
|
||||
- curl
|
||||
- wget
|
||||
- htop
|
||||
- unzip
|
||||
|
||||
# Create swap file to handle memory-intensive operations
|
||||
bootcmd:
|
||||
- |
|
||||
# Create 4GB swap file to handle large data extractions (reduced from 8GB to save space)
|
||||
if [ ! -f /swapfile ]; then
|
||||
fallocate -l 4G /swapfile
|
||||
chmod 600 /swapfile
|
||||
mkswap /swapfile
|
||||
swapon /swapfile
|
||||
echo '/swapfile none swap sw 0 0' >> /etc/fstab
|
||||
fi
|
||||
|
||||
runcmd:
|
||||
# Wait for package locks to be released
|
||||
- while fuser /var/lib/dpkg/lock-frontend >/dev/null 2>&1; do echo "Waiting for dpkg lock..."; sleep 5; done
|
||||
- while fuser /var/lib/apt/lists/lock >/dev/null 2>&1; do echo "Waiting for apt lock..."; sleep 5; done
|
||||
|
||||
# Enable and start Docker with retries
|
||||
- systemctl enable docker
|
||||
- systemctl start docker
|
||||
- sleep 10
|
||||
|
||||
# Add ubuntu user to docker group
|
||||
- usermod -aG docker ubuntu
|
||||
|
||||
# Create necessary directories
|
||||
- mkdir -p /opt/osm_dump /opt/osrm /var/lib/docker/volumes
|
||||
- mkdir -p /root/logs
|
||||
|
||||
# Install AWS CLI v2 (awscli package not available in Ubuntu 24.04)
|
||||
- curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o /tmp/awscliv2.zip
|
||||
- unzip /tmp/awscliv2.zip -d /tmp/
|
||||
- /tmp/aws/install
|
||||
- rm -rf /tmp/awscliv2.zip /tmp/aws
|
||||
|
||||
# Configure AWS CLI for S3 access (no credentials needed for public buckets)
|
||||
- mkdir -p /root/.aws
|
||||
- |
|
||||
cat > /root/.aws/config << 'EOF'
|
||||
[default]
|
||||
region = us-east-2
|
||||
output = json
|
||||
EOF
|
||||
|
||||
# Create a comprehensive bootstrap script that runs in background
|
||||
- |
|
||||
cat > /root/bootstrap.sh << 'EOF'
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
exec > >(tee -a /var/log/webarena-map-bootstrap.log) 2>&1
|
||||
|
||||
echo "$(date): Starting WebArena map server bootstrap"
|
||||
echo "$(date): System info: $(uname -a)"
|
||||
echo "$(date): Available memory: $(free -h)"
|
||||
echo "$(date): Available disk space: $(df -h)"
|
||||
|
||||
# Check if we have enough disk space (need at least 200GB free)
|
||||
AVAILABLE_GB=$(df / | awk 'NR==2 {print int($4/1024/1024)}')
|
||||
echo "$(date): Available disk space: ${AVAILABLE_GB}GB"
|
||||
if [ "$AVAILABLE_GB" -lt 200 ]; then
|
||||
echo "$(date): ERROR: Insufficient disk space. Need at least 200GB, have ${AVAILABLE_GB}GB"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Function to retry commands with exponential backoff
|
||||
retry() {
|
||||
local n=1
|
||||
local max=5
|
||||
local delay=30
|
||||
while true; do
|
||||
"$@" && break || {
|
||||
if [[ $n -lt $max ]]; then
|
||||
((n++))
|
||||
echo "$(date): Command failed. Attempt $n/$max. Waiting ${delay}s..."
|
||||
sleep $delay
|
||||
delay=$((delay * 2)) # Exponential backoff
|
||||
else
|
||||
echo "$(date): Command failed after $n attempts: $*"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
done
|
||||
}
|
||||
|
||||
# Function to monitor background processes
|
||||
monitor_extraction() {
|
||||
local pid=$1
|
||||
local desc=$2
|
||||
echo "$(date): Monitoring $desc (PID: $pid)"
|
||||
while kill -0 $pid 2>/dev/null; do
|
||||
echo "$(date): $desc still running..."
|
||||
sleep 60
|
||||
done
|
||||
wait $pid
|
||||
local exit_code=$?
|
||||
if [ $exit_code -eq 0 ]; then
|
||||
echo "$(date): ✅ $desc completed successfully"
|
||||
else
|
||||
echo "$(date): ❌ $desc failed with exit code $exit_code"
|
||||
return $exit_code
|
||||
fi
|
||||
}
|
||||
|
||||
# Download and extract data with retries and parallel processing where safe
|
||||
echo "$(date): Starting data downloads..."
|
||||
|
||||
# Download all files first (can be done in parallel)
|
||||
echo "$(date): Downloading OSM tile server data..."
|
||||
retry aws s3 cp --no-sign-request s3://webarena-map-server-data/osm_tile_server.tar /root/osm_tile_server.tar &
|
||||
DOWNLOAD_TILE_PID=$!
|
||||
|
||||
echo "$(date): Downloading Nominatim data..."
|
||||
retry aws s3 cp --no-sign-request s3://webarena-map-server-data/nominatim_volumes.tar /root/nominatim_volumes.tar &
|
||||
DOWNLOAD_NOM_PID=$!
|
||||
|
||||
echo "$(date): Downloading OSM dump..."
|
||||
retry aws s3 cp --no-sign-request s3://webarena-map-server-data/osm_dump.tar /root/osm_dump.tar &
|
||||
DOWNLOAD_DUMP_PID=$!
|
||||
|
||||
echo "$(date): Downloading OSRM routing data..."
|
||||
retry aws s3 cp --no-sign-request s3://webarena-map-server-data/osrm_routing.tar /root/osrm_routing.tar &
|
||||
DOWNLOAD_OSRM_PID=$!
|
||||
|
||||
# Wait for all downloads to complete
|
||||
echo "$(date): Waiting for downloads to complete..."
|
||||
monitor_extraction $DOWNLOAD_TILE_PID "OSM tile server download"
|
||||
monitor_extraction $DOWNLOAD_NOM_PID "Nominatim download"
|
||||
monitor_extraction $DOWNLOAD_DUMP_PID "OSM dump download"
|
||||
monitor_extraction $DOWNLOAD_OSRM_PID "OSRM routing download"
|
||||
|
||||
echo "$(date): All downloads completed. Starting extractions..."
|
||||
|
||||
# Extract files sequentially to avoid memory issues and clean up immediately
|
||||
echo "$(date): Extracting OSM tile server data..."
|
||||
tar -C /var/lib/docker/volumes -xf /root/osm_tile_server.tar
|
||||
rm -f /root/osm_tile_server.tar # Clean up immediately to save space
|
||||
echo "$(date): ✅ OSM tile server data extracted and cleaned up"
|
||||
|
||||
echo "$(date): Extracting Nominatim data..."
|
||||
tar -C /var/lib/docker/volumes -xf /root/nominatim_volumes.tar
|
||||
rm -f /root/nominatim_volumes.tar # Clean up immediately to save space
|
||||
echo "$(date): ✅ Nominatim data extracted and cleaned up"
|
||||
|
||||
echo "$(date): Extracting OSM dump..."
|
||||
tar -C /opt/osm_dump -xf /root/osm_dump.tar
|
||||
rm -f /root/osm_dump.tar # Clean up immediately to save space
|
||||
echo "$(date): ✅ OSM dump extracted and cleaned up"
|
||||
|
||||
echo "$(date): Extracting OSRM routing data..."
|
||||
tar -C /opt/osrm -xf /root/osrm_routing.tar
|
||||
rm -f /root/osrm_routing.tar # Clean up immediately to save space
|
||||
echo "$(date): ✅ OSRM routing data extracted and cleaned up"
|
||||
|
||||
# Verify extracted data
|
||||
echo "$(date): Verifying extracted data..."
|
||||
ls -la /var/lib/docker/volumes/ | head -20
|
||||
ls -la /opt/osm_dump/ | head -10
|
||||
ls -la /opt/osrm/ | head -10
|
||||
|
||||
# Pull Docker images
|
||||
echo "$(date): Pulling Docker images..."
|
||||
docker pull overv/openstreetmap-tile-server
|
||||
docker pull mediagis/nominatim:4.2
|
||||
docker pull ghcr.io/project-osrm/osrm-backend:v5.27.1
|
||||
|
||||
# Start containers with restart policies and proper resource limits
|
||||
echo "$(date): Starting tile server..."
|
||||
docker run --name tile --restart unless-stopped \
|
||||
--memory=2g --memory-swap=4g \
|
||||
--volume=osm-data:/data/database/ --volume=osm-tiles:/data/tiles/ \
|
||||
-p 8080:80 -d overv/openstreetmap-tile-server run
|
||||
|
||||
# Wait a bit for tile server to initialize
|
||||
sleep 30
|
||||
|
||||
echo "$(date): Starting Nominatim geocoding server..."
|
||||
docker run --name nominatim --restart unless-stopped \
|
||||
--memory=4g --memory-swap=8g \
|
||||
--env=IMPORT_STYLE=extratags \
|
||||
--env=PBF_PATH=/nominatim/data/us-northeast-latest.osm.pbf \
|
||||
--env=IMPORT_WIKIPEDIA=/nominatim/data/wikimedia-importance.sql.gz \
|
||||
--volume=/opt/osm_dump:/nominatim/data \
|
||||
--volume=nominatim-data:/var/lib/postgresql/14/main \
|
||||
--volume=nominatim-flatnode:/nominatim/flatnode \
|
||||
-p 8085:8080 -d mediagis/nominatim:4.2 /app/start.sh
|
||||
|
||||
# Wait for Nominatim to initialize
|
||||
sleep 60
|
||||
|
||||
echo "$(date): Starting OSRM routing servers..."
|
||||
|
||||
# Start OSRM car routing
|
||||
docker run --name osrm-car --restart unless-stopped \
|
||||
--memory=1g --memory-swap=2g \
|
||||
--volume=/opt/osrm/car:/data -p 5000:5000 -d \
|
||||
ghcr.io/project-osrm/osrm-backend:v5.27.1 osrm-routed --algorithm mld /data/us-northeast-latest.osrm
|
||||
|
||||
# Start OSRM bike routing
|
||||
docker run --name osrm-bike --restart unless-stopped \
|
||||
--memory=1g --memory-swap=2g \
|
||||
--volume=/opt/osrm/bike:/data -p 5001:5000 -d \
|
||||
ghcr.io/project-osrm/osrm-backend:v5.27.1 osrm-routed --algorithm mld /data/us-northeast-latest.osrm
|
||||
|
||||
# Start OSRM foot routing
|
||||
docker run --name osrm-foot --restart unless-stopped \
|
||||
--memory=1g --memory-swap=2g \
|
||||
--volume=/opt/osrm/foot:/data -p 5002:5000 -d \
|
||||
ghcr.io/project-osrm/osrm-backend:v5.27.1 osrm-routed --algorithm mld /data/us-northeast-latest.osrm
|
||||
|
||||
echo "$(date): All services started. Waiting for initialization..."
|
||||
sleep 120
|
||||
|
||||
echo "$(date): Verifying service health..."
|
||||
docker ps --format "table {{.Names}}\t{{.Image}}\t{{.Status}}\t{{.Ports}}"
|
||||
|
||||
# Test service endpoints
|
||||
echo "$(date): Testing service endpoints..."
|
||||
|
||||
# Test tile server
|
||||
if curl -f -s -o /dev/null "http://localhost:8080/tile/0/0/0.png"; then
|
||||
echo "$(date): ✅ Tile server is responding"
|
||||
else
|
||||
echo "$(date): ❌ Tile server is not responding"
|
||||
fi
|
||||
|
||||
# Test Nominatim
|
||||
if curl -f -s -o /dev/null "http://localhost:8085/search?q=test&format=json&limit=1"; then
|
||||
echo "$(date): ✅ Nominatim is responding"
|
||||
else
|
||||
echo "$(date): ❌ Nominatim is not responding"
|
||||
fi
|
||||
|
||||
# Test OSRM services
|
||||
for service in car bike foot; do
|
||||
port=$((5000 + $(echo "car bike foot" | tr ' ' '\n' | grep -n $service | cut -d: -f1) - 1))
|
||||
if curl -f -s -o /dev/null "http://localhost:$port/route/v1/$service/-79.9959,40.4406;-79.9,40.45?overview=false"; then
|
||||
echo "$(date): ✅ OSRM $service routing is responding"
|
||||
else
|
||||
echo "$(date): ❌ OSRM $service routing is not responding"
|
||||
fi
|
||||
done
|
||||
|
||||
# All tar files already cleaned up during extraction
|
||||
|
||||
# Final status report
|
||||
echo "$(date): Bootstrap completed!"
|
||||
echo "$(date): Final service status:"
|
||||
docker ps
|
||||
echo "$(date): Available disk space after cleanup:"
|
||||
df -h
|
||||
echo "$(date): Memory usage:"
|
||||
free -h
|
||||
|
||||
echo "$(date): Services are available at:"
|
||||
# 169.254.169.254 is the AWS Instance Metadata Service (IMDS) endpoint
|
||||
# It provides instance metadata including the public IP address
|
||||
echo " - Tile server: http://$(curl -s http://169.254.169.254/latest/meta-data/public-ipv4):8080/tile/{z}/{x}/{y}.png"
|
||||
echo " - Geocoding: http://$(curl -s http://169.254.169.254/latest/meta-data/public-ipv4):8085/"
|
||||
echo " - OSRM Car: http://$(curl -s http://169.254.169.254/latest/meta-data/public-ipv4):5000/"
|
||||
echo " - OSRM Bike: http://$(curl -s http://169.254.169.254/latest/meta-data/public-ipv4):5001/"
|
||||
echo " - OSRM Foot: http://$(curl -s http://169.254.169.254/latest/meta-data/public-ipv4):5002/"
|
||||
|
||||
echo "$(date): Bootstrap script completed successfully!"
|
||||
EOF
|
||||
|
||||
# Make bootstrap script executable and run it in background
|
||||
- chmod +x /root/bootstrap.sh
|
||||
- nohup /root/bootstrap.sh > /var/log/webarena-map-bootstrap.log 2>&1 &
|
||||
|
||||
# Write completion marker
|
||||
write_files:
|
||||
- path: /root/cloud-init-completed
|
||||
content: |
|
||||
Cloud-init completed at $(date)
|
||||
Bootstrap script started in background
|
||||
Check /var/log/webarena-map-bootstrap.log for progress
|
||||
permissions: '0644'
|
||||
|
||||
final_message: |
|
||||
WebArena map server cloud-init completed.
|
||||
Bootstrap script is running in background.
|
||||
Check /var/log/webarena-map-bootstrap.log for progress.
|
||||
Services will be available at:
|
||||
- Tiles: http://<instance-ip>:8080/tile/{z}/{x}/{y}.png
|
||||
- Geocoding: http://<instance-ip>:8085/
|
||||
- Routing: http://<instance-ip>:5000 (car), :5001 (bike), :5002 (foot)
|
||||
Loading…
Reference in New Issue
Block a user