Преглед на файлове

Fix: Add missing arguments for SSHBox in evaluation (#3075)

* Fix WebArena evaluation script to connect to SSH session

* Update run_infer.py

* Add missing arguments for DockerSSHBox
மனோஜ்குமார் பழனிச்சாமி преди 1 година
родител
ревизия
563ebd406d

+ 8 - 1
evaluation/agent_bench/run_infer.py

@@ -99,7 +99,14 @@ def process_instance(
     # create sandbox and run the agent
     # =============================================
 
-    sandbox = DockerSSHBox()
+    sandbox = DockerSSHBox(
+        config=config.sandbox,
+        persist_sandbox=False,
+        workspace_mount_path=config.workspace_mount_path,
+        sandbox_workspace_dir=config.workspace_mount_path_in_sandbox,
+        cache_dir=config.cache_dir,
+        run_as_devin=config.run_as_devin,
+    )
     sandbox.execute(f'cd {inst_id}')
 
     init_cmd = instance.init

+ 9 - 1
evaluation/logic_reasoning/run_infer.py

@@ -173,7 +173,15 @@ def process_instance(
 
         # use a session id for concurrent evaluation
         sid = instance['id'] + '_' + str(os.getpid())
-        sandbox = DockerSSHBox(sid=sid)
+        sandbox = DockerSSHBox(
+            config=config.sandbox,
+            persist_sandbox=False,
+            workspace_mount_path=config.workspace_mount_path,
+            sandbox_workspace_dir=config.workspace_mount_path_in_sandbox,
+            cache_dir=config.cache_dir,
+            run_as_devin=config.run_as_devin,
+            sid=sid,
+        )
         exit_code, command_output = sandbox.execute('pip install scitools-pyke')
 
         # Here's how you can run the agent (similar to the `main` function) and get the final task state

+ 9 - 1
evaluation/mint/run_infer.py

@@ -101,7 +101,15 @@ def process_instance(
 
     # use a session id for concurrent processing
     sid = instance.task_id + '_' + str(os.getpid())
-    sandbox = DockerSSHBox(sid=sid)
+    sandbox = DockerSSHBox(
+        config=config.sandbox,
+        persist_sandbox=False,
+        workspace_mount_path=config.workspace_mount_path,
+        sandbox_workspace_dir=config.workspace_mount_path_in_sandbox,
+        cache_dir=config.cache_dir,
+        run_as_devin=config.run_as_devin,
+        sid=sid,
+    )
 
     requirements_host_src = 'evaluation/mint/requirements.txt'
     requirements_sandbox_dest = '/opendevin/plugins/mint/requirements.txt'

+ 9 - 1
evaluation/ml_bench/run_infer.py

@@ -112,7 +112,15 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
 
         # Create a sandbox, using the instance ID and PID as the session ID to avoid conflicts
         sid = str(instance['id']) + '_' + str(os.getpid())
-        sandbox = DockerSSHBox(sid=sid)
+        sandbox = DockerSSHBox(
+            config=config.sandbox,
+            persist_sandbox=False,
+            workspace_mount_path=config.workspace_mount_path,
+            sandbox_workspace_dir=config.workspace_mount_path_in_sandbox,
+            cache_dir=config.cache_dir,
+            run_as_devin=config.run_as_devin,
+            sid=sid,
+        )
 
         # Set up the task environment
         sandbox.execute(f'conda activate {ID2CONDA[instance["github_id"]]}')

+ 8 - 1
evaluation/webarena/run_infer.py

@@ -34,7 +34,14 @@ docker_ssh_box: DockerSSHBox | None = None
 def get_sandbox():
     global docker_ssh_box
     if docker_ssh_box is None:
-        docker_ssh_box = DockerSSHBox()
+        docker_ssh_box = DockerSSHBox(
+            config=config.sandbox,
+            persist_sandbox=False,
+            workspace_mount_path=config.workspace_mount_path,
+            sandbox_workspace_dir=config.workspace_mount_path_in_sandbox,
+            cache_dir=config.cache_dir,
+            run_as_devin=config.run_as_devin,
+        )
     return docker_ssh_box