1 год назад · 874b4c9075
--- a/evaluation/EDA/run_infer.py
+++ b/evaluation/EDA/run_infer.py
@@ -126,6 +126,7 @@ def process_instance(
 
															         main(
														
 
															             instruction,
														
 
															             fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
														
 
															+            sid=instance['text'].strip(),
														
 
															         )
														
 
															     )
														
 
															     # ======= Attempt to evaluate the agent's edits =======
														
--- a/evaluation/agent_bench/run_infer.py
+++ b/evaluation/agent_bench/run_infer.py
@@ -163,6 +163,7 @@ def process_instance(
 
															             instruction,
														
 
															             fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
														
 
															             sandbox=sandbox,
														
 
															+            sid=inst_id,
														
 
															         )
														
 
															     )
														
--- a/evaluation/biocoder/run_infer.py
+++ b/evaluation/biocoder/run_infer.py
@@ -213,12 +213,16 @@ def process_instance(
 
															     # NOTE: You can actually set slightly different instruction for different agents
														
 
															     instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
														
 
															+    # use a session id for concurrent evaluation
														
 
															+    sid = instance.test_case_id.replace('/', '__')
														
 
															+
														
 
															     # Here's how you can run the agent (similar to the `main` function) and get the final task state
														
 
															     state: State = asyncio.run(
														
 
															         main(
														
 
															             instruction,
														
 
															             fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
														
 
															             sandbox=sandbox,
														
 
															+            sid=sid,
														
 
															         )
														
 
															     )
														
--- a/evaluation/bird/run_infer.py
+++ b/evaluation/bird/run_infer.py
@@ -153,13 +153,16 @@ def process_instance(
 
															     # Set up the database path
														
 
															     database_path = os.path.join(instance.db_id, f'{instance.db_id}.sqlite')
														
 
															+    # use session id for concurrent evaluation
														
 
															+    sid = instance.task_id.replace('/', '__')
														
 
															+
														
 
															     # Set up the logger properly, so you can run multi-processing to parallelize the evaluation
														
 
															     if reset_logger:
														
 
															         # Set up logger
														
 
															         log_file = os.path.join(
														
 
															             eval_output_dir,
														
 
															             'logs',
														
 
															-            f'instance_{instance.task_id.replace("/", "__")}.log',
														
 
															+            f'instance_{sid}.log',
														
 
															         )
														
 
															         # Remove all existing handlers from logger
														
 
															         for handler in logger.handlers[:]:
														
@@ -198,14 +201,12 @@ def process_instance(
 
															         result = execute_sql(db_path, sql)
														
 
															         print(result)
														
 
															     """
														
 
															-    path = os.path.join(
														
 
															-        config.workspace_mount_path, f'{instance.task_id.replace("/", "__")}.py'
														
 
															-    )
														
 
															+    path = os.path.join(config.workspace_mount_path, f'{sid}.py')
														
 
															     instruction = (
														
 
															         f'You are a SQL expert and need to complete the following text-to-SQL tasks.'
														
 
															         f'\n\n{instance.instruction}\n\n'
														
 
															         'Please write the SQL in one line without line breaks.'
														
 
															-        f'And write a new python file named {instance.task_id.replace("/", "__")}.py to call the SQL you wrote.'
														
 
															+        f'And write a new python file named {sid}.py to call the SQL you wrote.'
														
 
															         'You need to follow the code template below:'
														
 
															         f'\n\n{statements}\n\n'
														
 
															         'Environment has been set up for you to start working.'
														
@@ -222,6 +223,7 @@ def process_instance(
 
															         main(
														
 
															             instruction,
														
 
															             fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
														
 
															+            sid=sid,
														
 
															         )
														
 
															     )
														
--- a/evaluation/gaia/run_infer.py
+++ b/evaluation/gaia/run_infer.py
@@ -145,6 +145,7 @@ def process_instance(instance, agent_class, metadata, reset_logger: bool = True)
 
															                 fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
														
 
															                     agent_class
														
 
															                 ),
														
 
															+                sid=instance['task_id'],
														
 
															             )
														
 
															         )
														
 
															         # ======= Attempt to evaluate the agent's edits =======
														
--- a/evaluation/gorilla/run_infer.py
+++ b/evaluation/gorilla/run_infer.py
@@ -117,6 +117,7 @@ def process_instance(
 
															                 fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
														
 
															                     agent_class
														
 
															                 ),
														
 
															+                sid=question_id,
														
 
															             )
														
 
															         )
														
 
															         # ======= Attempt to evaluate the agent's edits =======
														
--- a/evaluation/gpqa/run_infer.py
+++ b/evaluation/gpqa/run_infer.py
@@ -251,6 +251,7 @@ def process_instance(
 
															                 fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
														
 
															                     agent_class
														
 
															                 ),
														
 
															+                sid=instance.instance_id,
														
 
															             )
														
 
															         )
														
--- a/evaluation/humanevalfix/run_infer.py
+++ b/evaluation/humanevalfix/run_infer.py
@@ -156,13 +156,16 @@ def process_instance(
 
															         config.workspace_base = workspace_mount_path
														
 
															         config.workspace_mount_path = workspace_mount_path
														
 
															+        # use a session id for concurrent evaluation
														
 
															+        sid = instance.task_id.replace('/', '__')
														
 
															+
														
 
															         # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
														
 
															         if reset_logger:
														
 
															             # Set up logger
														
 
															             log_file = os.path.join(
														
 
															                 eval_output_dir,
														
 
															                 'logs',
														
 
															-                f'instance_{instance.task_id.replace("/", "__")}.log',
														
 
															+                f'instance_{sid}.log',
														
 
															             )
														
 
															             # Remove all existing handlers from logger
														
 
															             for handler in logger.handlers[:]:
														
@@ -189,9 +192,7 @@ def process_instance(
 
															         problem_statement = (
														
 
															             instance.declaration + instance.buggy_solution + '\n' + instance.test
														
 
															         )
														
 
															-        path = os.path.join(
														
 
															-            workspace_mount_path, f'{instance.task_id.replace("/", "__")}.py'
														
 
															-        )
														
 
															+        path = os.path.join(workspace_mount_path, f'{sid}.py')
														
 
															         with open(path, 'w') as f:
														
 
															             f.write(problem_statement)
														
@@ -217,6 +218,7 @@ def process_instance(
 
															                 fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
														
 
															                     agent_class
														
 
															                 ),
														
 
															+                sid=sid,
														
 
															             )
														
 
															         )
														
--- a/evaluation/logic_reasoning/run_infer.py
+++ b/evaluation/logic_reasoning/run_infer.py
@@ -207,7 +207,9 @@ def process_instance(
 
															         # NOTE: You can actually set slightly different instruction for different agents
														
 
															         instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
														
 
															-        sandbox = DockerSSHBox()
														
 
															+        # use a session id for concurrent evaluation
														
 
															+        sid = instance['id'] + '_' + str(os.getpid())
														
 
															+        sandbox = DockerSSHBox(sid=sid)
														
 
															         exit_code, command_output = sandbox.execute('pip install scitools-pyke')
														
 
															         # Here's how you can run the agent (similar to the `main` function) and get the final task state
														
@@ -218,6 +220,7 @@ def process_instance(
 
															                     agent_class
														
 
															                 ),
														
 
															                 sandbox=sandbox,
														
 
															+                sid=sid,
														
 
															             )
														
 
															         )
														
 
															         # ======= Attempt to evaluate the agent's edits =======
														
--- a/evaluation/miniwob/run_infer.py
+++ b/evaluation/miniwob/run_infer.py
@@ -65,6 +65,7 @@ def process_instance(
 
															             'PLACEHOLDER_GOAL',
														
 
															             runtime_tools_config=runtime_tools_config,
														
 
															             sandbox=docker_sandbox,
														
 
															+            sid=env_id,
														
 
															         )
														
 
															     )
														
--- a/evaluation/mint/run_infer.py
+++ b/evaluation/mint/run_infer.py
@@ -116,7 +116,9 @@ def process_instance(
 
															     if not skip_workspace_mount:
														
 
															         logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
														
 
															-    sandbox = DockerSSHBox()
														
 
															+    # use a session id for concurrent processing
														
 
															+    sid = instance.task_id + '_' + str(os.getpid())
														
 
															+    sandbox = DockerSSHBox(sid=sid)
														
 
															     requirements_host_src = 'evaluation/mint/requirements.txt'
														
 
															     requirements_sandbox_dest = '/opendevin/plugins/mint/requirements.txt'
														
@@ -159,6 +161,7 @@ def process_instance(
 
															             instruction,
														
 
															             fake_user_response_fn=fake_user_response_fn,
														
 
															             sandbox=sandbox,
														
 
															+            sid=sid,
														
 
															         )
														
 
															     )
														
--- a/evaluation/ml_bench/run_infer.py
+++ b/evaluation/ml_bench/run_infer.py
@@ -143,8 +143,9 @@ def process_instance(
 
															         logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
														
 
															-        # Create a sandbox, using the instance ID as the session ID to avoid conflicts
														
 
															-        sandbox = DockerSSHBox(sid=str(instance['id']) + '_' + str(os.getpid()))
														
 
															+        # Create a sandbox, using the instance ID and PID as the session ID to avoid conflicts
														
 
															+        sid = str(instance['id']) + '_' + str(os.getpid())
														
 
															+        sandbox = DockerSSHBox(sid=sid)
														
 
															         # Set up the task environment
														
 
															         sandbox.execute(f'conda activate {ID2CONDA[instance["github_id"]]}')
														
@@ -186,6 +187,7 @@ def process_instance(
 
															                     agent_class
														
 
															                 ),
														
 
															                 sandbox=sandbox,
														
 
															+                sid=sid,
														
 
															             )
														
 
															         )
														
 
															         metrics = state.metrics.get() if state.metrics else {}
														
--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/swe_bench/run_infer.py
@@ -310,6 +310,7 @@ IMPORTANT TIPS:
 
															             instruction,
														
 
															             fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
														
 
															             sandbox=sandbox,
														
 
															+            sid=instance.instance_id,
														
 
															         )
														
 
															     )
														
--- a/evaluation/swe_bench/swe_env_box.py
+++ b/evaluation/swe_bench/swe_env_box.py
@@ -38,7 +38,7 @@ class SWEBenchSSHBox(DockerSSHBox):
 
															             container_image is not None
														
 
															         ), 'container_image is required for SWEBenchSSHBox!'
														
 
															         # Need to run as root to use SWEBench container
														
 
															-        sid = f'swe_bench_{swe_instance_id}' + str(uuid.uuid4())
														
 
															+        sid = f'swe_bench_{swe_instance_id}_' + str(uuid.uuid4())
														
 
															         super().__init__(container_image, timeout, sid)
														
 
															         self.init_plugins(sandbox_plugins)
														
--- a/evaluation/toolqa/run_infer.py
+++ b/evaluation/toolqa/run_infer.py
@@ -108,6 +108,7 @@ def process_instance(task, agent_class, metadata, reset_logger: bool = True):
 
															         main(
														
 
															             instruction,
														
 
															             fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
														
 
															+            sid=qid,
														
 
															         )
														
 
															     )
														
 
															     # ======= Attempt to evaluate the agent's edits =======
														
--- a/evaluation/webarena/run_infer.py
+++ b/evaluation/webarena/run_infer.py
@@ -65,6 +65,7 @@ def process_instance(
 
															             'PLACEHOLDER_GOAL',
														
 
															             runtime_tools_config=runtime_tools_config,
														
 
															             sandbox=docker_sandbox,
														
 
															+            sid=env_id,
														
 
															         )
														
 
															     )
														
--- a/opendevin/core/main.py
+++ b/opendevin/core/main.py
@@ -36,6 +36,7 @@ async def main(
 
															     fake_user_response_fn: Optional[Callable[[Optional[State]], str]] = None,
														
 
															     sandbox: Optional[Sandbox] = None,
														
 
															     runtime_tools_config: Optional[dict] = None,
														
 
															+    sid: str | None = None,
														
 
															 ) -> Optional[State]:
														
 
															     """Main coroutine to run the agent controller with task input flexibility.
														
 
															     It's only used when you launch opendevin backend directly via cmdline.
														
@@ -84,7 +85,7 @@ async def main(
 
															     AgentCls: Type[Agent] = Agent.get_cls(args.agent_cls)
														
 
															     agent = AgentCls(llm=llm)
														
 
															-    event_stream = EventStream('main')
														
 
															+    event_stream = EventStream('main' + ('_' + sid if sid else ''))
														
 
															     controller = AgentController(
														
 
															         agent=agent,
														
 
															         max_iterations=args.max_iterations,