Browse Source

Remove global args (#2760)

* Remove global args

* Remove global args

* Update files

* Update main

* Bug fixes

* Fix logging
Graham Neubig 1 year ago
parent
commit
ffd3c7144c

+ 16 - 7
evaluation/EDA/run_infer.py

@@ -13,15 +13,17 @@ from datasets import load_dataset
 from tqdm import tqdm
 
 from evaluation.EDA.game import Q20Game, Q20GameCelebrity
+from opendevin.controller.agent import Agent
 
 # from evaluation.EDA.scorer import question_scorer
 from opendevin.controller.state.state import State
 from opendevin.core.config import config, get_llm_config_arg, get_parser
 from opendevin.core.logger import get_console_handler
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import main
+from opendevin.core.main import run_agent_controller
 from opendevin.events.action import MessageAction
 from opendevin.events.serialization.event import event_to_dict
+from opendevin.llm.llm import LLM
 
 game = None
 
@@ -42,6 +44,7 @@ def codeact_user_response(state: State) -> str:
             if isinstance(act, MessageAction) and act.source == 'agent':
                 model_guess = act.content
                 break
+    assert game is not None, 'Game is not initialized.'
     msg = game.generate_user_response(model_guess)
     game.curr_turn += 1
     logger.info(f'Model guess: {model_guess}')
@@ -66,7 +69,7 @@ AGENT_CLS_TO_INST_SUFFIX = {
 
 
 def process_instance(
-    instance, agent_class, metadata, openai_api_key, reset_logger: bool = True
+    agent: Agent, instance, metadata, openai_api_key, reset_logger: bool = True
 ):
     # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
     eval_output_dir = metadata['eval_output_dir']
@@ -118,14 +121,17 @@ def process_instance(
 
     # instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
     # NOTE: You can actually set slightly different instruction for different agents
-    instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
+    instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
 
     # Here's how you can run the agent (similar to the `main` function) and get the final task state
 
-    state: State = asyncio.run(
-        main(
+    state: State | None = asyncio.run(
+        run_agent_controller(
+            agent,
             instruction,
-            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
+            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
+                agent.__class__.__name__
+            ],
             sid=instance['text'].strip(),
         )
     )
@@ -309,6 +315,9 @@ if __name__ == '__main__':
     num_workers = args.eval_num_workers
     logger.info(f'Using {num_workers} workers for evaluation.')
 
+    # Create the agent
+    agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
+
     try:
         with ProcessPoolExecutor(num_workers) as executor:
             futures = []
@@ -316,8 +325,8 @@ if __name__ == '__main__':
             for instance in eda_dataset:
                 future = executor.submit(
                     process_instance,
+                    agent,
                     instance,
-                    agent_class,
                     metadata,
                     args.OPENAI_API_KEY,
                     reset_logger=bool(num_workers > 1),

+ 18 - 9
evaluation/agent_bench/run_infer.py

@@ -19,13 +19,15 @@ from evaluation.agent_bench.helper import (
     create_sh_file,
     try_parse_answer,
 )
+from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.config import args, config, get_llm_config_arg
+from opendevin.core.config import config, get_llm_config_arg, parse_arguments
 from opendevin.core.logger import get_console_handler
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import main
+from opendevin.core.main import run_agent_controller
 from opendevin.events.action import CmdRunAction, MessageAction
 from opendevin.events.serialization.event import event_to_dict
+from opendevin.llm.llm import LLM
 from opendevin.runtime.docker.ssh_box import DockerSSHBox
 
 
@@ -78,8 +80,8 @@ AGENT_CLS_TO_INST_SUFFIX = {
 
 
 def process_instance(
+    agent,
     instance,
-    agent_class,
     metadata,
     eval_output_dir,
     reset_logger: bool = True,
@@ -138,7 +140,7 @@ def process_instance(
         'to you AND NEVER ASK FOR HUMAN HELP.\n'
     )
     # NOTE: You can actually set slightly different instruction for different agents
-    instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
+    instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
 
     # =============================================
     # create sandbox and run the agent
@@ -158,10 +160,13 @@ def process_instance(
         logger.info(f'Init script result: {init_res}')
 
     # Here's how you can run the agent (similar to the `main` function) and get the final task state
-    state: State = asyncio.run(
-        main(
+    state: State | None = asyncio.run(
+        run_agent_controller(
+            agent,
             instruction,
-            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
+            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
+                agent.__class__.__name__
+            ],
             sandbox=sandbox,
             sid=inst_id,
         )
@@ -257,10 +262,11 @@ def process_instance(
 
 
 if __name__ == '__main__':
+    args = parse_arguments()
+
     # =============================================
     # load datasets
     # =============================================
-
     dataset = load_dataset('iFurySt/AgentBench')
     agent_bench_tests = dataset['osbench'].to_pandas()
     logger.info(f'Loaded {len(agent_bench_tests)} tests.')
@@ -379,6 +385,9 @@ if __name__ == '__main__':
     num_workers = args.eval_num_workers
     logger.info(f'Using {num_workers} workers for evaluation.')
 
+    # Create the agent
+    agent = Agent.get_cls(agent_cls)(llm=LLM(config.llm))
+
     try:
         with ProcessPoolExecutor(num_workers) as executor:
             futures = []
@@ -386,8 +395,8 @@ if __name__ == '__main__':
             for inst in agent_bench_tests:
                 future = executor.submit(
                     process_instance,
+                    agent,
                     inst,
-                    agent_cls,
                     meta,
                     eval_op_dir,
                     reset_logger=bool(num_workers > 1),

+ 19 - 10
evaluation/biocoder/run_infer.py

@@ -12,15 +12,16 @@ import pandas as pd
 from datasets import load_dataset
 from tqdm import tqdm
 
-import agenthub
 from evaluation.biocoder.biocoder_env_box import BiocoderData, BiocoderSSHBox
+from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.config import args, config, get_llm_config_arg
+from opendevin.core.config import config, get_llm_config_arg, parse_arguments
 from opendevin.core.logger import get_console_handler
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import main
+from opendevin.core.main import run_agent_controller
 from opendevin.events.action import MessageAction
 from opendevin.events.serialization.event import event_to_dict
+from opendevin.llm.llm import LLM
 
 
 def cleanup():
@@ -111,8 +112,8 @@ def get_test_result(instance, sandbox, workspace_dir_name):
 
 
 def process_instance(
+    agent: Agent,
     instance,
-    agent_class,
     metadata,
     skip_workspace_mount,
     eval_output_dir,
@@ -169,7 +170,7 @@ def process_instance(
         workspace_dir_name,
         skip_workspace_mount=False,
         workspace_mount_path=workspace_mount_path,
-        sandbox_plugins=agenthub.Agent.get_cls(agent_class).sandbox_plugins,
+        sandbox_plugins=agent.sandbox_plugins,
     )
 
     sandbox.remove_code()
@@ -211,16 +212,19 @@ def process_instance(
     # )
 
     # NOTE: You can actually set slightly different instruction for different agents
-    instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
+    instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
 
     # use a session id for concurrent evaluation
     sid = instance.test_case_id.replace('/', '__')
 
     # Here's how you can run the agent (similar to the `main` function) and get the final task state
-    state: State = asyncio.run(
-        main(
+    state: State | None = asyncio.run(
+        run_agent_controller(
+            agent,
             instruction,
-            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
+            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
+                agent.__class__.__name__
+            ],
             sandbox=sandbox,
             sid=sid,
         )
@@ -253,6 +257,8 @@ def process_instance(
 
 
 if __name__ == '__main__':
+    args = parse_arguments()
+
     # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
     # so we don't need to manage file uploading to OpenDevin's repo
     dataset = load_dataset('lilbillbiscuit/biocoder_public')
@@ -369,6 +375,9 @@ if __name__ == '__main__':
     skip_workspace_mount = agent_class == 'CodeActAgent'
     logger.info(f'Skipping workspace mount: {skip_workspace_mount}')
 
+    # Create the agent
+    agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
+
     try:
         with ProcessPoolExecutor(num_workers) as executor:
             futures = []
@@ -376,8 +385,8 @@ if __name__ == '__main__':
             for row_idx, instance in biocoder_tests.iterrows():
                 future = executor.submit(
                     process_instance,
+                    agent,
                     instance,
-                    agent_class,
                     metadata,
                     skip_workspace_mount,
                     eval_output_dir,

+ 17 - 8
evaluation/bird/run_infer.py

@@ -16,13 +16,15 @@ from datasets import load_dataset
 from func_timeout import FunctionTimedOut, func_timeout
 from tqdm import tqdm
 
+from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.config import args, config, get_llm_config_arg
+from opendevin.core.config import config, get_llm_config_arg, parse_arguments
 from opendevin.core.logger import get_console_handler
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import main
+from opendevin.core.main import run_agent_controller
 from opendevin.events.action import MessageAction
 from opendevin.events.serialization.event import event_to_dict
+from opendevin.llm.llm import LLM
 
 
 def cleanup():
@@ -126,7 +128,7 @@ def get_test_result(instance, path, timeout=30):
 
 
 def process_instance(
-    instance, agent_class, metadata, skip_workspace_mount, reset_logger: bool = True
+    agent, instance, metadata, skip_workspace_mount, reset_logger: bool = True
 ):
     workspace_mount_path = os.path.join(
         config.workspace_mount_path, 'bird_eval_workspace'
@@ -217,12 +219,15 @@ def process_instance(
         'You SHOULD INCLUDE PROPER INDENTATION in your edit commands.\n'
     )
     # NOTE: You can actually set slightly different instruction for different agents
-    instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
+    instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
     # Here's how you can run the agent (similar to the `main` function) and get the final task state
-    state: State = asyncio.run(
-        main(
+    state: State | None = asyncio.run(
+        run_agent_controller(
+            agent,
             instruction,
-            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
+            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
+                agent.__class__.__name__
+            ],
             sid=sid,
         )
     )
@@ -381,6 +386,7 @@ def create_prompt(e, database_path):
 
 
 if __name__ == '__main__':
+    args = parse_arguments()
     # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
     # so we don't need to manage file uploading to OpenDevin's repo
     # Due to the large size of the BIRD database, it cannot be hosted on huggingface datasets, so it needs to be downloaded
@@ -492,6 +498,9 @@ if __name__ == '__main__':
     num_workers = args.eval_num_workers
     logger.info(f'Using {num_workers} workers for evaluation.')
 
+    # Create the agent
+    agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
+
     try:
         with ProcessPoolExecutor(num_workers) as executor:
             futures = []
@@ -499,8 +508,8 @@ if __name__ == '__main__':
             for row_idx, instance in bird_tests.iterrows():
                 future = executor.submit(
                     process_instance,
+                    agent,
                     instance,
-                    agent_class,
                     metadata,
                     skip_workspace_mount=False,
                     reset_logger=bool(num_workers > 1),

+ 15 - 9
evaluation/gaia/run_infer.py

@@ -15,13 +15,15 @@ from datasets import load_dataset
 from tqdm import tqdm
 
 from evaluation.gaia.scorer import question_scorer
+from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
 from opendevin.core.config import config, get_llm_config_arg, get_parser
 from opendevin.core.logger import get_console_handler
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import main
+from opendevin.core.main import run_agent_controller
 from opendevin.events.action import CmdRunAction, MessageAction
 from opendevin.events.serialization.event import event_to_dict
+from opendevin.llm.llm import LLM
 
 DATASET_CACHE_DIR = '~/.cache/open-devin/evals/gaia'
 DATASET_CACHE_DIR = os.path.expanduser(DATASET_CACHE_DIR)
@@ -72,7 +74,7 @@ AGENT_CLS_TO_INST_SUFFIX = {
 }
 
 
-def process_instance(instance, agent_class, metadata, reset_logger: bool = True):
+def process_instance(agent, instance, metadata, reset_logger: bool = True):
     # create process-specific workspace dir
     # we will create a workspace directory for EACH process
     # so that different agent don't interfere with each other.
@@ -135,16 +137,17 @@ def process_instance(instance, agent_class, metadata, reset_logger: bool = True)
             'For example: The answer to the question is <solution> 42 </solution>.\n'
         )
         # NOTE: You can actually set slightly different instruction for different agents
-        instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
+        instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent.__class__.__name__, '')
         logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})
 
         # Here's how you can run the agent (similar to the `main` function) and get the final task state
-        state: State = asyncio.run(
-            main(
+        state: State | None = asyncio.run(
+            run_agent_controller(
+                agent,
                 instruction,
-                fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
-                    agent_class
-                ),
+                fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
+                    agent.__class__.__name__
+                ],
                 sid=instance['task_id'],
             )
         )
@@ -344,6 +347,9 @@ if __name__ == '__main__':
     num_workers = args.eval_num_workers
     logger.info(f'Using {num_workers} workers for evaluation.')
 
+    # Create the agent
+    agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
+
     try:
         with ProcessPoolExecutor(num_workers) as executor:
             futures = []
@@ -351,8 +357,8 @@ if __name__ == '__main__':
             for instance in gaia_tests:
                 future = executor.submit(
                     process_instance,
+                    agent,
                     instance,
-                    agent_class,
                     metadata,
                     reset_logger=bool(num_workers > 1),
                 )

+ 15 - 10
evaluation/gorilla/run_infer.py

@@ -9,15 +9,18 @@ import time
 from concurrent.futures import ProcessPoolExecutor
 
 from tqdm import tqdm
-from utils import encode_question, get_data
 
+from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
 from opendevin.core.config import config, get_llm_config_arg, get_parser
 from opendevin.core.logger import get_console_handler
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import main
+from opendevin.core.main import run_agent_controller
 from opendevin.events.action import MessageAction
 from opendevin.events.serialization.event import event_to_dict
+from opendevin.llm.llm import LLM
+
+from .utils import encode_question, get_data
 
 
 def cleanup():
@@ -63,9 +66,7 @@ AGENT_CLS_TO_INST_SUFFIX = {
 }
 
 
-def process_instance(
-    question_id, question, agent_class, metadata, reset_logger: bool = True
-):
+def process_instance(agent, question_id, question, metadata, reset_logger: bool = True):
     # create process-specific workspace dir
     # we will create a workspace directory for EACH process
     # so that different agent don't interfere with each other.
@@ -107,15 +108,16 @@ def process_instance(
         instruction = encode_question(question, metadata['hub'])
         instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
         # NOTE: You can actually set slightly different instruction for different agents
-        instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
+        instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
         # logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})
 
         # Here's how you can run the agent (similar to the `main` function) and get the final task state
-        state: State = asyncio.run(
-            main(
+        state: State | None = asyncio.run(
+            run_agent_controller(
+                agent,
                 instruction,
                 fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
-                    agent_class
+                    agent.__class__.__name__
                 ),
                 sid=question_id,
             )
@@ -295,6 +297,9 @@ if __name__ == '__main__':
             output_fp.flush()
             finished_task_ids.add(output['question_id'])
 
+        # Create the agent
+        agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
+
         # This sets the multi-processing
         num_workers = args.eval_num_workers
         logger.info(f'Using {num_workers} workers for evaluation.')
@@ -308,9 +313,9 @@ if __name__ == '__main__':
                         question = questions[i]
                         future = executor.submit(
                             process_instance,
+                            agent,
                             question_id,
                             question,
-                            agent_class,
                             metadata,
                             reset_logger=bool(num_workers > 1),
                         )

+ 14 - 7
evaluation/gpqa/run_infer.py

@@ -33,13 +33,15 @@ import pandas as pd
 from datasets import load_dataset
 from tqdm import tqdm
 
+from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
 from opendevin.core.config import config, get_llm_config_arg, get_parser
 from opendevin.core.logger import get_console_handler
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import main
+from opendevin.core.main import run_agent_controller
 from opendevin.events.action import MessageAction
 from opendevin.events.serialization.event import event_to_dict
+from opendevin.llm.llm import LLM
 
 
 def cleanup():
@@ -154,8 +156,8 @@ def convert_instance_dict(instance):
 
 
 def process_instance(
+    agent: Agent,
     instance: dict,
-    agent_class: str,
     metadata: dict,
     skip_workspace_mount: bool,
     eval_output_dir: str,
@@ -242,18 +244,20 @@ def process_instance(
         """
 
         # NOTE: You can actually set slightly different instruction for different agents
-        instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
+        instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
 
         # Here's how you can run the agent (similar to the `main` function) and get the final task state
-        state: State = asyncio.run(
-            main(
+        state: State | None = asyncio.run(
+            run_agent_controller(
+                agent,
                 instruction,
                 fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
-                    agent_class
+                    agent.__class__.__name__
                 ),
                 sid=instance.instance_id,
             )
         )
+        assert state is not None, 'State should not be None.'
 
         # ======= Attempt to evaluate the agent's edits =======
         # get the final message from the state history (default to None if not found)
@@ -441,6 +445,9 @@ if __name__ == '__main__':
     skip_workspace_mount = agent_class == 'CodeActAgent'
     logger.info(f'Skipping workspace mount: {skip_workspace_mount}')
 
+    # Create the agent
+    agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
+
     try:
         with ProcessPoolExecutor(num_workers) as executor:
             futures = []
@@ -448,8 +455,8 @@ if __name__ == '__main__':
             for row_idx, instance in gpqa_dataset.iterrows():
                 future = executor.submit(
                     process_instance,
+                    agent,
                     instance,
-                    agent_class,
                     metadata,
                     skip_workspace_mount,
                     eval_output_dir,

+ 16 - 8
evaluation/humanevalfix/run_infer.py

@@ -24,13 +24,15 @@ from datasets import load_dataset
 from evaluate import load
 from tqdm import tqdm
 
+from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.config import args, config, get_llm_config_arg
+from opendevin.core.config import config, get_llm_config_arg, parse_arguments
 from opendevin.core.logger import get_console_handler
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import main
+from opendevin.core.main import run_agent_controller
 from opendevin.events.action import MessageAction
 from opendevin.events.serialization.event import event_to_dict
+from opendevin.llm.llm import LLM
 
 IMPORT_HELPER = {
     'python': [
@@ -136,7 +138,7 @@ def get_test_result(instance, path, language='python', timeout=10):
 
 
 def process_instance(
-    instance, agent_class, metadata, skip_workspace_mount, reset_logger: bool = True
+    agent: Agent, instance, metadata, skip_workspace_mount, reset_logger: bool = True
 ):
     old_workspace_mount_path = config.workspace_mount_path
     old_workspace_base = config.workspace_base
@@ -209,14 +211,15 @@ def process_instance(
             'You SHOULD INCLUDE PROPER INDENTATION in your edit commands.\n'
         )
         # NOTE: You can actually set slightly different instruction for different agents
-        instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
+        instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
 
         # Here's how you can run the agent (similar to the `main` function) and get the final task state
-        state: State = asyncio.run(
-            main(
+        state: State | None = asyncio.run(
+            run_agent_controller(
+                agent,
                 instruction,
                 fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
-                    agent_class
+                    agent.__class__.__name__
                 ),
                 sid=sid,
             )
@@ -254,6 +257,8 @@ def process_instance(
 
 
 if __name__ == '__main__':
+    args = parse_arguments()
+
     # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
     # so we don't need to manage file uploading to OpenDevin's repo
     dataset = load_dataset(
@@ -366,6 +371,9 @@ if __name__ == '__main__':
     num_workers = args.eval_num_workers
     logger.info(f'Using {num_workers} workers for evaluation.')
 
+    # Create the agent
+    agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
+
     try:
         with ProcessPoolExecutor(num_workers) as executor:
             futures = []
@@ -373,8 +381,8 @@ if __name__ == '__main__':
             for row_idx, instance in hefix_tests.iterrows():
                 future = executor.submit(
                     process_instance,
+                    agent,
                     instance,
-                    agent_class,
                     metadata,
                     skip_workspace_mount=False,
                     reset_logger=bool(num_workers > 1),

+ 14 - 9
evaluation/logic_reasoning/run_infer.py

@@ -12,13 +12,15 @@ from datasets import load_dataset
 from tqdm import tqdm
 
 from evaluation.swe_bench.swe_env_box import DockerSSHBox
+from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
 from opendevin.core.config import config, get_llm_config_arg, get_parser
 from opendevin.core.logger import get_console_handler
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import main
+from opendevin.core.main import run_agent_controller
 from opendevin.events.action import MessageAction
 from opendevin.events.serialization.event import event_to_dict
+from opendevin.llm.llm import LLM
 
 
 def cleanup():
@@ -103,7 +105,7 @@ def get_choice(answer_str):
 def get_test_result(
     model_answer: str,
     ground_truth: str,
-) -> bool:
+) -> dict[str, bool]:
     gold_answer = ground_truth.replace('(', '').replace(')', '').strip()
     answer_str = model_answer if model_answer is not None else ''
     prediction = get_choice(answer_str)
@@ -128,9 +130,8 @@ def get_test_result(
 
 
 def process_instance(
+    agent,
     instance,
-    agent_class,
-    # metadata,
     dataset_name,
     skip_workspace_mount,
     eval_output_dir,
@@ -205,7 +206,7 @@ def process_instance(
         )
 
         # NOTE: You can actually set slightly different instruction for different agents
-        instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
+        instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
 
         # use a session id for concurrent evaluation
         sid = instance['id'] + '_' + str(os.getpid())
@@ -213,11 +214,12 @@ def process_instance(
         exit_code, command_output = sandbox.execute('pip install scitools-pyke')
 
         # Here's how you can run the agent (similar to the `main` function) and get the final task state
-        state: State = asyncio.run(
-            main(
+        state: State | None = asyncio.run(
+            run_agent_controller(
+                agent,
                 instruction,
                 fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
-                    agent_class
+                    agent.__class__.__name__
                 ),
                 sandbox=sandbox,
                 sid=sid,
@@ -407,6 +409,9 @@ if __name__ == '__main__':
     skip_workspace_mount = False
     logger.info(f'Skipping workspace mount: {skip_workspace_mount}')
 
+    # Create the agent
+    agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
+
     try:
         with ProcessPoolExecutor(num_workers) as executor:
             futures = []
@@ -414,8 +419,8 @@ if __name__ == '__main__':
             for instance in logic_reasoning_tests:
                 future = executor.submit(
                     process_instance,
+                    agent,
                     instance,
-                    agent_class,
                     dataset_name,
                     skip_workspace_mount,
                     eval_output_dir,

+ 13 - 4
evaluation/miniwob/run_infer.py

@@ -10,12 +10,14 @@ import browsergym.miniwob  # noqa F401 register miniwob tasks as gym environment
 import gymnasium as gym
 from tqdm import tqdm
 
+from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.config import args, config, get_llm_config_arg
+from opendevin.core.config import config, get_llm_config_arg, parse_arguments
 from opendevin.core.logger import get_console_handler
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import main
+from opendevin.core.main import run_agent_controller
 from opendevin.events.serialization.event import event_to_dict
+from opendevin.llm.llm import LLM
 from opendevin.runtime.docker.ssh_box import DockerSSHBox
 from opendevin.runtime.tools import RuntimeTool
 
@@ -23,6 +25,7 @@ SUPPORTED_AGENT_CLS = {'BrowsingAgent'}
 
 
 def process_instance(
+    agent: Agent,
     env_id: str,
     metadata: dict,
     eval_output_dir: str,
@@ -60,8 +63,9 @@ def process_instance(
         }
     }
 
-    state: State = asyncio.run(
-        main(
+    state: State | None = asyncio.run(
+        run_agent_controller(
+            agent,
             'PLACEHOLDER_GOAL',
             runtime_tools_config=runtime_tools_config,
             sandbox=docker_sandbox,
@@ -108,6 +112,8 @@ def process_instance(
 
 
 if __name__ == '__main__':
+    args = parse_arguments()
+
     env_ids = [
         id for id in gym.envs.registry.keys() if id.startswith('browsergym/miniwob')
     ]
@@ -195,11 +201,14 @@ if __name__ == '__main__':
     )
 
     # =============================================
+    # Create the agent
+    agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
 
     docker_sandbox = DockerSSHBox()
     for env_id in tqdm(env_ids):
         try:
             output = process_instance(
+                agent=agent,
                 env_id=env_id,
                 metadata=metadata,
                 eval_output_dir=eval_output_dir,

+ 18 - 11
evaluation/mint/run_infer.py

@@ -11,21 +11,24 @@ from concurrent.futures import ProcessPoolExecutor
 from typing import Dict
 
 import tasks
-from config_variables import TASK_INFO_MAP
 from datasets import load_dataset
-from datatypes import TaskState
-from env import SimplifiedEnv
-from prompts import ToolPromptTemplate
-from tasks import Task
 from tqdm import tqdm
 
 from evaluation.swe_bench.swe_env_box import DockerSSHBox
+from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
 from opendevin.core.config import config, get_llm_config_arg, get_parser
 from opendevin.core.logger import get_console_handler
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import main
+from opendevin.core.main import run_agent_controller
 from opendevin.events.serialization.event import event_to_dict
+from opendevin.llm.llm import LLM
+
+from .config_variables import TASK_INFO_MAP
+from .datatypes import TaskState
+from .env import SimplifiedEnv
+from .prompts import ToolPromptTemplate
+from .tasks import Task
 
 
 def cleanup():
@@ -144,11 +147,11 @@ def process_instance(
     instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you or provide the concise RESULT inside <solution> tag AND NEVER ASK FOR HUMAN HELP.\n'
 
     # NOTE: You can actually set slightly different instruction for different agents
-    instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
+    instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
 
     # Here's how you can run the agent (similar to the `main` function) and get the final task state
     fake_user_response_fn = functools.partial(
-        AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
+        AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[agent.__class__.__name__],
         task=instance,
         task_config={
             'max_iterations': metadata['max_iterations'],
@@ -156,8 +159,9 @@ def process_instance(
         },
     )
 
-    state: State = asyncio.run(
-        main(
+    state: State | None = asyncio.run(
+        run_agent_controller(
+            agent,
             instruction,
             fake_user_response_fn=fake_user_response_fn,
             sandbox=sandbox,
@@ -337,6 +341,9 @@ if __name__ == '__main__':
     skip_workspace_mount = agent_class == 'CodeActAgent'
     logger.info(f'Skipping workspace mount: {skip_workspace_mount}')
 
+    # Create the agent
+    agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
+
     try:
         with ProcessPoolExecutor(num_workers) as executor:
             futures = []
@@ -344,8 +351,8 @@ if __name__ == '__main__':
             for instance in mint_dataset:
                 future = executor.submit(
                     process_instance,
+                    agent,
                     instance,
-                    agent_class,
                     metadata,
                     skip_workspace_mount,
                     eval_output_dir,

+ 14 - 7
evaluation/ml_bench/run_infer.py

@@ -27,13 +27,15 @@ from concurrent.futures import ProcessPoolExecutor
 from datasets import load_dataset
 from tqdm import tqdm
 
+from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
 from opendevin.core.config import config, get_llm_config_arg, get_parser
 from opendevin.core.logger import get_console_handler
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import main
+from opendevin.core.main import run_agent_controller
 from opendevin.events.action import MessageAction
 from opendevin.events.serialization.event import event_to_dict
+from opendevin.llm.llm import LLM
 from opendevin.runtime.docker.ssh_box import DockerSSHBox
 
 
@@ -99,7 +101,7 @@ ID2CONDA = {
 
 
 def process_instance(
-    instance, agent_class, metadata, eval_output_dir, reset_logger: bool = True
+    agent: Agent, instance, metadata, eval_output_dir, reset_logger: bool = True
 ):
     old_workspace_mount_path = config.workspace_mount_path
     old_workspace_base = config.workspace_base
@@ -177,19 +179,21 @@ def process_instance(
             )
             + 'You should terminate the subprocess after running the task (e.g., call subprocess.Popen(args).wait()).'
         )
-        instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
+        instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
 
         # Run the agent
-        state: State = asyncio.run(
-            main(
+        state: State | None = asyncio.run(
+            run_agent_controller(
+                agent,
                 instruction,
                 fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
-                    agent_class
+                    agent.__class__.__name__
                 ),
                 sandbox=sandbox,
                 sid=sid,
             )
         )
+        assert state is not None
         metrics = state.metrics.get() if state.metrics else {}
 
         # Evaluate the agent's script
@@ -365,14 +369,17 @@ if __name__ == '__main__':
     num_workers = args.eval_num_workers
     logger.info(f'Using {num_workers} workers for evaluation.')
 
+    # Create the agent
+    agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
+
     try:
         with ProcessPoolExecutor(num_workers) as executor:
             futures = []
             for _, instance in enumerate(new_instances):
                 future = executor.submit(
                     process_instance,
+                    agent,
                     instance,
-                    agent_class,
                     metadata,
                     eval_output_dir,
                     reset_logger=bool(num_workers > 1),

+ 20 - 9
evaluation/swe_bench/run_infer.py

@@ -7,6 +7,7 @@ import pathlib
 import subprocess
 import time
 from concurrent.futures import ProcessPoolExecutor
+from typing import Any
 
 import pandas as pd
 import toml
@@ -16,13 +17,15 @@ from tqdm import tqdm
 
 import agenthub
 from evaluation.swe_bench.swe_env_box import SWEBenchSSHBox
+from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.config import args, config, get_llm_config_arg
+from opendevin.core.config import config, get_llm_config_arg, parse_arguments
 from opendevin.core.logger import get_console_handler
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import main
+from opendevin.core.main import run_agent_controller
 from opendevin.events.action import MessageAction
 from opendevin.events.serialization.event import event_to_dict
+from opendevin.llm.llm import LLM
 
 USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false') == 'true'
 
@@ -190,8 +193,8 @@ def get_test_result(instance, sandbox, workspace_dir_name):
 
 
 def process_instance(
-    instance: dict,
-    agent_class: str,
+    agent: Agent,
+    instance: Any,
     metadata: dict,
     skip_workspace_mount: bool,
     eval_output_dir: str,
@@ -302,13 +305,16 @@ IMPORTANT TIPS:
         )
 
     # NOTE: You can actually set slightly different instruction for different agents
-    instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
+    instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
 
     # Here's how you can run the agent (similar to the `main` function) and get the final task state
-    state: State = asyncio.run(
-        main(
+    state: State | None = asyncio.run(
+        run_agent_controller(
+            agent,
             instruction,
-            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
+            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
+                agent.__class__.__name__
+            ],
             sandbox=sandbox,
             sid=instance.instance_id,
         )
@@ -369,6 +375,8 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
 
 
 if __name__ == '__main__':
+    args = parse_arguments()
+
     # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
     # so we don't need to manage file uploading to OpenDevin's repo
     dataset = load_dataset('princeton-nlp/SWE-bench_Lite')
@@ -488,6 +496,9 @@ if __name__ == '__main__':
     skip_workspace_mount = agent_class == 'CodeActAgent'
     logger.info(f'Skipping workspace mount: {skip_workspace_mount}')
 
+    # Create the agent
+    agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
+
     try:
         with ProcessPoolExecutor(num_workers) as executor:
             futures = []
@@ -495,8 +506,8 @@ if __name__ == '__main__':
             for row_idx, instance in swe_bench_tests.iterrows():
                 future = executor.submit(
                     process_instance,
+                    agent,
                     instance,
-                    agent_class,
                     metadata,
                     skip_workspace_mount,
                     eval_output_dir,

+ 17 - 8
evaluation/toolqa/run_infer.py

@@ -9,15 +9,18 @@ import time
 from concurrent.futures import ProcessPoolExecutor
 
 from tqdm import tqdm
-from utils import download_data, download_tools, encode_question, eval_answer, get_data
 
+from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
 from opendevin.core.config import config, get_llm_config_arg, get_parser
 from opendevin.core.logger import get_console_handler
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import main
+from opendevin.core.main import run_agent_controller
 from opendevin.events.action import MessageAction
 from opendevin.events.serialization.event import event_to_dict
+from opendevin.llm.llm import LLM
+
+from .utils import download_data, download_tools, encode_question, eval_answer, get_data
 
 
 def cleanup():
@@ -63,7 +66,7 @@ AGENT_CLS_TO_INST_SUFFIX = {
 }
 
 
-def process_instance(task, agent_class, metadata, reset_logger: bool = True):
+def process_instance(agent: Agent, task, metadata, reset_logger: bool = True):
     # create process-specific workspace dir
     # we will create a workspace directory for EACH process
     # so that different agent don't interfere with each other.
@@ -100,14 +103,17 @@ def process_instance(task, agent_class, metadata, reset_logger: bool = True):
     instruction = encode_question(question)
     instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
     # NOTE: You can actually set slightly different instruction for different agents
-    instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
+    instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
     # logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})
 
     # Here's how you can run the agent (similar to the `main` function) and get the final task state
-    state: State = asyncio.run(
-        main(
+    state: State | None = asyncio.run(
+        run_agent_controller(
+            agent,
             instruction,
-            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
+            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
+                agent.__class__.__name__
+            ],
             sid=qid,
         )
     )
@@ -304,6 +310,9 @@ if __name__ == '__main__':
         output_fp.flush()
         finished_task_ids.add(output['qid'])
 
+    # Create the agent
+    agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
+
     # This sets the multi-processing
     num_workers = args.eval_num_workers
     logger.info(f'Using {num_workers} workers for evaluation.')
@@ -315,8 +324,8 @@ if __name__ == '__main__':
                 try:
                     future = executor.submit(
                         process_instance,
+                        agent,
                         task,
-                        agent_class,
                         metadata,
                         reset_logger=bool(num_workers > 1),
                     )

+ 14 - 4
evaluation/webarena/run_infer.py

@@ -10,12 +10,14 @@ import browsergym.webarena  # noqa F401 register webarena tasks as gym environme
 import gymnasium as gym
 from tqdm import tqdm
 
+from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.config import args, config, get_llm_config_arg
+from opendevin.core.config import config, get_llm_config_arg, parse_arguments
 from opendevin.core.logger import get_console_handler
 from opendevin.core.logger import opendevin_logger as logger
-from opendevin.core.main import main
+from opendevin.core.main import run_agent_controller
 from opendevin.events.serialization.event import event_to_dict
+from opendevin.llm.llm import LLM
 from opendevin.runtime.docker.ssh_box import DockerSSHBox
 from opendevin.runtime.tools import RuntimeTool
 
@@ -23,6 +25,7 @@ SUPPORTED_AGENT_CLS = {'BrowsingAgent'}
 
 
 def process_instance(
+    agent: Agent,
     env_id: str,
     metadata: dict,
     eval_output_dir: str,
@@ -60,8 +63,9 @@ def process_instance(
         }
     }
 
-    state: State = asyncio.run(
-        main(
+    state: State | None = asyncio.run(
+        run_agent_controller(
+            agent,
             'PLACEHOLDER_GOAL',
             runtime_tools_config=runtime_tools_config,
             sandbox=docker_sandbox,
@@ -108,6 +112,8 @@ def process_instance(
 
 
 if __name__ == '__main__':
+    args = parse_arguments()
+
     env_ids = [
         id for id in gym.envs.registry.keys() if id.startswith('browsergym/webarena')
     ]
@@ -196,10 +202,14 @@ if __name__ == '__main__':
 
     # =============================================
 
+    # Create the agent
+    agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
+
     docker_sandbox = DockerSSHBox()
     for env_id in tqdm(env_ids):
         try:
             output = process_instance(
+                agent=agent,
                 env_id=env_id,
                 metadata=metadata,
                 eval_output_dir=eval_output_dir,

+ 4 - 1
opendevin/controller/agent_controller.py

@@ -59,7 +59,7 @@ class AgentController:
         agent: Agent,
         event_stream: EventStream,
         sid: str = 'default',
-        max_iterations: int = MAX_ITERATIONS,
+        max_iterations: int | None = MAX_ITERATIONS,
         max_budget_per_task: float | None = MAX_BUDGET_PER_TASK,
         initial_state: State | None = None,
         is_delegate: bool = False,
@@ -86,6 +86,9 @@ class AgentController:
         )
 
         # state from the previous session, state from a parent agent, or a fresh state
+        max_iterations = (
+            max_iterations if max_iterations is not None else MAX_ITERATIONS
+        )
         self.set_initial_state(
             state=initial_state,
             max_iterations=max_iterations,

+ 2 - 5
opendevin/core/config.py

@@ -476,7 +476,7 @@ def get_llm_config_arg(llm_config_arg: str):
 
 
 # Command line arguments
-def get_parser():
+def get_parser() -> argparse.ArgumentParser:
     """
     Get the parser for the command line arguments.
     """
@@ -559,7 +559,7 @@ def get_parser():
     return parser
 
 
-def parse_arguments():
+def parse_arguments() -> argparse.Namespace:
     """
     Parse the command line arguments.
     """
@@ -569,6 +569,3 @@ def parse_arguments():
         config.workspace_base = os.path.abspath(parsed_args.directory)
         print(f'Setting workspace base to {config.workspace_base}')
     return parsed_args
-
-
-args = parse_arguments()

+ 48 - 46
opendevin/core/main.py

@@ -7,7 +7,7 @@ import agenthub  # noqa F401 (we import this to get the agents registered)
 from opendevin.controller import AgentController
 from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.config import args, config, get_llm_config_arg
+from opendevin.core.config import config, get_llm_config_arg, parse_arguments
 from opendevin.core.logger import opendevin_logger as logger
 from opendevin.core.schema import AgentState
 from opendevin.events import EventSource, EventStream, EventStreamSubscriber
@@ -30,8 +30,11 @@ def read_task_from_stdin() -> str:
     return sys.stdin.read()
 
 
-async def main(
-    task_str: str = '',
+async def run_agent_controller(
+    agent: Agent,
+    task_str: str,
+    max_iterations: int | None = None,
+    max_budget_per_task: float | None = None,
     exit_on_message: bool = False,
     fake_user_response_fn: Callable[[State | None], str] | None = None,
     sandbox: Sandbox | None = None,
@@ -48,43 +51,10 @@ async def main(
         sandbox: An optional sandbox to run the agent in.
     """
 
-    # Determine the task source
-    if task_str:
-        task = task_str
-    elif args.file:
-        task = read_task_from_file(args.file)
-    elif args.task:
-        task = args.task
-    elif not sys.stdin.isatty():
-        task = read_task_from_stdin()
-    else:
-        raise ValueError('No task provided. Please specify a task through -t, -f.')
-
-    # only one of model_name or llm_config is required
-    if args.llm_config:
-        # --llm_config
-        # llm_config can contain any of the attributes of LLMConfig
-        llm_config = get_llm_config_arg(args.llm_config)
-
-        if llm_config is None:
-            raise ValueError(f'Invalid toml file, cannot read {args.llm_config}')
-
-        logger.info(
-            f'Running agent {args.agent_cls} (model: {llm_config.model}, llm_config: {args.llm_config}) with task: "{task}"'
-        )
-
-        # create LLM instance with the given config
-        llm = LLM(llm_config=llm_config)
-    else:
-        # --model-name model_name
-        logger.info(
-            f'Running agent {args.agent_cls} (model: {args.model_name}), with task: "{task}"'
-        )
-        llm = LLM(args.model_name)
-
-    # set up the agent
-    AgentCls: Type[Agent] = Agent.get_cls(args.agent_cls)
-    agent = AgentCls(llm=llm)
+    # Logging
+    logger.info(
+        f'Running agent {type(agent)}, model {agent.llm.model_name}, with task: "{task_str}"'
+    )
 
     # set up the event stream
     cli_session = 'main' + ('_' + sid if sid else '')
@@ -102,8 +72,8 @@ async def main(
     # init controller with this initial state
     controller = AgentController(
         agent=agent,
-        max_iterations=args.max_iterations,
-        max_budget_per_task=args.max_budget_per_task,
+        max_iterations=max_iterations,
+        max_budget_per_task=max_budget_per_task,
         event_stream=event_stream,
         initial_state=initial_state,
     )
@@ -124,8 +94,8 @@ async def main(
         with open(
             os.path.join(runtime.browser.eval_dir, 'goal.txt'), 'r', encoding='utf-8'
         ) as f:
-            task = f.read()
-            logger.info(f'Dynamic Eval task: {task}')
+            task_str = f.read()
+            logger.info(f'Dynamic Eval task: {task_str}')
 
     # start event is a MessageAction with the task, either resumed or new
     if config.enable_cli_session and initial_state is not None:
@@ -138,7 +108,7 @@ async def main(
         )
     elif initial_state is None:
         # init with the provided task
-        await event_stream.add_event(MessageAction(content=task), EventSource.USER)
+        await event_stream.add_event(MessageAction(content=task_str), EventSource.USER)
 
     async def on_event(event: Event):
         if isinstance(event, AgentStateChangedObservation):
@@ -174,4 +144,36 @@ async def main(
 
 
 if __name__ == '__main__':
-    asyncio.run(main())
+    args = parse_arguments()
+
+    # Determine the task
+    if args.file:
+        task_str = read_task_from_file(args.file)
+    elif args.task:
+        task_str = args.task
+    elif not sys.stdin.isatty():
+        task_str = read_task_from_stdin()
+    else:
+        raise ValueError('No task provided. Please specify a task through -t, -f.')
+
+    # Figure out the LLM config
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+        if llm_config is None:
+            raise ValueError(f'Invalid toml file, cannot read {args.llm_config}')
+        llm = LLM(llm_config=llm_config)
+    else:
+        llm = LLM(model=args.model_name)
+
+    # Create the agent
+    AgentCls: Type[Agent] = Agent.get_cls(args.agent_cls)
+    agent = AgentCls(llm=llm)
+
+    asyncio.run(
+        run_agent_controller(
+            agent=agent,
+            task_str=task_str,
+            max_iterations=args.max_iterations,
+            max_budget_per_task=args.max_budget_per_task,
+        )
+    )

+ 52 - 10
tests/integration/test_agent.py

@@ -5,13 +5,16 @@ import subprocess
 
 import pytest
 
+from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.main import main
+from opendevin.core.config import parse_arguments
+from opendevin.core.main import run_agent_controller
 from opendevin.core.schema import AgentState
 from opendevin.events.action import (
     AgentFinishAction,
     AgentRejectAction,
 )
+from opendevin.llm.llm import LLM
 
 workspace_base = os.getenv('WORKSPACE_BASE')
 workspace_mount_path = os.getenv('WORKSPACE_MOUNT_PATH')
@@ -29,7 +32,7 @@ print(f'workspace_mount_path_in_sandbox: {workspace_mount_path_in_sandbox}')
 )
 @pytest.mark.skipif(
     (os.getenv('AGENT') == 'CodeActAgent' or os.getenv('AGENT') == 'CodeActSWEAgent')
-    and os.getenv('SANDBOX_TYPE').lower() != 'ssh',
+    and os.getenv('SANDBOX_TYPE', '').lower() != 'ssh',
     reason='CodeActAgent/CodeActSWEAgent only supports ssh sandbox which is stateful',
 )
 @pytest.mark.skipif(
@@ -38,7 +41,14 @@ print(f'workspace_mount_path_in_sandbox: {workspace_mount_path_in_sandbox}')
 )
 def test_write_simple_script():
     task = "Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point."
-    final_state: State = asyncio.run(main(task, exit_on_message=True))
+    args = parse_arguments()
+
+    # Create the agent
+    agent = Agent.get_cls(args.agent_cls)(llm=LLM(args.model_name))
+
+    final_state: State | None = asyncio.run(
+        run_agent_controller(agent, task, exit_on_message=True)
+    )
     assert final_state.agent_state == AgentState.STOPPED
     assert final_state.last_error is None
 
@@ -61,7 +71,7 @@ def test_write_simple_script():
 )
 @pytest.mark.skipif(
     (os.getenv('AGENT') == 'CodeActAgent' or os.getenv('AGENT') == 'CodeActSWEAgent')
-    and os.getenv('SANDBOX_TYPE').lower() != 'ssh',
+    and os.getenv('SANDBOX_TYPE', '').lower() != 'ssh',
     reason='CodeActAgent/CodeActSWEAgent only supports ssh sandbox which is stateful',
 )
 @pytest.mark.skipif(
@@ -73,6 +83,7 @@ def test_write_simple_script():
     reason='local sandbox shows environment-dependent absolute path for pwd command',
 )
 def test_edits():
+    args = parse_arguments()
     # Copy workspace artifacts to workspace_base location
     source_dir = os.path.join(os.path.dirname(__file__), 'workspace/test_edits/')
     files = os.listdir(source_dir)
@@ -82,9 +93,14 @@ def test_edits():
             os.remove(dest_file)
         shutil.copy(os.path.join(source_dir, file), dest_file)
 
+    # Create the agent
+    agent = Agent.get_cls(args.agent_cls)(llm=LLM(args.model_name))
+
     # Execute the task
     task = 'Fix typos in bad.txt. Do not ask me for confirmation at any point.'
-    final_state: State = asyncio.run(main(task, exit_on_message=True))
+    final_state: State | None = asyncio.run(
+        run_agent_controller(agent, task, exit_on_message=True)
+    )
     assert final_state.agent_state == AgentState.STOPPED
     assert final_state.last_error is None
 
@@ -108,9 +124,16 @@ Enjoy!
     reason='Currently, only ssh sandbox supports stateful tasks',
 )
 def test_ipython():
+    args = parse_arguments()
+
+    # Create the agent
+    agent = Agent.get_cls(args.agent_cls)(llm=LLM(args.model_name))
+
     # Execute the task
     task = "Use Jupyter IPython to write a text file containing 'hello world' to '/workspace/test.txt'. Do not ask me for confirmation at any point."
-    final_state: State = asyncio.run(main(task, exit_on_message=True))
+    final_state: State | None = asyncio.run(
+        run_agent_controller(agent, task, exit_on_message=True)
+    )
     assert final_state.agent_state == AgentState.STOPPED
     assert final_state.last_error is None
 
@@ -135,10 +158,15 @@ def test_ipython():
     reason='FIXME: local sandbox does not capture stderr',
 )
 def test_simple_task_rejection():
+    args = parse_arguments()
+
+    # Create the agent
+    agent = Agent.get_cls(args.agent_cls)(llm=LLM(args.model_name))
+
     # Give an impossible task to do: cannot write a commit message because
     # the workspace is not a git repo
     task = 'Write a git commit message for the current staging area. Do not ask me for confirmation at any point.'
-    final_state: State = asyncio.run(main(task))
+    final_state: State | None = asyncio.run(run_agent_controller(agent, task))
     assert final_state.agent_state == AgentState.STOPPED
     assert final_state.last_error is None
     assert isinstance(final_state.history[-1][0], AgentRejectAction)
@@ -153,9 +181,16 @@ def test_simple_task_rejection():
     reason='Currently, only ssh sandbox supports stateful tasks',
 )
 def test_ipython_module():
+    args = parse_arguments()
+
+    # Create the agent
+    agent = Agent.get_cls(args.agent_cls)(llm=LLM(args.model_name))
+
     # Execute the task
     task = "Install and import pymsgbox==1.0.9 and print it's version in /workspace/test.txt. Do not ask me for confirmation at any point."
-    final_state: State = asyncio.run(main(task, exit_on_message=True))
+    final_state: State | None = asyncio.run(
+        run_agent_controller(agent, task, exit_on_message=True)
+    )
     assert final_state.agent_state == AgentState.STOPPED
     assert final_state.last_error is None
 
@@ -178,13 +213,20 @@ def test_ipython_module():
 )
 @pytest.mark.skipif(
     (os.getenv('AGENT') == 'CodeActAgent' or os.getenv('AGENT') == 'CodeActSWEAgent')
-    and os.getenv('SANDBOX_TYPE').lower() != 'ssh',
+    and os.getenv('SANDBOX_TYPE', '').lower() != 'ssh',
     reason='CodeActAgent/CodeActSWEAgent only supports ssh sandbox which is stateful',
 )
 def test_browse_internet(http_server):
+    args = parse_arguments()
+
+    # Create the agent
+    agent = Agent.get_cls(args.agent_cls)(llm=LLM(args.model_name))
+
     # Execute the task
     task = 'Browse localhost:8000, and tell me the ultimate answer to life. Do not ask me for confirmation at any point.'
-    final_state: State = asyncio.run(main(task, exit_on_message=True))
+    final_state: State | None = asyncio.run(
+        run_agent_controller(agent, task, exit_on_message=True)
+    )
     assert final_state.agent_state == AgentState.STOPPED
     assert final_state.last_error is None
     assert isinstance(final_state.history[-1][0], AgentFinishAction)