Jelajahi Sumber

Two fixes to swe bench eval (#2831)

* Two fixes to swe bench eval

* Add error message

* Change dumping of metadata
Graham Neubig 1 tahun lalu
induk
melakukan
d0384cafdd

+ 1 - 1
evaluation/EDA/run_infer.py

@@ -163,7 +163,7 @@ def process_instance(
         'instance_id': instance['text'].strip(),
         'instance': instance,
         'instruction': instruction,
-        'metadata': metadata,
+        'metadata': metadata.model_dump(),
         'history': [
             (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
         ],

+ 1 - 1
evaluation/agent_bench/run_infer.py

@@ -189,7 +189,7 @@ def process_instance(
         'instance_id': inst_id,
         'instance': instance.to_dict(),
         'instruction': instruction,
-        'metadata': metadata,
+        'metadata': metadata.model_dump(),
         'history': histories,
         'metrics': metrics,
         'error': state.last_error if state and state.last_error else None,

+ 1 - 1
evaluation/biocoder/run_infer.py

@@ -202,7 +202,7 @@ def process_instance(
         'biocoder_instance': instance.to_dict(),
         'instruction': instruction,
         'generated': test_result['metadata']['1_copy_change_code'],
-        'metadata': metadata,
+        'metadata': metadata.model_dump(),
         'history': [
             (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
         ],

+ 1 - 1
evaluation/bird/run_infer.py

@@ -249,7 +249,7 @@ def process_instance(
     output = {
         'task_id': instance.task_id,
         'instruction': instruction,
-        'metadata': metadata,
+        'metadata': metadata.model_dump(),
         'history': [
             (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
         ],

+ 1 - 1
evaluation/gaia/run_infer.py

@@ -171,7 +171,7 @@ def process_instance(
             'instance_id': instance['task_id'],
             'instance': instance,
             'instruction': instance['Question'],
-            'metadata': metadata,
+            'metadata': metadata.model_dump(),
             'history': [
                 (event_to_dict(action), event_to_dict(obs))
                 for action, obs in state.history

+ 1 - 1
evaluation/gorilla/run_infer.py

@@ -150,7 +150,7 @@ def process_instance(agent, question_id, question, metadata, reset_logger: bool
             'hallucination': hallucination,
             'answer_id': 'None',
             'model_id': metadata['model_name'],
-            'metadata': metadata,
+            'metadata': metadata.model_dump(),
             'history': [
                 (event_to_dict(action), event_to_dict(obs))
                 for action, obs in state.history

+ 1 - 1
evaluation/gpqa/run_infer.py

@@ -236,7 +236,7 @@ def process_instance(
             'task_id': instance.task_id,
             'instance_id': instance.instance_id,
             'instruction': instruction,
-            'metadata': metadata,
+            'metadata': metadata.model_dump(),
             'history': [
                 (event_to_dict(action), event_to_dict(obs))
                 for action, obs in state.history

+ 1 - 1
evaluation/humanevalfix/run_infer.py

@@ -206,7 +206,7 @@ def process_instance(
         output = {
             'task_id': instance.task_id,
             'instruction': instruction,
-            'metadata': metadata,
+            'metadata': metadata.model_dump(),
             'history': [
                 (event_to_dict(action), event_to_dict(obs))
                 for action, obs in state.history

+ 1 - 1
evaluation/logic_reasoning/run_infer.py

@@ -222,7 +222,7 @@ def process_instance(
             'id': instance['id'],
             'instance': instance,
             'instruction': instruction,
-            # 'metadata': metadata,
+            # 'metadata': metadata.model_dump(),
             'history': [
                 (event_to_dict(action), event_to_dict(obs))
                 for action, obs in state.history

+ 1 - 1
evaluation/miniwob/run_infer.py

@@ -114,7 +114,7 @@ def process_instance(
     output = {
         'instance_id': env_id,
         'instruction': instruction,
-        'metadata': metadata,
+        'metadata': metadata.model_dump(),
         'history': [
             (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
         ],

+ 1 - 1
evaluation/mint/run_infer.py

@@ -167,7 +167,7 @@ def process_instance(
         'id': instance.task_id,
         'instance': instance.to_dict(),
         'instruction': instruction,
-        'metadata': metadata,
+        'metadata': metadata.model_dump(),
         'history': [
             (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
         ],

+ 1 - 1
evaluation/ml_bench/run_infer.py

@@ -200,7 +200,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
             'instance_id': instance['id'],
             'repo': repo_url,
             'instruction': instruction,
-            'metadata': metadata,
+            'metadata': metadata.model_dump(),
             'history': [
                 (event_to_dict(action), event_to_dict(obs))
                 for action, obs in state.history

+ 5 - 4
evaluation/swe_bench/run_infer.py

@@ -176,9 +176,7 @@ def process_instance(
     # Create the agent
     agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
 
-    workspace_mount_path = os.path.join(
-        metadata.config.workspace_mount_path, '_eval_workspace'
-    )
+    workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace')
     # create process-specific workspace dir
     workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
     pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
@@ -318,7 +316,7 @@ IMPORTANT TIPS:
         'swe_instance': instance.to_dict(),  # SWE Bench specific
         'instruction': instruction,
         'git_patch': git_patch,  # SWE Bench specific
-        'metadata': metadata,
+        'metadata': metadata.model_dump(),
         'history': [
             (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
         ],
@@ -358,6 +356,8 @@ if __name__ == '__main__':
 
     id_column = 'instance_id'
     llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
+    if args.llm_config and llm_config is None:
+        raise ValueError(f'Could not find LLM config {args.llm_config}')
     logger.info(f'Config for evaluation: {config}')
 
     details = {}
@@ -371,6 +371,7 @@ if __name__ == '__main__':
         llm_config,
         'swe-bench-lite',
         args.agent_cls,
+        args.max_iterations,
         args.eval_note,
         args.eval_output_dir,
         details=details,

+ 1 - 1
evaluation/toolqa/run_infer.py

@@ -112,7 +112,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
         'correct': correct,
         'answer_id': 'None',
         'model_id': metadata.model_name,
-        'metadata': metadata,
+        'metadata': metadata.model_dump(),
         'history': [
             (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
         ],

+ 1 - 1
evaluation/webarena/run_infer.py

@@ -115,7 +115,7 @@ def process_instance(
     output = {
         'instance_id': env_id,
         'instruction': instruction,
-        'metadata': metadata,
+        'metadata': metadata.model_dump(),
         'history': [
             (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
         ],