Просмотр исходного кода

add cost metrics to evaluation outputs for all benchmarks (#2199)

Ryan H. Tran 1 год назад
Родитель
Сommit
22e8fb39b1

+ 2 - 0
evaluation/EDA/run_infer.py

@@ -141,6 +141,7 @@ def process_instance(instance, agent_class, metadata, reset_logger: bool = True)
 
     logger.info(f'Final message: {final_message} | Ground truth: {instance["text"]}')
     test_result = game.reward()
+    metrics = state.metrics.get() if state.metrics else None
 
     # Save the output
     output = {
@@ -151,6 +152,7 @@ def process_instance(instance, agent_class, metadata, reset_logger: bool = True)
         'history': [
             (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
         ],
+        'metrics': metrics,
         'error': state.error if state and state.error else None,
         'test_result': {
             'success': test_result,

+ 7 - 5
evaluation/agent_bench/run_infer.py

@@ -20,7 +20,7 @@ from opendevin.core.config import args, config, get_llm_config_arg
 from opendevin.core.logger import get_console_handler
 from opendevin.core.logger import opendevin_logger as logger
 from opendevin.core.main import main
-from opendevin.events.action import MessageAction, CmdRunAction
+from opendevin.events.action import CmdRunAction, MessageAction
 from opendevin.events.serialization.event import event_to_dict
 from opendevin.runtime.docker.ssh_box import DockerSSHBox
 
@@ -82,7 +82,9 @@ def process_instance(
     question = instance.description
     # create a directory for the instance's workspace
     instance_workspace = str(os.path.join(config.workspace_base, inst_id))
-    container_inst_workspace = str(os.path.join(config.workspace_mount_path_in_sandbox, inst_id))
+    container_inst_workspace = str(
+        os.path.join(config.workspace_mount_path_in_sandbox, inst_id)
+    )
     if os.path.exists(instance_workspace):
         shutil.rmtree(instance_workspace)
     os.makedirs(instance_workspace, exist_ok=True)
@@ -149,9 +151,7 @@ def process_instance(
     state: State = asyncio.run(
         main(
             instruction,
-            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
-                agent_class
-            ),
+            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
             sandbox=sandbox,
         )
     )
@@ -215,6 +215,7 @@ def process_instance(
     histories = [
         (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
     ]
+    metrics = state.metrics.get() if state.metrics else None
 
     # Save the output
     output = {
@@ -223,6 +224,7 @@ def process_instance(
         'instruction': instruction,
         'metadata': metadata,
         'history': histories,
+        'metrics': metrics,
         'error': state.error if state and state.error else None,
         'test_result': {
             'agent_answer': agent_answer,

+ 2 - 0
evaluation/bird/run_infer.py

@@ -232,6 +232,7 @@ def process_instance(
     # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
     if state is None:
         raise ValueError('State should not be None.')
+    metrics = state.metrics.get() if state.metrics else None
 
     # Save the output
     output = {
@@ -241,6 +242,7 @@ def process_instance(
         'history': [
             (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
         ],
+        'metrics': metrics,
         'error': state.error if state and state.error else None,
         'test_result': test_result,
     }

+ 2 - 0
evaluation/gaia/run_infer.py

@@ -177,6 +177,7 @@ def process_instance(instance, agent_class, metadata, reset_logger: bool = True)
         'model_answer': model_answer,
         'ground_truth': instance['Final answer'],
     }
+    metrics = state.metrics.get() if state.metrics else None
 
     # Save the output
     output = {
@@ -187,6 +188,7 @@ def process_instance(instance, agent_class, metadata, reset_logger: bool = True)
         'history': [
             (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
         ],
+        'metrics': metrics,
         'error': state.error if state and state.error else None,
         'test_result': test_result,
     }

+ 2 - 0
evaluation/humanevalfix/run_infer.py

@@ -221,6 +221,7 @@ def process_instance(
     # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
     if state is None:
         raise ValueError('State should not be None.')
+    metrics = state.metrics.get() if state.metrics else None
 
     # Save the output
     output = {
@@ -230,6 +231,7 @@ def process_instance(
         'history': [
             (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
         ],
+        'metrics': metrics,
         'error': state.error if state and state.error else None,
         'test_result': test_result,
     }

+ 26 - 20
evaluation/logic_reasoning/run_infer.py

@@ -204,8 +204,8 @@ def process_instance(
     instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
 
     sandbox = DockerSSHBox()
-    exit_code, command_output = sandbox.execute(f'pip install scitools-pyke')
-    
+    exit_code, command_output = sandbox.execute('pip install scitools-pyke')
+
     # Here's how you can run the agent (similar to the `main` function) and get the final task state
     state: State = asyncio.run(
         main(
@@ -230,13 +230,16 @@ def process_instance(
         if str(obs.content) in ["'A'", "'B'", "'C'"]:
             final_message = obs.content
             break
-    
+
     final_message = final_message.strip("'")
-    logger.info(f'Predicted answer: {final_message}, Ground truth: {instance["answer"]}')
+    logger.info(
+        f'Predicted answer: {final_message}, Ground truth: {instance["answer"]}'
+    )
 
     test_result = get_test_result(
         model_answer=final_message, ground_truth=instance['answer']
     )
+    metrics = state.metrics.get() if state.metrics else None
 
     # Save the output
     output = {
@@ -247,6 +250,7 @@ def process_instance(
         'history': [
             (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
         ],
+        'metrics': metrics,
         'final_message': final_message,
         'messages': messages,
         'error': state.error if state and state.error else None,
@@ -254,10 +258,10 @@ def process_instance(
     }
     config.workspace_mount_path = old_workspace_mount_path
     config.workspace_base = old_workspace_base
-    
+
     # Close the sandbox
     sandbox.close()
-    
+
     return output
 
 
@@ -272,7 +276,7 @@ if __name__ == '__main__':
     parser.add_argument(
         '--data_split',
         type=str,
-        help='data split to evaluate on {validation}', # right now we only support validation split
+        help='data split to evaluate on {validation}',  # right now we only support validation split
         default='validation',
     )
 
@@ -313,7 +317,7 @@ if __name__ == '__main__':
         'logic_reasoning',
         agent_class,
         dataset_name,
-        model_name + '_maxiter_' + str(max_iterations) + eval_note
+        model_name + '_maxiter_' + str(max_iterations) + eval_note,
     )
 
     pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
@@ -414,23 +418,25 @@ if __name__ == '__main__':
         cleanup()
 
     output_fp.close()
-    
+
     with open(output_file, 'r') as f:
-        test_result = [(json.loads(line))["test_result"]["result"] for line in f]
-            
+        test_result = [(json.loads(line))['test_result']['result'] for line in f]
+
     metadata = {
-        "Dataset": dataset_name,
-        "Data split": data_split,
-        "Number of Samples": len(test_result),
+        'Dataset': dataset_name,
+        'Data split': data_split,
+        'Number of Samples': len(test_result),
         'Agent class': agent_class,
         'Model name': model_name,
         'Start_time': start_time,
-        "End_time": time.strftime('%Y-%m-%d %H:%M:%S'),
-        "Final Accuracy": f"{sum(test_result)/len(test_result):.2f}",
-        }
-    
+        'End_time': time.strftime('%Y-%m-%d %H:%M:%S'),
+        'Final Accuracy': f'{sum(test_result)/len(test_result):.2f}',
+    }
+
     with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
         json.dump(metadata, f, indent=4)
-        
+
     logger.info(f'Metadata: {json.dumps(metadata, indent=4)}')
-    logger.info(f'Evaluation finished. Metadata saved to {eval_output_dir}/metadata.json')
+    logger.info(
+        f'Evaluation finished. Metadata saved to {eval_output_dir}/metadata.json'
+    )

+ 3 - 0
evaluation/mint/run_infer.py

@@ -172,6 +172,8 @@ def process_instance(
         task_state = state.task_state
         logger.info('Task state: ' + str(task_state.to_dict()))
 
+    metrics = state.metrics.get() if state.metrics else None
+
     # Save the output
     output = {
         'id': instance.task_id,
@@ -181,6 +183,7 @@ def process_instance(
         'history': [
             (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
         ],
+        'metrics': metrics,
         'error': state.error if state and state.error else None,
         'test_result': task_state.success if task_state else False,
     }