1 год назад · 22e8fb39b1
--- a/evaluation/EDA/run_infer.py
+++ b/evaluation/EDA/run_infer.py
@@ -141,6 +141,7 @@ def process_instance(instance, agent_class, metadata, reset_logger: bool = True)
 
				 
			
 
				     logger.info(f'Final message: {final_message} | Ground truth: {instance["text"]}')
			
 
				     test_result = game.reward()
			
 
				+    metrics = state.metrics.get() if state.metrics else None
			
 
				 
			
 
				     # Save the output
			
 
				     output = {
			
@@ -151,6 +152,7 @@ def process_instance(instance, agent_class, metadata, reset_logger: bool = True)
 
				         'history': [
			
 
				             (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
			
 
				         ],
			
 
				+        'metrics': metrics,
			
 
				         'error': state.error if state and state.error else None,
			
 
				         'test_result': {
			
 
				             'success': test_result,
			
--- a/evaluation/agent_bench/run_infer.py
+++ b/evaluation/agent_bench/run_infer.py
@@ -20,7 +20,7 @@ from opendevin.core.config import args, config, get_llm_config_arg
 
				 from opendevin.core.logger import get_console_handler
			
 
				 from opendevin.core.logger import opendevin_logger as logger
			
 
				 from opendevin.core.main import main
			
 
				-from opendevin.events.action import MessageAction, CmdRunAction
			
 
				+from opendevin.events.action import CmdRunAction, MessageAction
			
 
				 from opendevin.events.serialization.event import event_to_dict
			
 
				 from opendevin.runtime.docker.ssh_box import DockerSSHBox
			
 
				 
			
@@ -82,7 +82,9 @@ def process_instance(
 
				     question = instance.description
			
 
				     # create a directory for the instance's workspace
			
 
				     instance_workspace = str(os.path.join(config.workspace_base, inst_id))
			
 
				-    container_inst_workspace = str(os.path.join(config.workspace_mount_path_in_sandbox, inst_id))
			
 
				+    container_inst_workspace = str(
			
 
				+        os.path.join(config.workspace_mount_path_in_sandbox, inst_id)
			
 
				+    )
			
 
				     if os.path.exists(instance_workspace):
			
 
				         shutil.rmtree(instance_workspace)
			
 
				     os.makedirs(instance_workspace, exist_ok=True)
			
@@ -149,9 +151,7 @@ def process_instance(
 
				     state: State = asyncio.run(
			
 
				         main(
			
 
				             instruction,
			
 
				-            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
			
 
				-                agent_class
			
 
				-            ),
			
 
				+            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
			
 
				             sandbox=sandbox,
			
 
				         )
			
 
				     )
			
@@ -215,6 +215,7 @@ def process_instance(
 
				     histories = [
			
 
				         (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
			
 
				     ]
			
 
				+    metrics = state.metrics.get() if state.metrics else None
			
 
				 
			
 
				     # Save the output
			
 
				     output = {
			
@@ -223,6 +224,7 @@ def process_instance(
 
				         'instruction': instruction,
			
 
				         'metadata': metadata,
			
 
				         'history': histories,
			
 
				+        'metrics': metrics,
			
 
				         'error': state.error if state and state.error else None,
			
 
				         'test_result': {
			
 
				             'agent_answer': agent_answer,
			
--- a/evaluation/bird/run_infer.py
+++ b/evaluation/bird/run_infer.py
@@ -232,6 +232,7 @@ def process_instance(
 
				     # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
			
 
				     if state is None:
			
 
				         raise ValueError('State should not be None.')
			
 
				+    metrics = state.metrics.get() if state.metrics else None
			
 
				 
			
 
				     # Save the output
			
 
				     output = {
			
@@ -241,6 +242,7 @@ def process_instance(
 
				         'history': [
			
 
				             (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
			
 
				         ],
			
 
				+        'metrics': metrics,
			
 
				         'error': state.error if state and state.error else None,
			
 
				         'test_result': test_result,
			
 
				     }
			
--- a/evaluation/gaia/run_infer.py
+++ b/evaluation/gaia/run_infer.py
@@ -177,6 +177,7 @@ def process_instance(instance, agent_class, metadata, reset_logger: bool = True)
 
				         'model_answer': model_answer,
			
 
				         'ground_truth': instance['Final answer'],
			
 
				     }
			
 
				+    metrics = state.metrics.get() if state.metrics else None
			
 
				 
			
 
				     # Save the output
			
 
				     output = {
			
@@ -187,6 +188,7 @@ def process_instance(instance, agent_class, metadata, reset_logger: bool = True)
 
				         'history': [
			
 
				             (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
			
 
				         ],
			
 
				+        'metrics': metrics,
			
 
				         'error': state.error if state and state.error else None,
			
 
				         'test_result': test_result,
			
 
				     }
			
--- a/evaluation/humanevalfix/run_infer.py
+++ b/evaluation/humanevalfix/run_infer.py
@@ -221,6 +221,7 @@ def process_instance(
 
				     # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
			
 
				     if state is None:
			
 
				         raise ValueError('State should not be None.')
			
 
				+    metrics = state.metrics.get() if state.metrics else None
			
 
				 
			
 
				     # Save the output
			
 
				     output = {
			
@@ -230,6 +231,7 @@ def process_instance(
 
				         'history': [
			
 
				             (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
			
 
				         ],
			
 
				+        'metrics': metrics,
			
 
				         'error': state.error if state and state.error else None,
			
 
				         'test_result': test_result,
			
 
				     }
			
--- a/evaluation/logic_reasoning/run_infer.py
+++ b/evaluation/logic_reasoning/run_infer.py
@@ -204,8 +204,8 @@ def process_instance(
 
				     instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
			
 
				 
			
 
				     sandbox = DockerSSHBox()
			
 
				-    exit_code, command_output = sandbox.execute(f'pip install scitools-pyke')
			
 
				-    
			
 
				+    exit_code, command_output = sandbox.execute('pip install scitools-pyke')
			
 
				+
			
 
				     # Here's how you can run the agent (similar to the `main` function) and get the final task state
			
 
				     state: State = asyncio.run(
			
 
				         main(
			
@@ -230,13 +230,16 @@ def process_instance(
 
				         if str(obs.content) in ["'A'", "'B'", "'C'"]:
			
 
				             final_message = obs.content
			
 
				             break
			
 
				-    
			
 
				+
			
 
				     final_message = final_message.strip("'")
			
 
				-    logger.info(f'Predicted answer: {final_message}, Ground truth: {instance["answer"]}')
			
 
				+    logger.info(
			
 
				+        f'Predicted answer: {final_message}, Ground truth: {instance["answer"]}'
			
 
				+    )
			
 
				 
			
 
				     test_result = get_test_result(
			
 
				         model_answer=final_message, ground_truth=instance['answer']
			
 
				     )
			
 
				+    metrics = state.metrics.get() if state.metrics else None
			
 
				 
			
 
				     # Save the output
			
 
				     output = {
			
@@ -247,6 +250,7 @@ def process_instance(
 
				         'history': [
			
 
				             (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
			
 
				         ],
			
 
				+        'metrics': metrics,
			
 
				         'final_message': final_message,
			
 
				         'messages': messages,
			
 
				         'error': state.error if state and state.error else None,
			
@@ -254,10 +258,10 @@ def process_instance(
 
				     }
			
 
				     config.workspace_mount_path = old_workspace_mount_path
			
 
				     config.workspace_base = old_workspace_base
			
 
				-    
			
 
				+
			
 
				     # Close the sandbox
			
 
				     sandbox.close()
			
 
				-    
			
 
				+
			
 
				     return output
			
 
				 
			
 
				 
			
@@ -272,7 +276,7 @@ if __name__ == '__main__':
 
				     parser.add_argument(
			
 
				         '--data_split',
			
 
				         type=str,
			
 
				-        help='data split to evaluate on {validation}', # right now we only support validation split
			
 
				+        help='data split to evaluate on {validation}',  # right now we only support validation split
			
 
				         default='validation',
			
 
				     )
			
 
				 
			
@@ -313,7 +317,7 @@ if __name__ == '__main__':
 
				         'logic_reasoning',
			
 
				         agent_class,
			
 
				         dataset_name,
			
 
				-        model_name + '_maxiter_' + str(max_iterations) + eval_note
			
 
				+        model_name + '_maxiter_' + str(max_iterations) + eval_note,
			
 
				     )
			
 
				 
			
 
				     pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
			
@@ -414,23 +418,25 @@ if __name__ == '__main__':
 
				         cleanup()
			
 
				 
			
 
				     output_fp.close()
			
 
				-    
			
 
				+
			
 
				     with open(output_file, 'r') as f:
			
 
				-        test_result = [(json.loads(line))["test_result"]["result"] for line in f]
			
 
				-            
			
 
				+        test_result = [(json.loads(line))['test_result']['result'] for line in f]
			
 
				+
			
 
				     metadata = {
			
 
				-        "Dataset": dataset_name,
			
 
				-        "Data split": data_split,
			
 
				-        "Number of Samples": len(test_result),
			
 
				+        'Dataset': dataset_name,
			
 
				+        'Data split': data_split,
			
 
				+        'Number of Samples': len(test_result),
			
 
				         'Agent class': agent_class,
			
 
				         'Model name': model_name,
			
 
				         'Start_time': start_time,
			
 
				-        "End_time": time.strftime('%Y-%m-%d %H:%M:%S'),
			
 
				-        "Final Accuracy": f"{sum(test_result)/len(test_result):.2f}",
			
 
				-        }
			
 
				-    
			
 
				+        'End_time': time.strftime('%Y-%m-%d %H:%M:%S'),
			
 
				+        'Final Accuracy': f'{sum(test_result)/len(test_result):.2f}',
			
 
				+    }
			
 
				+
			
 
				     with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
			
 
				         json.dump(metadata, f, indent=4)
			
 
				-        
			
 
				+
			
 
				     logger.info(f'Metadata: {json.dumps(metadata, indent=4)}')
			
 
				-    logger.info(f'Evaluation finished. Metadata saved to {eval_output_dir}/metadata.json')
			
 
				+    logger.info(
			
 
				+        f'Evaluation finished. Metadata saved to {eval_output_dir}/metadata.json'
			
 
				+    )
			
--- a/evaluation/mint/run_infer.py
+++ b/evaluation/mint/run_infer.py
@@ -172,6 +172,8 @@ def process_instance(
 
				         task_state = state.task_state
			
 
				         logger.info('Task state: ' + str(task_state.to_dict()))
			
 
				 
			
 
				+    metrics = state.metrics.get() if state.metrics else None
			
 
				+
			
 
				     # Save the output
			
 
				     output = {
			
 
				         'id': instance.task_id,
			
@@ -181,6 +183,7 @@ def process_instance(
 
				         'history': [
			
 
				             (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
			
 
				         ],
			
 
				+        'metrics': metrics,
			
 
				         'error': state.error if state and state.error else None,
			
 
				         'test_result': task_state.success if task_state else False,
			
 
				     }