|
|
@@ -125,7 +125,7 @@ def process_instance(instance, agent_class, metadata, reset_logger: bool = True)
|
|
|
)
|
|
|
)
|
|
|
# ======= Attempt to evaluate the agent's edits =======
|
|
|
- # If you are working on simplier benchmark that only evaluates the final model output (e.g., in a MessageAction)
|
|
|
+ # If you are working on simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
|
|
|
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
|
|
|
|
|
|
if state is None:
|
|
|
@@ -235,7 +235,7 @@ if __name__ == '__main__':
|
|
|
'max_iterations': max_iterations,
|
|
|
'eval_output_dir': eval_output_dir,
|
|
|
'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
|
|
|
- # get the commit id of current repo for reproduciblity
|
|
|
+ # get the commit id of current repo for reproducibility
|
|
|
'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
|
|
|
.decode('utf-8')
|
|
|
.strip(),
|