summarise_results.py 2.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. import json
  2. import pprint
  3. import sys
  4. def extract_test_results(res_file_path: str) -> tuple[list[str], list[str]]:
  5. passed = []
  6. failed = []
  7. costs = []
  8. instance_ids = set()
  9. instances = []
  10. with open(res_file_path, 'r') as file:
  11. for line in file:
  12. data = json.loads(line.strip())
  13. success = data['metrics']['success']
  14. if data['instance_id'] in instance_ids:
  15. print(f'WARNING: Duplicate instance_id found: {data["instance_id"]}')
  16. continue
  17. instance_ids.add(data['instance_id'])
  18. instances.append(data)
  19. if success:
  20. passed.append(
  21. {
  22. 'instance_id': data['instance_id'],
  23. 'repo': data['repo'],
  24. 'instruction': data['instruction'],
  25. 'eval_script': data['eval_script'],
  26. 'eval_exit_code': data['eval_exit_code'],
  27. 'eval_output': data['eval_output'],
  28. 'accumulated_cost': data['metrics']['accumulated_cost'],
  29. }
  30. )
  31. else:
  32. failed.append(
  33. {
  34. 'instance_id': data['instance_id'],
  35. 'repo': data['repo'],
  36. 'instruction': data['instruction'],
  37. 'eval_script': data['eval_script'],
  38. 'eval_exit_code': data['eval_exit_code'],
  39. 'eval_output': data['eval_output'],
  40. 'accumulated_cost': data['metrics']['accumulated_cost'],
  41. }
  42. )
  43. costs.append(data['metrics']['accumulated_cost'])
  44. # sort by instance_id
  45. instances.sort(key=lambda x: x['instance_id'])
  46. with open(res_file_path, 'w') as file:
  47. for instance in instances:
  48. file.write(json.dumps(instance) + '\n')
  49. return passed, failed, costs
  50. if __name__ == '__main__':
  51. if len(sys.argv) != 2:
  52. print(
  53. 'Usage: poetry run python summarise_results.py <path_to_output_jsonl_file>'
  54. )
  55. sys.exit(1)
  56. json_file_path = sys.argv[1]
  57. passed_tests, failed_tests, costs = extract_test_results(json_file_path)
  58. success_rate = len(passed_tests) / (len(passed_tests) + len(failed_tests))
  59. print('PASSED TESTS:')
  60. pprint.pprint(passed_tests)
  61. print('FAILED TESTS:')
  62. pprint.pprint(failed_tests)
  63. print(
  64. f'\nPassed {len(passed_tests)} tests, failed {len(failed_tests)} tests, success rate = {success_rate}, average cost = {sum(costs) / len(costs)}'
  65. )