compare_patch_filename.py 1.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354
  1. """This script compares gold patches with OpenHands-generated patches and check whether
  2. OpenHands found the right (set of) files to modify.
  3. """
  4. import argparse
  5. import json
  6. import re
  7. def extract_modified_files(patch):
  8. modified_files = set()
  9. file_pattern = re.compile(r'^diff --git a/(.*?) b/')
  10. for line in patch.split('\n'):
  11. match = file_pattern.match(line)
  12. if match:
  13. modified_files.add(match.group(1))
  14. return modified_files
  15. def process_report(oh_output_file):
  16. succ = 0
  17. fail = 0
  18. for line in open(oh_output_file):
  19. line = json.loads(line)
  20. instance_id = line['instance_id']
  21. gold_patch = line['swe_instance']['patch']
  22. generated_patch = line['git_patch']
  23. gold_modified_files = extract_modified_files(gold_patch)
  24. # swe-bench lite only: a gold patch always contains exactly one file
  25. assert len(gold_modified_files) == 1
  26. generated_modified_files = extract_modified_files(generated_patch)
  27. # Check if all files in gold_patch are also in generated_patch
  28. all_files_in_generated = gold_modified_files.issubset(generated_modified_files)
  29. if all_files_in_generated:
  30. succ += 1
  31. else:
  32. fail += 1
  33. print(
  34. f'{instance_id}: file mismatch, gold = {gold_modified_files}, generated = {generated_modified_files}'
  35. )
  36. print(
  37. f'\nSUMMARY: {succ} out of {succ + fail} instances found correct files to edit, success rate = {succ / float(succ + fail)}'
  38. )
  39. if __name__ == '__main__':
  40. parser = argparse.ArgumentParser()
  41. parser.add_argument('--oh_output_file', help='Path to the OH output file')
  42. args = parser.parse_args()
  43. process_report(args.oh_output_file)