eval_infer.sh 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. #!/bin/bash
  2. PROCESS_FILEPATH=$1
  3. if [ -z "$PROCESS_FILEPATH" ]; then
  4. echo "Error: PROCESS_FILEPATH is empty. Usage: ./eval_infer.sh <output_file> [instance_id] [dataset_name] [split]"
  5. exit 1
  6. fi
  7. if [ ! -f $PROCESS_FILEPATH ]; then
  8. echo "Error: $PROCESS_FILEPATH is not a file"
  9. exit 1
  10. fi
  11. # If instance_id is empty, it means we want to eval on the whole $PROCESS_FILEPATH
  12. # otherwise, we want to eval on the instance_id
  13. INSTANCE_ID=$2
  14. DATASET_NAME=${3:-"princeton-nlp/SWE-bench_Lite"}
  15. SPLIT=${4:-"test"}
  16. echo "INSTANCE_ID: $INSTANCE_ID"
  17. echo "DATASET_NAME: $DATASET_NAME"
  18. echo "SPLIT: $SPLIT"
  19. PROCESS_FILEPATH=$(realpath $PROCESS_FILEPATH)
  20. FILE_DIR=$(dirname $PROCESS_FILEPATH)
  21. FILE_NAME=$(basename $PROCESS_FILEPATH)
  22. echo "Evaluating $FILE_NAME @ $FILE_DIR"
  23. # ================================================
  24. # detect whether PROCESS_FILEPATH is in OH format or in SWE-bench format
  25. echo "=============================================================="
  26. echo "Detecting whether PROCESS_FILEPATH is in OH format or in SWE-bench format"
  27. echo "=============================================================="
  28. # SWE-bench format is a JSONL where every line has three fields: model_name_or_path, instance_id, and model_patch
  29. function is_swebench_format() {
  30. # Read the first line of the file
  31. read -r first_line < "$PROCESS_FILEPATH"
  32. # Use jq to check if the first line has the required fields
  33. echo "$first_line" | jq -e '. | has("model_name_or_path") and has("instance_id") and has("model_patch")' > /dev/null
  34. if [ $? -ne 0 ]; then
  35. return 1 # Return 1 if the first line does not have the required fields
  36. fi
  37. return 0 # Return 0 if the first line has the required fields
  38. }
  39. # Call the function with the file path
  40. is_swebench_format "$PROCESS_FILEPATH"
  41. IS_SWEBENCH_FORMAT=$?
  42. # Use the result in an if-else statement
  43. if [ $IS_SWEBENCH_FORMAT -eq 0 ]; then
  44. echo "The file IS in SWE-bench format."
  45. SWEBENCH_FORMAT_JSONL=$PROCESS_FILEPATH
  46. else
  47. echo "The file IS NOT in SWE-bench format."
  48. # ==== Convert OH format to SWE-bench format ====
  49. echo "Merged output file with fine-grained report will be saved to $FILE_DIR"
  50. poetry run python3 evaluation/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py $PROCESS_FILEPATH
  51. # replace .jsonl with .swebench.jsonl in filename
  52. SWEBENCH_FORMAT_JSONL=${PROCESS_FILEPATH/.jsonl/.swebench.jsonl}
  53. echo "SWEBENCH_FORMAT_JSONL: $SWEBENCH_FORMAT_JSONL"
  54. # assert that the file exists
  55. if [ ! -f $SWEBENCH_FORMAT_JSONL ]; then
  56. echo "Error: $SWEBENCH_FORMAT_JSONL does not exist. There is probably an error in the conversion process."
  57. exit 1
  58. fi
  59. SWEBENCH_FORMAT_JSONL=$(realpath $SWEBENCH_FORMAT_JSONL)
  60. fi
  61. # ================================================
  62. echo "=============================================================="
  63. echo "Running SWE-bench evaluation"
  64. echo "=============================================================="
  65. RUN_ID=$(date +"%Y%m%d_%H%M%S")
  66. N_PROCESS=16
  67. if [ -z "$INSTANCE_ID" ]; then
  68. echo "Running SWE-bench evaluation on the whole input file..."
  69. # Default to SWE-Bench-lite
  70. # change `--dataset_name` and `--split` to alter dataset
  71. poetry run python -m swebench.harness.run_evaluation \
  72. --dataset_name "$DATASET_NAME" \
  73. --split "$SPLIT" \
  74. --predictions_path $SWEBENCH_FORMAT_JSONL \
  75. --timeout 1800 \
  76. --cache_level instance \
  77. --max_workers $N_PROCESS \
  78. --run_id $RUN_ID
  79. # get the "model_name_or_path" from the first line of the SWEBENCH_FORMAT_JSONL
  80. MODEL_NAME_OR_PATH=$(jq -r '.model_name_or_path' $SWEBENCH_FORMAT_JSONL | head -n 1)
  81. echo "MODEL_NAME_OR_PATH: $MODEL_NAME_OR_PATH"
  82. RESULT_OUTPUT_DIR=$(dirname $SWEBENCH_FORMAT_JSONL)
  83. echo "RESULT_OUTPUT_DIR: $RESULT_OUTPUT_DIR"
  84. # move the eval results to the target directory
  85. mkdir -p $RESULT_OUTPUT_DIR
  86. # rm eval_outputs directory if it exists
  87. if [ -d $RESULT_OUTPUT_DIR/eval_outputs ]; then
  88. rm -rf $RESULT_OUTPUT_DIR/eval_outputs
  89. fi
  90. mv logs/run_evaluation/$RUN_ID/$MODEL_NAME_OR_PATH $RESULT_OUTPUT_DIR
  91. mv $RESULT_OUTPUT_DIR/$MODEL_NAME_OR_PATH $RESULT_OUTPUT_DIR/eval_outputs
  92. echo "RUN_ID: $RUN_ID" > $RESULT_OUTPUT_DIR/run_id.txt
  93. # move report file
  94. REPORT_PATH=$MODEL_NAME_OR_PATH.$RUN_ID.json
  95. if [ -f $REPORT_PATH ]; then
  96. # check if $RESULT_OUTPUT_DIR/report.json exists
  97. if [ -f $RESULT_OUTPUT_DIR/report.json ]; then
  98. echo "Report file $RESULT_OUTPUT_DIR/report.json already exists. Overwriting..."
  99. if [ -f $RESULT_OUTPUT_DIR/report.json.bak ]; then
  100. rm $RESULT_OUTPUT_DIR/report.json.bak
  101. fi
  102. mv $RESULT_OUTPUT_DIR/report.json $RESULT_OUTPUT_DIR/report.json.bak
  103. fi
  104. mv $REPORT_PATH $RESULT_OUTPUT_DIR/report.json
  105. fi
  106. poetry run python evaluation/swe_bench/scripts/eval/update_output_with_eval.py $PROCESS_FILEPATH
  107. else
  108. echo "Running SWE-bench evaluation on the instance_id: $INSTANCE_ID"
  109. poetry run python -m swebench.harness.run_evaluation \
  110. --dataset_name "$DATASET_NAME" \
  111. --split "$SPLIT" \
  112. --predictions_path $SWEBENCH_FORMAT_JSONL \
  113. --timeout 1800 \
  114. --instance_ids $INSTANCE_ID \
  115. --cache_level instance \
  116. --max_workers $N_PROCESS \
  117. --run_id $RUN_ID
  118. fi