eval_infer.sh 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. #!/bin/bash
  2. PROCESS_FILEPATH=$1
  3. if [ -z "$PROCESS_FILEPATH" ]; then
  4. echo "Error: PROCESS_FILEPATH is empty. Usage: ./eval_infer.sh <output_file>"
  5. exit 1
  6. fi
  7. if [ ! -f $PROCESS_FILEPATH ]; then
  8. echo "Error: $PROCESS_FILEPATH is not a file"
  9. exit 1
  10. fi
  11. # If instance_id is empty, it means we want to eval on the whole $PROCESS_FILEPATH
  12. # otherwise, we want to eval on the instance_id
  13. INSTANCE_ID=$2
  14. echo "INSTANCE_ID: $INSTANCE_ID"
  15. PROCESS_FILEPATH=$(realpath $PROCESS_FILEPATH)
  16. FILE_DIR=$(dirname $PROCESS_FILEPATH)
  17. FILE_NAME=$(basename $PROCESS_FILEPATH)
  18. echo "Evaluating $FILE_NAME @ $FILE_DIR"
  19. DOCKERHUB_NAMESPACE="xingyaoww"
  20. SWEBENCH_TASKS=$(realpath evaluation/swe_bench/eval_workspace/eval_data/instances/swe-bench-lite-all.json)
  21. export SWEBENCH_DOCKER_FORK_DIR=$(realpath evaluation/swe_bench/eval_workspace/SWE-bench-docker)
  22. # ================================================
  23. # detect whether PROCESS_FILEPATH is in OD format or in SWE-bench format
  24. echo "=============================================================="
  25. echo "Detecting whether PROCESS_FILEPATH is in OD format or in SWE-bench format"
  26. echo "=============================================================="
  27. # SWE-bench format is a JSONL where every line has three fields: model_name_or_path, instance_id, and model_patch
  28. function is_swebench_format() {
  29. # Read the first line of the file
  30. read -r first_line < "$PROCESS_FILEPATH"
  31. # Use jq to check if the first line has the required fields
  32. echo "$first_line" | jq -e '. | has("model_name_or_path") and has("instance_id") and has("model_patch")' > /dev/null
  33. if [ $? -ne 0 ]; then
  34. return 1 # Return 1 if the first line does not have the required fields
  35. fi
  36. return 0 # Return 0 if the first line has the required fields
  37. }
  38. # Call the function with the file path
  39. is_swebench_format "$PROCESS_FILEPATH"
  40. IS_SWEBENCH_FORMAT=$?
  41. # Use the result in an if-else statement
  42. if [ $IS_SWEBENCH_FORMAT -eq 0 ]; then
  43. echo "The file IS in SWE-bench format."
  44. SWEBENCH_FORMAT_JSONL=$PROCESS_FILEPATH
  45. else
  46. echo "The file IS NOT in SWE-bench format."
  47. # ==== Convert OD format to SWE-bench format ====
  48. echo "Merged output file with fine-grained report will be saved to $FILE_DIR"
  49. poetry run python3 evaluation/swe_bench/scripts/eval/convert_od_output_to_swe_json.py $PROCESS_FILEPATH
  50. # replace .jsonl with .swebench.jsonl in filename
  51. SWEBENCH_FORMAT_JSONL=${PROCESS_FILEPATH/.jsonl/.swebench.jsonl}
  52. echo "SWEBENCH_FORMAT_JSONL: $SWEBENCH_FORMAT_JSONL"
  53. # assert that the file exists
  54. if [ ! -f $SWEBENCH_FORMAT_JSONL ]; then
  55. echo "Error: $SWEBENCH_FORMAT_JSONL does not exist. There is probably an error in the conversion process."
  56. exit 1
  57. fi
  58. SWEBENCH_FORMAT_JSONL=$(realpath $SWEBENCH_FORMAT_JSONL)
  59. fi
  60. # ================================================
  61. echo "=============================================================="
  62. echo "Running SWE-bench evaluation"
  63. echo "=============================================================="
  64. RUN_ID=$(date +"%Y%m%d_%H%M%S")
  65. N_PROCESS=16
  66. if [ -z "$INSTANCE_ID" ]; then
  67. echo "Running SWE-bench evaluation on the whole input file..."
  68. # Default to SWE-Bench-lite
  69. # change `--dataset_name` and `--split` to alter dataset
  70. poetry run python -m swebench.harness.run_evaluation \
  71. --predictions_path $SWEBENCH_FORMAT_JSONL \
  72. --timeout 1800 \
  73. --cache_level instance \
  74. --max_workers $N_PROCESS \
  75. --run_id $RUN_ID
  76. # get the "model_name_or_path" from the first line of the SWEBENCH_FORMAT_JSONL
  77. MODEL_NAME_OR_PATH=$(jq -r '.model_name_or_path' $SWEBENCH_FORMAT_JSONL | head -n 1)
  78. echo "MODEL_NAME_OR_PATH: $MODEL_NAME_OR_PATH"
  79. RESULT_OUTPUT_DIR=$(dirname $SWEBENCH_FORMAT_JSONL)
  80. echo "RESULT_OUTPUT_DIR: $RESULT_OUTPUT_DIR"
  81. # move the eval results to the target directory
  82. mkdir -p $RESULT_OUTPUT_DIR
  83. # rm eval_outputs directory if it exists
  84. if [ -d $RESULT_OUTPUT_DIR/eval_outputs ]; then
  85. rm -rf $RESULT_OUTPUT_DIR/eval_outputs
  86. fi
  87. mv run_instance_logs/$RUN_ID/$MODEL_NAME_OR_PATH $RESULT_OUTPUT_DIR
  88. mv $RESULT_OUTPUT_DIR/$MODEL_NAME_OR_PATH $RESULT_OUTPUT_DIR/eval_outputs
  89. echo "RUN_ID: $RUN_ID" > $RESULT_OUTPUT_DIR/run_id.txt
  90. # move report file
  91. REPORT_PATH=$MODEL_NAME_OR_PATH.$RUN_ID.json
  92. if [ -f $REPORT_PATH ]; then
  93. # check if $RESULT_OUTPUT_DIR/report.json exists
  94. if [ -f $RESULT_OUTPUT_DIR/report.json ]; then
  95. echo "Report file $RESULT_OUTPUT_DIR/report.json already exists. Overwriting..."
  96. if [ -f $RESULT_OUTPUT_DIR/report.json.bak ]; then
  97. rm $RESULT_OUTPUT_DIR/report.json.bak
  98. fi
  99. mv $RESULT_OUTPUT_DIR/report.json $RESULT_OUTPUT_DIR/report.json.bak
  100. fi
  101. mv $REPORT_PATH $RESULT_OUTPUT_DIR/report.json
  102. fi
  103. poetry run python evaluation/swe_bench/scripts/eval/update_output_with_eval.py $PROCESS_FILEPATH
  104. else
  105. echo "Running SWE-bench evaluation on the instance_id: $INSTANCE_ID"
  106. poetry run python -m swebench.harness.run_evaluation \
  107. --predictions_path $SWEBENCH_FORMAT_JSONL \
  108. --timeout 1800 \
  109. --instance_ids $INSTANCE_ID \
  110. --cache_level instance \
  111. --max_workers $N_PROCESS \
  112. --run_id $RUN_ID
  113. fi