eval_infer.sh 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101
  1. #!/bin/bash
  2. PROCESS_FILEPATH=$1
  3. if [ -z "$PROCESS_FILEPATH" ]; then
  4. echo "Error: PROCESS_FILEPATH is empty. Usage: ./eval_infer.sh <output_file>"
  5. exit 1
  6. fi
  7. if [ ! -f $PROCESS_FILEPATH ]; then
  8. echo "Error: $PROCESS_FILEPATH is not a file"
  9. exit 1
  10. fi
  11. # If instance_id is empty, it means we want to eval on the whole $PROCESS_FILEPATH
  12. # otherwise, we want to eval on the instance_id
  13. INSTANCE_ID=$2
  14. echo "INSTANCE_ID: $INSTANCE_ID"
  15. PROCESS_FILEPATH=$(realpath $PROCESS_FILEPATH)
  16. FILE_DIR=$(dirname $PROCESS_FILEPATH)
  17. FILE_NAME=$(basename $PROCESS_FILEPATH)
  18. mkdir -p $FILE_DIR/logs
  19. mkdir -p $FILE_DIR/swe_bench_format
  20. echo "Evaluating $FILE_NAME @ $FILE_DIR"
  21. DOCKERHUB_NAMESPACE="xingyaoww"
  22. SWEBENCH_TASKS=$(realpath evaluation/swe_bench/eval_workspace/eval_data/instances/swe-bench-lite-all.json)
  23. export SWEBENCH_DOCKER_FORK_DIR=$(realpath evaluation/swe_bench/eval_workspace/SWE-bench-docker)
  24. # ================================================
  25. # detect whether PROCESS_FILEPATH is in OD format or in SWE-bench format
  26. echo "=============================================================="
  27. echo "Detecting whether PROCESS_FILEPATH is in OD format or in SWE-bench format"
  28. echo "=============================================================="
  29. # SWE-bench format is a JSONL where every line has three fields: model_name_or_path, instance_id, and model_patch
  30. function is_swebench_format() {
  31. # Read the first line of the file
  32. read -r first_line < "$PROCESS_FILEPATH"
  33. # Use jq to check if the first line has the required fields
  34. echo "$first_line" | jq -e '. | has("model_name_or_path") and has("instance_id") and has("model_patch")' > /dev/null
  35. if [ $? -ne 0 ]; then
  36. return 1 # Return 1 if the first line does not have the required fields
  37. fi
  38. return 0 # Return 0 if the first line has the required fields
  39. }
  40. # Call the function with the file path
  41. is_swebench_format "$PROCESS_FILEPATH"
  42. IS_SWEBENCH_FORMAT=$?
  43. # Use the result in an if-else statement
  44. if [ $IS_SWEBENCH_FORMAT -eq 0 ]; then
  45. echo "The file IS in SWE-bench format."
  46. SWEBENCH_FORMAT_JSONL=$PROCESS_FILEPATH
  47. else
  48. echo "The file IS NOT in SWE-bench format."
  49. # ==== Convert OD format to SWE-bench format ====
  50. echo "Merged output file with fine-grained report will be saved to $FILE_DIR"
  51. poetry run python3 evaluation/swe_bench/scripts/eval/convert_od_output_to_swe_json.py $PROCESS_FILEPATH
  52. # replace .jsonl with .swebench.jsonl in filename
  53. SWEBENCH_FORMAT_JSONL=${PROCESS_FILEPATH/.jsonl/.swebench.jsonl}
  54. echo "SWEBENCH_FORMAT_JSONL: $SWEBENCH_FORMAT_JSONL"
  55. # assert that the file exists
  56. if [ ! -f $SWEBENCH_FORMAT_JSONL ]; then
  57. echo "Error: $SWEBENCH_FORMAT_JSONL does not exist. There is probably an error in the conversion process."
  58. exit 1
  59. fi
  60. SWEBENCH_FORMAT_JSONL=$(realpath $SWEBENCH_FORMAT_JSONL)
  61. fi
  62. # ================================================
  63. echo "=============================================================="
  64. echo "Running SWE-bench evaluation"
  65. echo "=============================================================="
  66. if [ -z "$INSTANCE_ID" ]; then
  67. echo "Running SWE-bench evaluation on the whole input file..."
  68. poetry run python $SWEBENCH_DOCKER_FORK_DIR/run_evaluation.py \
  69. --predictions_path $SWEBENCH_FORMAT_JSONL \
  70. --log_dir $FILE_DIR/logs \
  71. --swe_bench_tasks $SWEBENCH_TASKS \
  72. --namespace $DOCKERHUB_NAMESPACE \
  73. --timeout 1800
  74. else
  75. echo "Running SWE-bench evaluation on the instance_id: $INSTANCE_ID"
  76. poetry run python $SWEBENCH_DOCKER_FORK_DIR/run_single_instance.py \
  77. --predictions_path $SWEBENCH_FORMAT_JSONL \
  78. --swe_bench_tasks $SWEBENCH_TASKS \
  79. --namespace $DOCKERHUB_NAMESPACE \
  80. --instance_id $INSTANCE_ID
  81. fi
  82. poetry run python $SWEBENCH_DOCKER_FORK_DIR/generate_report.py \
  83. --predictions_path $SWEBENCH_FORMAT_JSONL \
  84. --log_dir $FILE_DIR/logs \
  85. --output_dir $FILE_DIR \
  86. --swe_bench_tasks $SWEBENCH_TASKS