1 anno fa · 0144caaf1f
--- a/evaluation/swe_bench/README.md
+++ b/evaluation/swe_bench/README.md
@@ -28,7 +28,9 @@ When the `run_infer.sh` script is started, it will automatically pull the releva
 
				 
			
 
				 ```bash
			
 
				 ./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
			
 
				-# e.g., ./evaluation/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 300 30 1 princeton-nlp/SWE-bench_Lite test
			
 
				+
			
 
				+# Example
			
 
				+./evaluation/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 300 30 1 princeton-nlp/SWE-bench_Lite test
			
 
				 ```
			
 
				 
			
 
				 where `model_config` is mandatory, and the rest are optional.
			
@@ -68,10 +70,11 @@ then your command would be:
 
				 This is in limited beta. Contact Xingyao over slack if you want to try this out!
			
 
				 
			
 
				 ```bash
			
 
				-# ./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
			
 
				+./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
			
 
				+
			
 
				+# Example - This runs evaluation on CodeActAgent for 300 instances on "princeton-nlp/SWE-bench_Lite"'s test set, with max 30 iteration per instances, with 16 number of workers running in parallel
			
 
				 ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" \
			
 
				 ./evaluation/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300 30 16 "princeton-nlp/SWE-bench_Lite" test
			
 
				-# This example runs evaluation on CodeActAgent for 300 instances on "princeton-nlp/SWE-bench_Lite"'s test set, with max 30 iteration per instances, with 16 number of workers running in parallel
			
 
				 ```
			
 
				 
			
 
				 To clean-up all existing runtime you've already started, run:
			
@@ -133,8 +136,9 @@ NOTE, you should have already pulled the instance-level OR env-level docker imag
 
				 Then you can run the following:
			
 
				 
			
 
				 ```bash
			
 
				-# ./evaluation/swe_bench/scripts/eval_infer.sh $YOUR_OUTPUT_JSONL [instance_id] [dataset_name] [split]
			
 
				-# For example:
			
 
				+./evaluation/swe_bench/scripts/eval_infer.sh $YOUR_OUTPUT_JSONL [instance_id] [dataset_name] [split]
			
 
				+
			
 
				+# Example
			
 
				 ./evaluation/swe_bench/scripts/eval_infer.sh evaluation/evaluation_outputs/outputs/swe_bench/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v1.0/output.jsonl
			
 
				 ```
			
 
				 
			
@@ -162,10 +166,11 @@ The final results will be saved to `evaluation/evaluation_outputs/outputs/swe_be
 
				 This is in limited beta. Contact Xingyao over slack if you want to try this out!
			
 
				 
			
 
				 ```bash
			
 
				-# ./evaluation/swe_bench/scripts/eval_infer_remote.sh [output.jsonl filepath] [num_workers]
			
 
				+./evaluation/swe_bench/scripts/eval_infer_remote.sh [output.jsonl filepath] [num_workers]
			
 
				+
			
 
				+# Example - This evaluates patches generated by CodeActAgent on Llama-3.1-70B-Instruct-Turbo on "princeton-nlp/SWE-bench_Lite"'s test set, with 16 number of workers running in parallel
			
 
				 ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" \
			
 
				-evaluation/swe_bench/scripts/eval_infer_remote.sh evaluation/evaluation_outputs/outputs/swe_bench_lite/CodeActAgent/Llama-3.1-70B-Instruct-Turbo_maxiter_30_N_v1.9-no-hint/output.jsonl 16 "princeton-nlp/SWE-bench_Lite" "test"
			
 
				-# This example evaluate patches generated by CodeActAgent on Llama-3.1-70B-Instruct-Turbo on "princeton-nlp/SWE-bench_Lite"'s test set, with 16 number of workers running in parallel
			
 
				+evaluation/swe_bench/scripts/eval_infer_remote.sh evaluation/evaluation_outputs/outputs/swe-bench-lite/CodeActAgent/Llama-3.1-70B-Instruct-Turbo_maxiter_30_N_v1.9-no-hint/output.jsonl 16 "princeton-nlp/SWE-bench_Lite" "test"
			
 
				 ```
			
 
				 
			
 
				 To clean-up all existing runtimes that you've already started, run: