push_docker_instance_images.py 2.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879
  1. """You should first perform the following steps:
  2. 1. Build the docker images. Install SWE-Bench first (https://github.com/princeton-nlp/SWE-bench). Then run:
  3. ```bash
  4. export DATASET_NAME=princeton-nlp/SWE-bench_Lite
  5. export SPLIT=test
  6. export MAX_WORKERS=4
  7. export RUN_ID=some-random-ID
  8. python -m swebench.harness.run_evaluation \
  9. --dataset_name $DATASET_NAME \
  10. --split $SPLIT \
  11. --predictions_path gold \
  12. --max_workers $MAX_WORKERS \
  13. --run_id $RUN_ID \
  14. --cache_level instance
  15. ```
  16. 2. Then run this script to push the docker images to the docker hub. Some of the docker images might fail to build in the previous step - start an issue in the SWE-Bench repo for possible fixes.
  17. To push the docker images for "princeton-nlp/SWE-bench_Lite" test set to the docker hub (e.g., under `docker.io/xingyaoww/`), run:
  18. ```bash
  19. EVAL_DOCKER_IMAGE_PREFIX='docker.io/xingyaoww/' python3 evaluation/swe_bench/scripts/docker/push_docker_instance_images.py --dataset princeton-nlp/SWE-bench_Lite --split test
  20. ```
  21. """
  22. import argparse
  23. import docker
  24. from datasets import load_dataset
  25. from tqdm import tqdm
  26. from openhands.core.logger import openhands_logger as logger
  27. logger.setLevel('ERROR')
  28. from evaluation.benchmarks.swe_bench.run_infer import get_instance_docker_image # noqa
  29. parser = argparse.ArgumentParser()
  30. parser.add_argument('--dataset', type=str, default='princeton-nlp/SWE-bench_Lite')
  31. parser.add_argument('--split', type=str, default='test')
  32. args = parser.parse_args()
  33. dataset = load_dataset(args.dataset, split=args.split)
  34. client = docker.from_env()
  35. pbar = tqdm(total=len(dataset))
  36. counter = {'success': 0, 'failed': 0}
  37. failed_instances = []
  38. for instance in dataset:
  39. instance_id = instance['instance_id']
  40. image_name = f'sweb.eval.x86_64.{instance_id}'
  41. target_image_name = get_instance_docker_image(instance_id)
  42. print('-' * 100)
  43. # check if image exists
  44. try:
  45. image: docker.models.images.Image = client.images.get(image_name)
  46. image.tag(target_image_name)
  47. print(f'Image {image_name} -- tagging to --> {target_image_name}')
  48. ret_push = client.images.push(target_image_name)
  49. if isinstance(ret_push, str):
  50. print(ret_push)
  51. else:
  52. for line in ret_push:
  53. print(line)
  54. print(f'Image {image_name} -- pushed to --> {target_image_name}')
  55. counter['success'] += 1
  56. except docker.errors.ImageNotFound:
  57. print(f'ERROR: Image {image_name} does not exist')
  58. counter['failed'] += 1
  59. failed_instances.append(instance_id)
  60. finally:
  61. pbar.update(1)
  62. pbar.set_postfix(counter)
  63. print(f'Success: {counter["success"]}, Failed: {counter["failed"]}')
  64. print('Failed instances IDs:')
  65. for failed_instance in failed_instances:
  66. print(failed_instance)