| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879 |
- """You should first perform the following steps:
- 1. Build the docker images. Install SWE-Bench first (https://github.com/princeton-nlp/SWE-bench). Then run:
- ```bash
- export DATASET_NAME=princeton-nlp/SWE-bench_Lite
- export SPLIT=test
- export MAX_WORKERS=4
- export RUN_ID=some-random-ID
- python -m swebench.harness.run_evaluation \
- --dataset_name $DATASET_NAME \
- --split $SPLIT \
- --predictions_path gold \
- --max_workers $MAX_WORKERS \
- --run_id $RUN_ID \
- --cache_level instance
- ```
- 2. Then run this script to push the docker images to the docker hub. Some of the docker images might fail to build in the previous step - start an issue in the SWE-Bench repo for possible fixes.
- To push the docker images for "princeton-nlp/SWE-bench_Lite" test set to the docker hub (e.g., under `docker.io/xingyaoww/`), run:
- ```bash
- EVAL_DOCKER_IMAGE_PREFIX='docker.io/xingyaoww/' python3 evaluation/swe_bench/scripts/docker/push_docker_instance_images.py --dataset princeton-nlp/SWE-bench_Lite --split test
- ```
- """
- import argparse
- import docker
- from datasets import load_dataset
- from tqdm import tqdm
- from openhands.core.logger import openhands_logger as logger
- logger.setLevel('ERROR')
- from evaluation.benchmarks.swe_bench.run_infer import get_instance_docker_image # noqa
- parser = argparse.ArgumentParser()
- parser.add_argument('--dataset', type=str, default='princeton-nlp/SWE-bench_Lite')
- parser.add_argument('--split', type=str, default='test')
- args = parser.parse_args()
- dataset = load_dataset(args.dataset, split=args.split)
- client = docker.from_env()
- pbar = tqdm(total=len(dataset))
- counter = {'success': 0, 'failed': 0}
- failed_instances = []
- for instance in dataset:
- instance_id = instance['instance_id']
- image_name = f'sweb.eval.x86_64.{instance_id}'
- target_image_name = get_instance_docker_image(instance_id)
- print('-' * 100)
- # check if image exists
- try:
- image: docker.models.images.Image = client.images.get(image_name)
- image.tag(target_image_name)
- print(f'Image {image_name} -- tagging to --> {target_image_name}')
- ret_push = client.images.push(target_image_name)
- if isinstance(ret_push, str):
- print(ret_push)
- else:
- for line in ret_push:
- print(line)
- print(f'Image {image_name} -- pushed to --> {target_image_name}')
- counter['success'] += 1
- except docker.errors.ImageNotFound:
- print(f'ERROR: Image {image_name} does not exist')
- counter['failed'] += 1
- failed_instances.append(instance_id)
- finally:
- pbar.update(1)
- pbar.set_postfix(counter)
- print(f'Success: {counter["success"]}, Failed: {counter["failed"]}')
- print('Failed instances IDs:')
- for failed_instance in failed_instances:
- print(failed_instance)
|