| 123456789101112131415161718192021222324252627 |
- import argparse
- import pandas as pd
- from datasets import load_dataset
- parser = argparse.ArgumentParser()
- parser.add_argument('output_filepath', type=str, help='Path to save the output file')
- parser.add_argument(
- '--dataset_name',
- type=str,
- help='Name of the dataset to download',
- default='princeton-nlp/SWE-bench_Lite',
- )
- parser.add_argument('--split', type=str, help='Split to download', default='test')
- args = parser.parse_args()
- dataset = load_dataset(args.dataset_name, split=args.split)
- output_filepath = args.output_filepath
- print(
- f'Downloading gold patches from {args.dataset_name} (split: {args.split}) to {output_filepath}'
- )
- patches = [
- {'instance_id': row['instance_id'], 'model_patch': row['patch']} for row in dataset
- ]
- print(f'{len(patches)} gold patches loaded')
- pd.DataFrame(patches).to_json(output_filepath, lines=True, orient='records')
- print(f'Patches saved to {output_filepath}')
|