|
|
@@ -158,9 +158,10 @@ def Dataset(data_list_file,
|
|
|
filter_fn = partial(filter, **filter_conf)
|
|
|
dataset = FilterIterDataPipe(dataset, fn=filter_fn)
|
|
|
|
|
|
- vocab = {'vocab': dict, 'seg_dict': seg_dict}
|
|
|
- tokenize_fn = partial(tokenize, **vocab)
|
|
|
- dataset = MapperIterDataPipe(dataset, fn=tokenize_fn)
|
|
|
+ if "text" in data_names:
|
|
|
+ vocab = {'vocab': dict, 'seg_dict': seg_dict}
|
|
|
+ tokenize_fn = partial(tokenize, **vocab)
|
|
|
+ dataset = MapperIterDataPipe(dataset, fn=tokenize_fn)
|
|
|
|
|
|
if shuffle:
|
|
|
buffer_conf = conf.get('shuffle_conf', {})
|