diff --git a/Ordinal/preprocess_dataset_raw2tfidf.py b/Ordinal/preprocess_dataset_raw2tfidf.py index 632546a..d472c3a 100644 --- a/Ordinal/preprocess_dataset_raw2tfidf.py +++ b/Ordinal/preprocess_dataset_raw2tfidf.py @@ -1,5 +1,5 @@ import quapy as qp -from Ordinal.utils import load_simple_sample_raw +from Ordinal.utils import load_simple_sample_raw, load_samples_raw from quapy.data import LabelledCollection from sklearn.feature_extraction.text import TfidfVectorizer from os.path import join @@ -19,6 +19,7 @@ datapath = './data' domain = 'Books' outname = domain + '-tfidf' + def save_preprocessing_info(transformer): with open(join(datapath, outname, 'prep-info.txt'), 'wt') as foo: foo.write(f'{str(transformer)}\n') @@ -30,11 +31,11 @@ os.makedirs(join(datapath, outname, 'app', 'dev_samples'), exist_ok=True) os.makedirs(join(datapath, outname, 'app', 'test_samples'), exist_ok=True) shutil.copyfile(join(datapath, domain, 'app', 'dev_prevalences.txt'), join(datapath, outname, 'app', 'dev_prevalences.txt')) shutil.copyfile(join(datapath, domain, 'app', 'test_prevalences.txt'), join(datapath, outname, 'app', 'test_prevalences.txt')) -os.makedirs(join(datapath, outname, 'npp'), exist_ok=True) -os.makedirs(join(datapath, outname, 'npp', 'dev_samples'), exist_ok=True) -os.makedirs(join(datapath, outname, 'npp', 'test_samples'), exist_ok=True) -shutil.copyfile(join(datapath, domain, 'npp', 'dev_prevalences.txt'), join(datapath, outname, 'npp', 'dev_prevalences.txt')) -shutil.copyfile(join(datapath, domain, 'npp', 'test_prevalences.txt'), join(datapath, outname, 'npp', 'test_prevalences.txt')) +os.makedirs(join(datapath, outname, 'real'), exist_ok=True) +os.makedirs(join(datapath, outname, 'real', 'dev_samples'), exist_ok=True) +os.makedirs(join(datapath, outname, 'real', 'test_samples'), exist_ok=True) +shutil.copyfile(join(datapath, domain, 'real', 'dev_prevalences.txt'), join(datapath, outname, 'real', 'dev_prevalences.txt')) +shutil.copyfile(join(datapath, domain, 'real', 'test_prevalences.txt'), join(datapath, outname, 'real', 'test_prevalences.txt')) tfidf = TfidfVectorizer(sublinear_tf=True, ngram_range=(1,2), min_df=5) @@ -45,16 +46,17 @@ save_preprocessing_info(tfidf) pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL) + def transform_folder_samples(protocol, splitname): - for i, sample in tqdm(enumerate(load_simple_sample_raw(join(datapath, domain, protocol, splitname), classes=train.classes_))): + for i, sample in tqdm(enumerate(load_samples_raw(join(datapath, domain, protocol, splitname), classes=train.classes_))): sample.instances = tfidf.transform(sample.instances) pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL) transform_folder_samples('app', 'dev_samples') transform_folder_samples('app', 'test_samples') -transform_folder_samples('npp', 'dev_samples') -transform_folder_samples('npp', 'test_samples') +transform_folder_samples('real', 'dev_samples') +transform_folder_samples('real', 'test_samples') diff --git a/Ordinal/utils.py b/Ordinal/utils.py index fc74962..12f3887 100644 --- a/Ordinal/utils.py +++ b/Ordinal/utils.py @@ -48,8 +48,8 @@ def load_single_sample_pkl(parentdir, filename): # return load_samples_folder(path_dir, filter, load_fn=load_simple_sample_npytxt) -# def load_samples_raw(path_dir, filter=None, classes=None): -# return load_samples_folder(path_dir, filter, load_fn=load_simple_sample_raw, load_fn_kwargs={'classes': classes}) +def load_samples_raw(path_dir, filter=None, classes=None): + return load_samples_folder(path_dir, filter, load_fn=load_simple_sample_raw, classes=classes) # def load_samples_as_csv(path_dir, filter=None):