diff --git a/Ordinal/build_Amazon_datasets.py b/Ordinal/build_Amazon_datasets.py index 533f7e4..cf29eb9 100644 --- a/Ordinal/build_Amazon_datasets.py +++ b/Ordinal/build_Amazon_datasets.py @@ -1,14 +1,17 @@ import gzip import quapy as qp +from Ordinal.utils import load_simple_sample_raw from quapy.data import LabelledCollection import quapy.functional as F import os from os.path import join from pathlib import Path +import numpy as np datadir = '/mnt/1T/Datasets/Amazon/reviews' outdir = './data/' +real_prev_path = './data/Books-real-prevalence-by-product_votes1_reviews100.csv' domain = 'Books' seed = 7 @@ -63,7 +66,6 @@ def gen_samples_APP(pool: LabelledCollection, nsamples, sample_size, outdir, pre write_txt_sample(sample, join(outdir, f'{i}.txt')) prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n') - def gen_samples_NPP(pool: LabelledCollection, nsamples, sample_size, outdir, prevpath): os.makedirs(outdir, exist_ok=True) with open(prevpath, 'wt') as prevfile: @@ -73,37 +75,69 @@ def gen_samples_NPP(pool: LabelledCollection, nsamples, sample_size, outdir, pre prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n') +def gen_samples_real_prevalences(real_prevalences, pool: LabelledCollection, sample_size, outdir, prevpath_out): + os.makedirs(outdir, exist_ok=True) + with open(prevpath_out, 'wt') as prevfile: + prevfile.write('id,' + ','.join(f'{c}' for c in pool.classes_) + '\n') + for i, prev in enumerate(real_prevalences): + sample = pool.sampling(sample_size, *prev[:-1]) + write_txt_sample(sample, join(outdir, f'{i}.txt')) + prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n') -fullpath = join(datadir,domain)+'.txt.gz' -data = LabelledCollection.load(fullpath, from_gz_text) -print(len(data)) -print(data.classes_) -print(data.prevalence()) +# fullpath = join(datadir,domain)+'.txt.gz' +# +# data = LabelledCollection.load(fullpath, from_gz_text) +# print(len(data)) +# print(data.classes_) +# print(data.prevalence()) with qp.util.temp_seed(seed): - train, rest = data.split_stratified(train_prop=tr_size) - - devel, test = rest.split_stratified(train_prop=0.5) - print(len(train)) - print(len(devel)) - print(len(test)) - + # train, rest = data.split_stratified(train_prop=tr_size) + # + # devel, test = rest.split_stratified(train_prop=0.5) + # print(len(train)) + # print(len(devel)) + # print(len(test)) + # domaindir = join(outdir, domain) - write_txt_sample(train, join(domaindir, 'training_data.txt')) - write_txt_sample(devel, join(domaindir, 'development_data.txt')) - write_txt_sample(test, join(domaindir, 'test_data.txt')) + # write_txt_sample(train, join(domaindir, 'training_data.txt')) + # write_txt_sample(devel, join(domaindir, 'development_data.txt')) + # write_txt_sample(test, join(domaindir, 'test_data.txt')) - gen_samples_APP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'app', 'dev_samples'), - prevpath=join(domaindir, 'app', 'dev_prevalences.txt')) - gen_samples_APP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'app', 'test_samples'), - prevpath=join(domaindir, 'app', 'test_prevalences.txt')) + # this part is to be used when the partitions have already been created, in order to avoid re-generating them + train = load_simple_sample_raw(domaindir, 'training_data') + devel = load_simple_sample_raw(domaindir, 'development_data') + test = load_simple_sample_raw(domaindir, 'test_data') - gen_samples_NPP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'npp', 'dev_samples'), - prevpath=join(domaindir, 'npp', 'dev_prevalences.txt')) - gen_samples_NPP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'npp', 'test_samples'), - prevpath=join(domaindir, 'npp', 'test_prevalences.txt')) + # gen_samples_APP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'app', 'dev_samples'), + # prevpath=join(domaindir, 'app', 'dev_prevalences.txt')) + # gen_samples_APP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'app', 'test_samples'), + # prevpath=join(domaindir, 'app', 'test_prevalences.txt')) + + # gen_samples_NPP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'npp', 'dev_samples'), + # prevpath=join(domaindir, 'npp', 'dev_prevalences.txt')) + # gen_samples_NPP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'npp', 'test_samples'), + # prevpath=join(domaindir, 'npp', 'test_prevalences.txt')) + + + # this part generates samples based on real prevalences (in this case, prevalences of sets of books reviews + # groupped by product). It loads the real prevalences (computed elsewhere), and randomly extract 5000 for test + # and 1000 for val (disjoint). Then realize the samplings + + assert os.path.exists(real_prev_path), f'real prevalence file does not seem to exist...' + real_prevalences = np.genfromtxt(real_prev_path, delimiter='\t') + + nrows = real_prevalences.shape[0] + rand_sel = np.random.permutation(nrows) + real_prevalences_val = real_prevalences[rand_sel[:nval]] + real_prevalences_te = real_prevalences[rand_sel[nval:nval+nte]] + + gen_samples_real_prevalences(real_prevalences_val, devel, sample_size=val_size, outdir=join(domaindir, 'real', 'dev_samples'), + prevpath_out=join(domaindir, 'real', 'dev_prevalences.txt')) + gen_samples_real_prevalences(real_prevalences_te, test, sample_size=te_size, outdir=join(domaindir, 'real', 'test_samples'), + prevpath_out=join(domaindir, 'real', 'test_prevalences.txt')) diff --git a/Ordinal/generate_bert_vectors_npytxt.py b/Ordinal/generate_bert_vectors_npytxt.py index f58d5ae..2e83ae4 100644 --- a/Ordinal/generate_bert_vectors_npytxt.py +++ b/Ordinal/generate_bert_vectors_npytxt.py @@ -89,6 +89,7 @@ def get_best_checkpoint(checkpointdir): steps.append(step) assert len(steps) <= 2, 'unexpected number of steps, only two where expected (the best one and the last one)' choosen = f'{checkpointdir}/checkpoint-{min(steps)}' + print(f'choosen checkpoint is {choosen}') return choosen @@ -97,7 +98,7 @@ if __name__ == '__main__': assert torch.cuda.is_available(), 'cuda is not available' #checkpoint='roberta-base-val-finetuned' - #generation_mode = 'posteriors' + #generation_mode = 'ave' n_args = len(sys.argv) assert n_args==3, 'wrong arguments, expected: \n' \ @@ -114,7 +115,7 @@ if __name__ == '__main__': datapath = './data' domain = 'Books' - protocols = ['app'] # ['app', 'npp'] + protocols = ['real'] # ['app', 'npp'] assert generation_mode in ['last', 'average', 'posteriors'], 'unknown generation_model' outname = domain + f'-{checkpoint}-{generation_mode}'