forked from moreo/QuaPy
new dataset generated out of real prevalence values of books by products
This commit is contained in:
parent
85abaf2ba2
commit
72c63fff09
|
@ -1,14 +1,17 @@
|
|||
import gzip
|
||||
import quapy as qp
|
||||
from Ordinal.utils import load_simple_sample_raw
|
||||
from quapy.data import LabelledCollection
|
||||
import quapy.functional as F
|
||||
import os
|
||||
from os.path import join
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
|
||||
|
||||
datadir = '/mnt/1T/Datasets/Amazon/reviews'
|
||||
outdir = './data/'
|
||||
real_prev_path = './data/Books-real-prevalence-by-product_votes1_reviews100.csv'
|
||||
domain = 'Books'
|
||||
seed = 7
|
||||
|
||||
|
@ -63,7 +66,6 @@ def gen_samples_APP(pool: LabelledCollection, nsamples, sample_size, outdir, pre
|
|||
write_txt_sample(sample, join(outdir, f'{i}.txt'))
|
||||
prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
|
||||
|
||||
|
||||
def gen_samples_NPP(pool: LabelledCollection, nsamples, sample_size, outdir, prevpath):
|
||||
os.makedirs(outdir, exist_ok=True)
|
||||
with open(prevpath, 'wt') as prevfile:
|
||||
|
@ -73,37 +75,69 @@ def gen_samples_NPP(pool: LabelledCollection, nsamples, sample_size, outdir, pre
|
|||
prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
|
||||
|
||||
|
||||
def gen_samples_real_prevalences(real_prevalences, pool: LabelledCollection, sample_size, outdir, prevpath_out):
|
||||
os.makedirs(outdir, exist_ok=True)
|
||||
with open(prevpath_out, 'wt') as prevfile:
|
||||
prevfile.write('id,' + ','.join(f'{c}' for c in pool.classes_) + '\n')
|
||||
for i, prev in enumerate(real_prevalences):
|
||||
sample = pool.sampling(sample_size, *prev[:-1])
|
||||
write_txt_sample(sample, join(outdir, f'{i}.txt'))
|
||||
prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
|
||||
|
||||
fullpath = join(datadir,domain)+'.txt.gz'
|
||||
|
||||
data = LabelledCollection.load(fullpath, from_gz_text)
|
||||
print(len(data))
|
||||
print(data.classes_)
|
||||
print(data.prevalence())
|
||||
# fullpath = join(datadir,domain)+'.txt.gz'
|
||||
#
|
||||
# data = LabelledCollection.load(fullpath, from_gz_text)
|
||||
# print(len(data))
|
||||
# print(data.classes_)
|
||||
# print(data.prevalence())
|
||||
|
||||
with qp.util.temp_seed(seed):
|
||||
train, rest = data.split_stratified(train_prop=tr_size)
|
||||
|
||||
devel, test = rest.split_stratified(train_prop=0.5)
|
||||
print(len(train))
|
||||
print(len(devel))
|
||||
print(len(test))
|
||||
|
||||
# train, rest = data.split_stratified(train_prop=tr_size)
|
||||
#
|
||||
# devel, test = rest.split_stratified(train_prop=0.5)
|
||||
# print(len(train))
|
||||
# print(len(devel))
|
||||
# print(len(test))
|
||||
#
|
||||
domaindir = join(outdir, domain)
|
||||
|
||||
write_txt_sample(train, join(domaindir, 'training_data.txt'))
|
||||
write_txt_sample(devel, join(domaindir, 'development_data.txt'))
|
||||
write_txt_sample(test, join(domaindir, 'test_data.txt'))
|
||||
# write_txt_sample(train, join(domaindir, 'training_data.txt'))
|
||||
# write_txt_sample(devel, join(domaindir, 'development_data.txt'))
|
||||
# write_txt_sample(test, join(domaindir, 'test_data.txt'))
|
||||
|
||||
gen_samples_APP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'app', 'dev_samples'),
|
||||
prevpath=join(domaindir, 'app', 'dev_prevalences.txt'))
|
||||
gen_samples_APP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'app', 'test_samples'),
|
||||
prevpath=join(domaindir, 'app', 'test_prevalences.txt'))
|
||||
# this part is to be used when the partitions have already been created, in order to avoid re-generating them
|
||||
train = load_simple_sample_raw(domaindir, 'training_data')
|
||||
devel = load_simple_sample_raw(domaindir, 'development_data')
|
||||
test = load_simple_sample_raw(domaindir, 'test_data')
|
||||
|
||||
gen_samples_NPP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'npp', 'dev_samples'),
|
||||
prevpath=join(domaindir, 'npp', 'dev_prevalences.txt'))
|
||||
gen_samples_NPP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'npp', 'test_samples'),
|
||||
prevpath=join(domaindir, 'npp', 'test_prevalences.txt'))
|
||||
# gen_samples_APP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'app', 'dev_samples'),
|
||||
# prevpath=join(domaindir, 'app', 'dev_prevalences.txt'))
|
||||
# gen_samples_APP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'app', 'test_samples'),
|
||||
# prevpath=join(domaindir, 'app', 'test_prevalences.txt'))
|
||||
|
||||
# gen_samples_NPP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'npp', 'dev_samples'),
|
||||
# prevpath=join(domaindir, 'npp', 'dev_prevalences.txt'))
|
||||
# gen_samples_NPP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'npp', 'test_samples'),
|
||||
# prevpath=join(domaindir, 'npp', 'test_prevalences.txt'))
|
||||
|
||||
|
||||
# this part generates samples based on real prevalences (in this case, prevalences of sets of books reviews
|
||||
# groupped by product). It loads the real prevalences (computed elsewhere), and randomly extract 5000 for test
|
||||
# and 1000 for val (disjoint). Then realize the samplings
|
||||
|
||||
assert os.path.exists(real_prev_path), f'real prevalence file does not seem to exist...'
|
||||
real_prevalences = np.genfromtxt(real_prev_path, delimiter='\t')
|
||||
|
||||
nrows = real_prevalences.shape[0]
|
||||
rand_sel = np.random.permutation(nrows)
|
||||
real_prevalences_val = real_prevalences[rand_sel[:nval]]
|
||||
real_prevalences_te = real_prevalences[rand_sel[nval:nval+nte]]
|
||||
|
||||
gen_samples_real_prevalences(real_prevalences_val, devel, sample_size=val_size, outdir=join(domaindir, 'real', 'dev_samples'),
|
||||
prevpath_out=join(domaindir, 'real', 'dev_prevalences.txt'))
|
||||
gen_samples_real_prevalences(real_prevalences_te, test, sample_size=te_size, outdir=join(domaindir, 'real', 'test_samples'),
|
||||
prevpath_out=join(domaindir, 'real', 'test_prevalences.txt'))
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -89,6 +89,7 @@ def get_best_checkpoint(checkpointdir):
|
|||
steps.append(step)
|
||||
assert len(steps) <= 2, 'unexpected number of steps, only two where expected (the best one and the last one)'
|
||||
choosen = f'{checkpointdir}/checkpoint-{min(steps)}'
|
||||
print(f'choosen checkpoint is {choosen}')
|
||||
return choosen
|
||||
|
||||
|
||||
|
@ -97,7 +98,7 @@ if __name__ == '__main__':
|
|||
assert torch.cuda.is_available(), 'cuda is not available'
|
||||
|
||||
#checkpoint='roberta-base-val-finetuned'
|
||||
#generation_mode = 'posteriors'
|
||||
#generation_mode = 'ave'
|
||||
|
||||
n_args = len(sys.argv)
|
||||
assert n_args==3, 'wrong arguments, expected: <checkpoint> <generation-mode>\n' \
|
||||
|
@ -114,7 +115,7 @@ if __name__ == '__main__':
|
|||
|
||||
datapath = './data'
|
||||
domain = 'Books'
|
||||
protocols = ['app'] # ['app', 'npp']
|
||||
protocols = ['real'] # ['app', 'npp']
|
||||
|
||||
assert generation_mode in ['last', 'average', 'posteriors'], 'unknown generation_model'
|
||||
outname = domain + f'-{checkpoint}-{generation_mode}'
|
||||
|
|
Loading…
Reference in New Issue