forked from moreo/QuaPy
script for genearting the datasets
This commit is contained in:
parent
5e1d59687f
commit
c63325e364
|
@ -0,0 +1,116 @@
|
|||
import gzip
|
||||
import quapy as qp
|
||||
from quapy.data import LabelledCollection
|
||||
import quapy.functional as F
|
||||
import os
|
||||
from os.path import join
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
datadir = '/mnt/1T/Datasets/Amazon/reviews'
|
||||
outdir = './data/'
|
||||
domain = 'Books'
|
||||
seed = 7
|
||||
|
||||
tr_size = 20000
|
||||
val_size = 1000
|
||||
te_size = 1000
|
||||
nval = 1000
|
||||
nte = 5000
|
||||
|
||||
# domain = 'Gift_Cards'
|
||||
# tr_size = 200
|
||||
# val_size = 100
|
||||
# te_size = 100
|
||||
# nval = 20
|
||||
# nte = 40
|
||||
|
||||
|
||||
def from_gz_text(path, encoding='utf-8', class2int=True):
|
||||
"""
|
||||
Reads a labelled colletion of documents.
|
||||
File fomart <0-4>\t<document>\n
|
||||
|
||||
:param path: path to the labelled collection
|
||||
:param encoding: the text encoding used to open the file
|
||||
:return: a list of sentences, and a list of labels
|
||||
"""
|
||||
all_sentences, all_labels = [], []
|
||||
file = gzip.open(path, 'rt', encoding=encoding).readlines()
|
||||
for line in file:
|
||||
line = line.strip()
|
||||
if line:
|
||||
try:
|
||||
label, sentence = line.split('\t')
|
||||
sentence = sentence.strip()
|
||||
if class2int:
|
||||
label = int(label) - 1
|
||||
if label >= 0:
|
||||
if sentence:
|
||||
all_sentences.append(sentence)
|
||||
all_labels.append(label)
|
||||
except ValueError:
|
||||
print(f'format error in {line}')
|
||||
return all_sentences, all_labels
|
||||
|
||||
|
||||
def write_txt_sample(sample: LabelledCollection, path):
|
||||
os.makedirs(Path(path).parent, exist_ok=True)
|
||||
with open(path, 'wt') as foo:
|
||||
for document, label in zip(*sample.Xy):
|
||||
foo.write(f'{label}\t{document}\n')
|
||||
|
||||
|
||||
def gen_samples_APP(pool: LabelledCollection, nsamples, sample_size, outdir, prevpath):
|
||||
os.makedirs(outdir, exist_ok=True)
|
||||
with open(prevpath, 'wt') as prevfile:
|
||||
prevfile.write('id,' + ','.join(f'{c}' for c in pool.classes_) + '\n')
|
||||
for i, prev in enumerate(F.uniform_simplex_sampling(n_classes=pool.n_classes, size=nsamples)):
|
||||
sample = pool.sampling(sample_size, *prev)
|
||||
write_txt_sample(sample, join(outdir, f'{i}.txt'))
|
||||
prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
|
||||
|
||||
|
||||
def gen_samples_NPP(pool: LabelledCollection, nsamples, sample_size, outdir, prevpath):
|
||||
os.makedirs(outdir, exist_ok=True)
|
||||
with open(prevpath, 'wt') as prevfile:
|
||||
prevfile.write('id,' + ','.join(f'{c}' for c in pool.classes_) + '\n')
|
||||
for i, sample in enumerate(pool.natural_sampling_generator(sample_size, repeats=nsamples)):
|
||||
write_txt_sample(sample, join(outdir, f'{i}.txt'))
|
||||
prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
|
||||
|
||||
|
||||
|
||||
fullpath = join(datadir,domain)+'.txt.gz'
|
||||
|
||||
data = LabelledCollection.load(fullpath, from_gz_text)
|
||||
print(len(data))
|
||||
print(data.classes_)
|
||||
print(data.prevalence())
|
||||
|
||||
with qp.util.temp_seed(seed):
|
||||
train, rest = data.split_stratified(train_prop=tr_size)
|
||||
|
||||
devel, test = rest.split_stratified(train_prop=0.5)
|
||||
print(len(train))
|
||||
print(len(devel))
|
||||
print(len(test))
|
||||
|
||||
domaindir = join(outdir, domain)
|
||||
|
||||
write_txt_sample(train, join(domaindir, 'training_data.txt'))
|
||||
write_txt_sample(devel, join(domaindir, 'development_data.txt'))
|
||||
write_txt_sample(test, join(domaindir, 'test_data.txt'))
|
||||
|
||||
gen_samples_APP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'app', 'dev_samples'),
|
||||
prevpath=join(domaindir, 'app', 'dev_prevalences.txt'))
|
||||
gen_samples_APP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'app', 'test_samples'),
|
||||
prevpath=join(domaindir, 'app', 'test_prevalences.txt'))
|
||||
|
||||
gen_samples_NPP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'npp', 'dev_samples'),
|
||||
prevpath=join(domaindir, 'npp', 'dev_prevalences.txt'))
|
||||
gen_samples_NPP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'npp', 'test_samples'),
|
||||
prevpath=join(domaindir, 'npp', 'test_prevalences.txt'))
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue