104 lines
4.4 KiB
Python
104 lines
4.4 KiB
Python
import quapy as qp
|
|
from quapy.method.aggregative import PACC
|
|
from quapy.data import LabelledCollection, Dataset
|
|
from quapy.protocol import ArtificialPrevalenceProtocol
|
|
import quapy.functional as F
|
|
import os
|
|
from os.path import join
|
|
|
|
# While quapy comes with ready-to-use datasets for experimental purposes, you may prefer to run experiments using
|
|
# your own data. Most of the quapy's functionality relies on an internal class called LabelledCollection, for fast
|
|
# indexing and sampling, and so this example provides guidance on how to convert your datasets into a LabelledCollection
|
|
# so all the functionality becomes available. This includes procedures for tuning the hyperparameters of your methods,
|
|
# evaluating the performance using high level sampling protocols, etc.
|
|
|
|
# Let us assume that we have a binary sentiment dataset of opinions in natural language. We will use the "IMDb"
|
|
# dataset of reviews, which can be downloaded as follows
|
|
URL_TRAIN = f'https://zenodo.org/record/4117827/files/imdb_train.txt'
|
|
URL_TEST = f'https://zenodo.org/record/4117827/files/imdb_test.txt'
|
|
os.makedirs('./reviews', exist_ok=True)
|
|
train_path = join('reviews', 'hp_train.txt')
|
|
test_path = join('reviews', 'hp_test.txt')
|
|
qp.util.download_file_if_not_exists(URL_TRAIN, train_path)
|
|
qp.util.download_file_if_not_exists(URL_TEST, test_path)
|
|
|
|
# these files contain 2 columns separated by a \t:
|
|
# the first one is a binary value (0=negative, 1=positive), and the second is the text
|
|
# Everything we need is to implement a function returning the instances and the labels as follows
|
|
def my_data_loader(path):
|
|
with open(path, 'rt') as fin:
|
|
labels, texts = zip(*[line.split('\t') for line in fin.readlines()])
|
|
labels = list(map(int, labels)) # convert string numbers to int
|
|
return texts, labels
|
|
|
|
# check that our function is working properly...
|
|
train_texts, train_labels = my_data_loader(train_path)
|
|
for i, (text, label) in enumerate(zip(train_texts, train_labels)):
|
|
print(f'#{i}: {label=}\t{text=}')
|
|
if i>=5:
|
|
print('...')
|
|
break
|
|
|
|
# We can now instantiate a LabelledCollection simply as
|
|
train_lc = LabelledCollection(instances=train_texts, labels=train_labels)
|
|
print('my training collection:', train_lc)
|
|
|
|
# We can instantiate directly a LabelledCollection using the data loader function,
|
|
# without having to load the data ourselves:
|
|
train_lc = LabelledCollection.load(train_path, loader_func=my_data_loader)
|
|
print('my training collection:', train_lc)
|
|
|
|
# We can do the same for the test set, or we can instead directly instantiate a Dataset object (this is by and large
|
|
# simply a tuple with training and test LabelledCollections) as follows:
|
|
my_data = Dataset.load(train_path, test_path, loader_func=my_data_loader)
|
|
print('my dataset:', my_data)
|
|
|
|
# However, since this is a textual dataset, we must vectorize it prior to training any quantification algorithm.
|
|
# We can do this in several ways in quapy. For example, manually...
|
|
# from sklearn.feature_extraction.text import TfidfVectorizer
|
|
# tfidf = TfidfVectorizer(min_df=5)
|
|
# Xtr = tfidf.fit_transform(my_data.training.instances)
|
|
# Xte = tfidf.transform(my_data.test.instances)
|
|
# ... or using some preprocessing functionality of quapy (recommended):
|
|
my_data_tfidf = qp.data.preprocessing.text2tfidf(my_data, min_df=5)
|
|
|
|
training, test = my_data_tfidf.train_test
|
|
|
|
# Once you have loaded your training and test data, you have access to a series of quapy's utilities, e.g.:
|
|
print(f'the training prevalence is {F.strprev(training.prevalence())}')
|
|
print(f'the test prevalence is {F.strprev(test.prevalence())}')
|
|
print(f'let us generate a small balanced training sample:')
|
|
desired_size = 200
|
|
desired_prevalence = [0.5, 0.5]
|
|
small_training_balanced = training.sampling(desired_size, *desired_prevalence, shuffle=True, random_state=0)
|
|
print(small_training_balanced)
|
|
print(f'or generating train/val splits such as: {training.split_stratified(train_prop=0.7)}')
|
|
|
|
# training
|
|
print('let us train a simple quantifier:...')
|
|
Xtr, ytr = training.Xy
|
|
quantifier = PACC()
|
|
quantifier.fit(Xtr, ytr) # or: quantifier.fit(*training.Xy)
|
|
|
|
# test
|
|
print("and use quapy' evaluation functions")
|
|
evaluation_protocol = ArtificialPrevalenceProtocol(
|
|
data=test,
|
|
sample_size=200,
|
|
random_state=0
|
|
)
|
|
|
|
report = qp.evaluation.evaluation_report(quantifier, protocol=evaluation_protocol, error_metrics=['ae'])
|
|
print(report)
|
|
print(f'mean absolute error across {len(report)} experiments: {report.mean(numeric_only=True)}')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|