forked from moreo/QuaPy
exploring multilabel quantification
This commit is contained in:
parent
ce908573e7
commit
1b20bf14ea
|
@ -0,0 +1,223 @@
|
||||||
|
from copy import deepcopy
|
||||||
|
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
from sklearn.metrics import f1_score
|
||||||
|
from sklearn.multiclass import OneVsRestClassifier
|
||||||
|
from sklearn.svm import LinearSVC
|
||||||
|
|
||||||
|
import quapy as qp
|
||||||
|
from functional import artificial_prevalence_sampling
|
||||||
|
from method.aggregative import PACC, CC, EMQ
|
||||||
|
from method.base import BaseQuantifier
|
||||||
|
from quapy.data import from_rcv2_lang_file, LabelledCollection, MultilingualLabelledCollection
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from sklearn.preprocessing import MultiLabelBinarizer
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
class MultilabelledCollection:
|
||||||
|
def __init__(self, instances, labels):
|
||||||
|
assert labels.ndim==2, 'data does not seem to be multilabel'
|
||||||
|
self.instances = instances
|
||||||
|
self.labels = labels
|
||||||
|
self.classes_ = np.arange(labels.shape[1])
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def load(cls, path: str, loader_func: callable):
|
||||||
|
return MultilabelledCollection(*loader_func(path))
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return self.instances.shape[0]
|
||||||
|
|
||||||
|
def prevalence(self):
|
||||||
|
# return self.labels.mean(axis=0)
|
||||||
|
pos = self.labels.mean(axis=0)
|
||||||
|
neg = 1-pos
|
||||||
|
return np.asarray([neg, pos]).T
|
||||||
|
|
||||||
|
def counts(self):
|
||||||
|
return self.labels.sum(axis=0)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def n_classes(self):
|
||||||
|
return len(self.classes_)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def binary(self):
|
||||||
|
return False
|
||||||
|
|
||||||
|
def __gen_index(self):
|
||||||
|
return np.arange(len(self))
|
||||||
|
|
||||||
|
def sampling_multi_index(self, size, cat, prev=None):
|
||||||
|
if prev is None: # no prevalence was indicated; returns an index for uniform sampling
|
||||||
|
return np.random.choice(len(self), size, replace=size>len(self))
|
||||||
|
aux = LabelledCollection(self.__gen_index(), self.instances[:,cat])
|
||||||
|
return aux.sampling_index(size, *[1-prev, prev])
|
||||||
|
|
||||||
|
def uniform_sampling_multi_index(self, size):
|
||||||
|
return np.random.choice(len(self), size, replace=size>len(self))
|
||||||
|
|
||||||
|
def uniform_sampling(self, size):
|
||||||
|
unif_index = self.uniform_sampling_multi_index(size)
|
||||||
|
return self.sampling_from_index(unif_index)
|
||||||
|
|
||||||
|
def sampling(self, size, category, prev=None):
|
||||||
|
prev_index = self.sampling_multi_index(size, category, prev)
|
||||||
|
return self.sampling_from_index(prev_index)
|
||||||
|
|
||||||
|
def sampling_from_index(self, index):
|
||||||
|
documents = self.instances[index]
|
||||||
|
labels = self.labels[index, :]
|
||||||
|
return MultilabelledCollection(documents, labels)
|
||||||
|
|
||||||
|
def train_test_split(self, train_prop=0.6, random_state=None):
|
||||||
|
tr_docs, te_docs, tr_labels, te_labels = \
|
||||||
|
train_test_split(self.instances, self.labels, train_size=train_prop, random_state=random_state)
|
||||||
|
return MultilabelledCollection(tr_docs, tr_labels), MultilabelledCollection(te_docs, te_labels)
|
||||||
|
|
||||||
|
def artificial_sampling_generator(self, sample_size, category, n_prevalences=101, repeats=1):
|
||||||
|
dimensions = 2
|
||||||
|
for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats):
|
||||||
|
yield self.sampling(sample_size, category, prevs[1])
|
||||||
|
|
||||||
|
def artificial_sampling_index_generator(self, sample_size, category, n_prevalences=101, repeats=1):
|
||||||
|
dimensions = 2
|
||||||
|
for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats):
|
||||||
|
yield self.sampling_multi_index(sample_size, category, prevs[1])
|
||||||
|
|
||||||
|
def natural_sampling_generator(self, sample_size, repeats=100):
|
||||||
|
for _ in range(repeats):
|
||||||
|
yield self.uniform_sampling(sample_size)
|
||||||
|
|
||||||
|
def natural_sampling_index_generator(self, sample_size, repeats=100):
|
||||||
|
for _ in range(repeats):
|
||||||
|
yield self.uniform_sampling_multi_index(sample_size)
|
||||||
|
|
||||||
|
def asLabelledCollection(self, category):
|
||||||
|
return LabelledCollection(self.instances, self.labels[:,category])
|
||||||
|
|
||||||
|
def genLabelledCollections(self):
|
||||||
|
for c in self.classes_:
|
||||||
|
yield self.asLabelledCollection(c)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def Xy(self):
|
||||||
|
return self.instances, self.labels
|
||||||
|
|
||||||
|
|
||||||
|
class MultilabelQuantifier:
|
||||||
|
def __init__(self, q:BaseQuantifier):
|
||||||
|
self.q = q
|
||||||
|
self.estimators = {}
|
||||||
|
|
||||||
|
def fit(self, data:MultilabelledCollection):
|
||||||
|
self.classes_ = data.classes_
|
||||||
|
for cat, lc in enumerate(data.genLabelledCollections()):
|
||||||
|
self.estimators[cat] = deepcopy(self.q).fit(lc)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def quantify(self, instances):
|
||||||
|
pos_prevs = np.zeros(len(self.classes_), dtype=float)
|
||||||
|
for c in self.classes_:
|
||||||
|
pos_prevs[c] = self.estimators[c].quantify(instances)[1]
|
||||||
|
neg_prevs = 1-pos_prevs
|
||||||
|
return np.asarray([neg_prevs, pos_prevs]).T
|
||||||
|
|
||||||
|
|
||||||
|
class MultilabelCC:
|
||||||
|
def __init__(self):
|
||||||
|
self.estimator = MultilabelQuantifier(CC(LinearSVC()))
|
||||||
|
|
||||||
|
def fit(self, data:MultilabelledCollection):
|
||||||
|
self.classes_ = data.classes_
|
||||||
|
tr, te = data.train_test_split()
|
||||||
|
self.estimator.fit(tr)
|
||||||
|
Xs = []
|
||||||
|
ys = []
|
||||||
|
for sample in te.natural_sampling_generator(sample_size=200, repeats=100):
|
||||||
|
ys.append(sample.prevalence()[:,1])
|
||||||
|
Xs.append(self.estimator.quantify(sample.instances)[:,1])
|
||||||
|
Xs = np.asarray(Xs)
|
||||||
|
ys = np.asarray(ys)
|
||||||
|
print(Xs.shape)
|
||||||
|
print(ys.shape)
|
||||||
|
self.W = np.linalg.solve(Xs, ys)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def quantify(self, instances):
|
||||||
|
pred = self.estimator.quantify(instances)[:,1].reshape(1,-1)
|
||||||
|
adjusted = pred.dot(self.W)
|
||||||
|
adjusted = adjusted.flatten()
|
||||||
|
neg_prevs = 1-adjusted
|
||||||
|
return np.asarray([neg_prevs, adjusted]).T
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# read documents
|
||||||
|
path = f'./crosslingual_data/rcv12/en.small.txt'
|
||||||
|
docs, cats = from_rcv2_lang_file(path)
|
||||||
|
|
||||||
|
# split train-test
|
||||||
|
tr_docs, te_docs, tr_cats, te_cats = train_test_split(docs, cats, test_size=0.2, random_state=42)
|
||||||
|
|
||||||
|
# generate Y matrices
|
||||||
|
mlb = MultiLabelBinarizer()
|
||||||
|
ytr = mlb.fit_transform([cats.split(' ') for cats in tr_cats])
|
||||||
|
yte = mlb.transform([cats.split(' ') for cats in te_cats])
|
||||||
|
# retain 10 most populated categories
|
||||||
|
most_populated = np.argsort(ytr.sum(axis=0))[-10:]
|
||||||
|
ytr = ytr[:,most_populated]
|
||||||
|
yte = yte[:,most_populated]
|
||||||
|
|
||||||
|
tfidf = TfidfVectorizer(min_df=5)
|
||||||
|
Xtr = tfidf.fit_transform(tr_docs)
|
||||||
|
Xte = tfidf.transform(te_docs)
|
||||||
|
|
||||||
|
train = MultilabelledCollection(Xtr, ytr)
|
||||||
|
test = MultilabelledCollection(Xte, yte)
|
||||||
|
|
||||||
|
# print(train.counts())
|
||||||
|
# print(train.prevalence())
|
||||||
|
#
|
||||||
|
# model = MultilabelQuantifier(PACC(LogisticRegression()))
|
||||||
|
# model.fit(train)
|
||||||
|
# estim_prevs = model.quantify(test.instances)
|
||||||
|
# true_prevs = test.prevalence()
|
||||||
|
# print('PACC:')
|
||||||
|
# print(estim_prevs)
|
||||||
|
# print(true_prevs)
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# model = MultilabelQuantifier(CC(LogisticRegression()))
|
||||||
|
# model.fit(train)
|
||||||
|
# estim_prevs = model.quantify(test.instances)
|
||||||
|
# true_prevs = test.prevalence()
|
||||||
|
# print('CC:')
|
||||||
|
# print(estim_prevs)
|
||||||
|
# print(true_prevs)
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# model = MultilabelQuantifier(EMQ(LogisticRegression()))
|
||||||
|
# model.fit(train)
|
||||||
|
# estim_prevs = model.quantify(test.instances)
|
||||||
|
# true_prevs = test.prevalence()
|
||||||
|
# print('EMQ:')
|
||||||
|
# print(estim_prevs)
|
||||||
|
# print(true_prevs)
|
||||||
|
|
||||||
|
model = MultilabelCC()
|
||||||
|
model.fit(train)
|
||||||
|
estim_prevs = model.quantify(test.instances)
|
||||||
|
true_prevs = test.prevalence()
|
||||||
|
print('EMQ:')
|
||||||
|
print(estim_prevs)
|
||||||
|
print(true_prevs)
|
||||||
|
|
||||||
|
qp.environ['SAMPLE_SIZE']=100
|
||||||
|
mae = qp.error.mae(true_prevs, estim_prevs)
|
||||||
|
print(mae)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -3,6 +3,13 @@ from scipy.sparse import dok_matrix
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
def from_rcv2_lang_file(path, encoding='utf-8'):
|
||||||
|
lines = open(path, 'rt', encoding=encoding).readlines()
|
||||||
|
parts = [l.split('\t') for l in lines]
|
||||||
|
docs, cats = list(zip(*[(parts_i[1], parts_i[2]) for parts_i in parts]))
|
||||||
|
return docs, cats
|
||||||
|
|
||||||
|
|
||||||
def from_text(path, encoding='utf-8'):
|
def from_text(path, encoding='utf-8'):
|
||||||
"""
|
"""
|
||||||
Reas a labelled colletion of documents.
|
Reas a labelled colletion of documents.
|
||||||
|
|
|
@ -227,7 +227,7 @@ def _delayed_new_instance(args):
|
||||||
if val_split is not None:
|
if val_split is not None:
|
||||||
if isinstance(val_split, float):
|
if isinstance(val_split, float):
|
||||||
assert 0 < val_split < 1, 'val_split should be in (0,1)'
|
assert 0 < val_split < 1, 'val_split should be in (0,1)'
|
||||||
data, val_split = data.split_stratified(train_prop=1 - val_split)
|
data, val_split = data.train_test_split(train_prop=1 - val_split)
|
||||||
|
|
||||||
sample_index = data.sampling_index(sample_size, *prev)
|
sample_index = data.sampling_index(sample_size, *prev)
|
||||||
sample = data.sampling_from_index(sample_index)
|
sample = data.sampling_from_index(sample_index)
|
||||||
|
|
|
@ -73,7 +73,7 @@ class QuaNetTrainer(BaseQuantifier):
|
||||||
|
|
||||||
if fit_learner:
|
if fit_learner:
|
||||||
classifier_data, unused_data = data.split_stratified(0.4)
|
classifier_data, unused_data = data.split_stratified(0.4)
|
||||||
train_data, valid_data = unused_data.split_stratified(0.66) # 0.66 split of 60% makes 40% and 20%
|
train_data, valid_data = unused_data.train_test_split(0.66) # 0.66 split of 60% makes 40% and 20%
|
||||||
self.learner.fit(*classifier_data.Xy)
|
self.learner.fit(*classifier_data.Xy)
|
||||||
else:
|
else:
|
||||||
classifier_data = None
|
classifier_data = None
|
||||||
|
|
|
@ -97,7 +97,7 @@ class GridSearchQ(BaseQuantifier):
|
||||||
return training, validation
|
return training, validation
|
||||||
elif isinstance(validation, float):
|
elif isinstance(validation, float):
|
||||||
assert 0. < validation < 1., 'validation proportion should be in (0,1)'
|
assert 0. < validation < 1., 'validation proportion should be in (0,1)'
|
||||||
training, validation = training.split_stratified(train_prop=1 - validation)
|
training, validation = training.train_test_split(train_prop=1 - validation)
|
||||||
return training, validation
|
return training, validation
|
||||||
else:
|
else:
|
||||||
raise ValueError(f'"validation" must either be a LabelledCollection or a float in (0,1) indicating the'
|
raise ValueError(f'"validation" must either be a LabelledCollection or a float in (0,1) indicating the'
|
||||||
|
|
Loading…
Reference in New Issue