From 1b20bf14ea44d114617035d9ff0b2104d507313e Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Fri, 2 Jul 2021 17:33:05 +0200 Subject: [PATCH] exploring multilabel quantification --- multi_label.py | 223 +++++++++++++++++++++++++++++++++++++++ quapy/data/reader.py | 7 ++ quapy/method/meta.py | 2 +- quapy/method/neural.py | 2 +- quapy/model_selection.py | 2 +- 5 files changed, 233 insertions(+), 3 deletions(-) create mode 100644 multi_label.py diff --git a/multi_label.py b/multi_label.py new file mode 100644 index 0000000..34a3469 --- /dev/null +++ b/multi_label.py @@ -0,0 +1,223 @@ +from copy import deepcopy + +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import f1_score +from sklearn.multiclass import OneVsRestClassifier +from sklearn.svm import LinearSVC + +import quapy as qp +from functional import artificial_prevalence_sampling +from method.aggregative import PACC, CC, EMQ +from method.base import BaseQuantifier +from quapy.data import from_rcv2_lang_file, LabelledCollection, MultilingualLabelledCollection +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import MultiLabelBinarizer +import numpy as np + + +class MultilabelledCollection: + def __init__(self, instances, labels): + assert labels.ndim==2, 'data does not seem to be multilabel' + self.instances = instances + self.labels = labels + self.classes_ = np.arange(labels.shape[1]) + + @classmethod + def load(cls, path: str, loader_func: callable): + return MultilabelledCollection(*loader_func(path)) + + def __len__(self): + return self.instances.shape[0] + + def prevalence(self): + # return self.labels.mean(axis=0) + pos = self.labels.mean(axis=0) + neg = 1-pos + return np.asarray([neg, pos]).T + + def counts(self): + return self.labels.sum(axis=0) + + @property + def n_classes(self): + return len(self.classes_) + + @property + def binary(self): + return False + + def __gen_index(self): + return np.arange(len(self)) + + def sampling_multi_index(self, size, cat, prev=None): + if prev is None: # no prevalence was indicated; returns an index for uniform sampling + return np.random.choice(len(self), size, replace=size>len(self)) + aux = LabelledCollection(self.__gen_index(), self.instances[:,cat]) + return aux.sampling_index(size, *[1-prev, prev]) + + def uniform_sampling_multi_index(self, size): + return np.random.choice(len(self), size, replace=size>len(self)) + + def uniform_sampling(self, size): + unif_index = self.uniform_sampling_multi_index(size) + return self.sampling_from_index(unif_index) + + def sampling(self, size, category, prev=None): + prev_index = self.sampling_multi_index(size, category, prev) + return self.sampling_from_index(prev_index) + + def sampling_from_index(self, index): + documents = self.instances[index] + labels = self.labels[index, :] + return MultilabelledCollection(documents, labels) + + def train_test_split(self, train_prop=0.6, random_state=None): + tr_docs, te_docs, tr_labels, te_labels = \ + train_test_split(self.instances, self.labels, train_size=train_prop, random_state=random_state) + return MultilabelledCollection(tr_docs, tr_labels), MultilabelledCollection(te_docs, te_labels) + + def artificial_sampling_generator(self, sample_size, category, n_prevalences=101, repeats=1): + dimensions = 2 + for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats): + yield self.sampling(sample_size, category, prevs[1]) + + def artificial_sampling_index_generator(self, sample_size, category, n_prevalences=101, repeats=1): + dimensions = 2 + for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats): + yield self.sampling_multi_index(sample_size, category, prevs[1]) + + def natural_sampling_generator(self, sample_size, repeats=100): + for _ in range(repeats): + yield self.uniform_sampling(sample_size) + + def natural_sampling_index_generator(self, sample_size, repeats=100): + for _ in range(repeats): + yield self.uniform_sampling_multi_index(sample_size) + + def asLabelledCollection(self, category): + return LabelledCollection(self.instances, self.labels[:,category]) + + def genLabelledCollections(self): + for c in self.classes_: + yield self.asLabelledCollection(c) + + @property + def Xy(self): + return self.instances, self.labels + + +class MultilabelQuantifier: + def __init__(self, q:BaseQuantifier): + self.q = q + self.estimators = {} + + def fit(self, data:MultilabelledCollection): + self.classes_ = data.classes_ + for cat, lc in enumerate(data.genLabelledCollections()): + self.estimators[cat] = deepcopy(self.q).fit(lc) + return self + + def quantify(self, instances): + pos_prevs = np.zeros(len(self.classes_), dtype=float) + for c in self.classes_: + pos_prevs[c] = self.estimators[c].quantify(instances)[1] + neg_prevs = 1-pos_prevs + return np.asarray([neg_prevs, pos_prevs]).T + + +class MultilabelCC: + def __init__(self): + self.estimator = MultilabelQuantifier(CC(LinearSVC())) + + def fit(self, data:MultilabelledCollection): + self.classes_ = data.classes_ + tr, te = data.train_test_split() + self.estimator.fit(tr) + Xs = [] + ys = [] + for sample in te.natural_sampling_generator(sample_size=200, repeats=100): + ys.append(sample.prevalence()[:,1]) + Xs.append(self.estimator.quantify(sample.instances)[:,1]) + Xs = np.asarray(Xs) + ys = np.asarray(ys) + print(Xs.shape) + print(ys.shape) + self.W = np.linalg.solve(Xs, ys) + return self + + def quantify(self, instances): + pred = self.estimator.quantify(instances)[:,1].reshape(1,-1) + adjusted = pred.dot(self.W) + adjusted = adjusted.flatten() + neg_prevs = 1-adjusted + return np.asarray([neg_prevs, adjusted]).T + + + +# read documents +path = f'./crosslingual_data/rcv12/en.small.txt' +docs, cats = from_rcv2_lang_file(path) + +# split train-test +tr_docs, te_docs, tr_cats, te_cats = train_test_split(docs, cats, test_size=0.2, random_state=42) + +# generate Y matrices +mlb = MultiLabelBinarizer() +ytr = mlb.fit_transform([cats.split(' ') for cats in tr_cats]) +yte = mlb.transform([cats.split(' ') for cats in te_cats]) +# retain 10 most populated categories +most_populated = np.argsort(ytr.sum(axis=0))[-10:] +ytr = ytr[:,most_populated] +yte = yte[:,most_populated] + +tfidf = TfidfVectorizer(min_df=5) +Xtr = tfidf.fit_transform(tr_docs) +Xte = tfidf.transform(te_docs) + +train = MultilabelledCollection(Xtr, ytr) +test = MultilabelledCollection(Xte, yte) + +# print(train.counts()) +# print(train.prevalence()) +# +# model = MultilabelQuantifier(PACC(LogisticRegression())) +# model.fit(train) +# estim_prevs = model.quantify(test.instances) +# true_prevs = test.prevalence() +# print('PACC:') +# print(estim_prevs) +# print(true_prevs) +# +# +# model = MultilabelQuantifier(CC(LogisticRegression())) +# model.fit(train) +# estim_prevs = model.quantify(test.instances) +# true_prevs = test.prevalence() +# print('CC:') +# print(estim_prevs) +# print(true_prevs) +# +# +# model = MultilabelQuantifier(EMQ(LogisticRegression())) +# model.fit(train) +# estim_prevs = model.quantify(test.instances) +# true_prevs = test.prevalence() +# print('EMQ:') +# print(estim_prevs) +# print(true_prevs) + +model = MultilabelCC() +model.fit(train) +estim_prevs = model.quantify(test.instances) +true_prevs = test.prevalence() +print('EMQ:') +print(estim_prevs) +print(true_prevs) + +qp.environ['SAMPLE_SIZE']=100 +mae = qp.error.mae(true_prevs, estim_prevs) +print(mae) + + + diff --git a/quapy/data/reader.py b/quapy/data/reader.py index 5b4d115..4e44fbb 100644 --- a/quapy/data/reader.py +++ b/quapy/data/reader.py @@ -3,6 +3,13 @@ from scipy.sparse import dok_matrix from tqdm import tqdm +def from_rcv2_lang_file(path, encoding='utf-8'): + lines = open(path, 'rt', encoding=encoding).readlines() + parts = [l.split('\t') for l in lines] + docs, cats = list(zip(*[(parts_i[1], parts_i[2]) for parts_i in parts])) + return docs, cats + + def from_text(path, encoding='utf-8'): """ Reas a labelled colletion of documents. diff --git a/quapy/method/meta.py b/quapy/method/meta.py index fc3efe3..e164f75 100644 --- a/quapy/method/meta.py +++ b/quapy/method/meta.py @@ -227,7 +227,7 @@ def _delayed_new_instance(args): if val_split is not None: if isinstance(val_split, float): assert 0 < val_split < 1, 'val_split should be in (0,1)' - data, val_split = data.split_stratified(train_prop=1 - val_split) + data, val_split = data.train_test_split(train_prop=1 - val_split) sample_index = data.sampling_index(sample_size, *prev) sample = data.sampling_from_index(sample_index) diff --git a/quapy/method/neural.py b/quapy/method/neural.py index 4decc74..2e28571 100644 --- a/quapy/method/neural.py +++ b/quapy/method/neural.py @@ -73,7 +73,7 @@ class QuaNetTrainer(BaseQuantifier): if fit_learner: classifier_data, unused_data = data.split_stratified(0.4) - train_data, valid_data = unused_data.split_stratified(0.66) # 0.66 split of 60% makes 40% and 20% + train_data, valid_data = unused_data.train_test_split(0.66) # 0.66 split of 60% makes 40% and 20% self.learner.fit(*classifier_data.Xy) else: classifier_data = None diff --git a/quapy/model_selection.py b/quapy/model_selection.py index 1080db0..f05b249 100644 --- a/quapy/model_selection.py +++ b/quapy/model_selection.py @@ -97,7 +97,7 @@ class GridSearchQ(BaseQuantifier): return training, validation elif isinstance(validation, float): assert 0. < validation < 1., 'validation proportion should be in (0,1)' - training, validation = training.split_stratified(train_prop=1 - validation) + training, validation = training.train_test_split(train_prop=1 - validation) return training, validation else: raise ValueError(f'"validation" must either be a LabelledCollection or a float in (0,1) indicating the'