finalizing experiments and bugfix in kld error

This commit is contained in:
Alejandro Moreo Fernandez 2024-05-08 11:31:28 +02:00
parent 1007257280
commit 366020d45c
8 changed files with 432 additions and 454 deletions

View File

@ -3,9 +3,7 @@ import numpy as np
from glob import glob from glob import glob
from os.path import join from os.path import join
from quapy.data import LabelledCollection import quapy.functional as F
from quapy.protocol import AbstractProtocol
import json
def load_sample(path, class_name): def load_sample(path, class_name):
@ -23,67 +21,86 @@ def load_sample(path, class_name):
return text, labels return text, labels
def get_text_label_score(df, class_name, vectorizer=None, filter_classes=None): def binarize_labels(labels, positive_class=None):
text = df.text.values if positive_class is not None:
labels = df[class_name].values protected_labels = labels==positive_class
rel_score = df.score.values labels[protected_labels] = 1
labels[~protected_labels] = 0
if filter_classes is not None: labels = labels.astype(int)
idx = np.isin(labels, filter_classes) return labels
text = text[idx]
labels = labels[idx]
rel_score = rel_score[idx]
if vectorizer is not None:
text = vectorizer.transform(text)
order = np.argsort(-rel_score)
return text[order], labels[order], rel_score[order]
class RetrievedSamples: class RetrievedSamples:
def __init__(self, def __init__(self,
class_home: str, class_home: str,
test_rankings_path: str, test_rankings_path: str,
test_query_prevs_path: str, test_query_prevs_path: str,
vectorizer, vectorizer,
class_name, class_name,
classes=None positive_class=None,
classes=None,
): ):
self.class_home = class_home self.class_home = class_home
self.test_rankings_df = pd.read_json(test_rankings_path) self.test_rankings_df = pd.read_json(test_rankings_path)
self.test_query_prevs_df = pd.read_json(test_query_prevs_path) self.test_query_prevs_df = pd.read_json(test_query_prevs_path)
self.vectorizer = vectorizer self.vectorizer = vectorizer
self.class_name = class_name self.class_name = class_name
self.classes=classes self.positive_class = positive_class
self.classes = classes
def get_text_label_score(self, df):
class_name = self.class_name
vectorizer = self.vectorizer
filter_classes = self.classes
text = df.text.values
labels = df[class_name].values
rel_score = df.score.values
labels = binarize_labels(labels, self.positive_class)
if filter_classes is not None:
idx = np.isin(labels, filter_classes)
text = text[idx]
labels = labels[idx]
rel_score = rel_score[idx]
if vectorizer is not None:
text = vectorizer.transform(text)
order = np.argsort(-rel_score)
return text[order], labels[order], rel_score[order]
def __call__(self): def __call__(self):
tests_df = self.test_rankings_df tests_df = self.test_rankings_df
class_name = self.class_name class_name = self.class_name
vectorizer = self.vectorizer
for file in self._list_queries(): for file in self._list_queries():
# print(file)
# loads the training sample # loads the training sample
train_df = pd.read_json(file) train_df = pd.read_json(file)
if len(train_df) == 0: if len(train_df) == 0:
print('empty dataframe: ', file) print('empty dataframe: ', file)
else: else:
Xtr, ytr, score_tr = get_text_label_score(train_df, class_name, vectorizer, filter_classes=self.classes) Xtr, ytr, score_tr = self.get_text_label_score(train_df)
# loads the test sample # loads the test sample
query_id = self._get_query_id_from_path(file) query_id = self._get_query_id_from_path(file)
sel_df = tests_df[tests_df.qid == query_id] sel_df = tests_df[tests_df.qid == query_id]
Xte, yte, score_te = get_text_label_score(sel_df, class_name, vectorizer, filter_classes=self.classes) Xte, yte, score_te = self.get_text_label_score(sel_df)
# gets the prevalence of all judged relevant documents for the query # gets the prevalence of all judged relevant documents for the query
df = self.test_query_prevs_df df = self.test_query_prevs_df
q_rel_prevs = df.loc[df.id == query_id][class_name+'_proportions'].values[0] q_rel_prevs = df.loc[df.id == query_id][class_name+'_proportions'].values[0]
if self.positive_class is not None:
if self.positive_class not in q_rel_prevs:
print(f'positive class {self.positive_class} not found in the query; skipping')
continue
q_rel_prevs = F.as_binary_prevalence(q_rel_prevs[self.positive_class])
else:
q_rel_prevs = np.asarray([q_rel_prevs.get(class_i, 0.) for class_i in self.classes])
yield (Xtr, ytr, score_tr), (Xte, yte, score_te), q_rel_prevs yield (Xtr, ytr, score_tr), (Xte, yte, score_te), q_rel_prevs
def _list_queries(self): def _list_queries(self):

View File

@ -6,16 +6,20 @@ from pathlib import Path
import numpy as np import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.base import clone
from sklearn.svm import LinearSVC from sklearn.svm import LinearSVC
from scipy.special import rel_entr as KLD from scipy.special import rel_entr as KLD
import quapy as qp import quapy as qp
import quapy.functional as F import quapy.functional as F
from Retrieval.commons import RetrievedSamples, load_sample from Retrieval.commons import RetrievedSamples, load_sample, binarize_labels
from Retrieval.methods import M3rND_ModelB, M3rND_ModelD, AbstractM3rND
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
from quapy.data.base import LabelledCollection from quapy.data.base import LabelledCollection
from scipy.sparse import vstack
from os.path import join from os.path import join
from tqdm import tqdm from tqdm import tqdm
@ -50,21 +54,20 @@ To evaluate our approach, I have executed the queries on the test split. You can
""" """
def methods(classifier, class_name): def methods(classifier, class_name, binarize=False):
kde_param = { kde_param = {
'continent': 0.01, 'continent': 0.01,
'gender': 0.005, 'gender': 0.03,
'years_category':0.03 'years_category':0.03
} }
#yield ('Naive', Naive()) yield ('Naive', Naive())
#yield ('NaiveQuery', Naive()) yield ('NaiveQuery', Naive())
yield ('CC', ClassifyAndCount(classifier)) yield ('CC', ClassifyAndCount(classifier))
# yield ('PCC', PCC(classifier)) # yield ('PCC', PCC(classifier))
# yield ('ACC', ACC(classifier, val_split=5, n_jobs=-1)) # yield ('ACC', ACC(classifier, val_split=5, n_jobs=-1))
#yield ('PACC', PACC(classifier, val_split=5, n_jobs=-1)) yield ('PACC', PACC(classifier, val_split=5, n_jobs=-1))
# yield ('PACC-s', PACC(classifier, val_split=5, n_jobs=-1))
# yield ('EMQ', EMQ(classifier, exact_train_prev=True)) # yield ('EMQ', EMQ(classifier, exact_train_prev=True))
# yield ('EMQ-Platt', EMQ(classifier, exact_train_prev=True, recalib='platt')) # yield ('EMQ-Platt', EMQ(classifier, exact_train_prev=True, recalib='platt'))
# yield ('EMQh', EMQ(classifier, exact_train_prev=False)) # yield ('EMQh', EMQ(classifier, exact_train_prev=False))
@ -72,26 +75,16 @@ def methods(classifier, class_name):
# yield ('EMQ-TS', EMQ(classifier, exact_train_prev=False, recalib='ts')) # yield ('EMQ-TS', EMQ(classifier, exact_train_prev=False, recalib='ts'))
# yield ('EMQ-NBVS', EMQ(classifier, exact_train_prev=False, recalib='nbvs')) # yield ('EMQ-NBVS', EMQ(classifier, exact_train_prev=False, recalib='nbvs'))
# yield ('EMQ-VS', EMQ(classifier, exact_train_prev=False, recalib='vs')) # yield ('EMQ-VS', EMQ(classifier, exact_train_prev=False, recalib='vs'))
# yield ('KDE001', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.001)) yield ('KDEy-ML', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=kde_param[class_name]))
# yield ('KDE005', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.005)) # <-- wow!
# yield ('KDE01', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.01)) # yield ('KDE01', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.01))
# yield ('KDE02', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.02)) if binarize:
# yield ('KDE03', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.03)) yield ('M3b', M3rND_ModelB(classifier))
# yield ('KDE-silver', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth='silverman')) yield ('M3b+', M3rND_ModelB(classifier))
# yield ('KDE-scott', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth='scott')) yield ('M3d', M3rND_ModelD(classifier))
# yield ('KDEy-ML', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=kde_param[class_name])) yield ('M3d+', M3rND_ModelD(classifier))
# yield ('KDE005', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.005))
# yield ('KDE01', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.01))
# yield ('KDE01-s', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.01))
# yield ('KDE02', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.02))
# yield ('KDE03', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.03))
# yield ('KDE04', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.04))
# yield ('KDE05', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.05))
# yield ('KDE07', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.07))
# yield ('KDE10', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.10))
def train_classifier(train_path): def train_classifier_fn(train_path):
""" """
Trains a classifier. To do so, it loads the training set, transforms it into a tfidf representation. Trains a classifier. To do so, it loads the training set, transforms it into a tfidf representation.
The classifier is Logistic Regression, with hyperparameters C (range [0.001, 0.01, ..., 1000]) and The classifier is Logistic Regression, with hyperparameters C (range [0.001, 0.01, ..., 1000]) and
@ -101,28 +94,36 @@ def train_classifier(train_path):
""" """
texts, labels = load_sample(train_path, class_name=class_name) texts, labels = load_sample(train_path, class_name=class_name)
if BINARIZE:
labels = binarize_labels(labels, positive_class=protected_group[class_name])
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=3) tfidf = TfidfVectorizer(sublinear_tf=True, min_df=3)
Xtr = tfidf.fit_transform(texts) Xtr = tfidf.fit_transform(texts)
print(f'Xtr shape={Xtr.shape}') print(f'Xtr shape={Xtr.shape}')
print('training classifier...', end='') print('training classifier...', end='')
classifier = LogisticRegression(max_iter=5000) classifier = LogisticRegression(max_iter=5000)
classifier = GridSearchCV( modsel = GridSearchCV(
classifier, classifier,
param_grid={'C': np.logspace(-4, 4, 9), 'class_weight': ['balanced', None]}, param_grid={'C': np.logspace(-4, 4, 9), 'class_weight': ['balanced', None]},
n_jobs=-1, n_jobs=-1,
cv=5 cv=5
) )
classifier.fit(Xtr, labels) modsel.fit(Xtr, labels)
classifier = classifier.best_estimator_ classifier = modsel.best_estimator_
classifier_acc = classifier.best_score_ classifier_acc = modsel.best_score_
print(f'[done] best-params={classifier.best_params_} got {classifier_acc:.4f} score') best_params = modsel.best_params_
print(f'[done] best-params={best_params} got {classifier_acc:.4f} score')
print('generating cross-val predictions for M3')
predictions = cross_val_predict(clone(classifier), Xtr, labels, cv=10, n_jobs=-1, verbose=10)
conf_matrix = confusion_matrix(labels, predictions, labels=classifier.classes_)
training = LabelledCollection(Xtr, labels) training = LabelledCollection(Xtr, labels)
print('training classes:', training.classes_) print('training classes:', training.classes_)
print('training prevalence:', training.prevalence()) print('training prevalence:', training.prevalence())
return tfidf, classifier return tfidf, classifier, conf_matrix
def reduceAtK(data: LabelledCollection, k): def reduceAtK(data: LabelledCollection, k):
@ -140,12 +141,12 @@ def benchmark_name(class_name, k):
def run_experiment(): def run_experiment():
results = { results = {
'mae': {k: [] for k in Ks}, 'mae': {k: [] for k in Ks},
'mrae': {k: [] for k in Ks}, 'mrae': {k: [] for k in Ks},
'Dkl_estim': [], 'rKL_error': [],
'Dkl_true': [], 'rND_error': []
'Dkl_error': []
} }
pbar = tqdm(experiment_prot(), total=experiment_prot.total()) pbar = tqdm(experiment_prot(), total=experiment_prot.total())
@ -153,163 +154,159 @@ def run_experiment():
Xtr, ytr, score_tr = train Xtr, ytr, score_tr = train
Xte, yte, score_te = test Xte, yte, score_te = test
if HALF and not method_name.endswith('-s'): n = len(ytr) // 2
n = len(ytr) // 2 train_col = LabelledCollection(Xtr[:n], ytr[:n], classes=classifier.classes_)
train_col = LabelledCollection(Xtr[:n], ytr[:n], classes=classifier_trained.classes_)
else:
train_col = LabelledCollection(Xtr, ytr, classes=classifier_trained.classes_)
class_order = train_col.classes_ if method_name not in ['Naive', 'NaiveQuery', 'M3b', 'M3b+', 'M3d', 'M3d+']:
q_rel_prevs = np.asarray([q_rel_prevs.get(k, 0.) for k in class_order]) method.fit(train_col, val_split=train_col, fit_classifier=False)
# idx, max_score_round_robin = get_idx_score_matrix_per_class(train_col, score_tr)
if method_name not in ['Naive', 'NaiveQuery'] and not method_name.endswith('-s'):
quantifier.fit(train_col, val_split=train_col, fit_classifier=False)
elif method_name == 'Naive': elif method_name == 'Naive':
quantifier.fit(train_col) method.fit(train_col)
test_col = LabelledCollection(Xte, yte, classes=classifier_trained.classes_) test_col = LabelledCollection(Xte, yte, classes=classifier.classes_)
Dkl_estim = [] rKL_estim, rKL_true = [], []
Dkl_true = [] rND_estim, rND_true = [], []
for k in Ks: for k in Ks:
test_k = reduceAtK(test_col, k) test_k = reduceAtK(test_col, k)
if method_name == 'NaiveQuery': if method_name == 'NaiveQuery':
train_k = reduceAtK(train_col, k) train_k = reduceAtK(train_col, k)
quantifier.fit(train_k) method.fit(train_k)
# elif method_name.endswith('-s'):
# test_min_score = score_te[k] if k < len(score_te) else score_te[-1]
# train_k = reduce_train_at_score(train_col, idx, max_score_round_robin, test_min_score)
# print(f'{k=}, {test_min_score=} {len(train_k)=}')
# quantifier.fit(train_k, val_split=train_k, fit_classifier=False)
estim_prev = quantifier.quantify(test_k.instances) estim_prev = method.quantify(test_k.instances)
eps=(1. / (2 * k)) # epsilon value for prevalence smoothing
mae = qp.error.mae(test_k.prevalence(), estim_prev) eps=(1. / (2. * k))
mrae = qp.error.mrae(test_k.prevalence(), estim_prev, eps=eps)
Dkl_at_k_estim = qp.error.kld(estim_prev, q_rel_prevs, eps=eps)
Dkl_at_k_true = qp.error.kld(test_k.prevalence(), q_rel_prevs, eps=eps)
# error metrics
test_k_prev = test_k.prevalence()
mae = qp.error.mae(test_k_prev, estim_prev)
mrae = qp.error.mrae(test_k_prev, estim_prev, eps=eps)
rKL_at_k_estim = qp.error.kld(estim_prev, q_rel_prevs, eps=eps)
rKL_at_k_true = qp.error.kld(test_k_prev, q_rel_prevs, eps=eps)
if BINARIZE:
# [1] is the index of the minority or historically disadvantaged group
rND_at_k_estim = np.abs(estim_prev[1] - q_rel_prevs[1])
rND_at_k_true = np.abs(test_k_prev[1] - q_rel_prevs[1])
# collect results
results['mae'][k].append(mae) results['mae'][k].append(mae)
results['mrae'][k].append(mrae) results['mrae'][k].append(mrae)
Dkl_estim.append(Dkl_at_k_estim) rKL_estim.append(rKL_at_k_estim)
Dkl_true.append(Dkl_at_k_true) rKL_true.append(rKL_at_k_true)
if BINARIZE:
rND_estim.append(rND_at_k_estim)
rND_true.append(rND_at_k_true)
Z = 1
Dkl_estim = (1/Z) * sum((1./np.log2(k)) * v for v in Dkl_estim)
Dkl_true = (1/Z) * sum((1./np.log2(k)) * v for v in Dkl_true)
Dkl_error = np.abs(Dkl_true-Dkl_estim)
#print(f'{Dkl_estim=}\t{Dkl_true=}\t{Dkl_error=}')
results['Dkl_estim'].append(Dkl_estim) # aggregate fairness metrics
results['Dkl_true'].append(Dkl_true) def aggregate(rMs, Ks, Z=1):
results['Dkl_error'].append(Dkl_error) return (1 / Z) * sum((1. / np.log2(k)) * v for v, k in zip(rMs, Ks))
Z = sum((1. / np.log2(k)) for k in Ks)
rKL_estim = aggregate(rKL_estim, Ks, Z)
rKL_true = aggregate(rKL_true, Ks, Z)
rKL_error = np.abs(rKL_true-rKL_estim)
results['rKL_error'].append(rKL_error)
if BINARIZE:
rND_estim = aggregate(rND_estim, Ks, Z)
rND_true = aggregate(rND_true, Ks, Z)
if isinstance(method, AbstractM3rND):
if method_name.endswith('+'):
conf_matrix_ = method.get_confusion_matrix(*train_col.Xy)
else:
conf_matrix_ = conf_matrix.copy()
rND_estim = method.fair_measure_correction(rND_estim, conf_matrix_)
rND_error = np.abs(rND_true - rND_estim)
results['rND_error'].append(rND_error)
pbar.set_description(f'{method_name}') pbar.set_description(f'{method_name}')
return results return results
def get_idx_score_matrix_per_class(train, score_tr):
classes = train.classes_
num_classes = len(classes)
num_docs = len(train)
scores = np.zeros(shape=(num_docs, num_classes), dtype=float)
idx = np.full(shape=(num_docs, num_classes), fill_value=-1, dtype=int)
X, y = train.Xy
for i, class_i in enumerate(classes):
class_i_scores = score_tr[y == class_i]
rank_i = np.argwhere(y == class_i).flatten()
scores[:len(class_i_scores), i] = class_i_scores
idx[:len(class_i_scores), i] = rank_i
max_score_round_robin = scores.max(axis=1)
return idx, max_score_round_robin
# Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000]
def reduce_train_at_score(train, idx, max_score_round_robin, score_te_at_k, min_docs_per_class=5): Ks = [50, 100, 500, 1000]
min_index = np.min(np.argwhere(max_score_round_robin<score_te_at_k).flatten()) CLASS_NAMES = ['years_category', 'continent', 'gender'] # ['relative_pageviews_category', 'num_sitelinks_category']:
min_index = max(min_docs_per_class, min_index)
choosen_idx = idx[:min_index,:].flatten()
choosen_idx = choosen_idx[choosen_idx!=-1]
choosen_data = LabelledCollection(train.X[choosen_idx], train.y[choosen_idx], classes=train.classes_)
return choosen_data
Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000]
CLASS_NAMES = ['gender', 'continent', 'years_category'] # 'relative_pageviews_category', 'num_sitelinks_category']:
DATA_SIZES = ['10K', '50K', '100K', '500K', '1M', 'FULL'] DATA_SIZES = ['10K', '50K', '100K', '500K', '1M', 'FULL']
data_home = 'data'
protected_group = {
'gender': 'Female',
'continent': 'Africa',
'years_category': 'Pre-1900s',
}
if __name__ == '__main__': if __name__ == '__main__':
data_home = 'data'
HALF=True tables_RND, tables_DKL = [], []
exp_posfix = '_half' for class_mode in ['binary', 'multiclass']:
BINARIZE = (class_mode=='binary')
method_names = [name for name, *other in methods(None, 'continent', BINARIZE)]
method_names = [name for name, *other in methods(None, 'continent')] for class_name in CLASS_NAMES:
tables_mae, tables_mrae = [], []
for class_name in CLASS_NAMES: benchmarks_size =[benchmark_name(class_name, s) for s in DATA_SIZES]
tables_mae, tables_mrae = [], [] table_DKL = Table(name=f'rKL-{class_name}', benchmarks=benchmarks_size, methods=method_names)
table_RND = Table(name=f'rND-{class_name}', benchmarks=benchmarks_size, methods=method_names)
table_DKL = Table(name=f'Dkl-{class_name}', benchmarks=[benchmark_name(class_name, s) for s in DATA_SIZES], methods=method_names) for data_size in DATA_SIZES:
print(class_name, class_mode, data_size)
benchmarks_k = [benchmark_name(class_name, k) for k in Ks]
# table_mae = Table(name=f'{class_name}-{data_size}-mae', benchmarks=benchmarks_k, methods=method_names)
table_mrae = Table(name=f'{class_name}-{data_size}-mrae', benchmarks=benchmarks_k, methods=method_names)
benchmarks = [benchmark_name(class_name, k) for k in Ks] # tables_mae.append(table_mae)
tables_mrae.append(table_mrae)
for data_size in DATA_SIZES: # sets all paths
class_home = join(data_home, class_name, data_size)
train_data_path = join(data_home, class_name, 'FULL', 'classifier_training.json') # <----- fixed classifier
classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}_{class_mode}.pkl')
test_rankings_path = join(data_home, 'testRanking_Results.json')
test_query_prevs_path = join(data_home, 'prevelance_vectors_judged_docs.json')
results_home = join('results', class_name, class_mode, data_size)
positive_class = protected_group[class_name] if BINARIZE else None
table_mae = Table(name=f'{class_name}-{data_size}-mae', benchmarks=benchmarks, methods=method_names) # instantiates the classifier (trains it the first time, loads it in the subsequent executions)
table_mrae = Table(name=f'{class_name}-{data_size}-mrae', benchmarks=benchmarks, methods=method_names) tfidf, classifier, conf_matrix \
table_mae.format.mean_prec = 5 = qp.util.pickled_resource(classifier_path, train_classifier_fn, train_data_path)
table_mae.format.remove_zero = True
table_mae.format.color_mode = 'global'
tables_mae.append(table_mae) experiment_prot = RetrievedSamples(
tables_mrae.append(table_mrae) class_home,
test_rankings_path,
test_query_prevs_path,
vectorizer=tfidf,
class_name=class_name,
positive_class=positive_class,
classes=classifier.classes_
)
class_home = join(data_home, class_name, data_size) for method_name, method in methods(classifier, class_name, BINARIZE):
# train_data_path = join(class_home, 'classifier_training.json')
# classifier_path = join('classifiers', data_size, f'classifier_{class_name}.pkl')
train_data_path = join(data_home, class_name, 'FULL', 'classifier_training.json') # <-------- fixed classifier
classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}.pkl') # <------------ fixed classifier
test_rankings_path = join(data_home, 'testRanking_Results.json')
test_query_prevs_path = join(data_home, 'prevelance_vectors_judged_docs.json')
results_home = join('results'+exp_posfix, class_name, data_size)
tfidf, classifier_trained = qp.util.pickled_resource(classifier_path, train_classifier, train_data_path) results_path = join(results_home, method_name + '.pkl')
results = qp.util.pickled_resource(results_path, run_experiment)
experiment_prot = RetrievedSamples( # compose the tables
class_home, for k in Ks:
test_rankings_path, # table_mae.add(benchmark=benchmark_name(class_name, k), method=method_name, v=results['mae'][k])
test_query_prevs_path, table_mrae.add(benchmark=benchmark_name(class_name, k), method=method_name, v=results['mrae'][k])
vectorizer=tfidf, table_DKL.add(benchmark=benchmark_name(class_name, data_size), method=method_name, v=results['rKL_error'])
class_name=class_name, if BINARIZE:
classes=classifier_trained.classes_ table_RND.add(benchmark=benchmark_name(class_name, data_size), method=method_name, v=results['rND_error'])
)
for method_name, quantifier in methods(classifier_trained, class_name):
results_path = join(results_home, method_name + '.pkl') tables = ([table_RND] + tables_mrae) if BINARIZE else ([table_DKL] + tables_mrae)
# if the result pickle exists, loads and returns it Table.LatexPDF(f'./latex/{class_mode}/{class_name}.pdf', tables=tables)
if os.path.exists(results_path):
print(f'Method {method_name=} already computed')
results = pickle.load(open(results_path, 'rb'))
# otherwie, computes the results, dumps a pickle, and returns it
else:
results = run_experiment()
os.makedirs(Path(results_path).parent, exist_ok=True)
pickle.dump(results, open(results_path, 'wb'), pickle.HIGHEST_PROTOCOL)
print(results_path) if BINARIZE:
print(results) tables_RND.append(table_RND)
else:
tables_DKL.append(table_DKL)
# compose the tables Table.LatexPDF(f'./latex/global/main.pdf', tables=tables_RND+tables_DKL, dedicated_pages=False)
for k in Ks:
table_mae.add(benchmark=benchmark_name(class_name, k), method=method_name, v=results['mae'][k])
table_mrae.add(benchmark=benchmark_name(class_name, k), method=method_name, v=results['mrae'][k])
table_DKL.add(benchmark=benchmark_name(class_name, data_size), method=method_name, v=results['Dkl_error'])
Table.LatexPDF(f'./latex{exp_posfix}/{class_name}{exp_posfix}.pdf', tables=[table_DKL] + tables_mrae)

View File

@ -14,6 +14,7 @@ from Retrieval.commons import RetrievedSamples, load_sample
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
from quapy.data.base import LabelledCollection from quapy.data.base import LabelledCollection
from experiments import benchmark_name, reduceAtK, run_experiment
from os.path import join from os.path import join
from tqdm import tqdm from tqdm import tqdm
@ -22,81 +23,19 @@ from result_table.src.table import Table
def methods(classifier, class_name): def methods(classifier):
yield ('KDE001', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.001)) for i, bandwidth in enumerate(np.linspace(0.01, 0.1, 10)):
yield ('KDE005', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.005)) yield (f'KDE{str(i).zfill(2)}', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=bandwidth))
yield ('KDE01', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.01))
yield ('KDE02', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.02))
yield ('KDE03', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.03))
yield ('KDE04', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.04))
yield ('KDE05', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.05))
yield ('KDE07', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.07))
yield ('KDE10', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.10))
def reduceAtK(data: LabelledCollection, k):
# if k > len(data):
# print(f'[warning] {k=}>{len(data)=}')
X, y = data.Xy
X = X[:k]
y = y[:k]
return LabelledCollection(X, y, classes=data.classes_)
def run_experiment():
results = {
'mae': {k: [] for k in Ks},
'mrae': {k: [] for k in Ks}
}
pbar = tqdm(experiment_prot(), total=experiment_prot.total())
for train, test in pbar:
Xtr, ytr, score_tr = train
Xte, yte, score_te = test
if HALF:
n = len(ytr) // 2
train_col = LabelledCollection(Xtr[:n], ytr[:n], classes=classifier_trained.classes_)
else:
train_col = LabelledCollection(Xtr, ytr, classes=classifier_trained.classes_)
if method_name not in ['Naive', 'NaiveQuery']:
quantifier.fit(train_col, val_split=train_col, fit_classifier=False)
elif method_name == 'Naive':
quantifier.fit(train_col)
test_col = LabelledCollection(Xte, yte, classes=classifier_trained.classes_)
for k in Ks:
test_k = reduceAtK(test_col, k)
if method_name == 'NaiveQuery':
train_k = reduceAtK(train_col, k)
quantifier.fit(train_k)
estim_prev = quantifier.quantify(test_k.instances)
mae = qp.error.mae(test_k.prevalence(), estim_prev)
mrae = qp.error.mrae(test_k.prevalence(), estim_prev, eps=(1. / (2 * k)))
results['mae'][k].append(mae)
results['mrae'][k].append(mrae)
pbar.set_description(f'{method_name}')
return results
def benchmark_name(class_name, k):
scape_class_name = class_name.replace('_', '\_')
return f'{scape_class_name}@{k}'
if __name__ == '__main__': if __name__ == '__main__':
data_home = 'data-modsel' data_home = 'data-modsel'
HALF=True
exp_posfix = '_half_modsel'
Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000] Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000]
method_names = [m for m, *_ in methods(None, None)] method_names = [m for m, *_ in methods(None)]
class_mode = 'multiclass'
dir_names={ dir_names={
'gender': '100K_GENDER_TREC21_QUERIES/100K-NEW-QUERIES', 'gender': '100K_GENDER_TREC21_QUERIES/100K-NEW-QUERIES',
@ -104,54 +43,42 @@ if __name__ == '__main__':
'years_category': '100K_YEARS_TREC21_QUERIES/100K-NEW-QUERIES' 'years_category': '100K_YEARS_TREC21_QUERIES/100K-NEW-QUERIES'
} }
for class_name in ['gender', 'continent', 'years_category']: # 'relative_pageviews_category', 'num_sitelinks_category']: for class_name in ['gender', 'continent', 'years_category']:
tables_mae, tables_mrae = [], []
tables_mrae = []
benchmarks = [benchmark_name(class_name, k) for k in Ks] benchmarks = [benchmark_name(class_name, k) for k in Ks]
for data_size in ['100K']: for data_size in ['100K']:
table_mae = Table(name=f'{class_name}-{data_size}-mae', benchmarks=benchmarks, methods=method_names)
table_mrae = Table(name=f'{class_name}-{data_size}-mrae', benchmarks=benchmarks, methods=method_names) table_mrae = Table(name=f'{class_name}-{data_size}-mrae', benchmarks=benchmarks, methods=method_names)
table_mae.format.mean_prec = 5
table_mae.format.remove_zero = True
table_mae.format.color_mode = 'global'
tables_mae.append(table_mae)
tables_mrae.append(table_mrae) tables_mrae.append(table_mrae)
class_home = join(data_home, dir_names[class_name]) class_home = join(data_home, dir_names[class_name])
classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}.pkl') # <------------ fixed classifier classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}_{class_mode}.pkl')
test_rankings_path = join(data_home, 'testRanking-TREC21-Queries_Results.json') test_rankings_path = join(data_home, 'testRanking-TREC21-Queries_Results.json')
results_home = join('results'+exp_posfix, class_name, data_size) test_query_prevs_path = join('data', 'prevelance_vectors_judged_docs.json')
results_home = join('results', 'modsel', class_name, data_size)
tfidf, classifier_trained = pickle.load(open(classifier_path, 'rb')) tfidf, classifier, conf_matrix = pickle.load(open(classifier_path, 'rb'))
experiment_prot = RetrievedSamples( experiment_prot = RetrievedSamples(
class_home, class_home,
test_rankings_path, test_rankings_path,
test_query_prevs_path,
vectorizer=tfidf, vectorizer=tfidf,
class_name=class_name, class_name=class_name,
classes=classifier_trained.classes_ classes=classifier.classes_
) )
for method_name, quantifier in methods(classifier_trained, class_name): for method_name, quantifier in methods(classifier):
results_path = join(results_home, method_name + '.pkl') results_path = join(results_home, method_name + '.pkl')
if os.path.exists(results_path): results = qp.util.pickled_resource(results_path, run_experiment)
print(f'Method {method_name=} already computed')
results = pickle.load(open(results_path, 'rb'))
else:
results = run_experiment()
os.makedirs(Path(results_path).parent, exist_ok=True)
pickle.dump(results, open(results_path, 'wb'), pickle.HIGHEST_PROTOCOL)
for k in Ks: for k in Ks:
table_mae.add(benchmark=benchmark_name(class_name, k), method=method_name, v=results['mae'][k])
table_mrae.add(benchmark=benchmark_name(class_name, k), method=method_name, v=results['mrae'][k]) table_mrae.add(benchmark=benchmark_name(class_name, k), method=method_name, v=results['mrae'][k])
# Table.LatexPDF(f'./latex{exp_posfix}/{class_name}{exp_posfix}.pdf', tables=tables_mae+tables_mrae) Table.LatexPDF(f'./latex/modsel/{class_name}.pdf', tables=tables_mrae)
Table.LatexPDF(f'./latex{exp_posfix}/{class_name}{exp_posfix}.pdf', tables=tables_mrae)

View File

@ -1,77 +0,0 @@
import itertools
import os.path
import pickle
from collections import defaultdict
from pathlib import Path
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
import quapy as qp
from Retrieval.commons import RetrievedSamples, load_sample
from quapy.protocol import UPP
from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
from quapy.model_selection import GridSearchQ
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
from quapy.data.base import LabelledCollection
from os.path import join
from tqdm import tqdm
from result_table.src.table import Table
"""
"""
data_home = 'data'
datasets = ['continent', 'gender', 'years_category'] #, 'relative_pageviews_category', 'num_sitelinks_category']
for class_name in datasets:
train_data_path = join(data_home, class_name, 'FULL', 'classifier_training.json') # <-------- fixed classifier
texts, labels = load_sample(train_data_path, class_name=class_name)
classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}.pkl')
tfidf, classifier_trained = pickle.load(open(classifier_path, 'rb'))
classifier_hyper = classifier_trained.get_params()
print(f'{classifier_hyper=}')
X = tfidf.transform(texts)
print(f'Xtr shape={X.shape}')
pool = LabelledCollection(X, labels)
train, val = pool.split_stratified(train_prop=0.5, random_state=0)
q = KDEyML(LogisticRegression())
classifier_hyper = {'classifier__C':[classifier_hyper['C'], 0.00000001], 'classifier__class_weight':[classifier_hyper['class_weight']]}
quantifier_hyper = {'bandwidth': np.linspace(0.01, 0.2, 20)}
hyper = {**classifier_hyper, **quantifier_hyper}
qp.environ['SAMPLE_SIZE'] = 100
modsel = GridSearchQ(
model=q,
param_grid=hyper,
protocol=UPP(val, sample_size=100),
n_jobs=-1,
error='mrae',
verbose=True
)
modsel.fit(train)
print(class_name)
print(f'{modsel.best_params_}')
print(f'{modsel.best_score_}')

View File

@ -0,0 +1,125 @@
import os.path
import pickle
from collections import defaultdict
from pathlib import Path
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
import quapy as qp
from Retrieval.commons import RetrievedSamples, load_sample
from Retrieval.experiments import methods, benchmark_name
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
from quapy.data.base import LabelledCollection
from os.path import join
from tqdm import tqdm
from result_table.src.table import Table
import matplotlib.pyplot as plt
data_home = 'data'
class_mode = 'multiclass'
method_names = [name for name, *other in methods(None, 'continent')]
# Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000]
Ks = [50, 100, 500, 1000]
DATA_SIZE = ['10K', '50K', '100K', '500K', '1M', 'FULL']
CLASS_NAME = ['gender', 'continent', 'years_category']
all_results = {}
# loads all MRAE results, and returns a dictionary containing the values, which is indexed by:
# class_name -> data_size -> method_name -> k -> stat -> float
# where stat is "mean", "std", "max"
def load_all_results():
for class_name in CLASS_NAME:
all_results[class_name] = {}
for data_size in DATA_SIZE:
all_results[class_name][data_size] = {}
results_home = join('results', class_name, class_mode, data_size)
all_results[class_name][data_size] = {}
for method_name in method_names:
results_path = join(results_home, method_name + '.pkl')
try:
results = pickle.load(open(results_path, 'rb'))
except Exception as e:
print(f'missing result {results}', e)
all_results[class_name][data_size][method_name] = {}
for k in Ks:
all_results[class_name][data_size][method_name][k] = {}
values = results['mrae']
all_results[class_name][data_size][method_name][k]['mean'] = np.mean(values[k])
all_results[class_name][data_size][method_name][k]['std'] = np.std(values[k])
all_results[class_name][data_size][method_name][k]['max'] = np.max(values[k])
return all_results
results = load_all_results()
# generates the class-independent, size-independent plots for y-axis=MRAE in which:
# - the x-axis displays the Ks
for class_name in CLASS_NAME:
for data_size in DATA_SIZE:
fig, ax = plt.subplots()
max_means = []
for method_name in method_names:
# class_name -> data_size -> method_name -> k -> stat -> float
means = [
results[class_name][data_size][method_name][k]['mean'] for k in Ks
]
stds = [
results[class_name][data_size][method_name][k]['std'] for k in Ks
]
# max_mean = np.max([
# results[class_name][data_size][method_name][k]['max'] for k in Ks
# ])
max_means.append(max(means))
means = np.asarray(means)
stds = np.asarray(stds)
line = ax.plot(Ks, means, 'o-', label=method_name, color=None)
color = line[-1].get_color()
# ax.fill_between(Ks, means - stds, means + stds, alpha=0.3, color=color)
ax.set_xlabel('k')
ax.set_ylabel('RAE')
ax.set_title(f'{class_name} from {data_size}')
ax.set_ylim([0, max(max_means)*1.05])
ax.legend()
os.makedirs(f'plots/var_k/{class_name}', exist_ok=True)
plotpath = f'plots/var_k/{class_name}/{data_size}_mrae.pdf'
print(f'saving plot in {plotpath}')
plt.savefig(plotpath, bbox_inches='tight')

View File

@ -0,0 +1,91 @@
import os.path
import pickle
from collections import defaultdict
from pathlib import Path
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
import quapy as qp
from Retrieval.commons import RetrievedSamples, load_sample
from Retrieval.experiments import methods, benchmark_name
from Retrieval.plot_mrae_xaxis_k import load_all_results
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
from quapy.data.base import LabelledCollection
from os.path import join
from tqdm import tqdm
from result_table.src.table import Table
import matplotlib.pyplot as plt
data_home = 'data'
class_mode = 'multiclass'
method_names = [name for name, *other in methods(None, 'continent')]
Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000]
DATA_SIZE = ['10K', '50K', '100K', '500K', '1M', 'FULL']
CLASS_NAME = ['gender', 'continent', 'years_category']
all_results = {}
# loads all MRAE results, and returns a dictionary containing the values, which is indexed by:
# class_name -> data_size -> method_name -> k -> stat -> float
results = load_all_results()
# generates the class-independent, size-independent plots for y-axis=MRAE in which:
# - the x-axis displays the Ks
for class_name in CLASS_NAME:
for k in Ks:
fig, ax = plt.subplots()
max_means = []
for method_name in method_names:
# class_name -> data_size -> method_name -> k -> stat -> float
means = [
results[class_name][data_size][method_name][k]['mean'] for data_size in DATA_SIZE
]
stds = [
results[class_name][data_size][method_name][k]['std'] for data_size in DATA_SIZE
]
# max_mean = np.max([
# results[class_name][data_size][method_name][k]['max'] for data_size in DATA_SIZE
# ])
max_means.append(max(means))
style = 'o-' if method_name != 'CC' else '--'
line = ax.plot(DATA_SIZE, means, style, label=method_name, color=None)
color = line[-1].get_color()
# ax.fill_between(Ks, means - stds, means + stds, alpha=0.3, color=color)
ax.set_xlabel('training pool size')
ax.set_ylabel('RAE')
ax.set_title(f'{class_name} from {k=}')
ax.set_ylim([0, max(max_means)*1.05])
ax.legend()
os.makedirs(f'plots/var_size/{class_name}', exist_ok=True)
plotpath = f'plots/var_size/{class_name}/{k}_mrae.pdf'
print(f'saving plot in {plotpath}')
plt.savefig(plotpath, bbox_inches='tight')

View File

@ -1,102 +0,0 @@
import os.path
import pickle
from collections import defaultdict
from pathlib import Path
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
import quapy as qp
from Retrieval.commons import RetrievedSamples, load_sample
from Retrieval.experiments import methods
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
from quapy.data.base import LabelledCollection
from os.path import join
from tqdm import tqdm
from result_table.src.table import Table
import matplotlib.pyplot as plt
def benchmark_name(class_name, k):
scape_class_name = class_name.replace('_', '\_')
return f'{scape_class_name}@{k}'
data_home = 'data'
HALF=True
exp_posfix = '_half'
method_names = [name for name, *other in methods(None, 'continent')]
Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000]
for class_name in ['gender', 'continent', 'years_category']: # 'relative_pageviews_category', 'num_sitelinks_category']:
benchmarks = [benchmark_name(class_name, k) for k in Ks]
for data_size in ['10K', '50K', '100K', '500K', '1M', 'FULL']:
fig, ax = plt.subplots()
class_home = join(data_home, class_name, data_size)
test_rankings_path = join(data_home, 'testRanking_Results.json')
results_home = join('results'+exp_posfix, class_name, data_size)
max_mean = None
for method_name in method_names:
results_path = join(results_home, method_name + '.pkl')
try:
results = pickle.load(open(results_path, 'rb'))
except Exception as e:
print(f'missing result {results}', e)
for err in ['mrae']:
means, stds = [], []
for k in Ks:
values = results[err][k]
means.append(np.mean(values))
stds.append(np.std(values))
means = np.asarray(means)
stds = np.asarray(stds) #/ np.sqrt(len(stds))
if max_mean is None:
max_mean = np.max(means)
else:
max_mean = max(max_mean, np.max(means))
line = ax.plot(Ks, means, 'o-', label=method_name, color=None)
color = line[-1].get_color()
# ax.fill_between(Ks, means - stds, means + stds, alpha=0.3, color=color)
ax.set_xlabel('k')
ax.set_ylabel(err.upper())
ax.set_title(f'{class_name} from {data_size}')
ax.set_ylim([0, max_mean])
ax.legend()
# plt.show()
os.makedirs(f'plots/results/{class_name}', exist_ok=True)
plotpath = f'plots/results/{class_name}/{data_size}_{err}.pdf'
print(f'saving plot in {plotpath}')
plt.savefig(plotpath)

View File

@ -158,8 +158,8 @@ def kld(prevs, prevs_hat, eps=None):
:return: Kullback-Leibler divergence between the two distributions :return: Kullback-Leibler divergence between the two distributions
""" """
eps = __check_eps(eps) eps = __check_eps(eps)
smooth_prevs = prevs + eps smooth_prevs = smooth(prevs, eps)
smooth_prevs_hat = prevs_hat + eps smooth_prevs_hat = smooth(prevs_hat, eps)
return (smooth_prevs*np.log(smooth_prevs/smooth_prevs_hat)).sum(axis=-1) return (smooth_prevs*np.log(smooth_prevs/smooth_prevs_hat)).sum(axis=-1)