QuaPy/Retrieval/experiments.py

317 lines
13 KiB
Python

import os.path
import pickle
from collections import defaultdict
from pathlib import Path
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.base import clone
from sklearn.svm import LinearSVC
from scipy.special import rel_entr as KLD
import quapy as qp
import quapy.functional as F
from Retrieval.commons import RetrievedSamples, load_sample, binarize_labels
from Retrieval.methods import M3rND_ModelB, M3rND_ModelD, AbstractM3rND
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
from quapy.data.base import LabelledCollection
from scipy.sparse import vstack
from os.path import join
from tqdm import tqdm
from result_table.src.table import Table
"""
In this sixth experiment, we have a collection C of >6M documents.
We split C in two equally-sized pools TrPool, TePool
I have randomly split the collection in 50% train and 50% split. In each split we have approx. 3.25 million documents.
We have 5 categories we can evaluate over: Continent, Years_Category, Num_Site_Links, Relative Pageviews and Gender.
From the training set I have created smaller subsets for each category:
100K, 500K, 1M and FULL (3.25M)
For each category and subset, I have created a training set called: "classifier_training.json". This is the "base" training set for the classifier. In this set we have 500 documents per group in a category. (For example: Male 500, Female 500, Unknown 500). Let me know if you think we need more.
To "bias" the quantifier towards a query, I have executed the queries (97) on the different training sets and retrieved the 200 most relevant documents per group.
For example: (Male 200, Female 200, Unknown 200)
Sometimes this is infeasible, we should probably discuss this at some point.
You can find the results for every query in a file named:
"training_Query-[QID]Sample-200SPLIT.json"
Test:
To evaluate our approach, I have executed the queries on the test split. You can find the results for all 97 queries up till k=1000 in this file.
testRanking_Results.json
"""
def methods(classifier, class_name, binarize=False):
kde_param = {
'continent': 0.01,
'gender': 0.03,
'years_category':0.03
}
yield ('Naive', Naive())
yield ('NaiveQuery', Naive())
yield ('CC', ClassifyAndCount(classifier))
# yield ('PCC', PCC(classifier))
# yield ('ACC', ACC(classifier, val_split=5, n_jobs=-1))
yield ('PACC', PACC(classifier, val_split=5, n_jobs=-1))
# yield ('EMQ', EMQ(classifier, exact_train_prev=True))
# yield ('EMQ-Platt', EMQ(classifier, exact_train_prev=True, recalib='platt'))
# yield ('EMQh', EMQ(classifier, exact_train_prev=False))
# yield ('EMQ-BCTS', EMQ(classifier, exact_train_prev=True, recalib='bcts'))
# yield ('EMQ-TS', EMQ(classifier, exact_train_prev=False, recalib='ts'))
# yield ('EMQ-NBVS', EMQ(classifier, exact_train_prev=False, recalib='nbvs'))
# yield ('EMQ-VS', EMQ(classifier, exact_train_prev=False, recalib='vs'))
yield ('KDEy-ML', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=kde_param[class_name]))
# yield ('KDE01', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.01))
if binarize:
yield ('M3b', M3rND_ModelB(classifier))
yield ('M3b+', M3rND_ModelB(classifier))
yield ('M3d', M3rND_ModelD(classifier))
yield ('M3d+', M3rND_ModelD(classifier))
def train_classifier_fn(train_path):
"""
Trains a classifier. To do so, it loads the training set, transforms it into a tfidf representation.
The classifier is Logistic Regression, with hyperparameters C (range [0.001, 0.01, ..., 1000]) and
class_weight (range {'balanced', None}) optimized via 5FCV.
:return: the tfidf-vectorizer and the classifier trained
"""
texts, labels = load_sample(train_path, class_name=class_name)
if BINARIZE:
labels = binarize_labels(labels, positive_class=protected_group[class_name])
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=3)
Xtr = tfidf.fit_transform(texts)
print(f'Xtr shape={Xtr.shape}')
print('training classifier...', end='')
classifier = LogisticRegression(max_iter=5000)
modsel = GridSearchCV(
classifier,
param_grid={'C': np.logspace(-4, 4, 9), 'class_weight': ['balanced', None]},
n_jobs=-1,
cv=5
)
modsel.fit(Xtr, labels)
classifier = modsel.best_estimator_
classifier_acc = modsel.best_score_
best_params = modsel.best_params_
print(f'[done] best-params={best_params} got {classifier_acc:.4f} score')
print('generating cross-val predictions for M3')
predictions = cross_val_predict(clone(classifier), Xtr, labels, cv=10, n_jobs=-1, verbose=10)
conf_matrix = confusion_matrix(labels, predictions, labels=classifier.classes_)
training = LabelledCollection(Xtr, labels)
print('training classes:', training.classes_)
print('training prevalence:', training.prevalence())
return tfidf, classifier, conf_matrix
def reduceAtK(data: LabelledCollection, k):
# if k > len(data):
# print(f'[warning] {k=}>{len(data)=}')
X, y = data.Xy
X = X[:k]
y = y[:k]
return LabelledCollection(X, y, classes=data.classes_)
def benchmark_name(class_name, k):
scape_class_name = class_name.replace('_', '\_')
return f'{scape_class_name}@{k}'
def run_experiment():
results = {
'mae': {k: [] for k in Ks},
'mrae': {k: [] for k in Ks},
'rKL_error': [],
'rND_error': []
}
pbar = tqdm(experiment_prot(), total=experiment_prot.total())
for train, test, q_rel_prevs in pbar:
Xtr, ytr, score_tr = train
Xte, yte, score_te = test
n = len(ytr) // 2
train_col = LabelledCollection(Xtr[:n], ytr[:n], classes=classifier.classes_)
if method_name not in ['Naive', 'NaiveQuery', 'M3b', 'M3b+', 'M3d', 'M3d+']:
method.fit(train_col, val_split=train_col, fit_classifier=False)
elif method_name == 'Naive':
method.fit(train_col)
test_col = LabelledCollection(Xte, yte, classes=classifier.classes_)
rKL_estim, rKL_true = [], []
rND_estim, rND_true = [], []
for k in Ks:
test_k = reduceAtK(test_col, k)
if method_name == 'NaiveQuery':
train_k = reduceAtK(train_col, k)
method.fit(train_k)
estim_prev = method.quantify(test_k.instances)
# epsilon value for prevalence smoothing
eps=(1. / (2. * k))
# error metrics
test_k_prev = test_k.prevalence()
mae = qp.error.mae(test_k_prev, estim_prev)
mrae = qp.error.mrae(test_k_prev, estim_prev, eps=eps)
rKL_at_k_estim = qp.error.kld(estim_prev, q_rel_prevs, eps=eps)
rKL_at_k_true = qp.error.kld(test_k_prev, q_rel_prevs, eps=eps)
if BINARIZE:
# [1] is the index of the minority or historically disadvantaged group
rND_at_k_estim = np.abs(estim_prev[1] - q_rel_prevs[1])
rND_at_k_true = np.abs(test_k_prev[1] - q_rel_prevs[1])
# collect results
results['mae'][k].append(mae)
results['mrae'][k].append(mrae)
rKL_estim.append(rKL_at_k_estim)
rKL_true.append(rKL_at_k_true)
if BINARIZE:
rND_estim.append(rND_at_k_estim)
rND_true.append(rND_at_k_true)
# aggregate fairness metrics
def aggregate(rMs, Ks, Z=1):
return (1 / Z) * sum((1. / np.log2(k)) * v for v, k in zip(rMs, Ks))
Z = sum((1. / np.log2(k)) for k in Ks)
rKL_estim = aggregate(rKL_estim, Ks, Z)
rKL_true = aggregate(rKL_true, Ks, Z)
rKL_error = np.abs(rKL_true-rKL_estim)
results['rKL_error'].append(rKL_error)
if BINARIZE:
rND_estim = aggregate(rND_estim, Ks, Z)
rND_true = aggregate(rND_true, Ks, Z)
if isinstance(method, AbstractM3rND):
if method_name.endswith('+'):
conf_matrix_ = method.get_confusion_matrix(*train_col.Xy)
else:
conf_matrix_ = conf_matrix.copy()
rND_estim = method.fair_measure_correction(rND_estim, conf_matrix_)
rND_error = np.abs(rND_true - rND_estim)
results['rND_error'].append(rND_error)
pbar.set_description(f'{method_name}')
return results
# Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000]
Ks = [50, 100, 500, 1000]
CLASS_NAMES = ['years_category', 'continent', 'gender'] # ['relative_pageviews_category', 'num_sitelinks_category']:
DATA_SIZES = ['10K', '50K', '100K', '500K', '1M', 'FULL']
data_home = 'data'
protected_group = {
'gender': 'Female',
'continent': 'Africa',
'years_category': 'Pre-1900s',
}
if __name__ == '__main__':
tables_RND, tables_DKL = [], []
for class_mode in ['binary', 'multiclass']:
BINARIZE = (class_mode=='binary')
method_names = [name for name, *other in methods(None, 'continent', BINARIZE)]
for class_name in CLASS_NAMES:
tables_mae, tables_mrae = [], []
benchmarks_size =[benchmark_name(class_name, s) for s in DATA_SIZES]
table_DKL = Table(name=f'rKL-{class_name}', benchmarks=benchmarks_size, methods=method_names)
table_RND = Table(name=f'rND-{class_name}', benchmarks=benchmarks_size, methods=method_names)
for data_size in DATA_SIZES:
print(class_name, class_mode, data_size)
benchmarks_k = [benchmark_name(class_name, k) for k in Ks]
# table_mae = Table(name=f'{class_name}-{data_size}-mae', benchmarks=benchmarks_k, methods=method_names)
table_mrae = Table(name=f'{class_name}-{data_size}-mrae', benchmarks=benchmarks_k, methods=method_names)
# tables_mae.append(table_mae)
tables_mrae.append(table_mrae)
# sets all paths
class_home = join(data_home, class_name, data_size)
train_data_path = join(data_home, class_name, 'FULL', 'classifier_training.json') # <----- fixed classifier
classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}_{class_mode}.pkl')
test_rankings_path = join(data_home, 'testRanking_Results.json')
test_query_prevs_path = join(data_home, 'prevelance_vectors_judged_docs.json')
results_home = join('results', class_name, class_mode, data_size)
positive_class = protected_group[class_name] if BINARIZE else None
# instantiates the classifier (trains it the first time, loads it in the subsequent executions)
tfidf, classifier, conf_matrix \
= qp.util.pickled_resource(classifier_path, train_classifier_fn, train_data_path)
experiment_prot = RetrievedSamples(
class_home,
test_rankings_path,
test_query_prevs_path,
vectorizer=tfidf,
class_name=class_name,
positive_class=positive_class,
classes=classifier.classes_
)
for method_name, method in methods(classifier, class_name, BINARIZE):
results_path = join(results_home, method_name + '.pkl')
results = qp.util.pickled_resource(results_path, run_experiment)
# compose the tables
for k in Ks:
# table_mae.add(benchmark=benchmark_name(class_name, k), method=method_name, v=results['mae'][k])
table_mrae.add(benchmark=benchmark_name(class_name, k), method=method_name, v=results['mrae'][k])
table_DKL.add(benchmark=benchmark_name(class_name, data_size), method=method_name, v=results['rKL_error'])
if BINARIZE:
table_RND.add(benchmark=benchmark_name(class_name, data_size), method=method_name, v=results['rND_error'])
tables = ([table_RND] + tables_mrae) if BINARIZE else ([table_DKL] + tables_mrae)
Table.LatexPDF(f'./latex/{class_mode}/{class_name}.pdf', tables=tables)
if BINARIZE:
tables_RND.append(table_RND)
else:
tables_DKL.append(table_DKL)
Table.LatexPDF(f'./latex/global/main.pdf', tables=tables_RND+tables_DKL, dedicated_pages=False)