last updates before submission

updating kde labels in plots
improving the quality of the plots
2024-08-22 17:02:00 +02:00 · 2024-05-20 12:13:00 +02:00 · 2024-05-17 13:52:56 +02:00 · 2024-05-15 12:00:00 +02:00 · 2024-05-10 15:46:13 +02:00 · 2024-05-09 16:24:20 +02:00
24 changed files with 1901 additions and 18 deletions
--- a/.gitignore
+++ b/.gitignore
@ -143,8 +143,7 @@ LeQua2022
 MultiLabel
 NewMethods
 Ordinal
-Retrieval
+Archived/eDiscovery
 eDiscovery
 poster-cikm
 slides-cikm
 slides-short-cikm
@ -153,9 +152,4 @@ svm_perf_quantification/svm_struct
 svm_perf_quantification/svm_light
 TweetSentQuant
 *.png
--- a/CHANGE_LOG.txt
+++ b/CHANGE_LOG.txt
@ -1,3 +1,9 @@
 Change Log 0.1.9
 ----------------
 <...>
 Change Log 0.1.8
 ----------------
--- a/Retrieval/classifier_kfcv_accuracy.py
+++ b/Retrieval/classifier_kfcv_accuracy.py
@ -0,0 +1,84 @@
 import itertools
 import os.path
 import pickle
 from collections import defaultdict
 from pathlib import Path
 import numpy as np
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import GridSearchCV
 from sklearn.svm import LinearSVC
 import quapy as qp
 from Retrieval.commons import RetrievedSamples, load_sample
 from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
 from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
 from quapy.data.base import LabelledCollection
 from os.path import join
 from tqdm import tqdm
 from result_table.src.table import Table
 """
 """
 data_home = 'data'
 datasets = ['continent', 'gender', 'years_category', 'relative_pageviews_category', 'num_sitelinks_category']
 param_grid = {'C': np.logspace(-4, 4, 9), 'class_weight': ['balanced', None]}
 classifiers = [
    ('LR', LogisticRegression(max_iter=5000), param_grid),
    ('SVM', LinearSVC(), param_grid)
 ]
 def benchmark_name(class_name):
    return class_name.replace('_', '\_')
 table = Table(name=f'accuracy', benchmarks=[benchmark_name(d) for d in datasets])
 table.format.show_std = False
 table.format.stat_test = None
 table.format.lower_is_better = False
 table.format.color = False
 table.format.remove_zero = True
 table.format.style = 'rules'
 for class_name, (cls_name, cls, grid) in itertools.product(datasets, classifiers):
    train_data_path = join(data_home, class_name, 'FULL', 'classifier_training.json')  # <-------- fixed classifier
    texts, labels = load_sample(train_data_path, class_name=class_name)
    tfidf = TfidfVectorizer(sublinear_tf=True, min_df=3)
    Xtr = tfidf.fit_transform(texts)
    print(f'Xtr shape={Xtr.shape}')
    print('training classifier...', end='')
    classifier = GridSearchCV(
        cls,
        param_grid=grid,
        n_jobs=-1,
        cv=5,
        verbose=10
    )
    classifier.fit(Xtr, labels)
    classifier_acc = classifier.best_score_
    classifier_acc_per_fold = classifier.cv_results_['mean_test_score'][classifier.best_index_]
    print(f'[done] best-params={classifier.best_params_} got {classifier_acc:.4f} score, per fold {classifier_acc_per_fold}')
    table.add(benchmark=benchmark_name(class_name), method=cls_name, v=classifier_acc_per_fold)
    Table.LatexPDF(f'./latex/classifier_Acc.pdf', tables=[table])
--- a/Retrieval/commons.py
+++ b/Retrieval/commons.py
@ -0,0 +1,153 @@
 import pandas as pd
 import numpy as np
 from glob import glob
 from os.path import join
 import quapy.functional as F
 Ks = [50, 100, 500, 1000]
 CLASS_NAMES = ['continent', 'gender', 'years_category'] # ['relative_pageviews_category', 'num_sitelinks_category']:
 DATA_SIZES = ['10K', '50K', '100K', '500K', '1M', 'FULL']
 protected_group = {
    'gender': 'Female',
    'continent': 'Africa',
    'years_category': 'Pre-1900s',
 }
 def load_sample(path, class_name):
    """
    Loads a sample json as a dataframe and returns text and labels for
    the given class_name
    :param path: path to a json file
    :param class_name: string representing the target class
    :return: texts, labels for class_name
    """
    df = pd.read_json(path)
    text = df.text.values
    labels = df[class_name].values
    return text, labels
 def binarize_labels(labels, positive_class=None):
    if positive_class is not None:
        protected_labels = labels==positive_class
        labels[protected_labels] = 1
        labels[~protected_labels] = 0
        labels = labels.astype(int)
    return labels
 class RetrievedSamples:
    def __init__(self,
                 class_home: str,
                 test_rankings_path: str,
                 test_query_prevs_path: str,
                 vectorizer,
                 class_name,
                 positive_class=None,
                 classes=None,
                 ):
        self.class_home = class_home
        self.test_rankings_df = pd.read_json(test_rankings_path)
        self.test_query_prevs_df = pd.read_json(test_query_prevs_path)
        self.vectorizer = vectorizer
        self.class_name = class_name
        self.positive_class = positive_class
        self.classes = classes
    def get_text_label_score(self, df, filter_rank=1000):
        df = df[df['rank']<filter_rank]
        class_name = self.class_name
        vectorizer = self.vectorizer
        filter_classes = self.classes
        text = df.text.values
        labels = df[class_name].values
        rel_score = df.score.values
        labels = binarize_labels(labels, self.positive_class)
        if filter_classes is not None:
            idx = np.isin(labels, filter_classes)
            text = text[idx]
            labels = labels[idx]
            rel_score = rel_score[idx]
        if vectorizer is not None:
            text = vectorizer.transform(text)
        order = np.argsort(-rel_score)
        return text[order], labels[order], rel_score[order]
    def __call__(self):
        tests_df = self.test_rankings_df
        class_name = self.class_name
        for file in self._list_queries():
            # loads the training sample
            train_df = pd.read_json(file)
            if len(train_df) == 0:
                print('empty dataframe: ', file)
            else:
                Xtr, ytr, score_tr = self.get_text_label_score(train_df)
                # loads the test sample
                query_id = self._get_query_id_from_path(file)
                sel_df = tests_df[tests_df.qid == query_id]
                Xte, yte, score_te = self.get_text_label_score(sel_df)
                # gets the prevalence of all judged relevant documents for the query
                df = self.test_query_prevs_df
                q_rel_prevs = df.loc[df.id == query_id][class_name+'_proportions'].values[0]
                if self.positive_class is not None:
                    if self.positive_class not in q_rel_prevs:
                        print(f'positive class {self.positive_class} not found in the query; skipping')
                        continue
                    q_rel_prevs = F.as_binary_prevalence(q_rel_prevs[self.positive_class])
                else:
                    q_rel_prevs = np.asarray([q_rel_prevs.get(class_i, 0.) for class_i in self.classes])
                yield (Xtr, ytr, score_tr), (Xte, yte, score_te), q_rel_prevs
    def _list_queries(self):
        return sorted(glob(join(self.class_home, 'training_Query*200SPLIT.json')))
    # def _get_test_sample(self, query_id, max_lines=-1):
    #     df = self.test_rankings_df
    #     sel_df = df[df.qid==int(query_id)]
    #     return get_text_label_score(sel_df)
        # texts = sel_df.text.values
        # try:
        #     labels = sel_df[self.class_name].values
        # except KeyError as e:
        #     print(f'error: key {self.class_name} not found in test rankings')
        #     raise e
        # if max_lines > 0 and len(texts) > max_lines:
        #     ranks = sel_df.rank.values
        #     idx = np.argsort(ranks)[:max_lines]
        #     texts = np.asarray(texts)[idx]
        #     labels = np.asarray(labels)[idx]
        # return texts, labels
    def total(self):
        return len(self._list_queries())
    def _get_query_id_from_path(self, path):
        prefix = 'training_Query-'
        posfix = 'Sample-200SPLIT'
        qid = path
        qid = qid[:qid.index(posfix)]
        qid = qid[qid.index(prefix) + len(prefix):]
        qid = int(qid)
        return qid
--- a/Retrieval/deprecated_code/fifth.py
+++ b/Retrieval/deprecated_code/fifth.py
@ -0,0 +1,182 @@
 from collections import defaultdict
 import numpy as np
 import pandas as pd
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import GridSearchCV
 from sklearn.svm import LinearSVC
 import quapy as qp
 import quapy.functional as F
 from Retrieval.commons import RetrievedSamples, load_txt_sample, load_json_sample
 from Retrieval.tabular import Table
 from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation
 from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
 from quapy.protocol import AbstractProtocol
 from quapy.data.base import LabelledCollection
 from glob import glob
 from os.path import join
 from tqdm import tqdm
 """
 In this fifth experiment, we have pairs of (Li,Ui) with Li a training set and Ui a test set as 
 in the fourth experiment, and the fairness group are defined upon geographic info as in the fourth case.
 As in the fourth, the data Li and Ui have been drawn by retrieving query-related documents from
 a pool of the same size. Unlike the fourth experiment, here the training queries are
 Por ahora 1000 en tr y 100 en test
 Parece que ahora hay muy poco shift  
 """
 def cls(classifier_trained=None):
    if classifier_trained is None:
        # return LinearSVC()
        return LogisticRegression()
    else:
        return classifier_trained
 def methods(classifier_trained=None):
    yield ('CC', ClassifyAndCount(cls(classifier_trained)))
    yield ('PCC', PCC(cls(classifier_trained)))
    yield ('ACC', ACC(cls(classifier_trained), val_split=5, n_jobs=-1))
    yield ('PACC', PACC(cls(classifier_trained), val_split=5, n_jobs=-1))
    yield ('EMQ', EMQ(cls(classifier_trained), exact_train_prev=True))
    yield ('EMQh', EMQ(cls(classifier_trained), exact_train_prev=False))
    # yield ('EMQ-BCTS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='bcts'))
    # yield ('EMQ-TS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='ts'))
    # yield ('EMQ-NBVS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='nbvs'))
    # yield ('EMQ-VS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='vs'))
    # yield ('KDE001', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.001))
    # yield ('KDE005', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.005)) # <-- wow!
    # yield ('KDE01', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.01))
    # yield ('KDE02', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.02))
    # yield ('KDE03', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.03))
    # yield ('KDE05', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.05))
    yield ('KDE07', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.07))
    # yield ('KDE10', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.10))
    yield ('MLPE', MaximumLikelihoodPrevalenceEstimation())
 def train_classifier():
    tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10)
    training = LabelledCollection.load(train_path, loader_func=load_json_sample, class_name=CLASS_NAME)
    if REDUCE_TR > 0 and len(training) > REDUCE_TR:
        print('Reducing the number of documents in the training to', REDUCE_TR)
        training = training.sampling(REDUCE_TR, *training.prevalence())
    Xtr, ytr = training.Xy
    Xtr = tfidf.fit_transform(Xtr)
    print('L orig shape = ', Xtr.shape)
    training = LabelledCollection(Xtr, ytr)
    print('training classifier')
    classifier_trained = LogisticRegression()
    classifier_trained = GridSearchCV(classifier_trained,
                                      param_grid={'C': np.logspace(-3, 3, 7), 'class_weight': ['balanced', None]},
                                      n_jobs=-1, cv=5)
    classifier_trained.fit(Xtr, ytr)
    classifier_trained = classifier_trained.best_estimator_
    trained = True
    print('[Done!]')
    classes = training.classes_
    print('training classes:', classes)
    print('training prevalence:', training.prevalence())
    return tfidf, classifier_trained
 def reduceAtK(data: LabelledCollection, k):
    X, y = data.Xy
    X = X[:k]
    y = y[:k]
    return LabelledCollection(X, y, classes=data.classes_)
 RANK_AT_K = -1
 REDUCE_TR = 50000
 qp.environ['SAMPLE_SIZE'] = RANK_AT_K
 def scape_latex(string):
    return string.replace('_', '\_')
 Ks = [10, 50, 100, 250, 500, 1000, 2000]
 # Ks = [500]
 for CLASS_NAME in ['gender_category'] : #'years_category']: #['continent', 'first_letter_category']: #, 'gender', 'gender_category', 'occupations', 'source_countries', 'source_subcont_regions', 'years_category', 'relative_pageviews_category']:
    data_path = './' + CLASS_NAME
    if CLASS_NAME in ['years_category', 'continent', 'gender_category']:
        train_path = join(data_path, 'train500PerGroup.json')
    else:
        train_path = join(data_path, 'train3000samples.json')
    tfidf, classifier_trained = qp.util.pickled_resource(f'classifier_{CLASS_NAME}.pkl', train_classifier)
    trained=True
    experiment_prot = RetrievedSamples(data_path,
                               load_fn=load_json_sample,
                               vectorizer=tfidf,
                               max_train_lines=None,
                               max_test_lines=RANK_AT_K, classes=classifier_trained.classes_, class_name=CLASS_NAME)
    method_names = [name for name, *other in methods()]
    benchmarks = [f'{scape_latex(CLASS_NAME)}@{k}' for k in Ks]
    table_mae = Table(benchmarks, method_names, color_mode='global')
    table_mrae = Table(benchmarks, method_names, color_mode='global')
    for method_name, quantifier in methods(classifier_trained):
        # print('Starting with method=', method_name)
        mae_errors = {k:[] for k in Ks}
        mrae_errors = {k:[] for k in Ks}
        pbar = tqdm(experiment_prot(), total=49)
        for train, test in pbar:
            if train is not None:
                try:
                    if trained and method_name!='MLPE':
                        quantifier.fit(train, val_split=train, fit_classifier=False)
                    else:
                        quantifier.fit(train)
                    for k in Ks:
                        test_k = reduceAtK(test, k)
                        estim_prev = quantifier.quantify(test_k.instances)
                        mae_errors[k].append(qp.error.mae(test_k.prevalence(), estim_prev))
                        mrae_errors[k].append(qp.error.mrae(test_k.prevalence(), estim_prev, eps=(1./(2*k))))
                except Exception as e:
                    print(f'wow, something happened here! skipping; {e}')
            else:
                print('skipping one!')
            # pbar.set_description(f'{method_name}\tmae={np.mean(mae_errors):.4f}\tmrae={np.mean(mrae_errors):.4f}')
            pbar.set_description(f'{method_name}')
        for k in Ks:
            table_mae.add(benchmark=f'{scape_latex(CLASS_NAME)}@{k}', method=method_name, values=mae_errors[k])
            table_mrae.add(benchmark=f'{scape_latex(CLASS_NAME)}@{k}', method=method_name, values=mrae_errors[k])
    table_mae.latexPDF('./latex', f'table_{CLASS_NAME}_mae.tex')
    table_mrae.latexPDF('./latex', f'table_{CLASS_NAME}_mrae.tex')
--- a/Retrieval/deprecated_code/fourth.py
+++ b/Retrieval/deprecated_code/fourth.py
@ -0,0 +1,161 @@
 import numpy as np
 import pandas as pd
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import GridSearchCV
 from sklearn.svm import LinearSVC
 import quapy as qp
 import quapy.functional as F
 from Retrieval.commons import RetrievedSamples, load_txt_sample
 from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation
 from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
 from quapy.protocol import AbstractProtocol
 from quapy.data.base import LabelledCollection
 from glob import glob
 from os.path import join
 from tqdm import tqdm
 """
 In this fourth experiment, we have pairs of (Li,Ui) with Li a training set and Ui a test set as 
 in the third experiment, and the fairness group are defined upon geographic info as in the third case.
 The difference here is that the data Li and Ui have been drawn by retrieving query-related documents from
 a pool of the same size.
 Por ahora 1000 en tr y 100 en test
 Parece que ahora hay muy poco shift  
 """
 def cls(classifier_trained=None):
    if classifier_trained is None:
        # return LinearSVC()
        return LogisticRegression()
    else:
        return classifier_trained
 def methods(classifier_trained=None):
    yield ('CC', ClassifyAndCount(cls(classifier_trained)))
    yield ('PACC', PACC(cls(classifier_trained), val_split=5, n_jobs=-1))
    yield ('EMQ', EMQ(cls(classifier_trained), exact_train_prev=True))
    yield ('EMQh', EMQ(cls(classifier_trained), exact_train_prev=False))
    yield ('EMQ-BCTS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='bcts'))
    yield ('EMQ-TS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='ts'))
    yield ('EMQ-NBVS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='nbvs'))
    # yield ('EMQ-VS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='vs'))
    yield ('PCC', PCC(cls(classifier_trained)))
    yield ('ACC', ACC(cls(classifier_trained), val_split=5, n_jobs=-1))
    yield ('KDE001', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.001))
    yield ('KDE005', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.005)) # <-- wow!
    yield ('KDE01', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.01))
    yield ('KDE02', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.02))
    yield ('KDE03', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.03))
    yield ('KDE05', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.05))
    yield ('KDE07', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.07))
    yield ('KDE10', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.10))
    yield ('MLPE', MaximumLikelihoodPrevalenceEstimation())
 def train_classifier():
    tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10)
    training = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True, parse_columns=False)
    if REDUCE_TR > 0:
        print('Reducing the number of documents in the training to', REDUCE_TR)
        training = training.sampling(REDUCE_TR, *training.prevalence())
    Xtr, ytr = training.Xy
    Xtr = tfidf.fit_transform(Xtr)
    print('L orig shape = ', Xtr.shape)
    training = LabelledCollection(Xtr, ytr)
    print('training classifier')
    classifier_trained = LogisticRegression()
    classifier_trained = GridSearchCV(classifier_trained,
                                      param_grid={'C': np.logspace(-3, 3, 7), 'class_weight': ['balanced', None]},
                                      n_jobs=-1, cv=5)
    classifier_trained.fit(Xtr, ytr)
    classifier_trained = classifier_trained.best_estimator_
    trained = True
    print('[Done!]')
    classes = training.classes_
    print('training classes:', classes)
    print('training prevalence:', training.prevalence())
    return tfidf, classifier_trained
 RANK_AT_K = 1000
 REDUCE_TR = 50000
 qp.environ['SAMPLE_SIZE'] = RANK_AT_K
 data_path = './50_50_split_trec'
 train_path = join(data_path, 'train_50_50_continent.txt')
 tfidf, classifier_trained = qp.util.pickled_resource('classifier.pkl', train_classifier)
 trained=True
 experiment_prot = RetrievedSamples(data_path,
                           load_fn=load_txt_sample,
                           vectorizer=tfidf,
                           max_train_lines=None,
                           max_test_lines=RANK_AT_K, classes=classifier_trained.classes_)
 result_mae_dict = {}
 result_mrae_dict = {}
 for method_name, quantifier in methods(classifier_trained):
    # print('Starting with method=', method_name)
    mae_errors = []
    mrae_errors = []
    pbar = tqdm(experiment_prot(), total=49)
    for train, test in pbar:
        if train is not None:
            try:
                # print(train.prevalence())
                # print(test.prevalence())
                if trained and method_name!='MLPE':
                    quantifier.fit(train, val_split=train, fit_classifier=False)
                else:
                    quantifier.fit(train)
                estim_prev = quantifier.quantify(test.instances)
                mae = qp.error.mae(test.prevalence(), estim_prev)
                mae_errors.append(mae)
                mrae = qp.error.mrae(test.prevalence(), estim_prev)
                mrae_errors.append(mrae)
                # print()
                # print('Training prevalence:', F.strprev(train.prevalence()), 'shape', train.X.shape)
                # print('Test prevalence:', F.strprev(test.prevalence()), 'shape', test.X.shape)
                # print('Estim prevalence:', F.strprev(estim_prev))
            except Exception as e:
                print(f'wow, something happened here! skipping; {e}')
        else:
            print('skipping one!')
        pbar.set_description(f'{method_name}\tmae={np.mean(mae_errors):.4f}\tmrae={np.mean(mrae_errors):.4f}')
    print()
    result_mae_dict[method_name] = np.mean(mae_errors)
    result_mrae_dict[method_name] = np.mean(mrae_errors)
 print('Results\n'+('-'*100))
 for method_name in result_mae_dict.keys():
    MAE = result_mae_dict[method_name]
    MRAE = result_mrae_dict[method_name]
    print(f'{method_name}\t{MAE=:.5f}\t{MRAE=:.5f}')
--- a/Retrieval/deprecated_code/preliminary_.py
+++ b/Retrieval/deprecated_code/preliminary_.py
@ -0,0 +1,98 @@
 import pandas as pd
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LogisticRegression
 import quapy as qp
 import quapy.functional as F
 from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation
 from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC
 from quapy.protocol import AbstractProtocol
 from quapy.data.base import LabelledCollection
 from glob import glob
 from os.path import join
 """
 This was the very first experiment. 1 big training set and many test rankings produced according to some queries.
 The quantification methods did not seem to work. The more sophisticated the method is, the worse it performed.
 This is a clear indication that the PPS assumptions do not hold.
 Actually, while the training set could be some iid sample from a distribution L and every test set
 is a iid sample from a distribution U, it is pretty clear that P(X|Y) is different, since the test set
 are biased towards a query term whereas the training set is not.  
 """
 def methods():
    yield ('MLPE', MaximumLikelihoodPrevalenceEstimation())
    yield ('CC', ClassifyAndCount(LogisticRegression(n_jobs=-1)))
    yield ('ACC', ACC(LogisticRegression(n_jobs=-1)))
    yield ('PCC', PCC(LogisticRegression(n_jobs=-1)))
    yield ('PACC', PACC(LogisticRegression(n_jobs=-1)))
    yield ('EMQ', EMQ(LogisticRegression(n_jobs=-1)))
 def load_txt_sample(path, verbose=False):
    if verbose:
        print(f'loading {path}...', end='')
    df = pd.read_csv(path, sep='\t')
    if verbose:
        print('[done]')
    X = df['text']
    y = df['first_letter_category']
    return X, y
 class RetrievedSamples(AbstractProtocol):
    def __init__(self, path_dir: str, load_fn, vectorizer, classes):
        self.path_dir = path_dir
        self.load_fn = load_fn
        self.vectorizer = vectorizer
        self.classes = classes
    def __call__(self):
        for file in glob(join(self.path_dir, 'test_data_*.txt')):
            X, y = self.load_fn(file)
            if len(X)!=qp.environ['SAMPLE_SIZE']:
                print(f'[warning]: file {file} contains {len(X)} instances (expected: {qp.environ["SAMPLE_SIZE"]})')
            # assert len(X) == qp.environ['SAMPLE_SIZE'], f'unexpected sample size for file {file}, found {len(X)}'
            X = self.vectorizer.transform(X)
            sample = LabelledCollection(X, y, classes=self.classes)
            yield sample.Xp
 qp.environ['SAMPLE_SIZE']=100
 data_path = './data'
 train_path = join(data_path, 'train_data.txt')
 tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5)
 training = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True)
 # training = training.sampling(1000)
 Xtr, ytr = training.Xy
 Xtr = tfidf.fit_transform(Xtr)
 print('Xtr shape = ', Xtr.shape)
 training = LabelledCollection(Xtr, ytr)
 classes = training.classes_
 test_prot = RetrievedSamples(data_path, load_fn=load_txt_sample, vectorizer=tfidf, classes=classes)
 print('Training prevalence:', F.strprev(training.prevalence()))
 for X, p in test_prot():
    print('Test prevalence:', F.strprev(p))
 for method_name, quantifier in methods():
    print('training ', method_name)
    quantifier.fit(training)
    print('[done]')
    report = qp.evaluation.evaluation_report(quantifier, test_prot, error_metrics=['mae', 'mrae'], verbose=True)
    print(report.mean())
--- a/Retrieval/deprecated_code/second.py
+++ b/Retrieval/deprecated_code/second.py
@ -0,0 +1,131 @@
 import numpy as np
 import pandas as pd
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LogisticRegression
 import quapy as qp
 import quapy.functional as F
 from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation
 from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC
 from quapy.protocol import AbstractProtocol
 from quapy.data.base import LabelledCollection
 from glob import glob
 from os.path import join
 from tqdm import tqdm
 """
 In this second experiment, we have pairs of (Li,Ui) with Li a training set and Ui a test set.
 Both elements in the pair are *retrieved according to the same query*. This is a way to impose
 the same type of bias that was present in the test, to the training set. Let's see...  
 """
 def methods():
    yield ('PACC', PACC(LogisticRegression(), val_split=5, n_jobs=-1))
    yield ('CC', ClassifyAndCount(LogisticRegression()))
    yield ('EMQ', EMQ(LogisticRegression()))
    yield ('PCC', PCC(LogisticRegression()))
    yield ('ACC', ACC(LogisticRegression(), val_split=5, n_jobs=-1))
    yield ('MLPE', MaximumLikelihoodPrevalenceEstimation())
 def load_txt_sample(path, parse_columns, verbose=False, max_lines=None):
    if verbose:
        print(f'loading {path}...', end='')
    df = pd.read_csv(path, sep='\t')
    if verbose:
        print('[done]')
    X = df['text'].values
    y = df['first_letter_category'].values
    if parse_columns:
        rank = df['rank'].values
        scores = df['score'].values
        order = np.argsort(rank)
        X = X[order]
        y = y[order]
        rank = rank[order]
        scores = scores[order]
    if max_lines is not None:
        X = X[:max_lines]
        y = y[:max_lines]
    return X, y
 class RetrievedSamples(AbstractProtocol):
    def __init__(self, path_dir: str, load_fn, vectorizer, classes, max_train_lines=None, max_test_lines=None):
        self.path_dir = path_dir
        self.load_fn = load_fn
        self.vectorizer = vectorizer
        self.classes = classes
        self.max_train_lines = max_train_lines
        self.max_test_lines = max_test_lines
    def __call__(self):
        for file in glob(join(self.path_dir, 'test_rankings_*.txt')):
            X, y = self.load_fn(file.replace('test_', 'training_'), parse_columns=True, max_lines=self.max_train_lines)
            X = self.vectorizer.transform(X)
            train_sample = LabelledCollection(X, y, classes=self.classes)
            X, y = self.load_fn(file, parse_columns=True, max_lines=self.max_test_lines)
            if len(X)!=qp.environ['SAMPLE_SIZE']:
                print(f'[warning]: file {file} contains {len(X)} instances (expected: {qp.environ["SAMPLE_SIZE"]})')
            # assert len(X) == qp.environ['SAMPLE_SIZE'], f'unexpected sample size for file {file}, found {len(X)}'
            X = self.vectorizer.transform(X)
            test_sample = LabelledCollection(X, y, classes=self.classes)
            yield train_sample, test_sample
 RANK_AT_K = 500
 REDUCE_TR = 50000
 qp.environ['SAMPLE_SIZE'] = RANK_AT_K
 data_path = './newCollection'
 train_path = join(data_path, 'train_data.txt')
 tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10)
 training = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True, parse_columns=False)
 if REDUCE_TR>0:
    print('Reducing the number of documents in the training to', REDUCE_TR)
    training = training.sampling(REDUCE_TR)
 Xtr, ytr = training.Xy
 Xtr = tfidf.fit_transform(Xtr)
 print('L orig shape = ', Xtr.shape)
 training = LabelledCollection(Xtr, ytr)
 classes = training.classes_
 experiment_prot = RetrievedSamples(data_path,
                                   load_fn=load_txt_sample,
                                   vectorizer=tfidf,
                                   classes=classes,
                                   max_train_lines=RANK_AT_K,
                                   max_test_lines=RANK_AT_K)
 for method_name, quantifier in methods():
    print('Starting with method=', method_name)
    errors = []
    pbar = tqdm(experiment_prot(), total=49)
    for train, test in pbar:
        # print('Training prevalence:', F.strprev(training.prevalence()), 'shape', train.X.shape)
        # print('Test prevalence:', F.strprev(test.prevalence()), 'shape', test.X.shape)
        quantifier.fit(train)
        estim_prev = quantifier.quantify(test.instances)
        mae = qp.error.mae(test.prevalence(), estim_prev)
        errors.append(mae)
        pbar.set_description(f'mae={np.mean(errors):.4f}')
    print()
--- a/Retrieval/deprecated_code/third.py
+++ b/Retrieval/deprecated_code/third.py
@ -0,0 +1,155 @@
 import numpy as np
 import pandas as pd
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LogisticRegression
 import quapy as qp
 import quapy.functional as F
 from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation
 from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC
 from quapy.protocol import AbstractProtocol
 from quapy.data.base import LabelledCollection
 from glob import glob
 from os.path import join
 from tqdm import tqdm
 """
 In this third experiment, we have pairs of (Li,Ui) with Li a training set and Ui a test set as 
 in the second experiment, but in this case the fairness group are defined upon geographic info.  
 """
 def methods():
    yield ('CC', ClassifyAndCount(LogisticRegression()))
    yield ('PACC', PACC(LogisticRegression(), val_split=5, n_jobs=-1))
    yield ('EMQ', EMQ(LogisticRegression()))
    yield ('PCC', PCC(LogisticRegression()))
    yield ('ACC', ACC(LogisticRegression(), val_split=5, n_jobs=-1))
    yield ('MLPE', MaximumLikelihoodPrevalenceEstimation())
 def load_txt_sample(path, parse_columns, verbose=False, max_lines=None):
    # print('reading', path)
    if verbose:
        print(f'loading {path}...', end='')
    df = pd.read_csv(path, sep='\t')
    if verbose:
        print('[done]')
    X = df['text'].values
    y = df['continent'].values
    if parse_columns:
        rank = df['rank'].values
        scores = df['score'].values
        rank = rank[y != 'Antarctica']
        scores = scores[y != 'Antarctica']
    X = X[y!='Antarctica']
    y = y[y!='Antarctica']
    if parse_columns:
        order = np.argsort(rank)
        X = X[order]
        y = y[order]
        rank = rank[order]
        scores = scores[order]
    if max_lines is not None:
        X = X[:max_lines]
        y = y[:max_lines]
    return X, y
 class RetrievedSamples(AbstractProtocol):
    def __init__(self, path_dir: str, load_fn, vectorizer, max_train_lines=None, max_test_lines=None):
        self.path_dir = path_dir
        self.load_fn = load_fn
        self.vectorizer = vectorizer
        self.max_train_lines = max_train_lines
        self.max_test_lines = max_test_lines
    def __call__(self):
        for file in glob(join(self.path_dir, 'test_rankings_*.txt')):
            X, y = self.load_fn(file.replace('test_', 'training_'), parse_columns=True, max_lines=self.max_train_lines)
            X = self.vectorizer.transform(X)
            train_sample = LabelledCollection(X, y)
            X, y = self.load_fn(file, parse_columns=True, max_lines=self.max_test_lines)
            if len(X)!=qp.environ['SAMPLE_SIZE']:
                print(f'[warning]: file {file} contains {len(X)} instances (expected: {qp.environ["SAMPLE_SIZE"]})')
            # assert len(X) == qp.environ['SAMPLE_SIZE'], f'unexpected sample size for file {file}, found {len(X)}'
            X = self.vectorizer.transform(X)
            try:
                test_sample = LabelledCollection(X, y, classes=train_sample.classes_)
            except ValueError as e:
                print(f'file {file} caused error {e}')
                yield None, None
            # print('train #classes:', train_sample.n_classes, train_sample.prevalence())
            # print('test  #classes:', test_sample.n_classes, test_sample.prevalence())
            yield train_sample, test_sample
 RANK_AT_K = 100
 REDUCE_TR = 50000
 qp.environ['SAMPLE_SIZE'] = RANK_AT_K
 data_path = './newCollectionGeo'
 train_path = join(data_path, 'train_data_continent.txt')
 tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10)
 training = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True, parse_columns=False)
 if REDUCE_TR>0:
    print('Reducing the number of documents in the training to', REDUCE_TR)
    training = training.sampling(REDUCE_TR)
 Xtr, ytr = training.Xy
 Xtr = tfidf.fit_transform(Xtr)
 print('L orig shape = ', Xtr.shape)
 training = LabelledCollection(Xtr, ytr)
 classes = training.classes_
 print('training classes:', classes)
 print('training prevalence:', training.prevalence())
 experiment_prot = RetrievedSamples(data_path,
                                   load_fn=load_txt_sample,
                                   vectorizer=tfidf,
                                   max_train_lines=None,
                                   max_test_lines=RANK_AT_K)
 for method_name, quantifier in methods():
    print('Starting with method=', method_name)
    errors = []
    pbar = tqdm(experiment_prot(), total=49)
    for train, test in pbar:
        if train is not None:
            try:
                # print('Training prevalence:', F.strprev(training.prevalence()), 'shape', train.X.shape)
                # print('Test prevalence:', F.strprev(test.prevalence()), 'shape', test.X.shape)
                # print(train.prevalence())
                # print(test.prevalence())
                quantifier.fit(train)
                estim_prev = quantifier.quantify(test.instances)
                mae = qp.error.mae(test.prevalence(), estim_prev)
                errors.append(mae)
            except Exception as e:
                print(f'wow, something happened here! skipping; {e}')
        else:
            print('skipping one!')
        pbar.set_description(f'mae={np.mean(errors):.4f}')
    print()
--- a/Retrieval/experiments.py
+++ b/Retrieval/experiments.py
@ -0,0 +1,299 @@
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import GridSearchCV, cross_val_predict
 from sklearn.base import clone
 import quapy as qp
 from Retrieval.commons import *
 from Retrieval.methods import *
 from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
 from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
 from quapy.data.base import LabelledCollection
 from os.path import join
 from tqdm import tqdm
 from result_table.src.table import Table
 """
 In this sixth experiment, we have a collection C of >6M documents.
 We split C in two equally-sized pools TrPool, TePool
 I have randomly split the collection in 50% train and 50% split. In each split we have approx. 3.25 million documents. 
 We have 5 categories we can evaluate over: Continent, Years_Category, Num_Site_Links, Relative Pageviews and Gender. 
 From the training set I have created smaller subsets for each category:
 100K, 500K, 1M and FULL (3.25M) 
 For each category and subset, I have created a training set called: "classifier_training.json". This is the "base" training set for the classifier. In this set we have 500 documents per group in a category. (For example: Male 500, Female 500, Unknown 500).  Let me know if you think we need more. 
 To "bias" the quantifier towards a query, I have executed the queries (97) on the different training sets and retrieved the 200 most relevant documents per group. 
 For example: (Male 200, Female 200, Unknown 200) 
 Sometimes this is infeasible, we should probably discuss this at some point. 
 You can find the results for every query in a file named: 
 "training_Query-[QID]Sample-200SPLIT.json" 
 Test: 
 To evaluate our approach, I have executed the queries on the test split. You can find the results for all 97 queries up till k=1000 in this file. 
 testRanking_Results.json 
 """
 def methods(classifier, class_name=None, binarize=False):
    kde_param = {
        'continent': 0.01,
        'gender': 0.03,
        'years_category':0.03
    }
    yield ('NaiveQuery', Naive())
    yield ('CC', ClassifyAndCount(classifier))
    yield ('PACC', PACC(classifier, val_split=5, n_jobs=-1))
    yield ('KDEy-ML', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=kde_param.get(class_name, 0.01)))
    if binarize:
        yield ('M3b', M3rND_ModelB(classifier))
        yield ('M3b+', M3rND_ModelB(classifier))
        yield ('M3d', M3rND_ModelD(classifier))
        yield ('M3d+', M3rND_ModelD(classifier))
 def train_classifier_fn(train_path):
    """
    Trains a classifier. To do so, it loads the training set, transforms it into a tfidf representation.
    The classifier is Logistic Regression, with hyperparameters C (range [0.001, 0.01, ..., 1000]) and
    class_weight (range {'balanced', None}) optimized via 5FCV.
    :return: the tfidf-vectorizer and the classifier trained
    """
    texts, labels = load_sample(train_path, class_name=class_name)
    if BINARIZE:
        labels = binarize_labels(labels, positive_class=protected_group[class_name])
    tfidf = TfidfVectorizer(sublinear_tf=True, min_df=3)
    Xtr = tfidf.fit_transform(texts)
    print(f'Xtr shape={Xtr.shape}')
    print('training classifier...', end='')
    classifier = LogisticRegression(max_iter=5000)
    modsel = GridSearchCV(
        classifier,
        param_grid={'C': np.logspace(-4, 4, 9), 'class_weight': ['balanced', None]},
        n_jobs=-1,
        cv=5
    )
    modsel.fit(Xtr, labels)
    classifier = modsel.best_estimator_
    classifier_acc = modsel.best_score_
    best_params = modsel.best_params_
    print(f'[done] best-params={best_params} got {classifier_acc:.4f} score')
    print('generating cross-val predictions for M3')
    predictions = cross_val_predict(clone(classifier), Xtr, labels, cv=10, n_jobs=-1, verbose=10)
    conf_matrix = confusion_matrix(labels, predictions, labels=classifier.classes_)
    training = LabelledCollection(Xtr, labels)
    print('training classes:', training.classes_)
    print('training prevalence:', training.prevalence())
    return tfidf, classifier, conf_matrix
 def reduceAtK(data: LabelledCollection, k):
    # if k > len(data):
    #     print(f'[warning] {k=}>{len(data)=}')
    X, y = data.Xy
    X = X[:k]
    y = y[:k]
    return LabelledCollection(X, y, classes=data.classes_)
 def benchmark_name(class_name, k=None):
    scape_class_name = class_name.replace('_', '\_')
    if k is None:
        return scape_class_name
    else:
        return f'{scape_class_name}@{k}'
 def run_experiment():
    results = {
        'mae': {k: [] for k in Ks},
        'mrae': {k: [] for k in Ks},
        'rKL_error': [],
        'rND_error': []
    }
    pbar = tqdm(experiment_prot(), total=experiment_prot.total())
    for train, test, q_rel_prevs in pbar:
        Xtr, ytr, score_tr = train
        Xte, yte, score_te = test
        train_col = LabelledCollection(Xtr, ytr, classes=classifier.classes_)
        if not method_name.startswith('Naive') and not method_name.startswith('M3'):
            method.fit(train_col, val_split=train_col, fit_classifier=False)
        elif method_name == 'Naive':
            method.fit(train_col)
        test_col = LabelledCollection(Xte, yte, classes=classifier.classes_)
        rKL_estim, rKL_true = [], []
        rND_estim, rND_true = [], []
        for k in Ks:
            test_k = reduceAtK(test_col, k)
            if method_name == 'NaiveQuery':
                train_k = reduceAtK(train_col, k)
                method.fit(train_k)
            estim_prev = method.quantify(test_k.instances)
            # epsilon value for prevalence smoothing
            eps=(1. / (2. * k))
            # error metrics
            test_k_prev = test_k.prevalence()
            mae = qp.error.mae(test_k_prev, estim_prev)
            mrae = qp.error.mrae(test_k_prev, estim_prev, eps=eps)
            rKL_at_k_estim = qp.error.kld(estim_prev, q_rel_prevs, eps=eps)
            rKL_at_k_true  = qp.error.kld(test_k_prev, q_rel_prevs, eps=eps)
            if BINARIZE:
                # [1] is the index of the minority or historically disadvantaged group
                rND_at_k_estim = np.abs(estim_prev[1] - q_rel_prevs[1])
                rND_at_k_true = np.abs(test_k_prev[1] - q_rel_prevs[1])
            # collect results
            results['mae'][k].append(mae)
            results['mrae'][k].append(mrae)
            rKL_estim.append(rKL_at_k_estim)
            rKL_true.append(rKL_at_k_true)
            if BINARIZE:
                rND_estim.append(rND_at_k_estim)
                rND_true.append(rND_at_k_true)
        # aggregate fairness metrics
        def aggregate(rMs, Ks, Z=1):
            return (1 / Z) * sum((1. / np.log2(k)) * v for v, k in zip(rMs, Ks))
        Z = sum((1. / np.log2(k)) for k in Ks)
        rKL_estim = aggregate(rKL_estim, Ks, Z)
        rKL_true  = aggregate(rKL_true, Ks, Z)
        rKL_error = np.abs(rKL_true-rKL_estim)
        results['rKL_error'].append(rKL_error)
        if BINARIZE:
            rND_estim = aggregate(rND_estim, Ks, Z)
            rND_true = aggregate(rND_true, Ks, Z)
            if isinstance(method, AbstractM3rND):
                if method_name.endswith('+'):
                    # learns the correction parameters from the query-specific training data
                    conf_matrix_ = method.get_confusion_matrix(*train_col.Xy)
                else:
                    # learns the correction parameters from the training data used to train the classifier
                    conf_matrix_ = conf_matrix.copy()
                rND_estim = method.fair_measure_correction(rND_estim, conf_matrix_)
            rND_error = np.abs(rND_true - rND_estim)
            results['rND_error'].append(rND_error)
        pbar.set_description(f'{method_name}')
    return results
 data_home = 'data'
 if __name__ == '__main__':
    # final tables only contain the information for the data size 10K, each row is a class name and each colum
    # the corresponding rND (for binary) or rKL (for multiclass) score
    tables_RND, tables_DKL = [], []
    tables_final = []
    for class_mode in ['multiclass', 'binary']:
        BINARIZE = (class_mode=='binary')
        method_names = [name for name, *other in methods(None, binarize=BINARIZE)]
        table_final = Table(name=f'rND' if BINARIZE else f'rKL', benchmarks=[benchmark_name(c) for c in CLASS_NAMES], methods=method_names)
        table_final.format.mean_macro = False
        tables_final.append(table_final)
        for class_name in CLASS_NAMES:
            tables_mae, tables_mrae = [], []
            benchmarks_size =[benchmark_name(class_name, s) for s in DATA_SIZES]
            table_DKL = Table(name=f'rKL-{class_name}', benchmarks=benchmarks_size, methods=method_names)
            table_RND = Table(name=f'rND-{class_name}', benchmarks=benchmarks_size, methods=method_names)
            for data_size in DATA_SIZES:
                print(class_name, class_mode, data_size)
                benchmarks_k = [benchmark_name(class_name, k) for k in Ks]
                # table_mae = Table(name=f'{class_name}-{data_size}-mae', benchmarks=benchmarks_k, methods=method_names)
                table_mrae = Table(name=f'{class_name}-{data_size}-mrae', benchmarks=benchmarks_k, methods=method_names)
                # tables_mae.append(table_mae)
                tables_mrae.append(table_mrae)
                # sets all paths
                class_home = join(data_home, class_name, data_size)
                train_data_path = join(data_home, class_name, 'FULL', 'classifier_training.json') # <----- fixed classifier
                classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}_{class_mode}.pkl')
                test_rankings_path = join(data_home, 'testRanking_Results.json')
                test_query_prevs_path = join(data_home, 'prevelance_vectors_judged_docs.json')
                results_home = join('results', class_name, class_mode, data_size)
                positive_class = protected_group[class_name] if BINARIZE else None
                # instantiates the classifier (trains it the first time, loads it in the subsequent executions)
                tfidf, classifier, conf_matrix \
                    = qp.util.pickled_resource(classifier_path, train_classifier_fn, train_data_path)
                experiment_prot = RetrievedSamples(
                    class_home,
                    test_rankings_path,
                    test_query_prevs_path,
                    vectorizer=tfidf,
                    class_name=class_name,
                    positive_class=positive_class,
                    classes=classifier.classes_
                )
                for method_name, method in methods(classifier, class_name, BINARIZE):
                    results_path = join(results_home, method_name + '.pkl')
                    results = qp.util.pickled_resource(results_path, run_experiment)
                    # compose the tables
                    for k in Ks:
                        # table_mae.add(benchmark=benchmark_name(class_name, k), method=method_name, v=results['mae'][k])
                        table_mrae.add(benchmark=benchmark_name(class_name, k), method=method_name, v=results['mrae'][k])
                    table_DKL.add(benchmark=benchmark_name(class_name, data_size), method=method_name, v=results['rKL_error'])
                    if BINARIZE:
                        table_RND.add(benchmark=benchmark_name(class_name, data_size), method=method_name, v=results['rND_error'])
                    if data_size=='10K':
                        value = results['rND_error'] if BINARIZE else results['rKL_error']
                        table_final.add(benchmark=benchmark_name(class_name), method=method_name, v=value)
            tables = ([table_RND] + tables_mrae) if BINARIZE else ([table_DKL] + tables_mrae)
            Table.LatexPDF(f'./latex/{class_mode}/{class_name}.pdf', tables=tables)
            if BINARIZE:
                tables_RND.append(table_RND)
            else:
                tables_DKL.append(table_DKL)
    Table.LatexPDF(f'./latex/global/main.pdf', tables=tables_RND+tables_DKL, dedicated_pages=False)
    Table.LatexPDF(f'./latex/final/main.pdf', tables=tables_final, dedicated_pages=False)
--- a/Retrieval/kdey_bandwidth_selection_queries.py
+++ b/Retrieval/kdey_bandwidth_selection_queries.py
@ -0,0 +1,88 @@
 import os.path
 import pickle
 from collections import defaultdict
 from pathlib import Path
 import numpy as np
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import GridSearchCV
 from sklearn.svm import LinearSVC
 import quapy as qp
 from Retrieval.commons import RetrievedSamples, load_sample
 from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
 from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
 from quapy.data.base import LabelledCollection
 from experiments import benchmark_name, reduceAtK, run_experiment
 from os.path import join
 from tqdm import tqdm
 from result_table.src.table import Table
 def methods(classifier):
    for i, bandwidth in enumerate(np.linspace(0.01, 0.1, 10)):
        yield (f'KDE{str(i).zfill(2)}', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=bandwidth))
 if __name__ == '__main__':
    data_home = 'data-modsel'
    Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000]
    method_names = [m for m, *_ in methods(None)]
    class_mode = 'multiclass'
    dir_names={
        'gender': '100K_GENDER_TREC21_QUERIES/100K-NEW-QUERIES',
        'continent': '100K_CONT_TREC21_QUERIES/100K-NEW-QUERIES',
        'years_category': '100K_YEARS_TREC21_QUERIES/100K-NEW-QUERIES'
    }
    for class_name in ['gender', 'continent', 'years_category']:
        tables_mrae = []
        benchmarks = [benchmark_name(class_name, k) for k in Ks]
        for data_size in ['100K']:
            table_mrae = Table(name=f'{class_name}-{data_size}-mrae', benchmarks=benchmarks, methods=method_names)
            tables_mrae.append(table_mrae)
            class_home = join(data_home, dir_names[class_name])
            classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}_{class_mode}.pkl')
            test_rankings_path = join(data_home, 'testRanking-TREC21-Queries_Results.json')
            test_query_prevs_path = join('data', 'prevelance_vectors_judged_docs.json')
            results_home = join('results', 'modsel', class_name, data_size)
            tfidf, classifier, conf_matrix = pickle.load(open(classifier_path, 'rb'))
            experiment_prot = RetrievedSamples(
                class_home,
                test_rankings_path,
                test_query_prevs_path,
                vectorizer=tfidf,
                class_name=class_name,
                classes=classifier.classes_
            )
            for method_name, quantifier in methods(classifier):
                results_path = join(results_home, method_name + '.pkl')
                results = qp.util.pickled_resource(results_path, run_experiment)
                for k in Ks:
                    table_mrae.add(benchmark=benchmark_name(class_name, k), method=method_name, v=results['mrae'][k])
                Table.LatexPDF(f'./latex/modsel/{class_name}.pdf', tables=tables_mrae)
--- a/Retrieval/methods.py
+++ b/Retrieval/methods.py
@ -0,0 +1,88 @@
 """
 This file implements some of the methods presented in the FAccT'22 paper by
 Ghazimatin, Kleindessner, Russell, Abedjan, and Golebiowski,
 Measuring Fairness of Rankings under Noisy Sensitive Information.
 In particular, it implements two variants of a method relying on M3=rND:
 one in which the assumed graphical model is P(Â,A,S) = P(Â|A)*P(S|A) (called "b")
 and another in which the assumed graphical model is P(Â,A,S) = P(Â|A)*P(S|Â) (called "d")
 """
 import numpy as np
 from abc import ABC, abstractmethod
 from sklearn.metrics import confusion_matrix
 from quapy.method.aggregative import CC
 class AbstractM3rND(ABC):
    def __init__(self, classifier):
        self.quantifier = CC(classifier)
    def proxy_labels(self, instances):
        return self.quantifier.classify(instances)
    def quantify(self, instances):
        return self.quantifier.quantify(instances)
    @abstractmethod
    def fair_measure_correction(self, rND_estim: float, conf_matrix: np.ndarray):
        ...
    def get_confusion_matrix(self, X, y, additive_smoothing=0.5):
        """
        Some confusion matrices may contain 0 values for certain classes, and this causes
        instabilities in the correction. If requested, applies additive smoothing. Default
        is adding half a count.
        :param X: array-like with the covariates
        :param y: array-like with the true labels
        :param additive_smoothing: float, default 0.5
        :return: the confusion matrix C with entries Cij=P(Y=i,Ŷ=j)
        """
        proxy_labels = self.proxy_labels(X)
        true_labels = y
        labels = self.quantifier.classes_
        conf_matrix = confusion_matrix(true_labels, proxy_labels, labels=labels)
        if additive_smoothing > 0:
            conf_matrix = conf_matrix.astype(float) + additive_smoothing
        return conf_matrix
 class M3rND_ModelB(AbstractM3rND):
    def __init__(self, classifier):
        super().__init__(classifier)
    def fair_measure_correction(self, rND_estim: float, conf_matrix: np.ndarray):
        # conf_matrix contains values Cij=P(Y=i,Ŷ=j)
        # truecond_matrix contains values Cij=P(Ŷ=j|Y=i) (truecond stands for "conditioned on true labels")
        truecond_matrix = conf_matrix / conf_matrix.sum(axis=1, keepdims=True)
        p = truecond_matrix[0, 1]  # P(hat{A}=1|A=0)
        q = truecond_matrix[1, 0]  # P(hat{A}=0|A=1)
        den = (1 - p - q)
        if den != 0:
            corr = 1./den
            rND_estim = rND_estim * corr
        return rND_estim
 class M3rND_ModelD(AbstractM3rND):
    def __init__(self, classifier):
        super().__init__(classifier)
    def fair_measure_correction(self, rND_estim: float, conf_matrix: np.ndarray):
        # conf_matrix contains values Cij=P(Y=i,Ŷ=j)
        # truecond_matrix contains values Cij=P(Ŷ=j|Y=i) (truecond stands for "conditioned on true labels")
        truecond_matrix = conf_matrix / conf_matrix.sum(axis=1, keepdims=True)
        prev_A = conf_matrix.sum(axis=1)
        beta = prev_A[1]  # P(A)
        p = truecond_matrix[0, 1]  # P(hat{A}=1|A=0)
        q = truecond_matrix[1, 0]  # P(hat{A}=0|A=1)
        x = (1 - q) * beta + p * (1 - beta)
        y = q * beta + (1 - p) * (1 - beta)
        if x != 0 and y != 0:
            corr = ((((1 - q) * beta) / x) - (q * beta / y))
            rND_estim = rND_estim * corr
        return rND_estim
--- a/Retrieval/plot_mrae_xaxis_k.py
+++ b/Retrieval/plot_mrae_xaxis_k.py
@ -0,0 +1,124 @@
 import itertools
 import os.path
 import pickle
 import numpy as np
 from Retrieval.experiments import methods
 from Retrieval.commons import CLASS_NAMES, Ks, DATA_SIZES
 from os.path import join
 import matplotlib.pyplot as plt
 data_home = 'data'
 class_mode = 'multiclass'
 method_names = [name for name, *other in methods(None, 'continent')]
 all_results = {}
 class_name_label = {
    'continent': 'Geographic Location',
    'gender': 'Gender',
    'years_category': 'Age of Topic'
 }
 # loads all MRAE results, and returns a dictionary containing the values, which is indexed by:
 # class_name -> data_size -> method_name -> k -> stat -> float
 # where stat is "mean", "std", "max"
 def load_all_results():
    for class_name in CLASS_NAMES:
        all_results[class_name] = {}
        for data_size in DATA_SIZES:
            all_results[class_name][data_size] = {}
            results_home = join('results', class_name, class_mode, data_size)
            all_results[class_name][data_size] = {}
            for method_name in method_names:
                results_path = join(results_home, method_name + '.pkl')
                try:
                    results = pickle.load(open(results_path, 'rb'))
                except Exception as e:
                    print(f'missing result {results}', e)
                all_results[class_name][data_size][method_name] = {}
                for k in Ks:
                    all_results[class_name][data_size][method_name][k] = {}
                    values = results['mrae']
                    all_results[class_name][data_size][method_name][k]['mean'] = np.mean(values[k])
                    all_results[class_name][data_size][method_name][k]['std'] = np.std(values[k])
                    all_results[class_name][data_size][method_name][k]['max'] = np.max(values[k])
    return all_results
 results = load_all_results()
 # generates the class-independent, size-independent plots for y-axis=MRAE in which:
 # - the x-axis displays the Ks
 for class_name in CLASS_NAMES:
    for data_size in DATA_SIZES[:1]:
        log = class_name=='gender'
        fig, ax = plt.subplots()
        max_means = []
        markers = itertools.cycle(['o', 's', '^', 'D', 'v', '*', '+'])
        for method_name in method_names:
            # class_name -> data_size -> method_name -> k -> stat -> float
            means = [
                results[class_name][data_size][method_name][k]['mean'] for k in Ks
            ]
            stds = [
                results[class_name][data_size][method_name][k]['std'] for k in Ks
            ]
            # max_mean = np.max([
            #         results[class_name][data_size][method_name][k]['max'] for k in Ks
            # ])
            max_means.append(max(means))
            means = np.asarray(means)
            stds = np.asarray(stds)
            method_name = method_name.replace('NaiveQuery', 'Naive@$k$')
            method_name = method_name.replace('KDEy-ML', 'KDEy')
            marker = next(markers)
            line = ax.plot(Ks, means, 'o-', label=method_name, color=None, linewidth=3, markersize=10, marker=marker)
            color = line[-1].get_color()
            if log:
                ax.set_yscale('log')
            # ax.fill_between(Ks, means - stds, means + stds, alpha=0.3, color=color)
        ax.grid(True, which='both', axis='y', color='gray', linestyle='--', linewidth=0.3)
        ax.set_xlabel('k')
        ax.set_ylabel('RAE' + (' (log scale)' if log else ''))
        data_size_label = '$\mathcal{L}_{10\mathrm{K}}$'
        ax.set_title(f'{class_name_label[class_name]} from {data_size_label}')
        ax.set_ylim([0, max(max_means)*1.05])
        if class_name == 'years_category':
            ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
        os.makedirs(f'plots/var_k/{class_name}', exist_ok=True)
        plotpath = f'plots/var_k/{class_name}/{data_size}_mrae.pdf'
        print(f'saving plot in {plotpath}')
        plt.savefig(plotpath, bbox_inches='tight')
--- a/Retrieval/plot_mrae_xaxis_size.py
+++ b/Retrieval/plot_mrae_xaxis_size.py
@ -0,0 +1,88 @@
 import itertools
 import os.path
 from Retrieval.experiments import methods
 from Retrieval.commons import CLASS_NAMES, Ks, DATA_SIZES
 import matplotlib.pyplot as plt
 from Retrieval.plot_mrae_xaxis_k import load_all_results
 data_home = 'data'
 class_mode = 'multiclass'
 method_names = [name for name, *other in methods(None)]
 all_results = {}
 class_name_label = {
    'continent': 'Geographic Location',
    'gender': 'Gender',
    'years_category': 'Age of Topic'
 }
 # loads all MRAE results, and returns a dictionary containing the values, which is indexed by:
 # class_name -> data_size -> method_name -> k -> stat -> float
 results = load_all_results()
 # generates the class-independent, size-independent plots for y-axis=MRAE in which:
 # - the x-axis displays the Ks
 # X_DATA_SIZES = [int(x.replace('K', '000').replace('M', '000000').replace('FULL', '3250000')) for x in DATA_SIZES]
 X_DATA_SIZES = [x.replace('FULL', '3.25M') for x in DATA_SIZES]
 for class_name in CLASS_NAMES:
    for k in [100]: #Ks:
        log = class_name=='gender'
        fig, ax = plt.subplots()
        max_means = []
        markers = itertools.cycle(['o', 's', '^', 'D', 'v', '*', '+'])
        for method_name in method_names:
            # class_name -> data_size -> method_name -> k -> stat -> float
            means = [
                results[class_name][data_size][method_name][k]['mean'] for data_size in DATA_SIZES
            ]
            stds = [
                results[class_name][data_size][method_name][k]['std'] for data_size in DATA_SIZES
            ]
            # max_mean = np.max([
            #         results[class_name][data_size][method_name][k]['max'] for data_size in DATA_SIZE
            # ])
            max_means.append(max(means))
            style = 'o-' if method_name != 'CC' else '--'
            method_name = method_name.replace('NaiveQuery', 'Naive@$k$')
            method_name = method_name.replace('KDEy-ML', 'KDEy')
            marker=next(markers)
            line = ax.plot(X_DATA_SIZES, means, style, label=method_name, color=None, linewidth=3, markersize=10, marker=marker)
            color = line[-1].get_color()
            if log:
                ax.set_yscale('log')
            # ax.fill_between(Ks, means - stds, means + stds, alpha=0.3, color=color)
        ax.grid(True, which='both', axis='y', color='gray', linestyle='--', linewidth=0.3)
        ax.set_xlabel('training pool size')
        ax.set_ylabel('RAE' + (' (log scale)' if log else ''))
        ax.set_title(f'{class_name_label[class_name]} at exposure {k=}')
        ax.set_ylim([0, max(max_means)*1.05])
        if class_name == 'years_category':
            ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
        os.makedirs(f'plots/var_size/{class_name}', exist_ok=True)
        plotpath = f'plots/var_size/{class_name}/{k}_mrae.pdf'
        print(f'saving plot in {plotpath}')
        plt.savefig(plotpath, bbox_inches='tight')
--- a/Retrieval/relscore_distribution.py
+++ b/Retrieval/relscore_distribution.py
@ -0,0 +1,93 @@
 import os.path
 import pickle
 from itertools import zip_longest
 from commons import RetrievedSamples, load_sample, DATA_SIZES
 from os.path import join
 from tqdm import tqdm
 import numpy as np
 import matplotlib.pyplot as plt
 """
 Plots the distribution of (predicted) relevance score for the test samples and for the training samples wrt:
 - training pool size (10K, 50K, 100K, 500K, 1M, FULL)
 - rank  
 """
 data_home = 'data'
 up_to = 250
 for class_name in ['continent']: # 'num_sitelinks_category', 'relative_pageviews_category', 'years_category', 'continent', 'gender']:
    test_added = False
    Mtrs, Mtes, source = [], [], []
    for data_size in DATA_SIZES:
        class_home = join(data_home, class_name, data_size)
        classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}.pkl')
        test_rankings_path = join(data_home, 'testRanking_Results.json')
        test_query_prevs_path = join(data_home, 'prevelance_vectors_judged_docs.json')
        _, classifier = pickle.load(open(classifier_path, 'rb'))
        experiment_prot = RetrievedSamples(
            class_home,
            test_rankings_path,
            test_query_prevs_path,
            vectorizer=None,
            class_name=class_name,
            classes=classifier.classes_
        )
        Mtr = []
        Mte = []
        pbar = tqdm(experiment_prot(), total=experiment_prot.total())
        for train, test, *_ in pbar:
            Xtr, ytr, score_tr = train
            Xte, yte, score_te = test
            if len(score_tr) >= up_to:
                Mtr.append(score_tr)
                Mte.append(score_te)
        Mtrs.append(Mtr)
        if not test_added:
            Mtes.append(Mte)
            test_added = True
        source.append(data_size)
    fig, ax = plt.subplots()
    # train_source = ['train-'+s for s in source]
    train_source = ['$L_{'+s.replace('FULL', '3.25M').replace('K','\mathrm{K}').replace('M','\mathrm{M}')+'}$' for s in source]
    # Ms = list(zip(Mtrs, train_source))+list(zip(Mtes, ['test']))
    Ms = list(zip(Mtrs, train_source)) + list(zip(Mtes, ['$U_{(3.25\mathrm{M})}$']))
    for M, source in Ms:
        M = np.asarray(list(zip_longest(*M, fillvalue=np.nan))).T
        num_rep, num_docs = M.shape
        mean_values = np.nanmean(M, axis=0)
        n_filled = np.count_nonzero(~np.isnan(M), axis=0)
        std_errors = np.nanstd(M, axis=0) / np.sqrt(n_filled)
        line = ax.plot(range(num_docs), mean_values, '-', label=source, color=None)
        color = line[-1].get_color()
        ax.fill_between(range(num_docs), mean_values - std_errors, mean_values + std_errors, alpha=0.3, color=color)
    ax.set_xlabel('rank ($k$)')
    ax.set_ylabel('predicted relevance score')
    ax.set_title(class_name.replace('continent', 'Geographic Location'))
    ax.set_xlim((0,up_to))
    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    # plt.show()
    os.makedirs('plots', exist_ok=True)
    plotpath = f'plots/{class_name}_rel_distrbution_2.pdf'
    print(f'saving plot in {plotpath}')
    plt.savefig(plotpath, bbox_inches='tight')
--- a/Retrieval/tmp.py
+++ b/Retrieval/tmp.py
@ -0,0 +1,16 @@
 import pandas as pd
 from os.path import join
 from quapy.data import LabelledCollection
 data_home = 'data'
 CLASS_NAME = 'continent'
 datasize = '100K'
 file_path = join(data_home, 'prevelance_vectors_judged_docs.json')
 df = pd.read_json(file_path)
 pd.set_option('display.max_columns', None)
 print(df)
--- a/quapy/init.py
+++ b/quapy/init.py
@ -11,7 +11,7 @@ from . import util
 from . import model_selection
 from . import classification
-__version__ = '0.1.8'
+__version__ = '0.1.9'
 environ = {
    'SAMPLE_SIZE': None,
--- a/quapy/error.py
+++ b/quapy/error.py
@ -158,8 +158,8 @@ def kld(prevs, prevs_hat, eps=None):
    :return: Kullback-Leibler divergence between the two distributions
    """
    eps = __check_eps(eps)
-    smooth_prevs = prevs + eps
+    smooth_prevs = smooth(prevs, eps)
-    smooth_prevs_hat = prevs_hat + eps
+    smooth_prevs_hat = smooth(prevs_hat, eps)
    return (smooth_prevs*np.log(smooth_prevs/smooth_prevs_hat)).sum(axis=-1)
--- a/quapy/evaluation.py
+++ b/quapy/evaluation.py
@ -87,7 +87,6 @@ def evaluation_report(model: BaseQuantifier,
    Generates a report (a pandas' DataFrame) containing information of the evaluation of the model as according
    to a specific protocol and in terms of one or more evaluation metrics (errors).
    :param model: a quantifier, instance of :class:`quapy.method.base.BaseQuantifier`
    :param protocol: :class:`quapy.protocol.AbstractProtocol`; if this object is also instance of
        :class:`quapy.protocol.OnLabelledCollectionProtocol`, then the aggregation speed-up can be run. This is the protocol
--- a/quapy/functional.py
+++ b/quapy/functional.py
@ -141,6 +141,19 @@ def uniform_prevalence_sampling(n_classes, size=1):
    return u
 def uniform_prevalence(n_classes):
    """
    Returns a vector representing the uniform distribution for `n_classes`
    :param n_classes: number of classes
    :return: np.ndarray with all values 1/n_classes
    """
    assert isinstance(n_classes, int) and n_classes>0, \
        (f'param {n_classes} not understood; must be a positive integer representing the '
         f'number of classes ')
    return np.full(shape=n_classes, fill_value=1./n_classes)
 uniform_simplex_sampling = uniform_prevalence_sampling
--- a/quapy/method/_kdey.py
+++ b/quapy/method/_kdey.py
@ -52,7 +52,7 @@ class KDEBase:
        """
        return np.exp(kde.score_samples(X))
-    def get_mixture_components(self, X, y, n_classes, bandwidth):
+    def get_mixture_components(self, X, y, classes, bandwidth):
        """
        Returns an array containing the mixture components, i.e., the KDE functions for each class.
@ -62,7 +62,13 @@ class KDEBase:
        :param bandwidth: float, the bandwidth of the kernel
        :return: a list of KernelDensity objects, each fitted with the corresponding class-specific covariates
        """
-        return [self.get_kde_function(X[y == cat], bandwidth) for cat in range(n_classes)]
+        class_cond_X = []
        for cat in classes:
            selX = X[y==cat]
            if selX.size==0:
                selX = [F.uniform_prevalence(len(classes))]
            class_cond_X.append(np.asarray(selX))
        return [self.get_kde_function(X_cond_yi, bandwidth) for X_cond_yi in class_cond_X]
@ -114,7 +120,7 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase):
        self.random_state=random_state
    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
-        self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.n_classes, self.bandwidth)
+        self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth)
        return self
    def aggregate(self, posteriors: np.ndarray):
@ -196,7 +202,7 @@ class KDEyHD(AggregativeSoftQuantifier, KDEBase):
        self.montecarlo_trials = montecarlo_trials
    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
-        self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.n_classes, self.bandwidth)
+        self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth)
        N = self.montecarlo_trials
        rs = self.random_state
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@ -2,7 +2,7 @@ from abc import ABC, abstractmethod
 from copy import deepcopy
 from typing import Callable, Union
 import numpy as np
-from abstention.calibration import NoBiasVectorScaling, TempScaling, VectorScaling
+from abstention.calibration import NoBiasVectorScaling, TempScaling, VectorScaling, PlattScaling
 from scipy import optimize
 from sklearn.base import BaseEstimator
 from sklearn.calibration import CalibratedClassifierCV
@ -636,18 +636,35 @@ class EMQ(AggregativeSoftQuantifier):
                calibrator = TempScaling()
            elif self.recalib == 'vs':
                calibrator = VectorScaling()
            elif self.recalib == 'platt':
                calibrator = CalibratedClassifierCV(estimator=self.classifier, cv='prefit')
            else:
                raise ValueError('invalid param argument for recalibration method; available ones are '
                                 '"nbvs", "bcts", "ts", and "vs".')
-            self.calibration_function = calibrator(P, np.eye(data.n_classes)[y], posterior_supplied=True)
+            if not np.issubdtype(y.dtype, np.number):
                y = np.searchsorted(data.classes_, y)
            if self.recalib == 'platt':
                self.classifier = calibrator.fit(*data.Xy)
            else:
                print(classif_predictions.prevalence())
                try:
                    self.calibration_function = calibrator(P, np.eye(data.n_classes)[y], posterior_supplied=True)
                except RuntimeError as e:
                    print(e)
                    print('defaults to I')
                    self.calibration_function = lambda P:P
        if self.exact_train_prev:
            self.train_prevalence = data.prevalence()
        else:
            train_posteriors = classif_predictions.X
            if self.recalib is not None:
-                train_posteriors = self.calibration_function(train_posteriors)
+                if self.recalib == 'platt':
                    train_posteriors = self.classifier.predict_proba(train_posteriors)
                else:
                    train_posteriors = self.calibration_function(train_posteriors)
            self.train_prevalence = F.prevalence_from_probabilities(train_posteriors)
    def aggregate(self, classif_posteriors, epsilon=EPSILON):
@ -681,6 +698,11 @@ class EMQ(AggregativeSoftQuantifier):
        """
        Px = posterior_probabilities
        Ptr = np.copy(tr_prev)
        if np.product(Ptr) == 0:  # some entry is 0; we should smooth the values to avoid 0 division
            Ptr += epsilon
            Ptr /= Ptr.sum()
        qs = np.copy(Ptr)  # qs (the running estimate) is initialized as the training prevalence
        s, converged = 0, False
--- a/quapy/method/non_aggregative.py
+++ b/quapy/method/non_aggregative.py
@ -1,5 +1,6 @@
 from typing import Union, Callable
 import numpy as np
 from sklearn.feature_extraction.text import CountVectorizer
 from quapy.functional import get_divergence
 from quapy.data import LabelledCollection
@ -146,6 +147,53 @@ class DMx(BaseQuantifier):
        return F.argmin_prevalence(loss, n_classes, method=self.search)
 class ReadMe(BaseQuantifier):
    def __init__(self, bootstrap_trials=100, bootstrap_range=100, bagging_trials=100, bagging_range=25, **vectorizer_kwargs):
        self.bootstrap_trials = bootstrap_trials
        self.bootstrap_range = bootstrap_range
        self.bagging_trials = bagging_trials
        self.bagging_range = bagging_range
        self.vectorizer_kwargs = vectorizer_kwargs
    def fit(self, data: LabelledCollection):
        X, y = data.Xy
        self.vectorizer = CountVectorizer(binary=True, **self.vectorizer_kwargs)
        X = self.vectorizer.fit_transform(X)
        self.class_conditional_X = {i: X[y==i] for i in range(data.classes_)}
    def quantify(self, instances):
        X = self.vectorizer.transform(instances)
        # number of features
        num_docs, num_feats = X.shape
        # bootstrap
        p_boots = []
        for _ in range(self.bootstrap_trials):
            docs_idx = np.random.choice(num_docs, size=self.bootstra_range, replace=False)
            class_conditional_X = {i: X[docs_idx] for i, X in self.class_conditional_X.items()}
            Xboot = X[docs_idx]
            # bagging
            p_bags = []
            for _ in range(self.bagging_trials):
                feat_idx = np.random.choice(num_feats, size=self.bagging_range, replace=False)
                class_conditional_Xbag = {i: X[:, feat_idx] for i, X in class_conditional_X.items()}
                Xbag = Xboot[:,feat_idx]
                p = self.std_constrained_linear_ls(Xbag, class_conditional_Xbag)
                p_bags.append(p)
            p_boots.append(np.mean(p_bags, axis=0))
        p_mean = np.mean(p_boots, axis=0)
        p_std  = np.std(p_bags, axis=0)
        return p_mean
    def std_constrained_linear_ls(self, X, class_cond_X: dict):
        pass
 def _get_features_range(X):
    feat_ranges = []
--- a/quapy/util.py
+++ b/quapy/util.py
@ -56,6 +56,7 @@ def parallel(func, args, n_jobs, seed=None, asarray=True, backend='loky'):
    :param seed: the numeric seed
    :param asarray: set to True to return a np.ndarray instead of a list
    :param backend: indicates the backend used for handling parallel works
    :param open_args: if True, then the delayed function is called on *args_i, instead of on args_i
    """
    def func_dec(environ, seed, *args):
        qp.environ = environ.copy()
@ -74,6 +75,40 @@ def parallel(func, args, n_jobs, seed=None, asarray=True, backend='loky'):
    return out
 def parallel_unpack(func, args, n_jobs, seed=None, asarray=True, backend='loky'):
    """
    A wrapper of multiprocessing:
    >>> Parallel(n_jobs=n_jobs)(
    >>>      delayed(func)(*args_i) for args_i in args
    >>> )
    that takes the `quapy.environ` variable as input silently.
    Seeds the child processes to ensure reproducibility when n_jobs>1.
    :param func: callable
    :param args: args of func
    :param seed: the numeric seed
    :param asarray: set to True to return a np.ndarray instead of a list
    :param backend: indicates the backend used for handling parallel works
    """
    def func_dec(environ, seed, *args):
        qp.environ = environ.copy()
        qp.environ['N_JOBS'] = 1
        # set a context with a temporal seed to ensure results are reproducibles in parallel
        with ExitStack() as stack:
            if seed is not None:
                stack.enter_context(qp.util.temp_seed(seed))
            return func(*args)
    out = Parallel(n_jobs=n_jobs, backend=backend)(
        delayed(func_dec)(qp.environ, None if seed is None else seed + i, *args_i) for i, args_i in enumerate(args)
    )
    if asarray:
        out = np.asarray(out)
    return out
@contextlib.contextmanager
 def temp_seed(random_state):
    """
Author	SHA1	Message	Date
Alejandro Moreo Fernandez	47eb864491	last updates before submission	2024-08-22 17:02:00 +02:00
Alejandro Moreo Fernandez	1a1bccdd23	updating kde labels in plots	2024-05-20 12:13:00 +02:00
Alejandro Moreo Fernandez	517686eea1	improving the quality of the plots	2024-05-17 13:52:56 +02:00
Alejandro Moreo Fernandez	0df44c13a9	switching	2024-05-15 12:00:00 +02:00
Alejandro Moreo Fernandez	2ac48a9798	setting a rank threshold to 1000, and finalizing plots	2024-05-10 15:46:13 +02:00
Alejandro Moreo Fernandez	67ed6e4c6c	adding methods of prior work to git	2024-05-09 16:24:20 +02:00
Alejandro Moreo Fernandez	5284e04c90	final plots	2024-05-09 16:22:59 +02:00
Alejandro Moreo Fernandez	366020d45c	finalizing experiments and bugfix in kld error	2024-05-08 11:31:28 +02:00
Alejandro Moreo Fernandez	1007257280	adding Dkl	2024-05-02 16:36:23 +02:00
Alejandro Moreo Fernandez	e1f6149f71	adding the prevalence of the judged relevant per each query	2024-05-02 10:59:16 +02:00
Alejandro Moreo Fernandez	a1a716dc4a	trying to select training documents based on test score distribution	2024-04-24 15:27:35 +02:00
Alejandro Moreo Fernandez	36c53639d7	model selection for kde in a past TREC dataset	2024-04-23 09:53:31 +02:00
Alejandro Moreo Fernandez	bc656fe207	kde working	2024-04-19 18:16:14 +02:00
Alejandro Moreo Fernandez	985f430d52	refactoring everything	2024-04-18 09:32:30 +02:00
Alejandro Moreo Fernandez	8399552c8d	testing gender and continent again	2024-04-12 12:03:38 +02:00
Alejandro Moreo Fernandez	8ad41b1d33	new experimental protocol applied to continent	2024-04-09 09:48:56 +02:00
Alejandro Moreo Fernandez	1b420afd6c	fixing code to handle different categories	2024-04-05 18:09:52 +02:00
Alejandro Moreo Fernandez	8f9d19dd5f	fixing code to handle different categories	2024-04-05 18:09:20 +02:00
Alejandro Moreo Fernandez	2a685cec1e	seems to be working :D	2024-03-23 20:12:10 +01:00
Alejandro Moreo Fernandez	4150f4351f	statring 5th approach	2024-03-15 16:57:45 +01:00
Alejandro Moreo Fernandez	1aa9891ff9	cleaning gitignore	2024-02-23 16:48:53 +01:00
Alejandro Moreo Fernandez	1c03dd651b	first commit, some ideas already explored	2024-02-23 16:42:31 +01:00
Alejandro Moreo Fernandez	b3ccf71edb	Merge branch 'devel' of github.com:HLT-ISTI/QuaPy into devel	2024-02-23 16:30:11 +01:00
Alejandro Moreo Fernandez	320b3eac38	small fixes in kdey (now should work with string labels) and EMQ (in case some training prior prob was 0, it broke)	2024-02-23 16:29:53 +01:00
Alejandro Moreo Fernandez	9542eaee61	doing some benchmarking	2024-02-22 15:10:45 +01:00
Alejandro Moreo Fernandez	d50a86daf4	sketching readme system by Lu and King, Hopings and King	2024-02-16 17:34:10 +01:00