From 4150f4351fcf3d12998e9fbf480ad0e392dce22c Mon Sep 17 00:00:00 2001 From: Alejandro Moreo <alejandro.moreo@isti.cnr.it> Date: Fri, 15 Mar 2024 16:57:45 +0100 Subject: [PATCH] statring 5th approach --- Retrieval/fifth.py | 161 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 161 insertions(+) create mode 100644 Retrieval/fifth.py diff --git a/Retrieval/fifth.py b/Retrieval/fifth.py new file mode 100644 index 0000000..52cf49c --- /dev/null +++ b/Retrieval/fifth.py @@ -0,0 +1,161 @@ +import numpy as np +import pandas as pd +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import GridSearchCV +from sklearn.svm import LinearSVC + +import quapy as qp +import quapy.functional as F +from Retrieval.commons import RetrievedSamples, load_txt_sample +from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation +from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML +from quapy.protocol import AbstractProtocol +from quapy.data.base import LabelledCollection + +from glob import glob +from os.path import join +from tqdm import tqdm + +""" +In this fifth experiment, we have pairs of (Li,Ui) with Li a training set and Ui a test set as +in the fourth experiment, and the fairness group are defined upon geographic info as in the fourth case. +As in the fourth, the data Li and Ui have been drawn by retrieving query-related documents from +a pool of the same size. Unlike the fourth experiment, here the training queries are + +Por ahora 1000 en tr y 100 en test +Parece que ahora hay muy poco shift +""" + +def cls(classifier_trained=None): + if classifier_trained is None: + # return LinearSVC() + return LogisticRegression() + else: + return classifier_trained + + +def methods(classifier_trained=None): + yield ('CC', ClassifyAndCount(cls(classifier_trained))) + yield ('PACC', PACC(cls(classifier_trained), val_split=5, n_jobs=-1)) + yield ('EMQ', EMQ(cls(classifier_trained), exact_train_prev=True)) + yield ('EMQh', EMQ(cls(classifier_trained), exact_train_prev=False)) + yield ('EMQ-BCTS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='bcts')) + yield ('EMQ-TS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='ts')) + yield ('EMQ-NBVS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='nbvs')) + # yield ('EMQ-VS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='vs')) + yield ('PCC', PCC(cls(classifier_trained))) + yield ('ACC', ACC(cls(classifier_trained), val_split=5, n_jobs=-1)) + yield ('KDE001', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.001)) + yield ('KDE005', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.005)) # <-- wow! + yield ('KDE01', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.01)) + yield ('KDE02', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.02)) + yield ('KDE03', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.03)) + yield ('KDE05', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.05)) + yield ('KDE07', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.07)) + yield ('KDE10', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.10)) + yield ('MLPE', MaximumLikelihoodPrevalenceEstimation()) + + +def train_classifier(): + tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10) + training = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True, parse_columns=False) + + if REDUCE_TR > 0: + print('Reducing the number of documents in the training to', REDUCE_TR) + training = training.sampling(REDUCE_TR, *training.prevalence()) + + Xtr, ytr = training.Xy + Xtr = tfidf.fit_transform(Xtr) + print('L orig shape = ', Xtr.shape) + + training = LabelledCollection(Xtr, ytr) + + print('training classifier') + classifier_trained = LogisticRegression() + classifier_trained = GridSearchCV(classifier_trained, + param_grid={'C': np.logspace(-3, 3, 7), 'class_weight': ['balanced', None]}, + n_jobs=-1, cv=5) + classifier_trained.fit(Xtr, ytr) + classifier_trained = classifier_trained.best_estimator_ + trained = True + print('[Done!]') + + classes = training.classes_ + + print('training classes:', classes) + print('training prevalence:', training.prevalence()) + + return tfidf, classifier_trained + + + +RANK_AT_K = 1000 +REDUCE_TR = 50000 +qp.environ['SAMPLE_SIZE'] = RANK_AT_K + +data_path = './50_50_split_trec' +train_path = join(data_path, 'train_50_50_continent.txt') + +tfidf, classifier_trained = qp.util.pickled_resource('classifier.pkl', train_classifier) +trained=True + +experiment_prot = RetrievedSamples(data_path, + load_fn=load_txt_sample, + vectorizer=tfidf, + max_train_lines=None, + max_test_lines=RANK_AT_K, classes=classifier_trained.classes_) + +result_mae_dict = {} +result_mrae_dict = {} +for method_name, quantifier in methods(classifier_trained): + # print('Starting with method=', method_name) + + mae_errors = [] + mrae_errors = [] + pbar = tqdm(experiment_prot(), total=49) + for train, test in pbar: + if train is not None: + try: + + # print(train.prevalence()) + # print(test.prevalence()) + if trained and method_name!='MLPE': + quantifier.fit(train, val_split=train, fit_classifier=False) + else: + quantifier.fit(train) + estim_prev = quantifier.quantify(test.instances) + + mae = qp.error.mae(test.prevalence(), estim_prev) + mae_errors.append(mae) + + mrae = qp.error.mrae(test.prevalence(), estim_prev) + mrae_errors.append(mrae) + + # print() + # print('Training prevalence:', F.strprev(train.prevalence()), 'shape', train.X.shape) + # print('Test prevalence:', F.strprev(test.prevalence()), 'shape', test.X.shape) + # print('Estim prevalence:', F.strprev(estim_prev)) + + except Exception as e: + print(f'wow, something happened here! skipping; {e}') + else: + print('skipping one!') + + pbar.set_description(f'{method_name}\tmae={np.mean(mae_errors):.4f}\tmrae={np.mean(mrae_errors):.4f}') + print() + result_mae_dict[method_name] = np.mean(mae_errors) + result_mrae_dict[method_name] = np.mean(mrae_errors) + +print('Results\n'+('-'*100)) +for method_name in result_mae_dict.keys(): + MAE = result_mae_dict[method_name] + MRAE = result_mrae_dict[method_name] + print(f'{method_name}\t{MAE=:.5f}\t{MRAE=:.5f}') + + + + + + +