diff --git a/Retrieval/commons.py b/Retrieval/commons.py new file mode 100644 index 0000000..9ef4423 --- /dev/null +++ b/Retrieval/commons.py @@ -0,0 +1,74 @@ +import pandas as pd +import numpy as np +from glob import glob +from os.path import join + +from quapy.data import LabelledCollection +from quapy.protocol import AbstractProtocol + + +def load_txt_sample(path, parse_columns, verbose=False, max_lines=None): + # print('reading', path) + if verbose: + print(f'loading {path}...', end='') + df = pd.read_csv(path, sep='\t') + if verbose: + print('[done]') + X = df['text'].values + y = df['continent'].values + + if parse_columns: + rank = df['rank'].values + scores = df['score'].values + rank = rank[y != 'Antarctica'] + scores = scores[y != 'Antarctica'] + + X = X[y!='Antarctica'] + y = y[y!='Antarctica'] + + if parse_columns: + order = np.argsort(rank) + X = X[order] + y = y[order] + rank = rank[order] + scores = scores[order] + + if max_lines is not None: + X = X[:max_lines] + y = y[:max_lines] + + return X, y + + +class RetrievedSamples(AbstractProtocol): + + def __init__(self, path_dir: str, load_fn, vectorizer, max_train_lines=None, max_test_lines=None, classes=None): + self.path_dir = path_dir + self.load_fn = load_fn + self.vectorizer = vectorizer + self.max_train_lines = max_train_lines + self.max_test_lines = max_test_lines + self.classes=classes + + def __call__(self): + for file in glob(join(self.path_dir, 'test_rankings', 'test_rankingstraining_rankings_*.txt')): + + X, y = self.load_fn(file.replace('test_', 'training_'), parse_columns=True, max_lines=self.max_train_lines) + X = self.vectorizer.transform(X) + train_sample = LabelledCollection(X, y, classes=self.classes) + + X, y = self.load_fn(file, parse_columns=True, max_lines=self.max_test_lines) + # if len(X)!=qp.environ['SAMPLE_SIZE']: + # print(f'[warning]: file {file} contains {len(X)} instances (expected: {qp.environ["SAMPLE_SIZE"]})') + # assert len(X) == qp.environ['SAMPLE_SIZE'], f'unexpected sample size for file {file}, found {len(X)}' + X = self.vectorizer.transform(X) + try: + test_sample = LabelledCollection(X, y, classes=train_sample.classes_) + except ValueError as e: + print(f'file {file} caused error {e}') + yield None, None + + # print('train #classes:', train_sample.n_classes, train_sample.prevalence()) + # print('test #classes:', test_sample.n_classes, test_sample.prevalence()) + + yield train_sample, test_sample \ No newline at end of file diff --git a/Retrieval/fourth.py b/Retrieval/fourth.py new file mode 100644 index 0000000..62b340b --- /dev/null +++ b/Retrieval/fourth.py @@ -0,0 +1,161 @@ +import numpy as np +import pandas as pd +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import GridSearchCV +from sklearn.svm import LinearSVC + +import quapy as qp +import quapy.functional as F +from Retrieval.commons import RetrievedSamples, load_txt_sample +from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation +from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML +from quapy.protocol import AbstractProtocol +from quapy.data.base import LabelledCollection + +from glob import glob +from os.path import join +from tqdm import tqdm + +""" +In this fourth experiment, we have pairs of (Li,Ui) with Li a training set and Ui a test set as +in the third experiment, and the fairness group are defined upon geographic info as in the third case. +The difference here is that the data Li and Ui have been drawn by retrieving query-related documents from +a pool of the same size. + +Por ahora 1000 en tr y 100 en test +Parece que ahora hay muy poco shift +""" + +def cls(classifier_trained=None): + if classifier_trained is None: + # return LinearSVC() + return LogisticRegression() + else: + return classifier_trained + + +def methods(classifier_trained=None): + yield ('CC', ClassifyAndCount(cls(classifier_trained))) + yield ('PACC', PACC(cls(classifier_trained), val_split=5, n_jobs=-1)) + yield ('EMQ', EMQ(cls(classifier_trained), exact_train_prev=True)) + yield ('EMQh', EMQ(cls(classifier_trained), exact_train_prev=False)) + yield ('EMQ-BCTS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='bcts')) + yield ('EMQ-TS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='ts')) + yield ('EMQ-NBVS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='nbvs')) + # yield ('EMQ-VS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='vs')) + yield ('PCC', PCC(cls(classifier_trained))) + yield ('ACC', ACC(cls(classifier_trained), val_split=5, n_jobs=-1)) + yield ('KDE001', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.001)) + yield ('KDE005', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.005)) # <-- wow! + yield ('KDE01', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.01)) + yield ('KDE02', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.02)) + yield ('KDE03', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.03)) + yield ('KDE05', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.05)) + yield ('KDE07', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.07)) + yield ('KDE10', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.10)) + yield ('MLPE', MaximumLikelihoodPrevalenceEstimation()) + + +def train_classifier(): + tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10) + training = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True, parse_columns=False) + + if REDUCE_TR > 0: + print('Reducing the number of documents in the training to', REDUCE_TR) + training = training.sampling(REDUCE_TR, *training.prevalence()) + + Xtr, ytr = training.Xy + Xtr = tfidf.fit_transform(Xtr) + print('L orig shape = ', Xtr.shape) + + training = LabelledCollection(Xtr, ytr) + + print('training classifier') + classifier_trained = LogisticRegression() + classifier_trained = GridSearchCV(classifier_trained, + param_grid={'C': np.logspace(-3, 3, 7), 'class_weight': ['balanced', None]}, + n_jobs=-1, cv=5) + classifier_trained.fit(Xtr, ytr) + classifier_trained = classifier_trained.best_estimator_ + trained = True + print('[Done!]') + + classes = training.classes_ + + print('training classes:', classes) + print('training prevalence:', training.prevalence()) + + return tfidf, classifier_trained + + + +RANK_AT_K = 1000 +REDUCE_TR = 50000 +qp.environ['SAMPLE_SIZE'] = RANK_AT_K + +data_path = './50_50_split_trec' +train_path = join(data_path, 'train_50_50_continent.txt') + +tfidf, classifier_trained = qp.util.pickled_resource('classifier.pkl', train_classifier) +trained=True + +experiment_prot = RetrievedSamples(data_path, + load_fn=load_txt_sample, + vectorizer=tfidf, + max_train_lines=None, + max_test_lines=RANK_AT_K, classes=classifier_trained.classes_) + +result_mae_dict = {} +result_mrae_dict = {} +for method_name, quantifier in methods(classifier_trained): + # print('Starting with method=', method_name) + + mae_errors = [] + mrae_errors = [] + pbar = tqdm(experiment_prot(), total=49) + for train, test in pbar: + if train is not None: + try: + + # print(train.prevalence()) + # print(test.prevalence()) + if trained and method_name!='MLPE': + quantifier.fit(train, val_split=train, fit_classifier=False) + else: + quantifier.fit(train) + estim_prev = quantifier.quantify(test.instances) + + mae = qp.error.mae(test.prevalence(), estim_prev) + mae_errors.append(mae) + + mrae = qp.error.mrae(test.prevalence(), estim_prev) + mrae_errors.append(mrae) + + # print() + # print('Training prevalence:', F.strprev(train.prevalence()), 'shape', train.X.shape) + # print('Test prevalence:', F.strprev(test.prevalence()), 'shape', test.X.shape) + # print('Estim prevalence:', F.strprev(estim_prev)) + + except Exception as e: + print(f'wow, something happened here! skipping; {e}') + else: + print('skipping one!') + + pbar.set_description(f'{method_name}\tmae={np.mean(mae_errors):.4f}\tmrae={np.mean(mrae_errors):.4f}') + print() + result_mae_dict[method_name] = np.mean(mae_errors) + result_mrae_dict[method_name] = np.mean(mrae_errors) + +print('Results\n'+('-'*100)) +for method_name in result_mae_dict.keys(): + MAE = result_mae_dict[method_name] + MRAE = result_mrae_dict[method_name] + print(f'{method_name}\t{MAE=:.5f}\t{MRAE=:.5f}') + + + + + + + diff --git a/Retrieval/previous/preliminary_.py b/Retrieval/previous/preliminary_.py new file mode 100644 index 0000000..959d70a --- /dev/null +++ b/Retrieval/previous/preliminary_.py @@ -0,0 +1,98 @@ +import pandas as pd +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.linear_model import LogisticRegression + +import quapy as qp +import quapy.functional as F +from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation +from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC +from quapy.protocol import AbstractProtocol +from quapy.data.base import LabelledCollection + +from glob import glob +from os.path import join + +""" +This was the very first experiment. 1 big training set and many test rankings produced according to some queries. +The quantification methods did not seem to work. The more sophisticated the method is, the worse it performed. +This is a clear indication that the PPS assumptions do not hold. +Actually, while the training set could be some iid sample from a distribution L and every test set +is a iid sample from a distribution U, it is pretty clear that P(X|Y) is different, since the test set +are biased towards a query term whereas the training set is not. +""" + +def methods(): + yield ('MLPE', MaximumLikelihoodPrevalenceEstimation()) + yield ('CC', ClassifyAndCount(LogisticRegression(n_jobs=-1))) + yield ('ACC', ACC(LogisticRegression(n_jobs=-1))) + yield ('PCC', PCC(LogisticRegression(n_jobs=-1))) + yield ('PACC', PACC(LogisticRegression(n_jobs=-1))) + yield ('EMQ', EMQ(LogisticRegression(n_jobs=-1))) + + +def load_txt_sample(path, verbose=False): + if verbose: + print(f'loading {path}...', end='') + df = pd.read_csv(path, sep='\t') + if verbose: + print('[done]') + X = df['text'] + y = df['first_letter_category'] + + return X, y + +class RetrievedSamples(AbstractProtocol): + + def __init__(self, path_dir: str, load_fn, vectorizer, classes): + self.path_dir = path_dir + self.load_fn = load_fn + self.vectorizer = vectorizer + self.classes = classes + + def __call__(self): + for file in glob(join(self.path_dir, 'test_data_*.txt')): + X, y = self.load_fn(file) + if len(X)!=qp.environ['SAMPLE_SIZE']: + print(f'[warning]: file {file} contains {len(X)} instances (expected: {qp.environ["SAMPLE_SIZE"]})') + # assert len(X) == qp.environ['SAMPLE_SIZE'], f'unexpected sample size for file {file}, found {len(X)}' + X = self.vectorizer.transform(X) + sample = LabelledCollection(X, y, classes=self.classes) + yield sample.Xp + + +qp.environ['SAMPLE_SIZE']=100 + +data_path = './data' +train_path = join(data_path, 'train_data.txt') + + +tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5) + +training = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True) + +# training = training.sampling(1000) + +Xtr, ytr = training.Xy +Xtr = tfidf.fit_transform(Xtr) +print('Xtr shape = ', Xtr.shape) + +training = LabelledCollection(Xtr, ytr) +classes = training.classes_ + +test_prot = RetrievedSamples(data_path, load_fn=load_txt_sample, vectorizer=tfidf, classes=classes) + +print('Training prevalence:', F.strprev(training.prevalence())) +for X, p in test_prot(): + print('Test prevalence:', F.strprev(p)) + +for method_name, quantifier in methods(): + print('training ', method_name) + quantifier.fit(training) + print('[done]') + + report = qp.evaluation.evaluation_report(quantifier, test_prot, error_metrics=['mae', 'mrae'], verbose=True) + + print(report.mean()) + + + diff --git a/Retrieval/previous/second.py b/Retrieval/previous/second.py new file mode 100644 index 0000000..5a28e8b --- /dev/null +++ b/Retrieval/previous/second.py @@ -0,0 +1,131 @@ +import numpy as np +import pandas as pd +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.linear_model import LogisticRegression + +import quapy as qp +import quapy.functional as F +from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation +from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC +from quapy.protocol import AbstractProtocol +from quapy.data.base import LabelledCollection + +from glob import glob +from os.path import join +from tqdm import tqdm + +""" +In this second experiment, we have pairs of (Li,Ui) with Li a training set and Ui a test set. +Both elements in the pair are *retrieved according to the same query*. This is a way to impose +the same type of bias that was present in the test, to the training set. Let's see... +""" + +def methods(): + yield ('PACC', PACC(LogisticRegression(), val_split=5, n_jobs=-1)) + yield ('CC', ClassifyAndCount(LogisticRegression())) + yield ('EMQ', EMQ(LogisticRegression())) + yield ('PCC', PCC(LogisticRegression())) + yield ('ACC', ACC(LogisticRegression(), val_split=5, n_jobs=-1)) + yield ('MLPE', MaximumLikelihoodPrevalenceEstimation()) + + +def load_txt_sample(path, parse_columns, verbose=False, max_lines=None): + if verbose: + print(f'loading {path}...', end='') + df = pd.read_csv(path, sep='\t') + if verbose: + print('[done]') + X = df['text'].values + y = df['first_letter_category'].values + + if parse_columns: + rank = df['rank'].values + scores = df['score'].values + order = np.argsort(rank) + X = X[order] + y = y[order] + rank = rank[order] + scores = scores[order] + + if max_lines is not None: + X = X[:max_lines] + y = y[:max_lines] + + return X, y + + +class RetrievedSamples(AbstractProtocol): + + def __init__(self, path_dir: str, load_fn, vectorizer, classes, max_train_lines=None, max_test_lines=None): + self.path_dir = path_dir + self.load_fn = load_fn + self.vectorizer = vectorizer + self.classes = classes + self.max_train_lines = max_train_lines + self.max_test_lines = max_test_lines + + def __call__(self): + for file in glob(join(self.path_dir, 'test_rankings_*.txt')): + + X, y = self.load_fn(file.replace('test_', 'training_'), parse_columns=True, max_lines=self.max_train_lines) + X = self.vectorizer.transform(X) + train_sample = LabelledCollection(X, y, classes=self.classes) + + X, y = self.load_fn(file, parse_columns=True, max_lines=self.max_test_lines) + if len(X)!=qp.environ['SAMPLE_SIZE']: + print(f'[warning]: file {file} contains {len(X)} instances (expected: {qp.environ["SAMPLE_SIZE"]})') + # assert len(X) == qp.environ['SAMPLE_SIZE'], f'unexpected sample size for file {file}, found {len(X)}' + X = self.vectorizer.transform(X) + test_sample = LabelledCollection(X, y, classes=self.classes) + + yield train_sample, test_sample + + +RANK_AT_K = 500 +REDUCE_TR = 50000 +qp.environ['SAMPLE_SIZE'] = RANK_AT_K + +data_path = './newCollection' +train_path = join(data_path, 'train_data.txt') + +tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10) + +training = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True, parse_columns=False) +if REDUCE_TR>0: + print('Reducing the number of documents in the training to', REDUCE_TR) + training = training.sampling(REDUCE_TR) + +Xtr, ytr = training.Xy +Xtr = tfidf.fit_transform(Xtr) +print('L orig shape = ', Xtr.shape) + +training = LabelledCollection(Xtr, ytr) +classes = training.classes_ + +experiment_prot = RetrievedSamples(data_path, + load_fn=load_txt_sample, + vectorizer=tfidf, + classes=classes, + max_train_lines=RANK_AT_K, + max_test_lines=RANK_AT_K) + +for method_name, quantifier in methods(): + print('Starting with method=', method_name) + + errors = [] + pbar = tqdm(experiment_prot(), total=49) + for train, test in pbar: + # print('Training prevalence:', F.strprev(training.prevalence()), 'shape', train.X.shape) + # print('Test prevalence:', F.strprev(test.prevalence()), 'shape', test.X.shape) + + quantifier.fit(train) + estim_prev = quantifier.quantify(test.instances) + mae = qp.error.mae(test.prevalence(), estim_prev) + errors.append(mae) + + pbar.set_description(f'mae={np.mean(errors):.4f}') + print() + + + + diff --git a/Retrieval/previous/third.py b/Retrieval/previous/third.py new file mode 100644 index 0000000..ab3a649 --- /dev/null +++ b/Retrieval/previous/third.py @@ -0,0 +1,155 @@ +import numpy as np +import pandas as pd +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.linear_model import LogisticRegression + +import quapy as qp +import quapy.functional as F +from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation +from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC +from quapy.protocol import AbstractProtocol +from quapy.data.base import LabelledCollection + +from glob import glob +from os.path import join +from tqdm import tqdm + +""" +In this third experiment, we have pairs of (Li,Ui) with Li a training set and Ui a test set as +in the second experiment, but in this case the fairness group are defined upon geographic info. +""" + +def methods(): + yield ('CC', ClassifyAndCount(LogisticRegression())) + yield ('PACC', PACC(LogisticRegression(), val_split=5, n_jobs=-1)) + yield ('EMQ', EMQ(LogisticRegression())) + yield ('PCC', PCC(LogisticRegression())) + yield ('ACC', ACC(LogisticRegression(), val_split=5, n_jobs=-1)) + yield ('MLPE', MaximumLikelihoodPrevalenceEstimation()) + + +def load_txt_sample(path, parse_columns, verbose=False, max_lines=None): + # print('reading', path) + if verbose: + print(f'loading {path}...', end='') + df = pd.read_csv(path, sep='\t') + if verbose: + print('[done]') + X = df['text'].values + y = df['continent'].values + + if parse_columns: + rank = df['rank'].values + scores = df['score'].values + rank = rank[y != 'Antarctica'] + scores = scores[y != 'Antarctica'] + + X = X[y!='Antarctica'] + y = y[y!='Antarctica'] + + if parse_columns: + order = np.argsort(rank) + X = X[order] + y = y[order] + rank = rank[order] + scores = scores[order] + + if max_lines is not None: + X = X[:max_lines] + y = y[:max_lines] + + return X, y + + +class RetrievedSamples(AbstractProtocol): + + def __init__(self, path_dir: str, load_fn, vectorizer, max_train_lines=None, max_test_lines=None): + self.path_dir = path_dir + self.load_fn = load_fn + self.vectorizer = vectorizer + self.max_train_lines = max_train_lines + self.max_test_lines = max_test_lines + + def __call__(self): + for file in glob(join(self.path_dir, 'test_rankings_*.txt')): + + X, y = self.load_fn(file.replace('test_', 'training_'), parse_columns=True, max_lines=self.max_train_lines) + X = self.vectorizer.transform(X) + train_sample = LabelledCollection(X, y) + + X, y = self.load_fn(file, parse_columns=True, max_lines=self.max_test_lines) + if len(X)!=qp.environ['SAMPLE_SIZE']: + print(f'[warning]: file {file} contains {len(X)} instances (expected: {qp.environ["SAMPLE_SIZE"]})') + # assert len(X) == qp.environ['SAMPLE_SIZE'], f'unexpected sample size for file {file}, found {len(X)}' + X = self.vectorizer.transform(X) + try: + test_sample = LabelledCollection(X, y, classes=train_sample.classes_) + except ValueError as e: + print(f'file {file} caused error {e}') + yield None, None + + # print('train #classes:', train_sample.n_classes, train_sample.prevalence()) + # print('test #classes:', test_sample.n_classes, test_sample.prevalence()) + + yield train_sample, test_sample + + +RANK_AT_K = 100 +REDUCE_TR = 50000 +qp.environ['SAMPLE_SIZE'] = RANK_AT_K + +data_path = './newCollectionGeo' +train_path = join(data_path, 'train_data_continent.txt') + +tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10) + +training = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True, parse_columns=False) + +if REDUCE_TR>0: + print('Reducing the number of documents in the training to', REDUCE_TR) + training = training.sampling(REDUCE_TR) + +Xtr, ytr = training.Xy +Xtr = tfidf.fit_transform(Xtr) +print('L orig shape = ', Xtr.shape) + +training = LabelledCollection(Xtr, ytr) +classes = training.classes_ + +print('training classes:', classes) +print('training prevalence:', training.prevalence()) + +experiment_prot = RetrievedSamples(data_path, + load_fn=load_txt_sample, + vectorizer=tfidf, + max_train_lines=None, + max_test_lines=RANK_AT_K) + +for method_name, quantifier in methods(): + print('Starting with method=', method_name) + + errors = [] + pbar = tqdm(experiment_prot(), total=49) + for train, test in pbar: + if train is not None: + try: + # print('Training prevalence:', F.strprev(training.prevalence()), 'shape', train.X.shape) + # print('Test prevalence:', F.strprev(test.prevalence()), 'shape', test.X.shape) + + # print(train.prevalence()) + # print(test.prevalence()) + quantifier.fit(train) + estim_prev = quantifier.quantify(test.instances) + mae = qp.error.mae(test.prevalence(), estim_prev) + errors.append(mae) + except Exception as e: + print(f'wow, something happened here! skipping; {e}') + else: + print('skipping one!') + + pbar.set_description(f'mae={np.mean(errors):.4f}') + print() + + + + diff --git a/Retrieval/understand_classif_scheme.py b/Retrieval/understand_classif_scheme.py new file mode 100644 index 0000000..19314ef --- /dev/null +++ b/Retrieval/understand_classif_scheme.py @@ -0,0 +1,66 @@ +import numpy as np +import pandas as pd +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.linear_model import LogisticRegression, LogisticRegressionCV +from sklearn.metrics import make_scorer, f1_score +from sklearn.svm import LinearSVC + +from quapy.data.base import LabelledCollection +from sklearn.model_selection import cross_val_score, GridSearchCV + +from os.path import join + +""" +In this experiment, I simply try to understand whether the learning task can be learned or not. +The problem is that we are quantifying the categories based on the alphabetical order (of what?). +""" + +def load_txt_sample(path, parse_columns, verbose=False, max_lines=None): + if verbose: + print(f'loading {path}...', end='') + df = pd.read_csv(path, sep='\t') + if verbose: + print('[done]') + X = df['text'].values + y = df['continent'].values + + if parse_columns: + rank = df['rank'].values + scores = df['score'].values + order = np.argsort(rank) + X = X[order] + y = y[order] + rank = rank[order] + scores = scores[order] + + if max_lines is not None: + X = X[:max_lines] + y = y[:max_lines] + + return X, y + +data_path = './50_50_split_trec' +train_path = join(data_path, 'train_50_50_continent.txt') + +tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10) +data = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True, parse_columns=False) +data = data.sampling(20000) +train, test = data.split_stratified() +train.instances = tfidf.fit_transform(train.instances) +test.instances = tfidf.transform(test.instances) + +# svm = LinearSVC() +# cls = GridSearchCV(svm, param_grid={'C':np.logspace(-3,3,7), 'class_weight':['balanced', None]}) +cls = LogisticRegression() +cls.fit(*train.Xy) + +# score = cross_val_score(LogisticRegressionCV(), *data.Xy, scoring=make_scorer(f1_score, average='macro'), n_jobs=-1, cv=5) +# print(score) +# print(np.mean(score)) + +y_pred = cls.predict(test.instances) +macrof1 = f1_score(y_true=test.labels, y_pred=y_pred, average='macro') +microf1 = f1_score(y_true=test.labels, y_pred=y_pred, average='micro') + +print('macro', macrof1) +print('micro', microf1)