import numpy as np import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression, LogisticRegressionCV from sklearn.metrics import make_scorer, f1_score from sklearn.svm import LinearSVC from quapy.data.base import LabelledCollection from sklearn.model_selection import cross_val_score, GridSearchCV from os.path import join """ In this experiment, I simply try to understand whether the learning task can be learned or not. The problem is that we are quantifying the categories based on the alphabetical order (of what?). """ def load_txt_sample(path, parse_columns, verbose=False, max_lines=None): if verbose: print(f'loading {path}...', end='') df = pd.read_csv(path, sep='\t') if verbose: print('[done]') X = df['text'].values y = df['continent'].values if parse_columns: rank = df['rank'].values scores = df['score'].values order = np.argsort(rank) X = X[order] y = y[order] rank = rank[order] scores = scores[order] if max_lines is not None: X = X[:max_lines] y = y[:max_lines] return X, y data_path = './50_50_split_trec' train_path = join(data_path, 'train_50_50_continent.txt') tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10) data = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True, parse_columns=False) data = data.sampling(20000) train, test = data.split_stratified() train.instances = tfidf.fit_transform(train.instances) test.instances = tfidf.transform(test.instances) # svm = LinearSVC() # cls = GridSearchCV(svm, param_grid={'C':np.logspace(-3,3,7), 'class_weight':['balanced', None]}) cls = LogisticRegression() cls.fit(*train.Xy) # score = cross_val_score(LogisticRegressionCV(), *data.Xy, scoring=make_scorer(f1_score, average='macro'), n_jobs=-1, cv=5) # print(score) # print(np.mean(score)) y_pred = cls.predict(test.instances) macrof1 = f1_score(y_true=test.labels, y_pred=y_pred, average='macro') microf1 = f1_score(y_true=test.labels, y_pred=y_pred, average='micro') print('macro', macrof1) print('micro', microf1)