diff --git a/Ordinal/gen_tables.py b/Ordinal/gen_tables.py index 50adb8e..e57517b 100644 --- a/Ordinal/gen_tables.py +++ b/Ordinal/gen_tables.py @@ -14,13 +14,9 @@ outpath = f'./tables/{domain}/{prot}/results.tex' resultpath = join('./results', domain, prot) methods = [qname for qname, *_ in quantifiers()] -methods += [m+'-r' for m in methods] +# methods += [m+'-r' for m in methods] -table = Table(benchmarks=['low', 'mid', 'high'], - methods=methods, - prec_mean=4, - show_std=True, - prec_std=4) +table = Table(benchmarks=['low', 'mid', 'high', 'all'], methods=methods, prec_mean=4, show_std=True, prec_std=4) for resultfile in glob(f'{resultpath}/*.csv'): @@ -29,6 +25,7 @@ for resultfile in glob(f'{resultpath}/*.csv'): resultname = Path(resultfile).name method, drift, *other = resultname.replace('.csv', '').split('.') if other: + continue method += '-r' table.add(drift, method, nmd) @@ -37,9 +34,9 @@ os.makedirs(Path(outpath).parent, exist_ok=True) tabular = """ \\resizebox{\\textwidth}{!}{% - \\begin{tabular}{|c||""" + ('c|' * (table.nbenchmarks+1)) + """} \hline + \\begin{tabular}{|c||""" + ('c|' * (table.nbenchmarks)) + """} \hline """ -tabular += table.latexTabularT() +tabular += table.latexTabularT(average=False) tabular += """ \end{tabular}% }""" diff --git a/Ordinal/main.py b/Ordinal/main.py index 1659c2a..6fd2e5a 100644 --- a/Ordinal/main.py +++ b/Ordinal/main.py @@ -1,11 +1,10 @@ -import itertools - from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression import quapy as qp import numpy as np -from Ordinal.model import OrderedLogisticRegression, StackedClassifier, RegressionQuantification, RegressorClassifier +from Ordinal.model import OrderedLogisticRegression, StackedClassifier, RegressionQuantification, RegressorClassifier, \ + LogisticAT from quapy.method.aggregative import PACC, CC, EMQ, PCC, ACC, SLD, HDy from quapy.data import LabelledCollection from os.path import join @@ -18,6 +17,14 @@ from tqdm import tqdm import mord +#TODO: +# Ordinal LR, LAD -> balance sample_weight +# use BERT to extract features +# other domains? Kitchen, Electronics... +# try with the inverse of the distance +# add drift='all' + + def load_test_samples(): ids = np.load(join(datapath, domain, protocol, f'{drift}drift.test.id.npy')) ids = set(ids) @@ -34,22 +41,11 @@ def load_dev_samples(): yield sample.instances, sample.prevalence() -class LAD(mord.LAD): - def fit(self, X, y): - self.classes_ = sorted(np.unique(y)) - return super().fit(X, y) - - -class OrdinalRidge(mord.OrdinalRidge): - def fit(self, X, y): - self.classes_ = sorted(np.unique(y)) - return super().fit(X, y) - - def quantifiers(): params_LR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']} - params_OLR = {'alpha':np.logspace(-3, 3, 7)} - params_SVR = {'C': np.logspace(-3,3,7)} + # params_OLR = {'alpha':np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced']} + params_OLR = {'alpha': np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced']} + params_SVR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']} # params_SVR = {'C': np.logspace(0, 1, 2)} # baselines @@ -62,12 +58,12 @@ def quantifiers(): # with order-aware classifiers # threshold-based ordinal regression (see https://pythonhosted.org/mord/) - yield 'CC(OLR-AT)', CC(mord.LogisticAT()), params_OLR - yield 'PCC(OLR-AT)', PCC(mord.LogisticAT()), params_OLR - yield 'ACC(OLR-AT)', ACC(mord.LogisticAT()), params_OLR - yield 'PACC(OLR-AT)', PACC(mord.LogisticAT()), params_OLR + yield 'CC(OLR-AT)', CC(LogisticAT()), params_OLR + yield 'PCC(OLR-AT)', PCC(LogisticAT()), params_OLR + yield 'ACC(OLR-AT)', ACC(LogisticAT()), params_OLR + yield 'PACC(OLR-AT)', PACC(LogisticAT()), params_OLR #yield 'HDy(OLR-AT)', HDy(mord.LogisticAT()), params_OLR - yield 'SLD(OLR-AT)', EMQ(mord.LogisticAT()), params_OLR + yield 'SLD(OLR-AT)', EMQ(LogisticAT()), params_OLR # other options include mord.LogisticIT(alpha=1.), mord.LogisticSE(alpha=1.) # regression-based ordinal regression (see https://pythonhosted.org/mord/) @@ -75,6 +71,7 @@ def quantifiers(): # the other implementation has OrdinalRidge(alpha=1.0) and LAD(C=1.0) with my wrapper classes for having the nclasses_; those do # not implement predict_proba nor decision_score yield 'CC(SVR)', CC(RegressorClassifier()), params_SVR + yield 'CC-bal(SVR)', CC(RegressorClassifier()), params_SVR # yield 'PCC(SVR)', PCC(RegressorClassifier()), params_SVR # yield 'PCC-cal(SVR)', PCC(RegressorClassifier()), params_SVR # yield 'ACC(SVR)', ACC(RegressorClassifier()), params_SVR @@ -137,7 +134,7 @@ if __name__ == '__main__': train = pickle.load(open(join(datapath, domain, 'training_data.pkl'), 'rb')) with open(join(resultpath, 'hyper.txt'), 'at') as foo: - for drift in ['low', 'mid', 'high']: + for drift in ['low', 'mid', 'high', 'all']: params = [(*qs, drift) for qs in quantifiers()] hypers = qp.util.parallel(run_experiment, params, n_jobs=-2) for h in hypers: diff --git a/Ordinal/model.py b/Ordinal/model.py index 0c63123..71fbd70 100644 --- a/Ordinal/model.py +++ b/Ordinal/model.py @@ -10,6 +10,8 @@ from sklearn.multioutput import MultiOutputRegressor from sklearn.preprocessing import StandardScaler from sklearn.svm import LinearSVR, SVR from statsmodels.miscmodels.ordinal_model import OrderedModel +import mord +from sklearn.utils.class_weight import compute_class_weight class OrderedLogisticRegression: @@ -134,15 +136,20 @@ class RegressionQuantification: class RegressorClassifier(BaseEstimator, ClassifierMixin): - def __init__(self, C=1.0): + def __init__(self, C=1.0, class_weight=None): self.C = C + self.class_weight = class_weight - def fit(self, X, y): + def fit(self, X, y, sample_weight=None): self.regressor = LinearSVR(C=self.C) # self.regressor = SVR() # self.regressor = Ridge(normalize=True) - self.nclasses = len(np.unique(y)) - self.regressor.fit(X, y) + classes = sorted(np.unique(y)) + self.nclasses = len(classes) + if self.class_weight == 'balanced': + class_weight = compute_class_weight('balanced', classes=classes, y=y) + sample_weight = class_weight[y] + self.regressor.fit(X, y, sample_weight=sample_weight) return self def predict(self, X): @@ -179,3 +186,28 @@ class RegressorClassifier(BaseEstimator, ClassifierMixin): self.C = params['C'] +class LogisticAT(mord.LogisticAT): + def __init__(self, alpha=1.0, class_weight=None): + assert class_weight in [None, 'balanced'], 'unexpected value for class_weight' + self.class_weight = class_weight + super(LogisticAT, self).__init__(alpha=alpha) + + def fit(self, X, y, sample_weight=None): + if self.class_weight == 'balanced': + classes = sorted(np.unique(y)) + class_weight = compute_class_weight('balanced', classes=classes, y=y) + sample_weight = class_weight[y] + return super(LogisticAT, self).fit(X, y, sample_weight=sample_weight) + + +class LAD(mord.LAD): + def fit(self, X, y): + self.classes_ = sorted(np.unique(y)) + return super().fit(X, y) + + +class OrdinalRidge(mord.OrdinalRidge): + def fit(self, X, y): + self.classes_ = sorted(np.unique(y)) + return super().fit(X, y) + diff --git a/Ordinal/partition_dataset_by_shift.py b/Ordinal/partition_dataset_by_shift.py index 8f2ae54..fea213d 100644 --- a/Ordinal/partition_dataset_by_shift.py +++ b/Ordinal/partition_dataset_by_shift.py @@ -19,15 +19,19 @@ def partition_by_drift(split, training_prevalence): order = np.argsort(drifts) nD = len(order) low_drift, mid_drift, high_drift = order[:nD // 3], order[nD // 3:2 * nD // 3], order[2 * nD // 3:] + all_drift = np.arange(nD) np.save(join(datapath, domain, 'app', f'lowdrift.{split}.id.npy'), low_drift) np.save(join(datapath, domain, 'app', f'middrift.{split}.id.npy'), mid_drift) np.save(join(datapath, domain, 'app', f'highdrift.{split}.id.npy'), high_drift) + np.save(join(datapath, domain, 'app', f'alldrift.{split}.id.npy'), all_drift) lows = drifts[low_drift] mids = drifts[mid_drift] highs = drifts[high_drift] + all = drifts[all_drift] print(f'low drift: interval [{lows.min():.4f}, {lows.max():.4f}] mean: {lows.mean():.4f}') print(f'mid drift: interval [{mids.min():.4f}, {mids.max():.4f}] mean: {mids.mean():.4f}') print(f'high drift: interval [{highs.min():.4f}, {highs.max():.4f}] mean: {highs.mean():.4f}') + print(f'all drift: interval [{all.min():.4f}, {all.max():.4f}] mean: {all.mean():.4f}') domain = 'Books-tfidf' diff --git a/Ordinal/tabular.py b/Ordinal/tabular.py index cdb1d9c..a6bfc51 100644 --- a/Ordinal/tabular.py +++ b/Ordinal/tabular.py @@ -284,7 +284,7 @@ class Table: if average: tab += ' & ' tab += self.average.latexCell('ave', row) - tab += '\\\\\hline\n' + tab += '\\\\\hline\n' return tab def latexRow(self, benchmark, endl='\\\\\hline\n'):