1
0
Fork 0

adding sample_weight to ordinal-aware classifiers

This commit is contained in:
Alejandro Moreo Fernandez 2022-03-10 18:28:49 +01:00
parent b2e161480e
commit ad64dfe2a0
5 changed files with 66 additions and 36 deletions

View File

@ -14,13 +14,9 @@ outpath = f'./tables/{domain}/{prot}/results.tex'
resultpath = join('./results', domain, prot) resultpath = join('./results', domain, prot)
methods = [qname for qname, *_ in quantifiers()] methods = [qname for qname, *_ in quantifiers()]
methods += [m+'-r' for m in methods] # methods += [m+'-r' for m in methods]
table = Table(benchmarks=['low', 'mid', 'high'], table = Table(benchmarks=['low', 'mid', 'high', 'all'], methods=methods, prec_mean=4, show_std=True, prec_std=4)
methods=methods,
prec_mean=4,
show_std=True,
prec_std=4)
for resultfile in glob(f'{resultpath}/*.csv'): for resultfile in glob(f'{resultpath}/*.csv'):
@ -29,6 +25,7 @@ for resultfile in glob(f'{resultpath}/*.csv'):
resultname = Path(resultfile).name resultname = Path(resultfile).name
method, drift, *other = resultname.replace('.csv', '').split('.') method, drift, *other = resultname.replace('.csv', '').split('.')
if other: if other:
continue
method += '-r' method += '-r'
table.add(drift, method, nmd) table.add(drift, method, nmd)
@ -37,9 +34,9 @@ os.makedirs(Path(outpath).parent, exist_ok=True)
tabular = """ tabular = """
\\resizebox{\\textwidth}{!}{% \\resizebox{\\textwidth}{!}{%
\\begin{tabular}{|c||""" + ('c|' * (table.nbenchmarks+1)) + """} \hline \\begin{tabular}{|c||""" + ('c|' * (table.nbenchmarks)) + """} \hline
""" """
tabular += table.latexTabularT() tabular += table.latexTabularT(average=False)
tabular += """ tabular += """
\end{tabular}% \end{tabular}%
}""" }"""

View File

@ -1,11 +1,10 @@
import itertools
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
import quapy as qp import quapy as qp
import numpy as np import numpy as np
from Ordinal.model import OrderedLogisticRegression, StackedClassifier, RegressionQuantification, RegressorClassifier from Ordinal.model import OrderedLogisticRegression, StackedClassifier, RegressionQuantification, RegressorClassifier, \
LogisticAT
from quapy.method.aggregative import PACC, CC, EMQ, PCC, ACC, SLD, HDy from quapy.method.aggregative import PACC, CC, EMQ, PCC, ACC, SLD, HDy
from quapy.data import LabelledCollection from quapy.data import LabelledCollection
from os.path import join from os.path import join
@ -18,6 +17,14 @@ from tqdm import tqdm
import mord import mord
#TODO:
# Ordinal LR, LAD -> balance sample_weight
# use BERT to extract features
# other domains? Kitchen, Electronics...
# try with the inverse of the distance
# add drift='all'
def load_test_samples(): def load_test_samples():
ids = np.load(join(datapath, domain, protocol, f'{drift}drift.test.id.npy')) ids = np.load(join(datapath, domain, protocol, f'{drift}drift.test.id.npy'))
ids = set(ids) ids = set(ids)
@ -34,22 +41,11 @@ def load_dev_samples():
yield sample.instances, sample.prevalence() yield sample.instances, sample.prevalence()
class LAD(mord.LAD):
def fit(self, X, y):
self.classes_ = sorted(np.unique(y))
return super().fit(X, y)
class OrdinalRidge(mord.OrdinalRidge):
def fit(self, X, y):
self.classes_ = sorted(np.unique(y))
return super().fit(X, y)
def quantifiers(): def quantifiers():
params_LR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']} params_LR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
params_OLR = {'alpha':np.logspace(-3, 3, 7)} # params_OLR = {'alpha':np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced']}
params_SVR = {'C': np.logspace(-3,3,7)} params_OLR = {'alpha': np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced']}
params_SVR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
# params_SVR = {'C': np.logspace(0, 1, 2)} # params_SVR = {'C': np.logspace(0, 1, 2)}
# baselines # baselines
@ -62,12 +58,12 @@ def quantifiers():
# with order-aware classifiers # with order-aware classifiers
# threshold-based ordinal regression (see https://pythonhosted.org/mord/) # threshold-based ordinal regression (see https://pythonhosted.org/mord/)
yield 'CC(OLR-AT)', CC(mord.LogisticAT()), params_OLR yield 'CC(OLR-AT)', CC(LogisticAT()), params_OLR
yield 'PCC(OLR-AT)', PCC(mord.LogisticAT()), params_OLR yield 'PCC(OLR-AT)', PCC(LogisticAT()), params_OLR
yield 'ACC(OLR-AT)', ACC(mord.LogisticAT()), params_OLR yield 'ACC(OLR-AT)', ACC(LogisticAT()), params_OLR
yield 'PACC(OLR-AT)', PACC(mord.LogisticAT()), params_OLR yield 'PACC(OLR-AT)', PACC(LogisticAT()), params_OLR
#yield 'HDy(OLR-AT)', HDy(mord.LogisticAT()), params_OLR #yield 'HDy(OLR-AT)', HDy(mord.LogisticAT()), params_OLR
yield 'SLD(OLR-AT)', EMQ(mord.LogisticAT()), params_OLR yield 'SLD(OLR-AT)', EMQ(LogisticAT()), params_OLR
# other options include mord.LogisticIT(alpha=1.), mord.LogisticSE(alpha=1.) # other options include mord.LogisticIT(alpha=1.), mord.LogisticSE(alpha=1.)
# regression-based ordinal regression (see https://pythonhosted.org/mord/) # regression-based ordinal regression (see https://pythonhosted.org/mord/)
@ -75,6 +71,7 @@ def quantifiers():
# the other implementation has OrdinalRidge(alpha=1.0) and LAD(C=1.0) with my wrapper classes for having the nclasses_; those do # the other implementation has OrdinalRidge(alpha=1.0) and LAD(C=1.0) with my wrapper classes for having the nclasses_; those do
# not implement predict_proba nor decision_score # not implement predict_proba nor decision_score
yield 'CC(SVR)', CC(RegressorClassifier()), params_SVR yield 'CC(SVR)', CC(RegressorClassifier()), params_SVR
yield 'CC-bal(SVR)', CC(RegressorClassifier()), params_SVR
# yield 'PCC(SVR)', PCC(RegressorClassifier()), params_SVR # yield 'PCC(SVR)', PCC(RegressorClassifier()), params_SVR
# yield 'PCC-cal(SVR)', PCC(RegressorClassifier()), params_SVR # yield 'PCC-cal(SVR)', PCC(RegressorClassifier()), params_SVR
# yield 'ACC(SVR)', ACC(RegressorClassifier()), params_SVR # yield 'ACC(SVR)', ACC(RegressorClassifier()), params_SVR
@ -137,7 +134,7 @@ if __name__ == '__main__':
train = pickle.load(open(join(datapath, domain, 'training_data.pkl'), 'rb')) train = pickle.load(open(join(datapath, domain, 'training_data.pkl'), 'rb'))
with open(join(resultpath, 'hyper.txt'), 'at') as foo: with open(join(resultpath, 'hyper.txt'), 'at') as foo:
for drift in ['low', 'mid', 'high']: for drift in ['low', 'mid', 'high', 'all']:
params = [(*qs, drift) for qs in quantifiers()] params = [(*qs, drift) for qs in quantifiers()]
hypers = qp.util.parallel(run_experiment, params, n_jobs=-2) hypers = qp.util.parallel(run_experiment, params, n_jobs=-2)
for h in hypers: for h in hypers:

View File

@ -10,6 +10,8 @@ from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVR, SVR from sklearn.svm import LinearSVR, SVR
from statsmodels.miscmodels.ordinal_model import OrderedModel from statsmodels.miscmodels.ordinal_model import OrderedModel
import mord
from sklearn.utils.class_weight import compute_class_weight
class OrderedLogisticRegression: class OrderedLogisticRegression:
@ -134,15 +136,20 @@ class RegressionQuantification:
class RegressorClassifier(BaseEstimator, ClassifierMixin): class RegressorClassifier(BaseEstimator, ClassifierMixin):
def __init__(self, C=1.0): def __init__(self, C=1.0, class_weight=None):
self.C = C self.C = C
self.class_weight = class_weight
def fit(self, X, y): def fit(self, X, y, sample_weight=None):
self.regressor = LinearSVR(C=self.C) self.regressor = LinearSVR(C=self.C)
# self.regressor = SVR() # self.regressor = SVR()
# self.regressor = Ridge(normalize=True) # self.regressor = Ridge(normalize=True)
self.nclasses = len(np.unique(y)) classes = sorted(np.unique(y))
self.regressor.fit(X, y) self.nclasses = len(classes)
if self.class_weight == 'balanced':
class_weight = compute_class_weight('balanced', classes=classes, y=y)
sample_weight = class_weight[y]
self.regressor.fit(X, y, sample_weight=sample_weight)
return self return self
def predict(self, X): def predict(self, X):
@ -179,3 +186,28 @@ class RegressorClassifier(BaseEstimator, ClassifierMixin):
self.C = params['C'] self.C = params['C']
class LogisticAT(mord.LogisticAT):
def __init__(self, alpha=1.0, class_weight=None):
assert class_weight in [None, 'balanced'], 'unexpected value for class_weight'
self.class_weight = class_weight
super(LogisticAT, self).__init__(alpha=alpha)
def fit(self, X, y, sample_weight=None):
if self.class_weight == 'balanced':
classes = sorted(np.unique(y))
class_weight = compute_class_weight('balanced', classes=classes, y=y)
sample_weight = class_weight[y]
return super(LogisticAT, self).fit(X, y, sample_weight=sample_weight)
class LAD(mord.LAD):
def fit(self, X, y):
self.classes_ = sorted(np.unique(y))
return super().fit(X, y)
class OrdinalRidge(mord.OrdinalRidge):
def fit(self, X, y):
self.classes_ = sorted(np.unique(y))
return super().fit(X, y)

View File

@ -19,15 +19,19 @@ def partition_by_drift(split, training_prevalence):
order = np.argsort(drifts) order = np.argsort(drifts)
nD = len(order) nD = len(order)
low_drift, mid_drift, high_drift = order[:nD // 3], order[nD // 3:2 * nD // 3], order[2 * nD // 3:] low_drift, mid_drift, high_drift = order[:nD // 3], order[nD // 3:2 * nD // 3], order[2 * nD // 3:]
all_drift = np.arange(nD)
np.save(join(datapath, domain, 'app', f'lowdrift.{split}.id.npy'), low_drift) np.save(join(datapath, domain, 'app', f'lowdrift.{split}.id.npy'), low_drift)
np.save(join(datapath, domain, 'app', f'middrift.{split}.id.npy'), mid_drift) np.save(join(datapath, domain, 'app', f'middrift.{split}.id.npy'), mid_drift)
np.save(join(datapath, domain, 'app', f'highdrift.{split}.id.npy'), high_drift) np.save(join(datapath, domain, 'app', f'highdrift.{split}.id.npy'), high_drift)
np.save(join(datapath, domain, 'app', f'alldrift.{split}.id.npy'), all_drift)
lows = drifts[low_drift] lows = drifts[low_drift]
mids = drifts[mid_drift] mids = drifts[mid_drift]
highs = drifts[high_drift] highs = drifts[high_drift]
all = drifts[all_drift]
print(f'low drift: interval [{lows.min():.4f}, {lows.max():.4f}] mean: {lows.mean():.4f}') print(f'low drift: interval [{lows.min():.4f}, {lows.max():.4f}] mean: {lows.mean():.4f}')
print(f'mid drift: interval [{mids.min():.4f}, {mids.max():.4f}] mean: {mids.mean():.4f}') print(f'mid drift: interval [{mids.min():.4f}, {mids.max():.4f}] mean: {mids.mean():.4f}')
print(f'high drift: interval [{highs.min():.4f}, {highs.max():.4f}] mean: {highs.mean():.4f}') print(f'high drift: interval [{highs.min():.4f}, {highs.max():.4f}] mean: {highs.mean():.4f}')
print(f'all drift: interval [{all.min():.4f}, {all.max():.4f}] mean: {all.mean():.4f}')
domain = 'Books-tfidf' domain = 'Books-tfidf'