forked from moreo/QuaPy
regression-based adjustment using the validation set; seems to be working
This commit is contained in:
parent
b982a51103
commit
5df355a4e1
|
@ -16,7 +16,7 @@ from tqdm import tqdm
|
||||||
domain = 'Books-tfidf'
|
domain = 'Books-tfidf'
|
||||||
datapath = './data'
|
datapath = './data'
|
||||||
protocol = 'app'
|
protocol = 'app'
|
||||||
drift = 'low'
|
drift = 'high'
|
||||||
|
|
||||||
train = pickle.load(open(join(datapath, domain, 'training_data.pkl'), 'rb'))
|
train = pickle.load(open(join(datapath, domain, 'training_data.pkl'), 'rb'))
|
||||||
|
|
||||||
|
@ -37,26 +37,33 @@ def load_dev_samples():
|
||||||
|
|
||||||
print('fitting the quantifier')
|
print('fitting the quantifier')
|
||||||
|
|
||||||
|
# q = EMQ(LogisticRegression(class_weight='balanced'))
|
||||||
# q = PACC(LogisticRegression(class_weight='balanced'))
|
# q = PACC(LogisticRegression(class_weight='balanced'))
|
||||||
# q = PACC(OrderedLogisticRegression())
|
q = PACC(OrderedLogisticRegression())
|
||||||
# q = PACC(StackedClassifier(LogisticRegression(class_weight='balanced')))
|
# q = PACC(StackedClassifier(LogisticRegression(class_weight='balanced')))
|
||||||
# q = RegressionQuantification(PCC(LogisticRegression(class_weight='balanced')), val_samples_generator=load_dev_samples)
|
# q = RegressionQuantification(PCC(LogisticRegression(class_weight='balanced')), val_samples_generator=load_dev_samples)
|
||||||
q = PACC(RegressorClassifier())
|
# q = ACC(RegressorClassifier())
|
||||||
|
|
||||||
q = qp.model_selection.GridSearchQ(
|
param_grid = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
|
||||||
q,
|
# param_grid = {'C': np.logspace(-3,3,14)}
|
||||||
# {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']},
|
# param_grid = {'alpha':np.logspace(-8, 6, 15)}
|
||||||
{'C': np.logspace(-3,3,14)},
|
|
||||||
1000,
|
# q = qp.model_selection.GridSearchQ(
|
||||||
'gen',
|
# q,
|
||||||
error=mnmd,
|
# param_grid,
|
||||||
val_split=load_dev_samples,
|
# 1000,
|
||||||
n_jobs=-1,
|
# 'gen',
|
||||||
refit=False,
|
# error=mnmd,
|
||||||
verbose=True)
|
# val_split=load_dev_samples,
|
||||||
|
# n_jobs=-1,
|
||||||
|
# refit=False,
|
||||||
|
# verbose=True)
|
||||||
|
|
||||||
q.fit(train)
|
q.fit(train)
|
||||||
|
|
||||||
|
# q = RegressionQuantification(q, val_samples_generator=load_dev_samples)
|
||||||
|
# q.fit(None)
|
||||||
|
|
||||||
print('[done]')
|
print('[done]')
|
||||||
|
|
||||||
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
|
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
|
||||||
|
@ -64,6 +71,14 @@ mean_nmd = report['nmd'].mean()
|
||||||
std_nmd = report['nmd'].std()
|
std_nmd = report['nmd'].std()
|
||||||
print(f'{mean_nmd:.4f} +-{std_nmd:.4f}')
|
print(f'{mean_nmd:.4f} +-{std_nmd:.4f}')
|
||||||
|
|
||||||
|
q = RegressionQuantification(q, val_samples_generator=load_dev_samples)
|
||||||
|
q.fit(None)
|
||||||
|
|
||||||
|
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
|
||||||
|
mean_nmd = report['nmd'].mean()
|
||||||
|
std_nmd = report['nmd'].std()
|
||||||
|
print(f'[regression-correction] {mean_nmd:.4f} +-{std_nmd:.4f}')
|
||||||
|
|
||||||
# drift='high'
|
# drift='high'
|
||||||
# report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
|
# report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
|
||||||
# mean_nmd = report['nmd'].mean()
|
# mean_nmd = report['nmd'].mean()
|
||||||
|
|
|
@ -0,0 +1,172 @@
|
||||||
|
from copy import deepcopy
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.base import BaseEstimator, ClassifierMixin
|
||||||
|
from sklearn.calibration import CalibratedClassifierCV
|
||||||
|
from sklearn.decomposition import TruncatedSVD
|
||||||
|
from sklearn.linear_model import LogisticRegression, Ridge
|
||||||
|
from scipy.sparse import issparse
|
||||||
|
from sklearn.multiclass import OneVsRestClassifier
|
||||||
|
from sklearn.multioutput import MultiOutputRegressor
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
from sklearn.svm import LinearSVR, SVR
|
||||||
|
from statsmodels.miscmodels.ordinal_model import OrderedModel
|
||||||
|
|
||||||
|
|
||||||
|
class OrderedLogisticRegression:
|
||||||
|
def __init__(self, model='logit'):
|
||||||
|
assert model in ['logit', 'probit'], 'unknown ordered model, valid ones are logit or probit'
|
||||||
|
self.model = model
|
||||||
|
|
||||||
|
def fit(self, X, y):
|
||||||
|
if issparse(X):
|
||||||
|
self.svd = TruncatedSVD(500)
|
||||||
|
X = self.svd.fit_transform(X)
|
||||||
|
self.learner = OrderedModel(y, X, distr=self.model)
|
||||||
|
self.res_prob = self.learner.fit(method='bfgs', disp=False, skip_hessian=True)
|
||||||
|
|
||||||
|
def predict(self, X):
|
||||||
|
prob = self.predict_proba(X)
|
||||||
|
return np.argmax(prob, axis=1)
|
||||||
|
|
||||||
|
def predict_proba(self, X):
|
||||||
|
if issparse(X):
|
||||||
|
assert hasattr(self, 'svd'), \
|
||||||
|
'X matrix in predict is sparse, but the method has not been fit with sparse type'
|
||||||
|
X = self.svd.transform(X)
|
||||||
|
return self.res_prob.model.predict(self.res_prob.params, exog=X)
|
||||||
|
|
||||||
|
|
||||||
|
class StackedClassifier: # aka Funnelling Monolingual
|
||||||
|
def __init__(self, base_estimator=LogisticRegression()):
|
||||||
|
if not hasattr(base_estimator, 'predict_proba'):
|
||||||
|
print('the estimator does not seem to be probabilistic: calibrating')
|
||||||
|
base_estimator = CalibratedClassifierCV(base_estimator)
|
||||||
|
# self.base = deepcopy(OneVsRestClassifier(base_estimator))
|
||||||
|
# self.meta = deepcopy(OneVsRestClassifier(base_estimator))
|
||||||
|
self.base = deepcopy(base_estimator)
|
||||||
|
self.meta = deepcopy(base_estimator)
|
||||||
|
self.norm = StandardScaler()
|
||||||
|
|
||||||
|
def fit(self, X, y):
|
||||||
|
self.base.fit(X, y)
|
||||||
|
P = self.base.predict_proba(X)
|
||||||
|
P = self.norm.fit_transform(P)
|
||||||
|
self.meta.fit(P, y)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def predict(self, X):
|
||||||
|
P = self.base.predict_proba(X)
|
||||||
|
P = self.norm.transform(P)
|
||||||
|
return self.meta.predict(P)
|
||||||
|
|
||||||
|
def predict_proba(self, X):
|
||||||
|
P = self.base.predict_proba(X)
|
||||||
|
P = self.norm.transform(P)
|
||||||
|
return self.meta.predict_proba(P)
|
||||||
|
|
||||||
|
|
||||||
|
class RegressionQuantification:
|
||||||
|
def __init__(self,
|
||||||
|
base_quantifier,
|
||||||
|
regression='svr',
|
||||||
|
val_samples_generator=None,
|
||||||
|
norm=True):
|
||||||
|
|
||||||
|
self.base_quantifier = base_quantifier
|
||||||
|
if isinstance(regression, str):
|
||||||
|
assert regression in ['ridge', 'svr'], 'unknown regression model'
|
||||||
|
if regression == 'ridge':
|
||||||
|
self.reg = Ridge(normalize=norm)
|
||||||
|
elif regression == 'svr':
|
||||||
|
self.reg = MultiOutputRegressor(LinearSVR())
|
||||||
|
else:
|
||||||
|
self.reg = regression
|
||||||
|
# self.reg = MultiTaskLassoCV(normalize=norm)
|
||||||
|
# self.reg = KernelRidge(kernel='rbf')
|
||||||
|
# self.reg = LassoLarsCV(normalize=norm)
|
||||||
|
# self.reg = MultiTaskElasticNetCV(normalize=norm) <- bien
|
||||||
|
#self.reg = LinearRegression(normalize=norm) # <- bien
|
||||||
|
# self.reg = MultiOutputRegressor(ARDRegression(normalize=norm)) # <- bastante bien, incluso sin norm
|
||||||
|
# self.reg = MultiOutputRegressor(BayesianRidge(normalize=False)) # <- bastante bien, incluso sin norm
|
||||||
|
# self.reg = MultiOutputRegressor(SGDRegressor()) # lento, no va
|
||||||
|
self.regression = regression
|
||||||
|
self.val_samples_generator = val_samples_generator
|
||||||
|
# self.norm = StandardScaler()
|
||||||
|
# self.covs = covs
|
||||||
|
|
||||||
|
def generate_validation_samples(self):
|
||||||
|
Xs, ys = [], []
|
||||||
|
for instances, prevalence in self.val_samples_generator():
|
||||||
|
ys.append(prevalence)
|
||||||
|
Xs.append(self.base_quantifier.quantify(instances))
|
||||||
|
Xs = np.asarray(Xs)
|
||||||
|
ys = np.asarray(ys)
|
||||||
|
return Xs, ys
|
||||||
|
|
||||||
|
def fit(self, data):
|
||||||
|
print('fitting quantifier')
|
||||||
|
if data is not None:
|
||||||
|
self.base_quantifier.fit(data)
|
||||||
|
print('generating val samples')
|
||||||
|
Xs, ys = self.generate_validation_samples()
|
||||||
|
# Xs = self.norm.fit_transform(Xs)
|
||||||
|
print('fitting regressor')
|
||||||
|
self.reg.fit(Xs, ys)
|
||||||
|
print('[done]')
|
||||||
|
return self
|
||||||
|
|
||||||
|
def quantify(self, instances):
|
||||||
|
Xs = self.base_quantifier.quantify(instances).reshape(1, -1)
|
||||||
|
# Xs = self.norm.transform(Xs)
|
||||||
|
Xs = self.reg.predict(Xs)
|
||||||
|
# Xs = self.norm.inverse_transform(Xs)
|
||||||
|
adjusted = Xs / Xs.sum()
|
||||||
|
# adjusted = np.clip(Xs, 0, 1)
|
||||||
|
adjusted = adjusted.flatten()
|
||||||
|
return adjusted
|
||||||
|
|
||||||
|
def get_params(self, deep=True):
|
||||||
|
return self.base_quantifier.get_params()
|
||||||
|
|
||||||
|
def set_params(self, **params):
|
||||||
|
self.base_quantifier.set_params(**params)
|
||||||
|
|
||||||
|
|
||||||
|
class RegressorClassifier(BaseEstimator, ClassifierMixin):
|
||||||
|
def __init__(self):
|
||||||
|
self.regressor = LinearSVR()
|
||||||
|
# self.regressor = SVR()
|
||||||
|
# self.regressor = Ridge(normalize=True)
|
||||||
|
|
||||||
|
|
||||||
|
def fit(self, X, y):
|
||||||
|
self.nclasses = len(np.unique(y))
|
||||||
|
self.regressor.fit(X, y)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def predict(self, X):
|
||||||
|
r = self.regressor.predict(X)
|
||||||
|
c = np.round(r)
|
||||||
|
c[c<0]=0
|
||||||
|
c[c>(self.nclasses-1)]=self.nclasses-1
|
||||||
|
return c.astype(np.int)
|
||||||
|
|
||||||
|
def predict_proba(self, X):
|
||||||
|
r = self.regressor.predict(X)
|
||||||
|
nC = len(self.classes_)
|
||||||
|
r = np.clip(r, 0, nC - 1)
|
||||||
|
dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
|
||||||
|
invdist = 1 - dists
|
||||||
|
invdist[invdist < 0] = 0
|
||||||
|
return invdist
|
||||||
|
|
||||||
|
@property
|
||||||
|
def classes_(self):
|
||||||
|
return np.arange(self.nclasses)
|
||||||
|
|
||||||
|
def get_params(self, deep=True):
|
||||||
|
return self.regressor.get_params()
|
||||||
|
|
||||||
|
def set_params(self, **params):
|
||||||
|
self.regressor.set_params(**params)
|
||||||
|
|
|
@ -0,0 +1,40 @@
|
||||||
|
import numpy as np
|
||||||
|
import quapy as qp
|
||||||
|
from Ordinal.evaluation import nmd
|
||||||
|
from Ordinal.utils import load_samples_pkl
|
||||||
|
from quapy.data import LabelledCollection
|
||||||
|
import pickle
|
||||||
|
import os
|
||||||
|
from os.path import join
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
def partition_by_drift(split, training_prevalence):
|
||||||
|
assert split in ['dev', 'test'], 'invalid split name'
|
||||||
|
total=1000 if split=='dev' else 5000
|
||||||
|
drifts = []
|
||||||
|
for sample in tqdm(load_samples_pkl(join(datapath, domain, 'app', f'{split}_samples')), total=total):
|
||||||
|
drifts.append(nmd(training_prevalence, sample.prevalence()))
|
||||||
|
drifts = np.asarray(drifts)
|
||||||
|
order = np.argsort(drifts)
|
||||||
|
nD = len(order)
|
||||||
|
low_drift, mid_drift, high_drift = order[:nD // 3], order[nD // 3:2 * nD // 3], order[2 * nD // 3:]
|
||||||
|
np.save(join(datapath, domain, 'app', f'lowdrift.{split}.id.npy'), low_drift)
|
||||||
|
np.save(join(datapath, domain, 'app', f'middrift.{split}.id.npy'), mid_drift)
|
||||||
|
np.save(join(datapath, domain, 'app', f'highdrift.{split}.id.npy'), high_drift)
|
||||||
|
lows = drifts[low_drift]
|
||||||
|
mids = drifts[mid_drift]
|
||||||
|
highs = drifts[high_drift]
|
||||||
|
print(f'low drift: interval [{lows.min():.4f}, {lows.max():.4f}] mean: {lows.mean():.4f}')
|
||||||
|
print(f'mid drift: interval [{mids.min():.4f}, {mids.max():.4f}] mean: {mids.mean():.4f}')
|
||||||
|
print(f'high drift: interval [{highs.min():.4f}, {highs.max():.4f}] mean: {highs.mean():.4f}')
|
||||||
|
|
||||||
|
|
||||||
|
domain = 'Books-tfidf'
|
||||||
|
datapath = './data'
|
||||||
|
|
||||||
|
training = pickle.load(open(join(datapath,domain,'training_data.pkl'), 'rb'))
|
||||||
|
|
||||||
|
partition_by_drift('dev', training.prevalence())
|
||||||
|
partition_by_drift('test', training.prevalence())
|
||||||
|
|
|
@ -0,0 +1,54 @@
|
||||||
|
import quapy as qp
|
||||||
|
from quapy.data import LabelledCollection
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from os.path import join
|
||||||
|
import os
|
||||||
|
import pickle
|
||||||
|
from utils import load_samples
|
||||||
|
from tqdm import tqdm
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
|
||||||
|
datapath = './data'
|
||||||
|
domain = 'Books'
|
||||||
|
outname = domain + '-tfidf'
|
||||||
|
|
||||||
|
def save_preprocessing_info(transformer):
|
||||||
|
with open(join(datapath, outname, 'prep-info.txt'), 'wt') as foo:
|
||||||
|
foo.write(f'{str(transformer)}\n')
|
||||||
|
|
||||||
|
|
||||||
|
os.makedirs(join(datapath, outname), exist_ok=True)
|
||||||
|
os.makedirs(join(datapath, outname, 'app'), exist_ok=True)
|
||||||
|
os.makedirs(join(datapath, outname, 'app', 'dev_samples'), exist_ok=True)
|
||||||
|
os.makedirs(join(datapath, outname, 'app', 'test_samples'), exist_ok=True)
|
||||||
|
shutil.copyfile(join(datapath, domain, 'app', 'dev_prevalences.txt'), join(datapath, outname, 'app', 'dev_prevalences.txt'))
|
||||||
|
shutil.copyfile(join(datapath, domain, 'app', 'test_prevalences.txt'), join(datapath, outname, 'app', 'test_prevalences.txt'))
|
||||||
|
os.makedirs(join(datapath, outname, 'npp'), exist_ok=True)
|
||||||
|
os.makedirs(join(datapath, outname, 'npp', 'dev_samples'), exist_ok=True)
|
||||||
|
os.makedirs(join(datapath, outname, 'npp', 'test_samples'), exist_ok=True)
|
||||||
|
shutil.copyfile(join(datapath, domain, 'npp', 'dev_prevalences.txt'), join(datapath, outname, 'npp', 'dev_prevalences.txt'))
|
||||||
|
shutil.copyfile(join(datapath, domain, 'npp', 'test_prevalences.txt'), join(datapath, outname, 'npp', 'test_prevalences.txt'))
|
||||||
|
|
||||||
|
|
||||||
|
tfidf = TfidfVectorizer(sublinear_tf=True, ngram_range=(1,2), min_df=5)
|
||||||
|
|
||||||
|
train = LabelledCollection.load(join(datapath, domain, 'training_data.txt'), loader_func=qp.data.reader.from_text)
|
||||||
|
train.instances = tfidf.fit_transform(train.instances)
|
||||||
|
save_preprocessing_info(tfidf)
|
||||||
|
pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
|
|
||||||
|
def transform_folder_samples(protocol, splitname):
|
||||||
|
for i, sample in tqdm(enumerate(load_samples(join(datapath, domain, protocol, splitname), classes=train.classes_))):
|
||||||
|
sample.instances = tfidf.transform(sample.instances)
|
||||||
|
pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
|
|
||||||
|
transform_folder_samples('app', 'dev_samples')
|
||||||
|
transform_folder_samples('app', 'test_samples')
|
||||||
|
transform_folder_samples('npp', 'dev_samples')
|
||||||
|
transform_folder_samples('npp', 'test_samples')
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue