forked from moreo/QuaPy
regression-based adjustment using the validation set; seems to be working
This commit is contained in:
parent
b982a51103
commit
5df355a4e1
|
@ -16,7 +16,7 @@ from tqdm import tqdm
|
|||
domain = 'Books-tfidf'
|
||||
datapath = './data'
|
||||
protocol = 'app'
|
||||
drift = 'low'
|
||||
drift = 'high'
|
||||
|
||||
train = pickle.load(open(join(datapath, domain, 'training_data.pkl'), 'rb'))
|
||||
|
||||
|
@ -37,26 +37,33 @@ def load_dev_samples():
|
|||
|
||||
print('fitting the quantifier')
|
||||
|
||||
# q = EMQ(LogisticRegression(class_weight='balanced'))
|
||||
# q = PACC(LogisticRegression(class_weight='balanced'))
|
||||
# q = PACC(OrderedLogisticRegression())
|
||||
q = PACC(OrderedLogisticRegression())
|
||||
# q = PACC(StackedClassifier(LogisticRegression(class_weight='balanced')))
|
||||
# q = RegressionQuantification(PCC(LogisticRegression(class_weight='balanced')), val_samples_generator=load_dev_samples)
|
||||
q = PACC(RegressorClassifier())
|
||||
# q = ACC(RegressorClassifier())
|
||||
|
||||
q = qp.model_selection.GridSearchQ(
|
||||
q,
|
||||
# {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']},
|
||||
{'C': np.logspace(-3,3,14)},
|
||||
1000,
|
||||
'gen',
|
||||
error=mnmd,
|
||||
val_split=load_dev_samples,
|
||||
n_jobs=-1,
|
||||
refit=False,
|
||||
verbose=True)
|
||||
param_grid = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
|
||||
# param_grid = {'C': np.logspace(-3,3,14)}
|
||||
# param_grid = {'alpha':np.logspace(-8, 6, 15)}
|
||||
|
||||
# q = qp.model_selection.GridSearchQ(
|
||||
# q,
|
||||
# param_grid,
|
||||
# 1000,
|
||||
# 'gen',
|
||||
# error=mnmd,
|
||||
# val_split=load_dev_samples,
|
||||
# n_jobs=-1,
|
||||
# refit=False,
|
||||
# verbose=True)
|
||||
|
||||
q.fit(train)
|
||||
|
||||
# q = RegressionQuantification(q, val_samples_generator=load_dev_samples)
|
||||
# q.fit(None)
|
||||
|
||||
print('[done]')
|
||||
|
||||
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
|
||||
|
@ -64,6 +71,14 @@ mean_nmd = report['nmd'].mean()
|
|||
std_nmd = report['nmd'].std()
|
||||
print(f'{mean_nmd:.4f} +-{std_nmd:.4f}')
|
||||
|
||||
q = RegressionQuantification(q, val_samples_generator=load_dev_samples)
|
||||
q.fit(None)
|
||||
|
||||
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
|
||||
mean_nmd = report['nmd'].mean()
|
||||
std_nmd = report['nmd'].std()
|
||||
print(f'[regression-correction] {mean_nmd:.4f} +-{std_nmd:.4f}')
|
||||
|
||||
# drift='high'
|
||||
# report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
|
||||
# mean_nmd = report['nmd'].mean()
|
||||
|
|
|
@ -0,0 +1,172 @@
|
|||
from copy import deepcopy
|
||||
import numpy as np
|
||||
from sklearn.base import BaseEstimator, ClassifierMixin
|
||||
from sklearn.calibration import CalibratedClassifierCV
|
||||
from sklearn.decomposition import TruncatedSVD
|
||||
from sklearn.linear_model import LogisticRegression, Ridge
|
||||
from scipy.sparse import issparse
|
||||
from sklearn.multiclass import OneVsRestClassifier
|
||||
from sklearn.multioutput import MultiOutputRegressor
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.svm import LinearSVR, SVR
|
||||
from statsmodels.miscmodels.ordinal_model import OrderedModel
|
||||
|
||||
|
||||
class OrderedLogisticRegression:
|
||||
def __init__(self, model='logit'):
|
||||
assert model in ['logit', 'probit'], 'unknown ordered model, valid ones are logit or probit'
|
||||
self.model = model
|
||||
|
||||
def fit(self, X, y):
|
||||
if issparse(X):
|
||||
self.svd = TruncatedSVD(500)
|
||||
X = self.svd.fit_transform(X)
|
||||
self.learner = OrderedModel(y, X, distr=self.model)
|
||||
self.res_prob = self.learner.fit(method='bfgs', disp=False, skip_hessian=True)
|
||||
|
||||
def predict(self, X):
|
||||
prob = self.predict_proba(X)
|
||||
return np.argmax(prob, axis=1)
|
||||
|
||||
def predict_proba(self, X):
|
||||
if issparse(X):
|
||||
assert hasattr(self, 'svd'), \
|
||||
'X matrix in predict is sparse, but the method has not been fit with sparse type'
|
||||
X = self.svd.transform(X)
|
||||
return self.res_prob.model.predict(self.res_prob.params, exog=X)
|
||||
|
||||
|
||||
class StackedClassifier: # aka Funnelling Monolingual
|
||||
def __init__(self, base_estimator=LogisticRegression()):
|
||||
if not hasattr(base_estimator, 'predict_proba'):
|
||||
print('the estimator does not seem to be probabilistic: calibrating')
|
||||
base_estimator = CalibratedClassifierCV(base_estimator)
|
||||
# self.base = deepcopy(OneVsRestClassifier(base_estimator))
|
||||
# self.meta = deepcopy(OneVsRestClassifier(base_estimator))
|
||||
self.base = deepcopy(base_estimator)
|
||||
self.meta = deepcopy(base_estimator)
|
||||
self.norm = StandardScaler()
|
||||
|
||||
def fit(self, X, y):
|
||||
self.base.fit(X, y)
|
||||
P = self.base.predict_proba(X)
|
||||
P = self.norm.fit_transform(P)
|
||||
self.meta.fit(P, y)
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
P = self.base.predict_proba(X)
|
||||
P = self.norm.transform(P)
|
||||
return self.meta.predict(P)
|
||||
|
||||
def predict_proba(self, X):
|
||||
P = self.base.predict_proba(X)
|
||||
P = self.norm.transform(P)
|
||||
return self.meta.predict_proba(P)
|
||||
|
||||
|
||||
class RegressionQuantification:
|
||||
def __init__(self,
|
||||
base_quantifier,
|
||||
regression='svr',
|
||||
val_samples_generator=None,
|
||||
norm=True):
|
||||
|
||||
self.base_quantifier = base_quantifier
|
||||
if isinstance(regression, str):
|
||||
assert regression in ['ridge', 'svr'], 'unknown regression model'
|
||||
if regression == 'ridge':
|
||||
self.reg = Ridge(normalize=norm)
|
||||
elif regression == 'svr':
|
||||
self.reg = MultiOutputRegressor(LinearSVR())
|
||||
else:
|
||||
self.reg = regression
|
||||
# self.reg = MultiTaskLassoCV(normalize=norm)
|
||||
# self.reg = KernelRidge(kernel='rbf')
|
||||
# self.reg = LassoLarsCV(normalize=norm)
|
||||
# self.reg = MultiTaskElasticNetCV(normalize=norm) <- bien
|
||||
#self.reg = LinearRegression(normalize=norm) # <- bien
|
||||
# self.reg = MultiOutputRegressor(ARDRegression(normalize=norm)) # <- bastante bien, incluso sin norm
|
||||
# self.reg = MultiOutputRegressor(BayesianRidge(normalize=False)) # <- bastante bien, incluso sin norm
|
||||
# self.reg = MultiOutputRegressor(SGDRegressor()) # lento, no va
|
||||
self.regression = regression
|
||||
self.val_samples_generator = val_samples_generator
|
||||
# self.norm = StandardScaler()
|
||||
# self.covs = covs
|
||||
|
||||
def generate_validation_samples(self):
|
||||
Xs, ys = [], []
|
||||
for instances, prevalence in self.val_samples_generator():
|
||||
ys.append(prevalence)
|
||||
Xs.append(self.base_quantifier.quantify(instances))
|
||||
Xs = np.asarray(Xs)
|
||||
ys = np.asarray(ys)
|
||||
return Xs, ys
|
||||
|
||||
def fit(self, data):
|
||||
print('fitting quantifier')
|
||||
if data is not None:
|
||||
self.base_quantifier.fit(data)
|
||||
print('generating val samples')
|
||||
Xs, ys = self.generate_validation_samples()
|
||||
# Xs = self.norm.fit_transform(Xs)
|
||||
print('fitting regressor')
|
||||
self.reg.fit(Xs, ys)
|
||||
print('[done]')
|
||||
return self
|
||||
|
||||
def quantify(self, instances):
|
||||
Xs = self.base_quantifier.quantify(instances).reshape(1, -1)
|
||||
# Xs = self.norm.transform(Xs)
|
||||
Xs = self.reg.predict(Xs)
|
||||
# Xs = self.norm.inverse_transform(Xs)
|
||||
adjusted = Xs / Xs.sum()
|
||||
# adjusted = np.clip(Xs, 0, 1)
|
||||
adjusted = adjusted.flatten()
|
||||
return adjusted
|
||||
|
||||
def get_params(self, deep=True):
|
||||
return self.base_quantifier.get_params()
|
||||
|
||||
def set_params(self, **params):
|
||||
self.base_quantifier.set_params(**params)
|
||||
|
||||
|
||||
class RegressorClassifier(BaseEstimator, ClassifierMixin):
|
||||
def __init__(self):
|
||||
self.regressor = LinearSVR()
|
||||
# self.regressor = SVR()
|
||||
# self.regressor = Ridge(normalize=True)
|
||||
|
||||
|
||||
def fit(self, X, y):
|
||||
self.nclasses = len(np.unique(y))
|
||||
self.regressor.fit(X, y)
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
r = self.regressor.predict(X)
|
||||
c = np.round(r)
|
||||
c[c<0]=0
|
||||
c[c>(self.nclasses-1)]=self.nclasses-1
|
||||
return c.astype(np.int)
|
||||
|
||||
def predict_proba(self, X):
|
||||
r = self.regressor.predict(X)
|
||||
nC = len(self.classes_)
|
||||
r = np.clip(r, 0, nC - 1)
|
||||
dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
|
||||
invdist = 1 - dists
|
||||
invdist[invdist < 0] = 0
|
||||
return invdist
|
||||
|
||||
@property
|
||||
def classes_(self):
|
||||
return np.arange(self.nclasses)
|
||||
|
||||
def get_params(self, deep=True):
|
||||
return self.regressor.get_params()
|
||||
|
||||
def set_params(self, **params):
|
||||
self.regressor.set_params(**params)
|
||||
|
|
@ -0,0 +1,40 @@
|
|||
import numpy as np
|
||||
import quapy as qp
|
||||
from Ordinal.evaluation import nmd
|
||||
from Ordinal.utils import load_samples_pkl
|
||||
from quapy.data import LabelledCollection
|
||||
import pickle
|
||||
import os
|
||||
from os.path import join
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def partition_by_drift(split, training_prevalence):
|
||||
assert split in ['dev', 'test'], 'invalid split name'
|
||||
total=1000 if split=='dev' else 5000
|
||||
drifts = []
|
||||
for sample in tqdm(load_samples_pkl(join(datapath, domain, 'app', f'{split}_samples')), total=total):
|
||||
drifts.append(nmd(training_prevalence, sample.prevalence()))
|
||||
drifts = np.asarray(drifts)
|
||||
order = np.argsort(drifts)
|
||||
nD = len(order)
|
||||
low_drift, mid_drift, high_drift = order[:nD // 3], order[nD // 3:2 * nD // 3], order[2 * nD // 3:]
|
||||
np.save(join(datapath, domain, 'app', f'lowdrift.{split}.id.npy'), low_drift)
|
||||
np.save(join(datapath, domain, 'app', f'middrift.{split}.id.npy'), mid_drift)
|
||||
np.save(join(datapath, domain, 'app', f'highdrift.{split}.id.npy'), high_drift)
|
||||
lows = drifts[low_drift]
|
||||
mids = drifts[mid_drift]
|
||||
highs = drifts[high_drift]
|
||||
print(f'low drift: interval [{lows.min():.4f}, {lows.max():.4f}] mean: {lows.mean():.4f}')
|
||||
print(f'mid drift: interval [{mids.min():.4f}, {mids.max():.4f}] mean: {mids.mean():.4f}')
|
||||
print(f'high drift: interval [{highs.min():.4f}, {highs.max():.4f}] mean: {highs.mean():.4f}')
|
||||
|
||||
|
||||
domain = 'Books-tfidf'
|
||||
datapath = './data'
|
||||
|
||||
training = pickle.load(open(join(datapath,domain,'training_data.pkl'), 'rb'))
|
||||
|
||||
partition_by_drift('dev', training.prevalence())
|
||||
partition_by_drift('test', training.prevalence())
|
||||
|
|
@ -0,0 +1,54 @@
|
|||
import quapy as qp
|
||||
from quapy.data import LabelledCollection
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from os.path import join
|
||||
import os
|
||||
import pickle
|
||||
from utils import load_samples
|
||||
from tqdm import tqdm
|
||||
import shutil
|
||||
|
||||
|
||||
datapath = './data'
|
||||
domain = 'Books'
|
||||
outname = domain + '-tfidf'
|
||||
|
||||
def save_preprocessing_info(transformer):
|
||||
with open(join(datapath, outname, 'prep-info.txt'), 'wt') as foo:
|
||||
foo.write(f'{str(transformer)}\n')
|
||||
|
||||
|
||||
os.makedirs(join(datapath, outname), exist_ok=True)
|
||||
os.makedirs(join(datapath, outname, 'app'), exist_ok=True)
|
||||
os.makedirs(join(datapath, outname, 'app', 'dev_samples'), exist_ok=True)
|
||||
os.makedirs(join(datapath, outname, 'app', 'test_samples'), exist_ok=True)
|
||||
shutil.copyfile(join(datapath, domain, 'app', 'dev_prevalences.txt'), join(datapath, outname, 'app', 'dev_prevalences.txt'))
|
||||
shutil.copyfile(join(datapath, domain, 'app', 'test_prevalences.txt'), join(datapath, outname, 'app', 'test_prevalences.txt'))
|
||||
os.makedirs(join(datapath, outname, 'npp'), exist_ok=True)
|
||||
os.makedirs(join(datapath, outname, 'npp', 'dev_samples'), exist_ok=True)
|
||||
os.makedirs(join(datapath, outname, 'npp', 'test_samples'), exist_ok=True)
|
||||
shutil.copyfile(join(datapath, domain, 'npp', 'dev_prevalences.txt'), join(datapath, outname, 'npp', 'dev_prevalences.txt'))
|
||||
shutil.copyfile(join(datapath, domain, 'npp', 'test_prevalences.txt'), join(datapath, outname, 'npp', 'test_prevalences.txt'))
|
||||
|
||||
|
||||
tfidf = TfidfVectorizer(sublinear_tf=True, ngram_range=(1,2), min_df=5)
|
||||
|
||||
train = LabelledCollection.load(join(datapath, domain, 'training_data.txt'), loader_func=qp.data.reader.from_text)
|
||||
train.instances = tfidf.fit_transform(train.instances)
|
||||
save_preprocessing_info(tfidf)
|
||||
pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
|
||||
def transform_folder_samples(protocol, splitname):
|
||||
for i, sample in tqdm(enumerate(load_samples(join(datapath, domain, protocol, splitname), classes=train.classes_))):
|
||||
sample.instances = tfidf.transform(sample.instances)
|
||||
pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
|
||||
transform_folder_samples('app', 'dev_samples')
|
||||
transform_folder_samples('app', 'test_samples')
|
||||
transform_folder_samples('npp', 'dev_samples')
|
||||
transform_folder_samples('npp', 'test_samples')
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue