1
0
Fork 0

regression-based adjustment using the validation set; seems to be working

This commit is contained in:
Alejandro Moreo Fernandez 2022-03-08 18:24:30 +01:00
parent b982a51103
commit 5df355a4e1
4 changed files with 295 additions and 14 deletions

View File

@ -16,7 +16,7 @@ from tqdm import tqdm
domain = 'Books-tfidf'
datapath = './data'
protocol = 'app'
drift = 'low'
drift = 'high'
train = pickle.load(open(join(datapath, domain, 'training_data.pkl'), 'rb'))
@ -37,26 +37,33 @@ def load_dev_samples():
print('fitting the quantifier')
# q = EMQ(LogisticRegression(class_weight='balanced'))
# q = PACC(LogisticRegression(class_weight='balanced'))
# q = PACC(OrderedLogisticRegression())
q = PACC(OrderedLogisticRegression())
# q = PACC(StackedClassifier(LogisticRegression(class_weight='balanced')))
# q = RegressionQuantification(PCC(LogisticRegression(class_weight='balanced')), val_samples_generator=load_dev_samples)
q = PACC(RegressorClassifier())
# q = ACC(RegressorClassifier())
q = qp.model_selection.GridSearchQ(
q,
# {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']},
{'C': np.logspace(-3,3,14)},
1000,
'gen',
error=mnmd,
val_split=load_dev_samples,
n_jobs=-1,
refit=False,
verbose=True)
param_grid = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
# param_grid = {'C': np.logspace(-3,3,14)}
# param_grid = {'alpha':np.logspace(-8, 6, 15)}
# q = qp.model_selection.GridSearchQ(
# q,
# param_grid,
# 1000,
# 'gen',
# error=mnmd,
# val_split=load_dev_samples,
# n_jobs=-1,
# refit=False,
# verbose=True)
q.fit(train)
# q = RegressionQuantification(q, val_samples_generator=load_dev_samples)
# q.fit(None)
print('[done]')
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
@ -64,6 +71,14 @@ mean_nmd = report['nmd'].mean()
std_nmd = report['nmd'].std()
print(f'{mean_nmd:.4f} +-{std_nmd:.4f}')
q = RegressionQuantification(q, val_samples_generator=load_dev_samples)
q.fit(None)
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
mean_nmd = report['nmd'].mean()
std_nmd = report['nmd'].std()
print(f'[regression-correction] {mean_nmd:.4f} +-{std_nmd:.4f}')
# drift='high'
# report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
# mean_nmd = report['nmd'].mean()

172
Ordinal/model.py Normal file
View File

@ -0,0 +1,172 @@
from copy import deepcopy
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.calibration import CalibratedClassifierCV
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression, Ridge
from scipy.sparse import issparse
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVR, SVR
from statsmodels.miscmodels.ordinal_model import OrderedModel
class OrderedLogisticRegression:
def __init__(self, model='logit'):
assert model in ['logit', 'probit'], 'unknown ordered model, valid ones are logit or probit'
self.model = model
def fit(self, X, y):
if issparse(X):
self.svd = TruncatedSVD(500)
X = self.svd.fit_transform(X)
self.learner = OrderedModel(y, X, distr=self.model)
self.res_prob = self.learner.fit(method='bfgs', disp=False, skip_hessian=True)
def predict(self, X):
prob = self.predict_proba(X)
return np.argmax(prob, axis=1)
def predict_proba(self, X):
if issparse(X):
assert hasattr(self, 'svd'), \
'X matrix in predict is sparse, but the method has not been fit with sparse type'
X = self.svd.transform(X)
return self.res_prob.model.predict(self.res_prob.params, exog=X)
class StackedClassifier: # aka Funnelling Monolingual
def __init__(self, base_estimator=LogisticRegression()):
if not hasattr(base_estimator, 'predict_proba'):
print('the estimator does not seem to be probabilistic: calibrating')
base_estimator = CalibratedClassifierCV(base_estimator)
# self.base = deepcopy(OneVsRestClassifier(base_estimator))
# self.meta = deepcopy(OneVsRestClassifier(base_estimator))
self.base = deepcopy(base_estimator)
self.meta = deepcopy(base_estimator)
self.norm = StandardScaler()
def fit(self, X, y):
self.base.fit(X, y)
P = self.base.predict_proba(X)
P = self.norm.fit_transform(P)
self.meta.fit(P, y)
return self
def predict(self, X):
P = self.base.predict_proba(X)
P = self.norm.transform(P)
return self.meta.predict(P)
def predict_proba(self, X):
P = self.base.predict_proba(X)
P = self.norm.transform(P)
return self.meta.predict_proba(P)
class RegressionQuantification:
def __init__(self,
base_quantifier,
regression='svr',
val_samples_generator=None,
norm=True):
self.base_quantifier = base_quantifier
if isinstance(regression, str):
assert regression in ['ridge', 'svr'], 'unknown regression model'
if regression == 'ridge':
self.reg = Ridge(normalize=norm)
elif regression == 'svr':
self.reg = MultiOutputRegressor(LinearSVR())
else:
self.reg = regression
# self.reg = MultiTaskLassoCV(normalize=norm)
# self.reg = KernelRidge(kernel='rbf')
# self.reg = LassoLarsCV(normalize=norm)
# self.reg = MultiTaskElasticNetCV(normalize=norm) <- bien
#self.reg = LinearRegression(normalize=norm) # <- bien
# self.reg = MultiOutputRegressor(ARDRegression(normalize=norm)) # <- bastante bien, incluso sin norm
# self.reg = MultiOutputRegressor(BayesianRidge(normalize=False)) # <- bastante bien, incluso sin norm
# self.reg = MultiOutputRegressor(SGDRegressor()) # lento, no va
self.regression = regression
self.val_samples_generator = val_samples_generator
# self.norm = StandardScaler()
# self.covs = covs
def generate_validation_samples(self):
Xs, ys = [], []
for instances, prevalence in self.val_samples_generator():
ys.append(prevalence)
Xs.append(self.base_quantifier.quantify(instances))
Xs = np.asarray(Xs)
ys = np.asarray(ys)
return Xs, ys
def fit(self, data):
print('fitting quantifier')
if data is not None:
self.base_quantifier.fit(data)
print('generating val samples')
Xs, ys = self.generate_validation_samples()
# Xs = self.norm.fit_transform(Xs)
print('fitting regressor')
self.reg.fit(Xs, ys)
print('[done]')
return self
def quantify(self, instances):
Xs = self.base_quantifier.quantify(instances).reshape(1, -1)
# Xs = self.norm.transform(Xs)
Xs = self.reg.predict(Xs)
# Xs = self.norm.inverse_transform(Xs)
adjusted = Xs / Xs.sum()
# adjusted = np.clip(Xs, 0, 1)
adjusted = adjusted.flatten()
return adjusted
def get_params(self, deep=True):
return self.base_quantifier.get_params()
def set_params(self, **params):
self.base_quantifier.set_params(**params)
class RegressorClassifier(BaseEstimator, ClassifierMixin):
def __init__(self):
self.regressor = LinearSVR()
# self.regressor = SVR()
# self.regressor = Ridge(normalize=True)
def fit(self, X, y):
self.nclasses = len(np.unique(y))
self.regressor.fit(X, y)
return self
def predict(self, X):
r = self.regressor.predict(X)
c = np.round(r)
c[c<0]=0
c[c>(self.nclasses-1)]=self.nclasses-1
return c.astype(np.int)
def predict_proba(self, X):
r = self.regressor.predict(X)
nC = len(self.classes_)
r = np.clip(r, 0, nC - 1)
dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
invdist = 1 - dists
invdist[invdist < 0] = 0
return invdist
@property
def classes_(self):
return np.arange(self.nclasses)
def get_params(self, deep=True):
return self.regressor.get_params()
def set_params(self, **params):
self.regressor.set_params(**params)

View File

@ -0,0 +1,40 @@
import numpy as np
import quapy as qp
from Ordinal.evaluation import nmd
from Ordinal.utils import load_samples_pkl
from quapy.data import LabelledCollection
import pickle
import os
from os.path import join
from tqdm import tqdm
def partition_by_drift(split, training_prevalence):
assert split in ['dev', 'test'], 'invalid split name'
total=1000 if split=='dev' else 5000
drifts = []
for sample in tqdm(load_samples_pkl(join(datapath, domain, 'app', f'{split}_samples')), total=total):
drifts.append(nmd(training_prevalence, sample.prevalence()))
drifts = np.asarray(drifts)
order = np.argsort(drifts)
nD = len(order)
low_drift, mid_drift, high_drift = order[:nD // 3], order[nD // 3:2 * nD // 3], order[2 * nD // 3:]
np.save(join(datapath, domain, 'app', f'lowdrift.{split}.id.npy'), low_drift)
np.save(join(datapath, domain, 'app', f'middrift.{split}.id.npy'), mid_drift)
np.save(join(datapath, domain, 'app', f'highdrift.{split}.id.npy'), high_drift)
lows = drifts[low_drift]
mids = drifts[mid_drift]
highs = drifts[high_drift]
print(f'low drift: interval [{lows.min():.4f}, {lows.max():.4f}] mean: {lows.mean():.4f}')
print(f'mid drift: interval [{mids.min():.4f}, {mids.max():.4f}] mean: {mids.mean():.4f}')
print(f'high drift: interval [{highs.min():.4f}, {highs.max():.4f}] mean: {highs.mean():.4f}')
domain = 'Books-tfidf'
datapath = './data'
training = pickle.load(open(join(datapath,domain,'training_data.pkl'), 'rb'))
partition_by_drift('dev', training.prevalence())
partition_by_drift('test', training.prevalence())

View File

@ -0,0 +1,54 @@
import quapy as qp
from quapy.data import LabelledCollection
from sklearn.feature_extraction.text import TfidfVectorizer
from os.path import join
import os
import pickle
from utils import load_samples
from tqdm import tqdm
import shutil
datapath = './data'
domain = 'Books'
outname = domain + '-tfidf'
def save_preprocessing_info(transformer):
with open(join(datapath, outname, 'prep-info.txt'), 'wt') as foo:
foo.write(f'{str(transformer)}\n')
os.makedirs(join(datapath, outname), exist_ok=True)
os.makedirs(join(datapath, outname, 'app'), exist_ok=True)
os.makedirs(join(datapath, outname, 'app', 'dev_samples'), exist_ok=True)
os.makedirs(join(datapath, outname, 'app', 'test_samples'), exist_ok=True)
shutil.copyfile(join(datapath, domain, 'app', 'dev_prevalences.txt'), join(datapath, outname, 'app', 'dev_prevalences.txt'))
shutil.copyfile(join(datapath, domain, 'app', 'test_prevalences.txt'), join(datapath, outname, 'app', 'test_prevalences.txt'))
os.makedirs(join(datapath, outname, 'npp'), exist_ok=True)
os.makedirs(join(datapath, outname, 'npp', 'dev_samples'), exist_ok=True)
os.makedirs(join(datapath, outname, 'npp', 'test_samples'), exist_ok=True)
shutil.copyfile(join(datapath, domain, 'npp', 'dev_prevalences.txt'), join(datapath, outname, 'npp', 'dev_prevalences.txt'))
shutil.copyfile(join(datapath, domain, 'npp', 'test_prevalences.txt'), join(datapath, outname, 'npp', 'test_prevalences.txt'))
tfidf = TfidfVectorizer(sublinear_tf=True, ngram_range=(1,2), min_df=5)
train = LabelledCollection.load(join(datapath, domain, 'training_data.txt'), loader_func=qp.data.reader.from_text)
train.instances = tfidf.fit_transform(train.instances)
save_preprocessing_info(tfidf)
pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
def transform_folder_samples(protocol, splitname):
for i, sample in tqdm(enumerate(load_samples(join(datapath, domain, protocol, splitname), classes=train.classes_))):
sample.instances = tfidf.transform(sample.instances)
pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
transform_folder_samples('app', 'dev_samples')
transform_folder_samples('app', 'test_samples')
transform_folder_samples('npp', 'dev_samples')
transform_folder_samples('npp', 'test_samples')