From 5df355a4e1f65484ececc78923dbdcedf01028c1 Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Tue, 8 Mar 2022 18:24:30 +0100 Subject: [PATCH] regression-based adjustment using the validation set; seems to be working --- Ordinal/main.py | 43 ++++--- Ordinal/model.py | 172 ++++++++++++++++++++++++++ Ordinal/partition_dataset_by_shift.py | 40 ++++++ Ordinal/preprocess_dataset.py | 54 ++++++++ 4 files changed, 295 insertions(+), 14 deletions(-) create mode 100644 Ordinal/model.py create mode 100644 Ordinal/partition_dataset_by_shift.py create mode 100644 Ordinal/preprocess_dataset.py diff --git a/Ordinal/main.py b/Ordinal/main.py index 9567aad..dbdadeb 100644 --- a/Ordinal/main.py +++ b/Ordinal/main.py @@ -16,7 +16,7 @@ from tqdm import tqdm domain = 'Books-tfidf' datapath = './data' protocol = 'app' -drift = 'low' +drift = 'high' train = pickle.load(open(join(datapath, domain, 'training_data.pkl'), 'rb')) @@ -37,26 +37,33 @@ def load_dev_samples(): print('fitting the quantifier') +# q = EMQ(LogisticRegression(class_weight='balanced')) # q = PACC(LogisticRegression(class_weight='balanced')) -# q = PACC(OrderedLogisticRegression()) +q = PACC(OrderedLogisticRegression()) # q = PACC(StackedClassifier(LogisticRegression(class_weight='balanced'))) # q = RegressionQuantification(PCC(LogisticRegression(class_weight='balanced')), val_samples_generator=load_dev_samples) -q = PACC(RegressorClassifier()) +# q = ACC(RegressorClassifier()) -q = qp.model_selection.GridSearchQ( - q, -# {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}, - {'C': np.logspace(-3,3,14)}, - 1000, - 'gen', - error=mnmd, - val_split=load_dev_samples, - n_jobs=-1, - refit=False, - verbose=True) +param_grid = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']} +# param_grid = {'C': np.logspace(-3,3,14)} +# param_grid = {'alpha':np.logspace(-8, 6, 15)} + +# q = qp.model_selection.GridSearchQ( +# q, +# param_grid, +# 1000, +# 'gen', +# error=mnmd, +# val_split=load_dev_samples, +# n_jobs=-1, +# refit=False, +# verbose=True) q.fit(train) +# q = RegressionQuantification(q, val_samples_generator=load_dev_samples) +# q.fit(None) + print('[done]') report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd]) @@ -64,6 +71,14 @@ mean_nmd = report['nmd'].mean() std_nmd = report['nmd'].std() print(f'{mean_nmd:.4f} +-{std_nmd:.4f}') +q = RegressionQuantification(q, val_samples_generator=load_dev_samples) +q.fit(None) + +report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd]) +mean_nmd = report['nmd'].mean() +std_nmd = report['nmd'].std() +print(f'[regression-correction] {mean_nmd:.4f} +-{std_nmd:.4f}') + # drift='high' # report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd]) # mean_nmd = report['nmd'].mean() diff --git a/Ordinal/model.py b/Ordinal/model.py new file mode 100644 index 0000000..fb0c985 --- /dev/null +++ b/Ordinal/model.py @@ -0,0 +1,172 @@ +from copy import deepcopy +import numpy as np +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.calibration import CalibratedClassifierCV +from sklearn.decomposition import TruncatedSVD +from sklearn.linear_model import LogisticRegression, Ridge +from scipy.sparse import issparse +from sklearn.multiclass import OneVsRestClassifier +from sklearn.multioutput import MultiOutputRegressor +from sklearn.preprocessing import StandardScaler +from sklearn.svm import LinearSVR, SVR +from statsmodels.miscmodels.ordinal_model import OrderedModel + + +class OrderedLogisticRegression: + def __init__(self, model='logit'): + assert model in ['logit', 'probit'], 'unknown ordered model, valid ones are logit or probit' + self.model = model + + def fit(self, X, y): + if issparse(X): + self.svd = TruncatedSVD(500) + X = self.svd.fit_transform(X) + self.learner = OrderedModel(y, X, distr=self.model) + self.res_prob = self.learner.fit(method='bfgs', disp=False, skip_hessian=True) + + def predict(self, X): + prob = self.predict_proba(X) + return np.argmax(prob, axis=1) + + def predict_proba(self, X): + if issparse(X): + assert hasattr(self, 'svd'), \ + 'X matrix in predict is sparse, but the method has not been fit with sparse type' + X = self.svd.transform(X) + return self.res_prob.model.predict(self.res_prob.params, exog=X) + + +class StackedClassifier: # aka Funnelling Monolingual + def __init__(self, base_estimator=LogisticRegression()): + if not hasattr(base_estimator, 'predict_proba'): + print('the estimator does not seem to be probabilistic: calibrating') + base_estimator = CalibratedClassifierCV(base_estimator) + # self.base = deepcopy(OneVsRestClassifier(base_estimator)) + # self.meta = deepcopy(OneVsRestClassifier(base_estimator)) + self.base = deepcopy(base_estimator) + self.meta = deepcopy(base_estimator) + self.norm = StandardScaler() + + def fit(self, X, y): + self.base.fit(X, y) + P = self.base.predict_proba(X) + P = self.norm.fit_transform(P) + self.meta.fit(P, y) + return self + + def predict(self, X): + P = self.base.predict_proba(X) + P = self.norm.transform(P) + return self.meta.predict(P) + + def predict_proba(self, X): + P = self.base.predict_proba(X) + P = self.norm.transform(P) + return self.meta.predict_proba(P) + + +class RegressionQuantification: + def __init__(self, + base_quantifier, + regression='svr', + val_samples_generator=None, + norm=True): + + self.base_quantifier = base_quantifier + if isinstance(regression, str): + assert regression in ['ridge', 'svr'], 'unknown regression model' + if regression == 'ridge': + self.reg = Ridge(normalize=norm) + elif regression == 'svr': + self.reg = MultiOutputRegressor(LinearSVR()) + else: + self.reg = regression + # self.reg = MultiTaskLassoCV(normalize=norm) + # self.reg = KernelRidge(kernel='rbf') + # self.reg = LassoLarsCV(normalize=norm) + # self.reg = MultiTaskElasticNetCV(normalize=norm) <- bien + #self.reg = LinearRegression(normalize=norm) # <- bien + # self.reg = MultiOutputRegressor(ARDRegression(normalize=norm)) # <- bastante bien, incluso sin norm + # self.reg = MultiOutputRegressor(BayesianRidge(normalize=False)) # <- bastante bien, incluso sin norm + # self.reg = MultiOutputRegressor(SGDRegressor()) # lento, no va + self.regression = regression + self.val_samples_generator = val_samples_generator + # self.norm = StandardScaler() + # self.covs = covs + + def generate_validation_samples(self): + Xs, ys = [], [] + for instances, prevalence in self.val_samples_generator(): + ys.append(prevalence) + Xs.append(self.base_quantifier.quantify(instances)) + Xs = np.asarray(Xs) + ys = np.asarray(ys) + return Xs, ys + + def fit(self, data): + print('fitting quantifier') + if data is not None: + self.base_quantifier.fit(data) + print('generating val samples') + Xs, ys = self.generate_validation_samples() + # Xs = self.norm.fit_transform(Xs) + print('fitting regressor') + self.reg.fit(Xs, ys) + print('[done]') + return self + + def quantify(self, instances): + Xs = self.base_quantifier.quantify(instances).reshape(1, -1) + # Xs = self.norm.transform(Xs) + Xs = self.reg.predict(Xs) + # Xs = self.norm.inverse_transform(Xs) + adjusted = Xs / Xs.sum() + # adjusted = np.clip(Xs, 0, 1) + adjusted = adjusted.flatten() + return adjusted + + def get_params(self, deep=True): + return self.base_quantifier.get_params() + + def set_params(self, **params): + self.base_quantifier.set_params(**params) + + +class RegressorClassifier(BaseEstimator, ClassifierMixin): + def __init__(self): + self.regressor = LinearSVR() + # self.regressor = SVR() + # self.regressor = Ridge(normalize=True) + + + def fit(self, X, y): + self.nclasses = len(np.unique(y)) + self.regressor.fit(X, y) + return self + + def predict(self, X): + r = self.regressor.predict(X) + c = np.round(r) + c[c<0]=0 + c[c>(self.nclasses-1)]=self.nclasses-1 + return c.astype(np.int) + + def predict_proba(self, X): + r = self.regressor.predict(X) + nC = len(self.classes_) + r = np.clip(r, 0, nC - 1) + dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1)) + invdist = 1 - dists + invdist[invdist < 0] = 0 + return invdist + + @property + def classes_(self): + return np.arange(self.nclasses) + + def get_params(self, deep=True): + return self.regressor.get_params() + + def set_params(self, **params): + self.regressor.set_params(**params) + diff --git a/Ordinal/partition_dataset_by_shift.py b/Ordinal/partition_dataset_by_shift.py new file mode 100644 index 0000000..8f2ae54 --- /dev/null +++ b/Ordinal/partition_dataset_by_shift.py @@ -0,0 +1,40 @@ +import numpy as np +import quapy as qp +from Ordinal.evaluation import nmd +from Ordinal.utils import load_samples_pkl +from quapy.data import LabelledCollection +import pickle +import os +from os.path import join +from tqdm import tqdm + + +def partition_by_drift(split, training_prevalence): + assert split in ['dev', 'test'], 'invalid split name' + total=1000 if split=='dev' else 5000 + drifts = [] + for sample in tqdm(load_samples_pkl(join(datapath, domain, 'app', f'{split}_samples')), total=total): + drifts.append(nmd(training_prevalence, sample.prevalence())) + drifts = np.asarray(drifts) + order = np.argsort(drifts) + nD = len(order) + low_drift, mid_drift, high_drift = order[:nD // 3], order[nD // 3:2 * nD // 3], order[2 * nD // 3:] + np.save(join(datapath, domain, 'app', f'lowdrift.{split}.id.npy'), low_drift) + np.save(join(datapath, domain, 'app', f'middrift.{split}.id.npy'), mid_drift) + np.save(join(datapath, domain, 'app', f'highdrift.{split}.id.npy'), high_drift) + lows = drifts[low_drift] + mids = drifts[mid_drift] + highs = drifts[high_drift] + print(f'low drift: interval [{lows.min():.4f}, {lows.max():.4f}] mean: {lows.mean():.4f}') + print(f'mid drift: interval [{mids.min():.4f}, {mids.max():.4f}] mean: {mids.mean():.4f}') + print(f'high drift: interval [{highs.min():.4f}, {highs.max():.4f}] mean: {highs.mean():.4f}') + + +domain = 'Books-tfidf' +datapath = './data' + +training = pickle.load(open(join(datapath,domain,'training_data.pkl'), 'rb')) + +partition_by_drift('dev', training.prevalence()) +partition_by_drift('test', training.prevalence()) + diff --git a/Ordinal/preprocess_dataset.py b/Ordinal/preprocess_dataset.py new file mode 100644 index 0000000..38f24df --- /dev/null +++ b/Ordinal/preprocess_dataset.py @@ -0,0 +1,54 @@ +import quapy as qp +from quapy.data import LabelledCollection +from sklearn.feature_extraction.text import TfidfVectorizer +from os.path import join +import os +import pickle +from utils import load_samples +from tqdm import tqdm +import shutil + + +datapath = './data' +domain = 'Books' +outname = domain + '-tfidf' + +def save_preprocessing_info(transformer): + with open(join(datapath, outname, 'prep-info.txt'), 'wt') as foo: + foo.write(f'{str(transformer)}\n') + + +os.makedirs(join(datapath, outname), exist_ok=True) +os.makedirs(join(datapath, outname, 'app'), exist_ok=True) +os.makedirs(join(datapath, outname, 'app', 'dev_samples'), exist_ok=True) +os.makedirs(join(datapath, outname, 'app', 'test_samples'), exist_ok=True) +shutil.copyfile(join(datapath, domain, 'app', 'dev_prevalences.txt'), join(datapath, outname, 'app', 'dev_prevalences.txt')) +shutil.copyfile(join(datapath, domain, 'app', 'test_prevalences.txt'), join(datapath, outname, 'app', 'test_prevalences.txt')) +os.makedirs(join(datapath, outname, 'npp'), exist_ok=True) +os.makedirs(join(datapath, outname, 'npp', 'dev_samples'), exist_ok=True) +os.makedirs(join(datapath, outname, 'npp', 'test_samples'), exist_ok=True) +shutil.copyfile(join(datapath, domain, 'npp', 'dev_prevalences.txt'), join(datapath, outname, 'npp', 'dev_prevalences.txt')) +shutil.copyfile(join(datapath, domain, 'npp', 'test_prevalences.txt'), join(datapath, outname, 'npp', 'test_prevalences.txt')) + + +tfidf = TfidfVectorizer(sublinear_tf=True, ngram_range=(1,2), min_df=5) + +train = LabelledCollection.load(join(datapath, domain, 'training_data.txt'), loader_func=qp.data.reader.from_text) +train.instances = tfidf.fit_transform(train.instances) +save_preprocessing_info(tfidf) +pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL) + + +def transform_folder_samples(protocol, splitname): + for i, sample in tqdm(enumerate(load_samples(join(datapath, domain, protocol, splitname), classes=train.classes_))): + sample.instances = tfidf.transform(sample.instances) + pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL) + + +transform_folder_samples('app', 'dev_samples') +transform_folder_samples('app', 'test_samples') +transform_folder_samples('npp', 'dev_samples') +transform_folder_samples('npp', 'test_samples') + + +