last experiments before the meeting with unipi
This commit is contained in:
parent
21d052313c
commit
4f3a6a4169
|
@ -14,17 +14,21 @@ from copy import deepcopy
|
||||||
|
|
||||||
np.set_printoptions(linewidth=np.inf)
|
np.set_printoptions(linewidth=np.inf)
|
||||||
|
|
||||||
|
|
||||||
def classifier():
|
def classifier():
|
||||||
return LogisticRegressionCV()
|
return LogisticRegressionCV()
|
||||||
|
|
||||||
|
|
||||||
def quantifiers():
|
def quantifiers():
|
||||||
cls = classifier()
|
cls = classifier()
|
||||||
yield 'MLPE', MLPE()
|
yield 'MLPE', MLPE()
|
||||||
yield 'CC', CC(cls)
|
# yield 'CC', CC(cls)
|
||||||
yield 'PCC', PCC(cls)
|
yield 'PCC', PCC(cls)
|
||||||
yield 'ACC', ACC(cls)
|
# yield 'ACC', ACC(cls)
|
||||||
yield 'PACC', PACC(cls)
|
# yield 'PACC', PACC(cls)
|
||||||
yield 'MS', MS(cls)
|
# yield 'MS', MS(cls)
|
||||||
|
yield 'SModelLR', StatModelLR()
|
||||||
|
yield 'SModel', StatModel(mean=prob_mean, scale=prob_std)
|
||||||
# yield 'MS2', MS2(cls)
|
# yield 'MS2', MS2(cls)
|
||||||
# yield 'SLD', EMQ(cls)
|
# yield 'SLD', EMQ(cls)
|
||||||
|
|
||||||
|
@ -35,6 +39,7 @@ Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True)
|
||||||
|
|
||||||
preprocessor = Preprocessor()
|
preprocessor = Preprocessor()
|
||||||
Xtr = preprocessor.fit_transform(Xtr)
|
Xtr = preprocessor.fit_transform(Xtr)
|
||||||
|
prob_mean, prob_std = preprocessor.get_mean_std(column=-1) # get the mean and std of the "prob" colum
|
||||||
|
|
||||||
data = get_dataset_by_area(Atr, Xtr, ytr)
|
data = get_dataset_by_area(Atr, Xtr, ytr)
|
||||||
n_areas = len(data)
|
n_areas = len(data)
|
||||||
|
@ -58,10 +63,8 @@ for aggr in ['median', 'mean']:
|
||||||
table = Table(name=f'adjacent{aggr}', benchmarks=benchmarks, methods=methods, stat_test=None, color_mode='local')
|
table = Table(name=f'adjacent{aggr}', benchmarks=benchmarks, methods=methods, stat_test=None, color_mode='local')
|
||||||
table.format.mean_prec = 4
|
table.format.mean_prec = 4
|
||||||
table.format.show_std = False
|
table.format.show_std = False
|
||||||
table.format.sta = False
|
|
||||||
table.format.remove_zero = True
|
table.format.remove_zero = True
|
||||||
|
|
||||||
|
|
||||||
for q_name, q in quantifiers():
|
for q_name, q in quantifiers():
|
||||||
# pretrain quantifiers per area
|
# pretrain quantifiers per area
|
||||||
pretrained_area_q = []
|
pretrained_area_q = []
|
||||||
|
|
|
@ -25,6 +25,8 @@ def quantifiers():
|
||||||
yield 'ACC', ACC(cls)
|
yield 'ACC', ACC(cls)
|
||||||
yield 'PACC', PACC(cls)
|
yield 'PACC', PACC(cls)
|
||||||
yield 'SLD', EMQ(cls)
|
yield 'SLD', EMQ(cls)
|
||||||
|
yield 'SModelLR', StatModelLR()
|
||||||
|
yield 'SModel', StatModel(mean=prob_mean, scale=prob_std)
|
||||||
|
|
||||||
|
|
||||||
survey_y = './data/survey_y.csv'
|
survey_y = './data/survey_y.csv'
|
||||||
|
@ -33,6 +35,7 @@ Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True)
|
||||||
|
|
||||||
preprocessor = Preprocessor()
|
preprocessor = Preprocessor()
|
||||||
Xtr = preprocessor.fit_transform(Xtr)
|
Xtr = preprocessor.fit_transform(Xtr)
|
||||||
|
prob_mean, prob_std = preprocessor.get_mean_std(column=-1) # get the mean and std of the "prob" colum
|
||||||
|
|
||||||
data = get_dataset_by_area(Atr, Xtr, ytr)
|
data = get_dataset_by_area(Atr, Xtr, ytr)
|
||||||
n_areas = len(data)
|
n_areas = len(data)
|
||||||
|
|
|
@ -1,5 +1,10 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
|
||||||
|
|
||||||
|
from quapy.data import LabelledCollection
|
||||||
|
from quapy.method.base import BaseQuantifier
|
||||||
|
import quapy.functional as F
|
||||||
from sklearn.preprocessing import StandardScaler
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
|
||||||
np.set_printoptions(linewidth=np.inf)
|
np.set_printoptions(linewidth=np.inf)
|
||||||
|
@ -43,7 +48,7 @@ def get_dataset_by_area(A, X, y=None):
|
||||||
|
|
||||||
class AdjMatrix:
|
class AdjMatrix:
|
||||||
|
|
||||||
def __init__(self, path):
|
def __init__(self, path, add_diagonal=False):
|
||||||
df = pd.read_csv(path)
|
df = pd.read_csv(path)
|
||||||
|
|
||||||
area_codes = df.columns[1:].values
|
area_codes = df.columns[1:].values
|
||||||
|
@ -54,7 +59,12 @@ class AdjMatrix:
|
||||||
print(values)
|
print(values)
|
||||||
self.area2idx = {area:i for i, area in enumerate(area_codes)}
|
self.area2idx = {area:i for i, area in enumerate(area_codes)}
|
||||||
self.idx2area = area_codes
|
self.idx2area = area_codes
|
||||||
self.M = np.asarray(values)
|
self.M = np.asarray(values, dtype=int)
|
||||||
|
if add_diagonal:
|
||||||
|
# adding the diagonal has the effect of considering an area be adjacent to itself. This is useful when
|
||||||
|
# the model is trained using survey_y.csv data and tested using cens_y.csv, but should not be done when
|
||||||
|
# the model is trained and tested on survey_y.csv
|
||||||
|
self.M += np.eye(self.M.shape[0], dtype=int)
|
||||||
|
|
||||||
def adjacent(self, cod_1, cod_2):
|
def adjacent(self, cod_1, cod_2):
|
||||||
idx1 = self.area2idx[cod_1]
|
idx1 = self.area2idx[cod_1]
|
||||||
|
@ -87,4 +97,76 @@ class Preprocessor:
|
||||||
def fit_transform(self, X, y=None):
|
def fit_transform(self, X, y=None):
|
||||||
return self.fit(X, y).transform(X)
|
return self.fit(X, y).transform(X)
|
||||||
|
|
||||||
|
def get_mean_std(self, column):
|
||||||
|
mean = self.scaler.mean_[column]
|
||||||
|
std = self.scaler.scale_[column]
|
||||||
|
return mean, std
|
||||||
|
|
||||||
|
|
||||||
|
class StatModel(BaseQuantifier):
|
||||||
|
"""
|
||||||
|
This method is a wrapper that simply returns the expected value of column "prob" as the prediction.
|
||||||
|
The column "prob" comes from a different model used by our statiticians and is pre-computed, so this
|
||||||
|
method actually simply reports the average.
|
||||||
|
|
||||||
|
:param posteriors_column: index of the column "prob" in the csv. The default value is -1 since
|
||||||
|
it is the last column either in survey_y.csv and cens_y.csv
|
||||||
|
:param mean: indicates the mean of the column. If specified, then the column is assumed to be
|
||||||
|
standardized, and the inverse function is applied in order to recover the posterior probability
|
||||||
|
in the range [0,1]
|
||||||
|
:param scale: indicates the scale of the column. If specified, then the column is assumed to be
|
||||||
|
standardized, and the inverse function is applied in order to recover the posterior probability
|
||||||
|
in the range [0,1]
|
||||||
|
"""
|
||||||
|
def __init__(self, posteriors_column=-1, mean=0, scale=1):
|
||||||
|
self.posteriors_column = posteriors_column
|
||||||
|
self.mean = mean
|
||||||
|
self.scale = scale
|
||||||
|
|
||||||
|
def fit(self, data: LabelledCollection):
|
||||||
|
return self
|
||||||
|
|
||||||
|
def quantify(self, instances):
|
||||||
|
prob = instances[:, self.posteriors_column]
|
||||||
|
# reconvert the z-scored variable to its original status
|
||||||
|
prob = zscore_inv(prob, self.mean, self.scale)
|
||||||
|
prob_ave = np.mean(prob)
|
||||||
|
print('Model', prob_ave)
|
||||||
|
prev = F.as_binary_prevalence(prob_ave)
|
||||||
|
return prev
|
||||||
|
|
||||||
|
|
||||||
|
def zscore_inv(X, mean, scale):
|
||||||
|
return X*scale + mean
|
||||||
|
|
||||||
|
|
||||||
|
class StatModelLR(BaseQuantifier):
|
||||||
|
"""
|
||||||
|
This method is a wrapper that recalibrates the column "prob" via Logistic Regression.
|
||||||
|
The column "prob" comes from a different model used by our statiticians and is pre-computed.
|
||||||
|
|
||||||
|
:param posteriors_column: index of the column "prob" in the csv. The default value is -1 since
|
||||||
|
it is the last column either in survey_y.csv and cens_y.csv
|
||||||
|
"""
|
||||||
|
def __init__(self, posteriors_column=-1, mean=0, scale=1):
|
||||||
|
self.posteriors_column = posteriors_column
|
||||||
|
self.mean = mean
|
||||||
|
self.scale = scale
|
||||||
|
self.lr = LogisticRegressionCV()
|
||||||
|
|
||||||
|
def fit(self, data: LabelledCollection):
|
||||||
|
X = data.X[:,self.posteriors_column].reshape(-1,1)
|
||||||
|
# reconvert the z-scored variable to its original status
|
||||||
|
X = zscore_inv(X, self.mean, self.scale)
|
||||||
|
y = data.y
|
||||||
|
self.lr.fit(X, y)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def quantify(self, instances):
|
||||||
|
prob = instances[:, self.posteriors_column].reshape(-1,1)
|
||||||
|
# reconvert the z-scored variable to its original status
|
||||||
|
prob = zscore_inv(prob, self.mean, self.scale)
|
||||||
|
calib_prob = self.lr.predict_proba(prob)[:,-1]
|
||||||
|
prob_ave = np.mean(calib_prob)
|
||||||
|
prev = F.as_binary_prevalence(prob_ave)
|
||||||
|
return prev
|
|
@ -39,7 +39,6 @@ class CombinationRule(ABC):
|
||||||
return prevalence
|
return prevalence
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def optimize_ensemble(area_data: Iterable, q: BaseQuantifier, Madj=None, hyper=None, error='mae'):
|
def optimize_ensemble(area_data: Iterable, q: BaseQuantifier, Madj=None, hyper=None, error='mae'):
|
||||||
if hyper is None:
|
if hyper is None:
|
||||||
hyper = {
|
hyper = {
|
||||||
|
|
|
@ -23,6 +23,8 @@ def quantifiers():
|
||||||
yield 'ACC', ACC(cls)
|
yield 'ACC', ACC(cls)
|
||||||
yield 'PACC', PACC(cls)
|
yield 'PACC', PACC(cls)
|
||||||
yield 'SLD', SLD(cls)
|
yield 'SLD', SLD(cls)
|
||||||
|
yield 'SModelLR', StatModelLR()
|
||||||
|
yield 'SModel', StatModel(mean=prob_mean, scale=prob_std)
|
||||||
|
|
||||||
|
|
||||||
survey_y = './data/survey_y.csv'
|
survey_y = './data/survey_y.csv'
|
||||||
|
@ -31,6 +33,7 @@ Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True)
|
||||||
|
|
||||||
preprocessor = Preprocessor()
|
preprocessor = Preprocessor()
|
||||||
Xtr = preprocessor.fit_transform(Xtr)
|
Xtr = preprocessor.fit_transform(Xtr)
|
||||||
|
prob_mean, prob_std = preprocessor.get_mean_std(column=-1) # get the mean and std of the "prob" colum
|
||||||
|
|
||||||
trains = get_dataset_by_area(Atr, Xtr, ytr)
|
trains = get_dataset_by_area(Atr, Xtr, ytr)
|
||||||
n_areas = len(trains)
|
n_areas = len(trains)
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
from matplotlib.cm import get_cmap
|
from matplotlib.pyplot import get_cmap
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from matplotlib import cm
|
from matplotlib import cm
|
||||||
from scipy.stats import ttest_ind_from_stats
|
from scipy.stats import ttest_ind_from_stats
|
||||||
|
|
Loading…
Reference in New Issue