first trials
This commit is contained in:
parent
390fa24103
commit
289c474ea5
|
@ -0,0 +1,124 @@
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
|
||||
from sklearn.svm import LinearSVC
|
||||
from tqdm import tqdm
|
||||
|
||||
import quapy as qp
|
||||
from quapy.method.aggregative import EMQ, PACC, CC, PCC, MS2, MS
|
||||
from quapy.data import LabelledCollection
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
|
||||
np.set_printoptions(linewidth=np.inf)
|
||||
|
||||
|
||||
cens_y = './data/cens_y.csv'
|
||||
survey_y = './data/survey_y.csv'
|
||||
|
||||
|
||||
def load_csv(file, use_yhat=True):
|
||||
df = pd.read_csv(file)
|
||||
|
||||
cod_area = 'cod.prov'
|
||||
if use_yhat:
|
||||
covariates = ['owner', 'eta', 'work', 'sex', 'year_edu', 'hsize', 'y.hat', 'prob']
|
||||
else:
|
||||
covariates = ['owner', 'eta', 'work', 'sex', 'year_edu', 'hsize', 'prob']
|
||||
y_true = 'y.true'
|
||||
|
||||
X = df[covariates].values
|
||||
A = df[cod_area].values
|
||||
|
||||
for i, cov in enumerate(covariates):
|
||||
print(f'values of col {i} "{cov}" {np.unique(X[:,i])}')
|
||||
|
||||
if y_true in df.columns:
|
||||
y = df[y_true].values
|
||||
return A, X, y
|
||||
else:
|
||||
return A, X
|
||||
|
||||
|
||||
def get_dataset_by_area(A, X, y=None):
|
||||
lc = []
|
||||
for area in np.unique(A):
|
||||
sel = (A == area)
|
||||
Xsel = X[sel]
|
||||
if y is not None:
|
||||
ysel = y[sel]
|
||||
else:
|
||||
ysel = None
|
||||
lc.append((area, Xsel, ysel))
|
||||
return lc
|
||||
|
||||
|
||||
class Preprocessor:
|
||||
def __init__(self):
|
||||
self.scaler = StandardScaler()
|
||||
# self.standardize_col_ids = np.asarray([1, 4, 5]) # eta, year_edu, hsize
|
||||
self.standardize_col_ids = np.arange(8) # everything
|
||||
|
||||
def fit(self, X, y=None):
|
||||
Xsel = X[:, self.standardize_col_ids]
|
||||
self.scaler.fit(Xsel)
|
||||
return self
|
||||
|
||||
def transform(self, X):
|
||||
Xsel = X[:, self.standardize_col_ids]
|
||||
Xsel_zscore = self.scaler.transform(Xsel)
|
||||
X[:, self.standardize_col_ids] = Xsel_zscore
|
||||
return X
|
||||
|
||||
def fit_transform(self, X, y=None):
|
||||
return self.fit(X, y).transform(X)
|
||||
|
||||
|
||||
# cls = LinearSVC()
|
||||
cls = LogisticRegressionCV(class_weight='balanced', Cs=10)
|
||||
q = CC(cls)
|
||||
# q = PCC(cls)
|
||||
# q = PACC(cls)
|
||||
# q = EMQ(cls)
|
||||
# q = MS(cls)
|
||||
|
||||
|
||||
# Ate, Xte = load_csv(cens_y)
|
||||
Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True)
|
||||
|
||||
preprocessor = Preprocessor()
|
||||
# Xtr_proc = preprocessor.fit_transform(Xtr)
|
||||
# big_train = LabelledCollection(Xtr_proc, ytr)
|
||||
# q.fit(big_train)
|
||||
|
||||
trains = get_dataset_by_area(Atr, Xtr, ytr)
|
||||
# tests = get_dataset_by_area(Ate, Xte)
|
||||
|
||||
|
||||
n_area = len(trains)
|
||||
|
||||
results = np.zeros(shape=(n_area, n_area))
|
||||
|
||||
for i, (Ai, Xi, yi) in tqdm(enumerate(trains), total=n_area):
|
||||
Xi = preprocessor.fit_transform(Xi)
|
||||
tr = LabelledCollection(Xi, yi)
|
||||
q.fit(tr)
|
||||
len_tr = len(tr)
|
||||
# len_tr = len(big_train)
|
||||
for j, (Aj, Xj, yj) in enumerate(trains):
|
||||
if i==j: continue
|
||||
Xj = preprocessor.transform(Xj)
|
||||
te = LabelledCollection(Xj, yj)
|
||||
pred_prev = q.quantify(te.X)
|
||||
true_prev = te.prevalence()
|
||||
err = qp.error.mae(true_prev, pred_prev)
|
||||
print(f'{i=} {j=} [#train={len_tr}] true_prev={true_prev[1]:.3f} pred_prev={pred_prev[1]:.3f} {err=:.4f}')
|
||||
results[i,j] = err
|
||||
|
||||
|
||||
print(results)
|
||||
print(f'mean results = {results.mean():.4f}')
|
||||
|
||||
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue