first trials

This commit is contained in:
Alejandro Moreo Fernandez 2024-03-15 18:02:05 +01:00
parent 390fa24103
commit 289c474ea5
1 changed files with 124 additions and 0 deletions

124
Census/main.py Normal file
View File

@ -0,0 +1,124 @@
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import LinearSVC
from tqdm import tqdm
import quapy as qp
from quapy.method.aggregative import EMQ, PACC, CC, PCC, MS2, MS
from quapy.data import LabelledCollection
from sklearn.preprocessing import StandardScaler
np.set_printoptions(linewidth=np.inf)
cens_y = './data/cens_y.csv'
survey_y = './data/survey_y.csv'
def load_csv(file, use_yhat=True):
df = pd.read_csv(file)
cod_area = 'cod.prov'
if use_yhat:
covariates = ['owner', 'eta', 'work', 'sex', 'year_edu', 'hsize', 'y.hat', 'prob']
else:
covariates = ['owner', 'eta', 'work', 'sex', 'year_edu', 'hsize', 'prob']
y_true = 'y.true'
X = df[covariates].values
A = df[cod_area].values
for i, cov in enumerate(covariates):
print(f'values of col {i} "{cov}" {np.unique(X[:,i])}')
if y_true in df.columns:
y = df[y_true].values
return A, X, y
else:
return A, X
def get_dataset_by_area(A, X, y=None):
lc = []
for area in np.unique(A):
sel = (A == area)
Xsel = X[sel]
if y is not None:
ysel = y[sel]
else:
ysel = None
lc.append((area, Xsel, ysel))
return lc
class Preprocessor:
def __init__(self):
self.scaler = StandardScaler()
# self.standardize_col_ids = np.asarray([1, 4, 5]) # eta, year_edu, hsize
self.standardize_col_ids = np.arange(8) # everything
def fit(self, X, y=None):
Xsel = X[:, self.standardize_col_ids]
self.scaler.fit(Xsel)
return self
def transform(self, X):
Xsel = X[:, self.standardize_col_ids]
Xsel_zscore = self.scaler.transform(Xsel)
X[:, self.standardize_col_ids] = Xsel_zscore
return X
def fit_transform(self, X, y=None):
return self.fit(X, y).transform(X)
# cls = LinearSVC()
cls = LogisticRegressionCV(class_weight='balanced', Cs=10)
q = CC(cls)
# q = PCC(cls)
# q = PACC(cls)
# q = EMQ(cls)
# q = MS(cls)
# Ate, Xte = load_csv(cens_y)
Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True)
preprocessor = Preprocessor()
# Xtr_proc = preprocessor.fit_transform(Xtr)
# big_train = LabelledCollection(Xtr_proc, ytr)
# q.fit(big_train)
trains = get_dataset_by_area(Atr, Xtr, ytr)
# tests = get_dataset_by_area(Ate, Xte)
n_area = len(trains)
results = np.zeros(shape=(n_area, n_area))
for i, (Ai, Xi, yi) in tqdm(enumerate(trains), total=n_area):
Xi = preprocessor.fit_transform(Xi)
tr = LabelledCollection(Xi, yi)
q.fit(tr)
len_tr = len(tr)
# len_tr = len(big_train)
for j, (Aj, Xj, yj) in enumerate(trains):
if i==j: continue
Xj = preprocessor.transform(Xj)
te = LabelledCollection(Xj, yj)
pred_prev = q.quantify(te.X)
true_prev = te.prevalence()
err = qp.error.mae(true_prev, pred_prev)
print(f'{i=} {j=} [#train={len_tr}] true_prev={true_prev[1]:.3f} pred_prev={pred_prev[1]:.3f} {err=:.4f}')
results[i,j] = err
print(results)
print(f'mean results = {results.mean():.4f}')