From 289c474ea519ddd0f2abd5be1dc9fa7bddc9cdf0 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Fri, 15 Mar 2024 18:02:05 +0100 Subject: [PATCH] first trials --- Census/main.py | 124 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) create mode 100644 Census/main.py diff --git a/Census/main.py b/Census/main.py new file mode 100644 index 0000000..435f71d --- /dev/null +++ b/Census/main.py @@ -0,0 +1,124 @@ +import numpy as np +import pandas as pd +from sklearn.linear_model import LogisticRegression, LogisticRegressionCV +from sklearn.svm import LinearSVC +from tqdm import tqdm + +import quapy as qp +from quapy.method.aggregative import EMQ, PACC, CC, PCC, MS2, MS +from quapy.data import LabelledCollection +from sklearn.preprocessing import StandardScaler + +np.set_printoptions(linewidth=np.inf) + + +cens_y = './data/cens_y.csv' +survey_y = './data/survey_y.csv' + + +def load_csv(file, use_yhat=True): + df = pd.read_csv(file) + + cod_area = 'cod.prov' + if use_yhat: + covariates = ['owner', 'eta', 'work', 'sex', 'year_edu', 'hsize', 'y.hat', 'prob'] + else: + covariates = ['owner', 'eta', 'work', 'sex', 'year_edu', 'hsize', 'prob'] + y_true = 'y.true' + + X = df[covariates].values + A = df[cod_area].values + + for i, cov in enumerate(covariates): + print(f'values of col {i} "{cov}" {np.unique(X[:,i])}') + + if y_true in df.columns: + y = df[y_true].values + return A, X, y + else: + return A, X + + +def get_dataset_by_area(A, X, y=None): + lc = [] + for area in np.unique(A): + sel = (A == area) + Xsel = X[sel] + if y is not None: + ysel = y[sel] + else: + ysel = None + lc.append((area, Xsel, ysel)) + return lc + + +class Preprocessor: + def __init__(self): + self.scaler = StandardScaler() + # self.standardize_col_ids = np.asarray([1, 4, 5]) # eta, year_edu, hsize + self.standardize_col_ids = np.arange(8) # everything + + def fit(self, X, y=None): + Xsel = X[:, self.standardize_col_ids] + self.scaler.fit(Xsel) + return self + + def transform(self, X): + Xsel = X[:, self.standardize_col_ids] + Xsel_zscore = self.scaler.transform(Xsel) + X[:, self.standardize_col_ids] = Xsel_zscore + return X + + def fit_transform(self, X, y=None): + return self.fit(X, y).transform(X) + + +# cls = LinearSVC() +cls = LogisticRegressionCV(class_weight='balanced', Cs=10) +q = CC(cls) +# q = PCC(cls) +# q = PACC(cls) +# q = EMQ(cls) +# q = MS(cls) + + +# Ate, Xte = load_csv(cens_y) +Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True) + +preprocessor = Preprocessor() +# Xtr_proc = preprocessor.fit_transform(Xtr) +# big_train = LabelledCollection(Xtr_proc, ytr) +# q.fit(big_train) + +trains = get_dataset_by_area(Atr, Xtr, ytr) +# tests = get_dataset_by_area(Ate, Xte) + + +n_area = len(trains) + +results = np.zeros(shape=(n_area, n_area)) + +for i, (Ai, Xi, yi) in tqdm(enumerate(trains), total=n_area): + Xi = preprocessor.fit_transform(Xi) + tr = LabelledCollection(Xi, yi) + q.fit(tr) + len_tr = len(tr) + # len_tr = len(big_train) + for j, (Aj, Xj, yj) in enumerate(trains): + if i==j: continue + Xj = preprocessor.transform(Xj) + te = LabelledCollection(Xj, yj) + pred_prev = q.quantify(te.X) + true_prev = te.prevalence() + err = qp.error.mae(true_prev, pred_prev) + print(f'{i=} {j=} [#train={len_tr}] true_prev={true_prev[1]:.3f} pred_prev={pred_prev[1]:.3f} {err=:.4f}') + results[i,j] = err + + +print(results) +print(f'mean results = {results.mean():.4f}') + + + + +