From 289c474ea519ddd0f2abd5be1dc9fa7bddc9cdf0 Mon Sep 17 00:00:00 2001
From: Alejandro Moreo <alejandro.moreo@isti.cnr.it>
Date: Fri, 15 Mar 2024 18:02:05 +0100
Subject: [PATCH] first trials

---
 Census/main.py | 124 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 124 insertions(+)
 create mode 100644 Census/main.py

diff --git a/Census/main.py b/Census/main.py
new file mode 100644
index 0000000..435f71d
--- /dev/null
+++ b/Census/main.py
@@ -0,0 +1,124 @@
+import numpy as np
+import pandas as pd
+from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
+from sklearn.svm import LinearSVC
+from tqdm import tqdm
+
+import quapy as qp
+from quapy.method.aggregative import EMQ, PACC, CC, PCC, MS2, MS
+from quapy.data import LabelledCollection
+from sklearn.preprocessing import StandardScaler
+
+np.set_printoptions(linewidth=np.inf)
+
+
+cens_y = './data/cens_y.csv'
+survey_y = './data/survey_y.csv'
+
+
+def load_csv(file, use_yhat=True):
+    df = pd.read_csv(file)
+
+    cod_area = 'cod.prov'
+    if use_yhat:
+        covariates = ['owner', 'eta', 'work', 'sex', 'year_edu', 'hsize', 'y.hat', 'prob']
+    else:
+        covariates = ['owner', 'eta', 'work', 'sex', 'year_edu', 'hsize', 'prob']
+    y_true = 'y.true'
+
+    X = df[covariates].values
+    A = df[cod_area].values
+
+    for i, cov in enumerate(covariates):
+        print(f'values of col {i} "{cov}" {np.unique(X[:,i])}')
+
+    if y_true in df.columns:
+        y = df[y_true].values
+        return A, X, y
+    else:
+        return A, X
+
+
+def get_dataset_by_area(A, X, y=None):
+    lc = []
+    for area in np.unique(A):
+        sel = (A == area)
+        Xsel = X[sel]
+        if y is not None:
+            ysel = y[sel]
+        else:
+            ysel = None
+        lc.append((area, Xsel, ysel))
+    return lc
+
+
+class Preprocessor:
+    def __init__(self):
+        self.scaler = StandardScaler()
+        # self.standardize_col_ids = np.asarray([1, 4, 5]) # eta, year_edu, hsize
+        self.standardize_col_ids = np.arange(8) # everything
+
+    def fit(self, X, y=None):
+        Xsel = X[:, self.standardize_col_ids]
+        self.scaler.fit(Xsel)
+        return self
+
+    def transform(self, X):
+        Xsel = X[:, self.standardize_col_ids]
+        Xsel_zscore = self.scaler.transform(Xsel)
+        X[:, self.standardize_col_ids] = Xsel_zscore
+        return X
+
+    def fit_transform(self, X, y=None):
+        return self.fit(X, y).transform(X)
+
+
+# cls = LinearSVC()
+cls = LogisticRegressionCV(class_weight='balanced', Cs=10)
+q = CC(cls)
+# q = PCC(cls)
+# q = PACC(cls)
+# q = EMQ(cls)
+# q = MS(cls)
+
+
+# Ate, Xte = load_csv(cens_y)
+Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True)
+
+preprocessor = Preprocessor()
+# Xtr_proc = preprocessor.fit_transform(Xtr)
+# big_train = LabelledCollection(Xtr_proc, ytr)
+# q.fit(big_train)
+
+trains = get_dataset_by_area(Atr, Xtr, ytr)
+# tests  = get_dataset_by_area(Ate, Xte)
+
+
+n_area = len(trains)
+
+results = np.zeros(shape=(n_area, n_area))
+
+for i, (Ai, Xi, yi) in tqdm(enumerate(trains), total=n_area):
+    Xi = preprocessor.fit_transform(Xi)
+    tr = LabelledCollection(Xi, yi)
+    q.fit(tr)
+    len_tr = len(tr)
+    # len_tr = len(big_train)
+    for j, (Aj, Xj, yj) in enumerate(trains):
+        if i==j: continue
+        Xj = preprocessor.transform(Xj)
+        te = LabelledCollection(Xj, yj)
+        pred_prev = q.quantify(te.X)
+        true_prev = te.prevalence()
+        err = qp.error.mae(true_prev, pred_prev)
+        print(f'{i=} {j=} [#train={len_tr}] true_prev={true_prev[1]:.3f} pred_prev={pred_prev[1]:.3f} {err=:.4f}')
+        results[i,j] = err
+
+
+print(results)
+print(f'mean results = {results.mean():.4f}')
+
+
+
+
+