import numpy as np import pandas as pd from sklearn.linear_model import LogisticRegression, LogisticRegressionCV from sklearn.svm import LinearSVC from tqdm import tqdm import quapy as qp from quapy.method.aggregative import EMQ, PACC, CC, PCC, MS2, MS from quapy.data import LabelledCollection from sklearn.preprocessing import StandardScaler np.set_printoptions(linewidth=np.inf) cens_y = './data/cens_y.csv' survey_y = './data/survey_y.csv' def load_csv(file, use_yhat=True): df = pd.read_csv(file) cod_area = 'cod.prov' if use_yhat: covariates = ['owner', 'eta', 'work', 'sex', 'year_edu', 'hsize', 'y.hat', 'prob'] else: covariates = ['owner', 'eta', 'work', 'sex', 'year_edu', 'hsize', 'prob'] y_true = 'y.true' X = df[covariates].values A = df[cod_area].values for i, cov in enumerate(covariates): print(f'values of col {i} "{cov}" {np.unique(X[:,i])}') if y_true in df.columns: y = df[y_true].values return A, X, y else: return A, X def get_dataset_by_area(A, X, y=None): lc = [] for area in np.unique(A): sel = (A == area) Xsel = X[sel] if y is not None: ysel = y[sel] else: ysel = None lc.append((area, Xsel, ysel)) return lc class Preprocessor: def __init__(self): self.scaler = StandardScaler() # self.standardize_col_ids = np.asarray([1, 4, 5]) # eta, year_edu, hsize self.standardize_col_ids = np.arange(8) # everything def fit(self, X, y=None): Xsel = X[:, self.standardize_col_ids] self.scaler.fit(Xsel) return self def transform(self, X): Xsel = X[:, self.standardize_col_ids] Xsel_zscore = self.scaler.transform(Xsel) X[:, self.standardize_col_ids] = Xsel_zscore return X def fit_transform(self, X, y=None): return self.fit(X, y).transform(X) # cls = LinearSVC() cls = LogisticRegressionCV(class_weight='balanced', Cs=10) q = CC(cls) # q = PCC(cls) # q = PACC(cls) # q = EMQ(cls) # q = MS(cls) # Ate, Xte = load_csv(cens_y) Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True) preprocessor = Preprocessor() Xtr = preprocessor.fit_transform(Xtr) # Xtr_proc = preprocessor.fit_transform(Xtr) # big_train = LabelledCollection(Xtr_proc, ytr) # q.fit(big_train) trains = get_dataset_by_area(Atr, Xtr, ytr) # tests = get_dataset_by_area(Ate, Xte) n_area = len(trains) results = np.zeros(shape=(n_area, n_area)) for i, (Ai, Xi, yi) in tqdm(enumerate(trains), total=n_area): # Xi = preprocessor.fit_transform(Xi) tr = LabelledCollection(Xi, yi) q.fit(tr) len_tr = len(tr) # len_tr = len(big_train) for j, (Aj, Xj, yj) in enumerate(trains): if i==j: continue # Xj = preprocessor.transform(Xj) te = LabelledCollection(Xj, yj) pred_prev = q.quantify(te.X) true_prev = te.prevalence() err = qp.error.mae(true_prev, pred_prev) print(f'{i=} {j=} [#train={len_tr}] true_prev={true_prev[1]:.3f} pred_prev={pred_prev[1]:.3f} {err=:.4f}') results[i,j] = err print(results) print(f'mean results = {results.mean():.4f}')