import numpy as np import pandas as pd from sklearn.linear_model import LogisticRegression, LogisticRegressionCV from sklearn.svm import LinearSVC from tqdm import tqdm import quapy as qp from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE from quapy.method.aggregative import EMQ, PACC, CC, PCC, MS2, MS, ACC from quapy.data import LabelledCollection from sklearn.preprocessing import StandardScaler np.set_printoptions(linewidth=np.inf) cens_y = './data/cens_y.csv' survey_y = './data/survey_y.csv' def load_csv(file, use_yhat=True): df = pd.read_csv(file) cod_area = 'cod.prov' if use_yhat: covariates = ['owner', 'eta', 'work', 'sex', 'year_edu', 'hsize', 'y.hat', 'prob'] else: covariates = ['owner', 'eta', 'work', 'sex', 'year_edu', 'hsize', 'prob'] y_true = 'y.true' X = df[covariates].values A = df[cod_area].values for i, cov in enumerate(covariates): print(f'values of col {i} "{cov}" {np.unique(X[:,i])}') if y_true in df.columns: y = df[y_true].values return A, X, y else: return A, X def get_dataset_by_area(A, X, y=None): lc = [] for area in np.unique(A): sel = (A == area) Xsel = X[sel] if y is not None: ysel = y[sel] else: ysel = None lc.append((area, Xsel, ysel)) return lc class Preprocessor: def __init__(self): self.scaler = StandardScaler() # self.standardize_col_ids = np.asarray([1, 4, 5]) # eta, year_edu, hsize self.standardize_col_ids = np.arange(8) # everything def fit(self, X, y=None): Xsel = X[:, self.standardize_col_ids] self.scaler.fit(Xsel) return self def transform(self, X): Xsel = X[:, self.standardize_col_ids] Xsel_zscore = self.scaler.transform(Xsel) X[:, self.standardize_col_ids] = Xsel_zscore return X def fit_transform(self, X, y=None): return self.fit(X, y).transform(X) # Ate, Xte = load_csv(cens_y) Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True) preprocessor = Preprocessor() Xtr = preprocessor.fit_transform(Xtr) # Xtr_proc = preprocessor.fit_transform(Xtr) # big_train = LabelledCollection(Xtr_proc, ytr) # q.fit(big_train) trains = get_dataset_by_area(Atr, Xtr, ytr) # tests = get_dataset_by_area(Ate, Xte) n_area = len(trains) # cls = LinearSVC() cls = LogisticRegression() # cls = LogisticRegressionCV(class_weight='balanced', Cs=10) # q = CC(cls) # q = PCC(cls) # q = PACC(cls) q = EMQ(cls) # q = MS(cls) #q = MaximumLikelihoodPrevalenceEstimation() for q in [CC(cls), PCC(cls), ACC(cls), PACC(cls), EMQ(cls), MLPE()]: results = np.zeros(shape=(n_area, n_area)) for i, (Ai, Xi, yi) in tqdm(enumerate(trains), total=n_area): # Xi = preprocessor.fit_transform(Xi) tr = LabelledCollection(Xi, yi) q.fit(tr) len_tr = len(tr) # len_tr = len(big_train) for j, (Aj, Xj, yj) in enumerate(trains): if i==j: continue # Xj = preprocessor.transform(Xj) te = LabelledCollection(Xj, yj) pred_prev = q.quantify(te.X) true_prev = te.prevalence() # qp.environ["SAMPLE_SIZE"] = len(te) # err = qp.error.mrae(true_prev, pred_prev) err = qp.error.mae(true_prev, pred_prev) print(f'{i=} {j=} [#train={len_tr}] true_prev={true_prev[1]:.3f} pred_prev={pred_prev[1]:.3f} {err=:.4f}') results[i,j] = err import sys; sys.exit() q_name = q.__class__.__name__ # print(results) print(f'{q_name} mean results = {results.mean():.4f}') results += np.eye(results.shape[0]) print(results.min(axis=0).mean())