some experiments run, not much to say though

This commit is contained in:
Alejandro Moreo Fernandez 2024-03-27 16:43:28 +01:00
parent 7ee224521a
commit 1f3b1597dc
12 changed files with 1313 additions and 61 deletions

View File

@ -0,0 +1,87 @@
import numpy as np
from sklearn.linear_model import LogisticRegressionCV
from quapy.data import LabelledCollection
from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE
from quapy.method.aggregative import CC, PCC, ACC, PACC, EMQ
from commons import *
from table import Table
from tqdm import tqdm
import quapy as qp
np.set_printoptions(linewidth=np.inf)
def classifier():
return LogisticRegressionCV()
def quantifiers():
cls = classifier()
yield 'MLPE', MLPE()
yield 'CC', CC(cls)
yield 'PCC', PCC(cls)
yield 'ACC', ACC(cls)
yield 'PACC', PACC(cls)
yield 'SLD', EMQ(cls)
survey_y = './data/survey_y.csv'
Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True)
preprocessor = Preprocessor()
Xtr = preprocessor.fit_transform(Xtr)
data = get_dataset_by_area(Atr, Xtr, ytr)
n_areas = len(data)
Madj = AdjMatrix('./data/matrice_adiacenza.csv')
areas = [Ai for Ai, _, _ in data]
q_names = [q_name for q_name, _ in quantifiers()]
# tables = []
text_outputs = []
benchmarks = [f'te-{Ai}' for Ai in areas] # areas used as test
# areas on which a quantifier is trained, e.g., 'PACC-w/o46' means a PACC quantifier
# has been trained on all areas but 46
methods = [f'{q_name}-cat' for q_name in q_names]
table = Table(name='adjacentconcat', benchmarks=benchmarks, methods=methods, stat_test=None, color_mode='local')
table.format.mean_prec = 4
table.format.show_std = False
table.format.sta = False
table.format.remove_zero = True
for q_name, q in quantifiers():
for i, (Ai, Xi, yi) in tqdm(enumerate(data), total=n_areas):
#training
trainings = [LabelledCollection(Xj, yj) for Aj, Xj, yj in data if Aj!=Ai and Aj in Madj.get_adjacent(Ai)]
print(f'for test Ai={Ai} there should be {Madj.get_adjacent(Ai)}: len={len(trainings)}')
tr = LabelledCollection.join(*trainings)
q.fit(tr)
#test
te = LabelledCollection(Xi, yi)
qp.environ["SAMPLE_SIZE"] = len(te)
pred_prev = q.quantify(te.X)
true_prev = te.prevalence()
err = qp.error.mae(true_prev, pred_prev)
method_name = f'{q_name}-cat'
table.add(benchmark=f'te-{Ai}', method=method_name, v=err)
# text_outputs.append(f'{q_name} got mean {table.all_mean():.5f}, best mean {table.get_method_values("Best").mean():.5f}')
Table.LatexPDF(f'./results/adjacentconcat/doc.pdf', [table])
# with open(f'./results/classifier/output.txt', 'tw') as foo:
# foo.write('\n'.join(text_outputs))

View File

@ -0,0 +1,101 @@
import numpy as np
from sklearn.linear_model import LogisticRegressionCV
from Census.methods import AreaQuantifier, AggregationRule
from quapy.data import LabelledCollection
from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE
from quapy.method.aggregative import CC, PCC, ACC, PACC, EMQ, MS, MS2
from commons import *
from table import Table
from tqdm import tqdm
import quapy as qp
from copy import deepcopy
np.set_printoptions(linewidth=np.inf)
def classifier():
return LogisticRegressionCV()
def quantifiers():
cls = classifier()
yield 'MLPE', MLPE()
yield 'CC', CC(cls)
yield 'PCC', PCC(cls)
yield 'ACC', ACC(cls)
yield 'PACC', PACC(cls)
yield 'MS', MS(cls)
# yield 'MS2', MS2(cls)
# yield 'SLD', EMQ(cls)
survey_y = './data/survey_y.csv'
Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True)
preprocessor = Preprocessor()
Xtr = preprocessor.fit_transform(Xtr)
data = get_dataset_by_area(Atr, Xtr, ytr)
n_areas = len(data)
areas = [Ai for Ai, _, _ in data]
q_names = [q_name for q_name, _ in quantifiers()]
Madj = AdjMatrix('./data/matrice_adiacenza.csv')
tables = []
text_outputs = []
benchmarks = [f'te-{Ai}' for Ai in areas] # areas used as test
for aggr in ['median', 'mean']:
# areas on which a quantifier is trained, e.g., 'PACC-w/o46' means a PACC quantifier
# has been trained on all areas but 46
methods = [f'{q_name}-{aggr}' for q_name in q_names]
table = Table(name=f'adjacent{aggr}', benchmarks=benchmarks, methods=methods, stat_test=None, color_mode='local')
table.format.mean_prec = 4
table.format.show_std = False
table.format.sta = False
table.format.remove_zero = True
for q_name, q in quantifiers():
# pretrain quantifiers per area
pretrained_area_q = []
for i, (Ai, Xi, yi) in tqdm(enumerate(data), total=n_areas):
q_i = deepcopy(q)
q_i.fit(LabelledCollection(Xi, yi))
pretrained_area_q.append(AreaQuantifier(Ai, q_i))
for i, (Ai, Xi, yi) in tqdm(enumerate(data), total=n_areas):
# compose members of the rule (quantifiers are already fit)
#training
area_quantifiers = [qA_j for qA_j in pretrained_area_q if qA_j.area != Ai]
rule = AggregationRule(area_quantifiers, adjacent_matrix=Madj, aggr=aggr)
#test
te = LabelledCollection(Xi, yi)
qp.environ["SAMPLE_SIZE"] = len(te)
pred_prev = rule.predict(Ai, te.X)
true_prev = te.prevalence()
err = qp.error.mae(true_prev, pred_prev)
method_name = f'{q_name}-{aggr}'
table.add(benchmark=f'te-{Ai}', method=method_name, v=err)
# text_outputs.append(f'{q_name} got mean {table.all_mean():.5f}, best mean {table.get_method_values("Best").mean():.5f}')
tables.append(table)
Table.LatexPDF(f'./results/adjacentaggregation/doc.pdf', tables)
# with open(f'./results/classifier/output.txt', 'tw') as foo:
# foo.write('\n'.join(text_outputs))

View File

@ -0,0 +1,95 @@
import numpy as np
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from Census.methods import AreaQuantifier, AggregationRule, optimize_ensemble
from quapy.data import LabelledCollection
from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE
from quapy.method.aggregative import CC, PCC, ACC, PACC, EMQ, MS, MS2
from commons import *
from table import Table
from tqdm import tqdm
import quapy as qp
from copy import deepcopy
np.set_printoptions(linewidth=np.inf)
def classifier():
return LogisticRegression()
def quantifiers():
cls = classifier()
# yield 'MLPE', MLPE()
yield 'CC', CC(cls)
yield 'PCC', PCC(cls)
yield 'ACC', ACC(cls)
yield 'PACC', PACC(cls)
yield 'MS', MS(cls)
# yield 'MS2', MS2(cls)
# yield 'SLD', EMQ(cls)
survey_y = './data/survey_y.csv'
Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True)
preprocessor = Preprocessor()
Xtr = preprocessor.fit_transform(Xtr)
data = get_dataset_by_area(Atr, Xtr, ytr)
n_areas = len(data)
areas = [Ai for Ai, _, _ in data]
q_names = [q_name for q_name, _ in quantifiers()]
Madj = AdjMatrix('./data/matrice_adiacenza.csv')
tables = []
text_outputs = []
benchmarks = [f'te-{Ai}' for Ai in areas] # areas used as test
for aggr in ['median', 'mean']:
# areas on which a quantifier is trained, e.g., 'PACC-w/o46' means a PACC quantifier
# has been trained on all areas but 46
methods = [f'{q_name}-{aggr}' for q_name in q_names]
table = Table(name=f'adjacent{aggr}optim', benchmarks=benchmarks, methods=methods, stat_test=None, color_mode='local')
table.format.mean_prec = 4
table.format.show_std = False
table.format.sta = False
table.format.remove_zero = True
for q_name, q in quantifiers():
for i, (Ai, Xi, yi) in tqdm(enumerate(data), total=n_areas):
# compose members of the rule (quantifiers are optimized wrt the rest of the areas)
#training
other_area = [(Aj, Xj, yj) for Aj, Xj, yj in data if Aj != Ai]
area_quantifiers = optimize_ensemble(other_area, q, Madj)
rule = AggregationRule(area_quantifiers, adjacent_matrix=Madj, aggr=aggr)
#test
te = LabelledCollection(Xi, yi)
qp.environ["SAMPLE_SIZE"] = len(te)
pred_prev = rule.predict(Ai, te.X)
true_prev = te.prevalence()
err = qp.error.mae(true_prev, pred_prev)
method_name = f'{q_name}-{aggr}'
table.add(benchmark=f'te-{Ai}', method=method_name, v=err)
# text_outputs.append(f'{q_name} got mean {table.all_mean():.5f}, best mean {table.get_method_values("Best").mean():.5f}')
tables.append(table)
Table.LatexPDF(f'./results/adjacentaggregationoptim/doc.pdf', tables)
# with open(f'./results/classifier/output.txt', 'tw') as foo:
# foo.write('\n'.join(text_outputs))

84
Census/allconcat_3.py Normal file
View File

@ -0,0 +1,84 @@
import numpy as np
from sklearn.linear_model import LogisticRegressionCV
from quapy.data import LabelledCollection
from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE
from quapy.method.aggregative import CC, PCC, ACC, PACC, EMQ
from commons import *
from table import Table
from tqdm import tqdm
import quapy as qp
np.set_printoptions(linewidth=np.inf)
def classifier():
return LogisticRegressionCV()
def quantifiers():
cls = classifier()
yield 'MLPE', MLPE()
yield 'CC', CC(cls)
yield 'PCC', PCC(cls)
yield 'ACC', ACC(cls)
yield 'PACC', PACC(cls)
yield 'SLD', EMQ(cls)
survey_y = './data/survey_y.csv'
Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True)
preprocessor = Preprocessor()
Xtr = preprocessor.fit_transform(Xtr)
data = get_dataset_by_area(Atr, Xtr, ytr)
n_areas = len(data)
areas = [Ai for Ai, _, _ in data]
q_names = [q_name for q_name, _ in quantifiers()]
# tables = []
text_outputs = []
benchmarks = [f'te-{Ai}' for Ai in areas] # areas used as test
# areas on which a quantifier is trained, e.g., 'PACC-w/o46' means a PACC quantifier
# has been trained on all areas but 46
methods = [f'{q_name}-cat' for q_name in q_names]
table = Table(name='allconcat', benchmarks=benchmarks, methods=methods, stat_test=None, color_mode='local')
table.format.mean_prec = 4
table.format.show_std = False
table.format.sta = False
table.format.remove_zero = True
for q_name, q in quantifiers():
for i, (Ai, Xi, yi) in tqdm(enumerate(data), total=n_areas):
#training
trainings = [LabelledCollection(Xj, yj) for Aj, Xj, yj in data if Aj!=Ai]
tr = LabelledCollection.join(*trainings)
q.fit(tr)
#test
te = LabelledCollection(Xi, yi)
qp.environ["SAMPLE_SIZE"] = len(te)
pred_prev = q.quantify(te.X)
true_prev = te.prevalence()
err = qp.error.mae(true_prev, pred_prev)
method_name = f'{q_name}-cat'
table.add(benchmark=f'te-{Ai}', method=method_name, v=err)
# text_outputs.append(f'{q_name} got mean {table.all_mean():.5f}, best mean {table.get_method_values("Best").mean():.5f}')
Table.LatexPDF(f'./results/allconcat/doc.pdf', [table])
# with open(f'./results/classifier/output.txt', 'tw') as foo:
# foo.write('\n'.join(text_outputs))

96
Census/allmedian_3.1.py Normal file
View File

@ -0,0 +1,96 @@
import numpy as np
from sklearn.linear_model import LogisticRegressionCV
from Census.methods import AreaQuantifier, AggregationRule
from quapy.data import LabelledCollection
from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE
from quapy.method.aggregative import CC, PCC, ACC, PACC, EMQ
from commons import *
from table import Table
from tqdm import tqdm
import quapy as qp
from copy import deepcopy
np.set_printoptions(linewidth=np.inf)
def classifier():
return LogisticRegressionCV()
def quantifiers():
cls = classifier()
yield 'MLPE', MLPE()
yield 'CC', CC(cls)
yield 'PCC', PCC(cls)
yield 'ACC', ACC(cls)
yield 'PACC', PACC(cls)
yield 'SLD', EMQ(cls)
survey_y = './data/survey_y.csv'
Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True)
preprocessor = Preprocessor()
Xtr = preprocessor.fit_transform(Xtr)
data = get_dataset_by_area(Atr, Xtr, ytr)
n_areas = len(data)
areas = [Ai for Ai, _, _ in data]
q_names = [q_name for q_name, _ in quantifiers()]
tables = []
text_outputs = []
benchmarks = [f'te-{Ai}' for Ai in areas] # areas used as test
for aggr in ['median', 'mean']:
# areas on which a quantifier is trained, e.g., 'PACC-w/o46' means a PACC quantifier
# has been trained on all areas but 46
methods = [f'{q_name}-{aggr}' for q_name in q_names]
table = Table(name=f'all{aggr}', benchmarks=benchmarks, methods=methods, stat_test=None, color_mode='local')
table.format.mean_prec = 4
table.format.show_std = False
table.format.sta = False
table.format.remove_zero = True
for q_name, q in quantifiers():
# pretrain quantifiers per area
pretrained_area_q = []
for i, (Ai, Xi, yi) in tqdm(enumerate(data), total=n_areas):
q_i = deepcopy(q)
q_i.fit(LabelledCollection(Xi, yi))
pretrained_area_q.append(AreaQuantifier(Ai, q_i))
for i, (Ai, Xi, yi) in tqdm(enumerate(data), total=n_areas):
# compose members of the rule (quantifiers are already fit)
#training
area_quantifiers = [qA_j for qA_j in pretrained_area_q if qA_j.area != Ai]
rule = AggregationRule(area_quantifiers, aggr=aggr)
#test
te = LabelledCollection(Xi, yi)
qp.environ["SAMPLE_SIZE"] = len(te)
pred_prev = rule.predict(Ai, te.X)
true_prev = te.prevalence()
err = qp.error.mae(true_prev, pred_prev)
method_name = f'{q_name}-{aggr}'
table.add(benchmark=f'te-{Ai}', method=method_name, v=err)
# text_outputs.append(f'{q_name} got mean {table.all_mean():.5f}, best mean {table.get_method_values("Best").mean():.5f}')
tables.append(table)
Table.LatexPDF(f'./results/allaggregation/doc.pdf', tables)
# with open(f'./results/classifier/output.txt', 'tw') as foo:
# foo.write('\n'.join(text_outputs))

View File

@ -0,0 +1,70 @@
import numpy as np
from sklearn.svm import SVC
from commons import *
from table import Table
np.set_printoptions(linewidth=np.inf)
def classifiers():
yield 'LR-opt', LogisticRegressionCV(class_weight='balanced', Cs=10)
yield 'LR-def', LogisticRegressionCV()
yield 'SVM-linear', LinearSVC()
yield 'SVM-rbf', SVC(kernel='rbf')
survey_y = './data/survey_y.csv'
Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True)
preprocessor = Preprocessor()
Xtr = preprocessor.fit_transform(Xtr)
trains = get_dataset_by_area(Atr, Xtr, ytr)
n_areas = len(trains)
areas = [Ai for Ai, _, _ in trains]
tables = []
text_outputs = []
benchmarks = [f'te-{Ai}' for Ai in areas] # areas used as test
methods = [f'tr-{Ai}' for Ai in areas] # areas on which a quantifier is trained
for cls_name, c in classifiers():
table = Table(name=cls_name, benchmarks=benchmarks, methods=methods, stat_test=None, color_mode='local', lower_is_better=False)
table.format.mean_prec = 4
table.format.show_std = False
table.format.sta = False
table.format.remove_zero = True
for i, (Ai, Xi, yi) in tqdm(enumerate(trains), total=n_areas):
c.fit(Xi, yi)
for j, (Aj, Xj, yj) in enumerate(trains):
if i==j: continue
pred_labels = c.predict(Xj)
true_labels = yj
acc = (pred_labels==true_labels).mean()
table.add(benchmark=f'te-{Aj}', method=f'tr-{Ai}', v=acc)
for test in benchmarks:
values = table.get_benchmark_values(test)
table.add(benchmark=test, method='Best', v=max(values))
table.add(benchmark=test, method='Worst', v=min(values))
table.add(benchmark=test, method='AVE', v=np.mean(values))
tables.append(table)
text_outputs.append(f'{cls_name} got mean {table.all_mean():.5f}')
Table.LatexPDF(f'./results/classifier/doc.pdf', tables)
with open(f'./results/classifier/output.txt', 'tw') as foo:
foo.write('\n'.join(text_outputs))

90
Census/commons.py Normal file
View File

@ -0,0 +1,90 @@
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
np.set_printoptions(linewidth=np.inf)
def load_csv(file, use_yhat=True):
df = pd.read_csv(file)
cod_area = 'cod.prov'
if use_yhat:
covariates = ['owner', 'eta', 'work', 'sex', 'year_edu', 'hsize', 'y.hat', 'prob']
else:
covariates = ['owner', 'eta', 'work', 'sex', 'year_edu', 'hsize', 'prob']
y_true = 'y.true'
X = df[covariates].values
A = df[cod_area].values
# for i, cov in enumerate(covariates):
# print(f'values of col {i} "{cov}" {np.unique(X[:,i])}')
if y_true in df.columns:
y = df[y_true].values
return A, X, y
else:
return A, X
def get_dataset_by_area(A, X, y=None):
data = []
for area in np.unique(A):
sel = (A == area)
Xsel = X[sel]
if y is not None:
ysel = y[sel]
else:
ysel = None
data.append((area, Xsel, ysel))
return data
class AdjMatrix:
def __init__(self, path):
df = pd.read_csv(path)
area_codes = df.columns[1:].values
area_codes = np.asarray([int(c) for c in area_codes])
values = df.values[:, 1:]
print(area_codes)
print(values)
self.area2idx = {area:i for i, area in enumerate(area_codes)}
self.idx2area = area_codes
self.M = np.asarray(values)
def adjacent(self, cod_1, cod_2):
idx1 = self.area2idx[cod_1]
idx2 = self.area2idx[cod_2]
return (self.M[idx1, idx2] == 1)
def get_adjacent(self, cod):
idx = self.area2idx[cod]
idx_adj = np.argwhere(self.M[idx]==1).flatten()
return self.idx2area[idx_adj]
class Preprocessor:
def __init__(self):
self.scaler = StandardScaler()
# self.standardize_col_ids = np.asarray([1, 4, 5]) # eta, year_edu, hsize
self.standardize_col_ids = np.arange(8) # everything
def fit(self, X, y=None):
Xsel = X[:, self.standardize_col_ids]
self.scaler.fit(Xsel)
return self
def transform(self, X):
Xsel = X[:, self.standardize_col_ids]
Xsel_zscore = self.scaler.transform(Xsel)
X[:, self.standardize_col_ids] = Xsel_zscore
return X
def fit_transform(self, X, y=None):
return self.fit(X, y).transform(X)

View File

@ -9,6 +9,7 @@ from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation a
from quapy.method.aggregative import EMQ, PACC, CC, PCC, MS2, MS, ACC from quapy.method.aggregative import EMQ, PACC, CC, PCC, MS2, MS, ACC
from quapy.data import LabelledCollection from quapy.data import LabelledCollection
from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import StandardScaler
from commons import *
np.set_printoptions(linewidth=np.inf) np.set_printoptions(linewidth=np.inf)
@ -16,67 +17,6 @@ np.set_printoptions(linewidth=np.inf)
cens_y = './data/cens_y.csv' cens_y = './data/cens_y.csv'
survey_y = './data/survey_y.csv' survey_y = './data/survey_y.csv'
def load_csv(file, use_yhat=True):
df = pd.read_csv(file)
cod_area = 'cod.prov'
if use_yhat:
covariates = ['owner', 'eta', 'work', 'sex', 'year_edu', 'hsize', 'y.hat', 'prob']
else:
covariates = ['owner', 'eta', 'work', 'sex', 'year_edu', 'hsize', 'prob']
y_true = 'y.true'
X = df[covariates].values
A = df[cod_area].values
for i, cov in enumerate(covariates):
print(f'values of col {i} "{cov}" {np.unique(X[:,i])}')
if y_true in df.columns:
y = df[y_true].values
return A, X, y
else:
return A, X
def get_dataset_by_area(A, X, y=None):
lc = []
for area in np.unique(A):
sel = (A == area)
Xsel = X[sel]
if y is not None:
ysel = y[sel]
else:
ysel = None
lc.append((area, Xsel, ysel))
return lc
class Preprocessor:
def __init__(self):
self.scaler = StandardScaler()
# self.standardize_col_ids = np.asarray([1, 4, 5]) # eta, year_edu, hsize
self.standardize_col_ids = np.arange(8) # everything
def fit(self, X, y=None):
Xsel = X[:, self.standardize_col_ids]
self.scaler.fit(Xsel)
return self
def transform(self, X):
Xsel = X[:, self.standardize_col_ids]
Xsel_zscore = self.scaler.transform(Xsel)
X[:, self.standardize_col_ids] = Xsel_zscore
return X
def fit_transform(self, X, y=None):
return self.fit(X, y).transform(X)
# Ate, Xte = load_csv(cens_y) # Ate, Xte = load_csv(cens_y)
Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True) Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True)

111
Census/methods.py Normal file
View File

@ -0,0 +1,111 @@
from abc import abstractmethod, ABC
from copy import deepcopy
from typing import List, Iterable
import numpy as np
import quapy as qp
from quapy.method.aggregative import AggregativeQuantifier
from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE
from quapy.data import LabelledCollection
from quapy.method.base import BaseQuantifier
class AreaQuantifier:
def __init__(self, area:int, quantifier: BaseQuantifier):
self.area = area
self.quantifier = quantifier
def quantify(self, X):
return self.quantifier.quantify(X)
class CombinationRule(ABC):
def __init__(self, area_quantifiers: List[AreaQuantifier]):
self.area_quantifiers = area_quantifiers
@abstractmethod
def select_quantifiers(self, area:int, X):
...
@abstractmethod
def combination(self, choice, X):
...
def predict(self, area:int, X):
choice = self.select_quantifiers(area, X)
prevalence = self.combination(choice, X)
return prevalence
def optimize_ensemble(area_data: Iterable, q: BaseQuantifier, Madj=None, hyper=None, error='mae'):
if hyper is None:
hyper = {
'classifier__C': np.logspace(-4, 4, 9),
'classifier__class_weight': ['balanced', None]
}
labelled_collections = [(A, LabelledCollection(X, y)) for A, X, y in area_data]
area_quantifiers = []
for A, lc in labelled_collections:
if Madj is None:
rest = [lc_j for Aj, lc_j in labelled_collections if Aj != A]
else:
rest = [lc_j for Aj, lc_j in labelled_collections if Aj != A and Aj in Madj.get_adjacent(A)]
q = optim(q, lc, rest, hyper, error)
area_quantifiers.append(AreaQuantifier(A, q))
return area_quantifiers
class AggregationRule(CombinationRule):
def __init__(self, area_quantifiers: List[AreaQuantifier], adjacent_matrix: 'AdjMatrix' = None, aggr='median'):
assert aggr in ['mean', 'median'], f'unknown {aggr=}'
self.area_quantifiers = area_quantifiers
self.adjacent_matrix = adjacent_matrix
self.aggr = aggr
def select_quantifiers(self, area:int, X):
if self.adjacent_matrix is None:
chosen = self.area_quantifiers
else:
adjacent = self.adjacent_matrix.get_adjacent(area)
chosen = [q_i for q_i in self.area_quantifiers if q_i.area in adjacent]
return chosen
def combination(self, choice, X):
prevs = np.asarray([q.quantify(X) for q in choice])
if self.aggr == 'median':
prev = np.median(prevs, axis=0)
elif self.aggr == 'mean':
prev = np.mean(prevs, axis=0)
else:
raise NotImplementedError(f'{self.aggr=} not implemented')
return prev
def optim(q: BaseQuantifier, train: LabelledCollection, labelled_collections: Iterable[LabelledCollection], hyper:dict, error='mae'):
q = deepcopy(q)
prot = qp.protocol.IterateProtocol(labelled_collections)
try:
mod_sel = qp.model_selection.GridSearchQ(
model=q,
param_grid=hyper,
protocol=prot,
error=error,
refit=False,
n_jobs=-1
).fit(train)
fitted = mod_sel.best_model_
except ValueError:
print(f'method {q} failed; training without model selection')
fitted = q.fit(train)
return fitted

86
Census/pairwise_2.py Normal file
View File

@ -0,0 +1,86 @@
import numpy as np
from sklearn.linear_model import LogisticRegressionCV
from quapy.data import LabelledCollection
from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE
from quapy.method.aggregative import CC, PCC, ACC, PACC, EMQ
from commons import *
from table import Table
from tqdm import tqdm
import quapy as qp
np.set_printoptions(linewidth=np.inf)
def classifier():
#return LogisticRegressionCV(class_weight='balanced', Cs=10)
return LogisticRegressionCV()
def quantifiers():
cls = classifier()
yield 'MLPE', MLPE()
yield 'CC', CC(cls)
yield 'PCC', PCC(cls)
yield 'ACC', ACC(cls)
yield 'PACC', PACC(cls)
survey_y = './data/survey_y.csv'
Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True)
preprocessor = Preprocessor()
Xtr = preprocessor.fit_transform(Xtr)
trains = get_dataset_by_area(Atr, Xtr, ytr)
n_areas = len(trains)
areas = [Ai for Ai, _, _ in trains]
tables = []
text_outputs = []
benchmarks = [f'te-{Ai}' for Ai in areas] # areas used as test
methods = [f'tr-{Ai}' for Ai in areas] # areas on which a quantifier is trained
for q_name, q in quantifiers():
table = Table(name=q_name, benchmarks=benchmarks, methods=methods, stat_test=None, color_mode='global')
table.format.mean_prec = 4
table.format.show_std = False
table.format.sta = False
table.format.remove_zero = True
table.with_mean = True
for i, (Ai, Xi, yi) in tqdm(enumerate(trains), total=n_areas):
tr = LabelledCollection(Xi, yi)
q.fit(tr)
len_tr = len(tr)
for j, (Aj, Xj, yj) in enumerate(trains):
if i==j: continue
te = LabelledCollection(Xj, yj)
qp.environ["SAMPLE_SIZE"] = len(te)
pred_prev = q.quantify(te.X)
true_prev = te.prevalence()
# err = qp.error.mrae(true_prev, pred_prev)
err = qp.error.mae(true_prev, pred_prev)
table.add(benchmark=f'te-{Aj}', method=f'tr-{Ai}', v=err)
for test in benchmarks:
values = table.get_benchmark_values(test)
table.add(benchmark=test, method='Best', v=min(values))
tables.append(table)
text_outputs.append(f'{q_name} got mean {table.all_mean():.5f}, best mean {table.get_method_values("Best").mean():.5f}')
Table.LatexPDF(f'./results/pairwise/doc.pdf', tables)
with open(f'./results/classifier/output.txt', 'tw') as foo:
foo.write('\n'.join(text_outputs))

476
Census/table.py Normal file
View File

@ -0,0 +1,476 @@
import numpy as np
from typing import Union, List
from collections.abc import Iterable
from dataclasses import dataclass
from scipy.stats import wilcoxon, ttest_ind_from_stats
import pandas as pd
import os
from pathlib import Path
@dataclass
class CellFormat:
mean_prec: int = 3
std_prec: int = 3
show_std: bool = True
remove_zero: bool = False
color: bool = True
maxtone: int = 50
class Cell:
def __init__(self, format: CellFormat, group: 'CellGroup'):
self.values = []
self.format = format
self.touch()
self.group = group
self.group.register_cell(self)
def __len__(self):
return len(self.values)
def mean(self):
if self.mean_ is None:
self.mean_ = np.mean(self.values)
return self.mean_
def std(self):
if self.std_ is None:
self.std_ = np.std(self.values)
return self.std_
def touch(self):
self.mean_ = None
self.std_ = None
def append(self, v: Union[float,Iterable]):
if isinstance(v, Iterable):
self.values.extend(v)
self.values.append(v)
self.touch()
def isEmpty(self):
return len(self)==0
def isBest(self):
best = self.group.best()
if best is not None:
return (best == self) or (np.isclose(best.mean(), self.mean()))
return False
def print_mean(self):
if self.isEmpty():
return ''
else:
return f'{self.mean():.{self.format.mean_prec}f}'
def print(self):
if self.isEmpty():
return ''
# mean
# ---------------------------------------------------
mean = self.print_mean()
if self.format.remove_zero:
mean = mean.replace('0.', '.')
# std ?
# ---------------------------------------------------
if self.format.show_std:
std = f' $\pm$ {self.std():.{self.format.std_prec}f}'
else:
std = ''
# bold or statistical test
# ---------------------------------------------------
if self.isBest():
str_cell = f'\\textbf{{{mean}{std}}}'
else:
comp_symbol = ''
pval = self.group.compare(self)
if pval is not None:
if 0.005 > pval:
comp_symbol = ''
elif 0.05 > pval >= 0.005:
comp_symbol = '$^{\dag}$'
elif pval >= 0.05:
comp_symbol = '${\ddag}$'
str_cell = f'{mean}{comp_symbol}{std}'
# color ?
# ---------------------------------------------------
if self.format.color:
str_cell += ' ' + self.group.color(self)
return str_cell
class CellGroup:
def __init__(self, lower_is_better=True, stat_test='wilcoxon', color_mode='local', color_global_min=None, color_global_max=None):
assert stat_test in ['wilcoxon', 'ttest', None], \
f"unknown {stat_test=}, valid ones are wilcoxon, ttest, or None"
assert color_mode in ['local', 'global'], \
f"unknown {color_mode=}, valid ones are local and global"
if (color_global_min is not None or color_global_max is not None) and color_mode=='local':
print('warning: color_global_min and color_global_max are only considered when color_mode==local')
self.cells = []
self.lower_is_better = lower_is_better
self.stat_test = stat_test
self.color_mode = color_mode
self.color_global_min = color_global_min
self.color_global_max = color_global_max
def register_cell(self, cell: Cell):
self.cells.append(cell)
def non_empty_cells(self):
return [c for c in self.cells if not c.isEmpty()]
def max(self):
cells = self.non_empty_cells()
if len(cells)>0:
return cells[np.argmax([c.mean() for c in cells])]
return None
def min(self):
cells = self.non_empty_cells()
if len(cells) > 0:
return cells[np.argmin([c.mean() for c in cells])]
return None
def best(self) -> Cell:
return self.min() if self.lower_is_better else self.max()
def worst(self) -> Cell:
return self.max() if self.lower_is_better else self.min()
def isEmpty(self):
return len(self.non_empty_cells())==0
def compare(self, cell: Cell):
best = self.best()
best_n = len(best)
cell_n = len(cell)
if best_n > 0 and cell_n > 0:
if self.stat_test == 'wilcoxon':
try:
_, p_val = wilcoxon(best.values, cell.values)
except ValueError:
p_val = None
return p_val
elif self.stat_test == 'ttest':
best_mean, best_std = best.mean(), best.std()
cell_mean, cell_std = cell.mean(), cell.std()
_, p_val = ttest_ind_from_stats(best_mean, best_std, best_n, cell_mean, cell_std, cell_n)
return p_val
elif self.stat_test is None:
return None
else:
raise ValueError(f'unknown statistical test {self.stat_test}')
else:
return None
def color(self, cell: Cell):
cell_mean = cell.mean()
if self.color_mode == 'local':
best = self.best()
worst = self.worst()
best_mean = best.mean()
worst_mean = worst.mean()
if best is None or worst is None or best_mean == worst_mean or cell.isEmpty():
return ''
# normalize val in [0,1]
maxval = max(best_mean, worst_mean)
minval = min(best_mean, worst_mean)
else:
maxval = self.color_global_max
minval = self.color_global_min
normval = (cell_mean - minval) / (maxval - minval)
if self.lower_is_better:
normval = 1 - normval
normval = np.clip(normval, 0, 1)
normval = normval * 2 - 1 # rescale to [-1,1]
if normval < 0:
color = 'red'
tone = cell.format.maxtone * (-normval)
else:
color = 'green'
tone = cell.format.maxtone * normval
return f'\cellcolor{{{color}!{int(tone)}}}'
class Table:
def __init__(self,
name,
benchmarks=None,
methods=None,
format:CellFormat=None,
lower_is_better=True,
stat_test='wilcoxon',
color_mode='local',
with_mean=True
):
self.name = name
self.benchmarks = [] if benchmarks is None else benchmarks
self.methods = [] if methods is None else methods
self.format = format if format is not None else CellFormat()
self.lower_is_better = lower_is_better
self.stat_test = stat_test
self.color_mode = color_mode
self.with_mean = with_mean
self.only_full_mean = True # if False, compute the mean of partially empty methods also
if self.color_mode == 'global':
self.color_global_min = 0
self.color_global_max = 1
else:
self.color_global_min = None
self.color_global_max = None
self.T = {}
self.groups = {}
def add(self, benchmark, method, v):
cell = self.get(benchmark, method)
cell.append(v)
def get_benchmarks(self):
return self.benchmarks
def get_methods(self):
return self.methods
def n_benchmarks(self):
return len(self.benchmarks)
def n_methods(self):
return len(self.methods)
def _new_group(self):
return CellGroup(self.lower_is_better, self.stat_test, color_mode=self.color_mode,
color_global_max=self.color_global_max, color_global_min=self.color_global_min)
def get(self, benchmark, method) -> Cell:
if benchmark not in self.benchmarks:
self.benchmarks.append(benchmark)
if benchmark not in self.groups:
self.groups[benchmark] = self._new_group()
if method not in self.methods:
self.methods.append(method)
b_idx = self.benchmarks.index(benchmark)
m_idx = self.methods.index(method)
idx = tuple((b_idx, m_idx))
if idx not in self.T:
self.T[idx] = Cell(self.format, group=self.groups[benchmark])
cell = self.T[idx]
return cell
def get_value(self, benchmark, method) -> float:
return self.get(benchmark, method).mean()
def get_benchmark(self, benchmark):
cells = [self.get(benchmark, method=m) for m in self.get_methods()]
cells = [c for c in cells if not c.isEmpty()]
return cells
def get_method(self, method):
cells = [self.get(benchmark=b, method=method) for b in self.get_benchmarks()]
cells = [c for c in cells if not c.isEmpty()]
return cells
def get_method_means(self, method_order):
mean_group = self._new_group()
cells = []
for method in method_order:
method_mean = Cell(self.format, group=mean_group)
for bench in self.get_benchmarks():
mean_value = self.get_value(benchmark=bench, method=method)
if not np.isnan(mean_value):
method_mean.append(mean_value)
cells.append(method_mean)
return cells
def get_benchmark_values(self, benchmark):
values = np.asarray([c.mean() for c in self.get_benchmark(benchmark)])
return values
def get_method_values(self, method):
values = np.asarray([c.mean() for c in self.get_method(method)])
return values
def all_mean(self):
values = [c.mean() for c in self.T.values() if not c.isEmpty()]
return np.mean(values)
def print(self): # todo: missing method names?
data_dict = {}
data_dict['Benchmark'] = [b for b in self.get_benchmarks()]
for method in self.get_methods():
data_dict[method] = [self.get(bench, method).print_mean() for bench in self.get_benchmarks()]
df = pd.DataFrame(data_dict)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
print(df.to_string(index=False))
def tabular(self, path=None, benchmark_replace=None, method_replace=None, benchmark_order=None, method_order=None, transpose=False):
if benchmark_replace is None:
benchmark_replace = {}
if method_replace is None:
method_replace = {}
if benchmark_order is None:
benchmark_order = self.get_benchmarks()
if method_order is None:
method_order = self.get_methods()
if transpose:
row_order, row_replace = method_order, method_replace
col_order, col_replace = benchmark_order, benchmark_replace
else:
row_order, row_replace = benchmark_order, benchmark_replace
col_order, col_replace = method_order, method_replace
n_cols = len(col_order)
add_mean_col = self.with_mean and transpose
add_mean_row = self.with_mean and not transpose
last_col_idx = n_cols+2 if add_mean_col else n_cols+1
if self.with_mean:
mean_cells = self.get_method_means(method_order)
lines = []
lines.append('\\begin{tabular}{|c' + '|c' * n_cols + ('||c' if add_mean_col else '') + "|}")
lines.append(f'\\cline{{2-{last_col_idx}}}')
l = '\multicolumn{1}{c|}{} & '
l += ' & '.join([col_replace.get(col, col) for col in col_order])
if add_mean_col:
l += ' & Ave.'
l += ' \\\\\\hline'
lines.append(l)
for i, row in enumerate(row_order):
rowname = row_replace.get(row, row)
l = rowname + ' & '
l += ' & '.join([
self.get(benchmark=col if transpose else row, method=row if transpose else col).print()
for col in col_order
])
if add_mean_col:
l+= ' & ' + mean_cells[i].print()
l += ' \\\\\\hline'
lines.append(l)
if add_mean_row:
lines.append('\hline')
l = 'Ave. & '
l+= ' & '.join([mean_cell.print() for mean_cell in mean_cells])
l += ' \\\\\\hline'
lines.append(l)
lines.append('\\end{tabular}')
tabular_tex = '\n'.join(lines)
if path is not None:
parent = Path(path).parent
if parent:
os.makedirs(parent, exist_ok=True)
with open(path, 'wt') as foo:
foo.write(tabular_tex)
return tabular_tex
def table(self, tabular_path, benchmark_replace=None, method_replace=None, resizebox=True, caption=None, label=None, benchmark_order=None, method_order=None, transpose=False):
if benchmark_replace is None:
benchmark_replace = {}
if method_replace is None:
method_replace = {}
lines = []
lines.append('\\begin{table}')
lines.append('\center')
if resizebox:
lines.append('\\resizebox{\\textwidth}{!}{%')
tabular_str = self.tabular(tabular_path, benchmark_replace, method_replace, benchmark_order, method_order, transpose)
if tabular_path is None:
lines.append(tabular_str)
else:
lines.append(f'\input{{tables/{Path(tabular_path).name}}}')
if resizebox:
lines.append('}%')
if caption is None:
caption = tabular_path.replace('_', '\_')
lines.append(f'\caption{{{caption}}}')
if label is not None:
lines.append(f'\label{{{label}}}')
lines.append('\end{table}')
table_tex = '\n'.join(lines)
return table_tex
def document(self, tex_path, tabular_dir='tables', *args, **kwargs):
Table.Document(tex_path, tables=[self], tabular_dir=tabular_dir, *args, **kwargs)
def latexPDF(self, pdf_path, tabular_dir='tables', *args, **kwargs):
return Table.LatexPDF(pdf_path, tables=[self], tabular_dir=tabular_dir, *args, **kwargs)
@classmethod
def Document(self, tex_path, tables:List['Table'], tabular_dir='tables', *args, **kwargs):
lines = []
lines.append('\\documentclass[10pt,a4paper]{article}')
lines.append('\\usepackage[utf8]{inputenc}')
lines.append('\\usepackage{amsmath}')
lines.append('\\usepackage{amsfonts}')
lines.append('\\usepackage{amssymb}')
lines.append('\\usepackage{graphicx}')
lines.append('\\usepackage{xcolor}')
lines.append('\\usepackage{colortbl}')
lines.append('')
lines.append('\\begin{document}')
for table in tables:
lines.append('')
lines.append(table.table(os.path.join(Path(tex_path).parent, tabular_dir, table.name + '_table.tex'), *args, **kwargs))
lines.append('\\end{document}')
document = '\n'.join(lines)
parent = Path(tex_path).parent
if parent:
os.makedirs(parent, exist_ok=True)
with open(tex_path, 'wt') as foo:
foo.write(document)
return document
@classmethod
def LatexPDF(cls, pdf_path: str, tables:List['Table'], tabular_dir: str = 'tables', *args, **kwargs):
assert pdf_path.endswith('.pdf'), f'{pdf_path=} does not seem a valid name for a pdf file'
tex_path = pdf_path.replace('.pdf', '.tex')
cls.Document(tex_path, tables, tabular_dir, *args, **kwargs)
dir = Path(pdf_path).parent
pwd = os.getcwd()
print('currently in', pwd)
print("[Tables Done] runing latex")
os.chdir(dir)
os.system('pdflatex ' + Path(tex_path).name)
basename = Path(tex_path).name.replace('.tex', '')
os.system(f'rm {basename}.aux {basename}.bbl {basename}.blg {basename}.log {basename}.out {basename}.dvi')
os.chdir(pwd)

16
Census/tmp.py Normal file
View File

@ -0,0 +1,16 @@
import numpy as np
import pandas as pd
from Census.commons import AdjMatrix, load_csv, get_dataset_by_area
census = './data/cens_y.csv'
Areas, X = load_csv(census, use_yhat=True)
data = get_dataset_by_area(Areas, X)
areas = [a for a, *_ in data]
print(f'Area codes={areas}')
A = AdjMatrix('./data/matrice_adiacenza.csv')
print(A.adjacent(45, 46))
print(A.get_adjacent(50))