forked from moreo/QuaPy
90 lines
2.8 KiB
Python
90 lines
2.8 KiB
Python
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sklearn.linear_model import LogisticRegression
|
|
import quapy as qp
|
|
import numpy as np
|
|
|
|
from Ordinal.model import OrderedLogisticRegression, StackedClassifier, RegressionQuantification, RegressorClassifier
|
|
from quapy.method.aggregative import PACC, CC, EMQ, PCC, ACC
|
|
from quapy.data import LabelledCollection
|
|
from os.path import join
|
|
from utils import load_samples, load_samples_pkl
|
|
from evaluation import nmd, mnmd
|
|
from time import time
|
|
import pickle
|
|
from tqdm import tqdm
|
|
|
|
domain = 'Books-tfidf'
|
|
datapath = './data'
|
|
protocol = 'app'
|
|
drift = 'high'
|
|
|
|
train = pickle.load(open(join(datapath, domain, 'training_data.pkl'), 'rb'))
|
|
|
|
|
|
def load_test_samples():
|
|
ids = np.load(join(datapath, domain, protocol, f'{drift}drift.test.id.npy'))
|
|
ids = set(ids)
|
|
for sample in tqdm(load_samples_pkl(join(datapath, domain, protocol, 'test_samples'), filter=ids), total=len(ids)):
|
|
yield sample.instances, sample.prevalence()
|
|
|
|
|
|
def load_dev_samples():
|
|
ids = np.load(join(datapath, domain, protocol, f'{drift}drift.dev.id.npy'))
|
|
ids = set(ids)
|
|
for sample in tqdm(load_samples_pkl(join(datapath, domain, protocol, 'dev_samples'), filter=ids), total=len(ids)):
|
|
yield sample.instances, sample.prevalence()
|
|
|
|
|
|
print('fitting the quantifier')
|
|
|
|
# q = EMQ(LogisticRegression(class_weight='balanced'))
|
|
# q = PACC(LogisticRegression(class_weight='balanced'))
|
|
q = PACC(OrderedLogisticRegression())
|
|
# q = PACC(StackedClassifier(LogisticRegression(class_weight='balanced')))
|
|
# q = RegressionQuantification(PCC(LogisticRegression(class_weight='balanced')), val_samples_generator=load_dev_samples)
|
|
# q = ACC(RegressorClassifier())
|
|
|
|
param_grid = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
|
|
# param_grid = {'C': np.logspace(-3,3,14)}
|
|
# param_grid = {'alpha':np.logspace(-8, 6, 15)}
|
|
|
|
# q = qp.model_selection.GridSearchQ(
|
|
# q,
|
|
# param_grid,
|
|
# 1000,
|
|
# 'gen',
|
|
# error=mnmd,
|
|
# val_split=load_dev_samples,
|
|
# n_jobs=-1,
|
|
# refit=False,
|
|
# verbose=True)
|
|
|
|
q.fit(train)
|
|
|
|
# q = RegressionQuantification(q, val_samples_generator=load_dev_samples)
|
|
# q.fit(None)
|
|
|
|
print('[done]')
|
|
|
|
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
|
|
mean_nmd = report['nmd'].mean()
|
|
std_nmd = report['nmd'].std()
|
|
print(f'{mean_nmd:.4f} +-{std_nmd:.4f}')
|
|
|
|
q = RegressionQuantification(q, val_samples_generator=load_dev_samples)
|
|
q.fit(None)
|
|
|
|
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
|
|
mean_nmd = report['nmd'].mean()
|
|
std_nmd = report['nmd'].std()
|
|
print(f'[regression-correction] {mean_nmd:.4f} +-{std_nmd:.4f}')
|
|
|
|
# drift='high'
|
|
# report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
|
|
# mean_nmd = report['nmd'].mean()
|
|
# std_nmd = report['nmd'].std()
|
|
# print(f'{mean_nmd:.4f} +-{std_nmd:.4f}')
|
|
|
|
|
|
|