forked from moreo/QuaPy
trying ordinal classification
This commit is contained in:
parent
f285e936ad
commit
b982a51103
|
@ -1,35 +1,74 @@
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
from sklearn.linear_model import LogisticRegression
|
from sklearn.linear_model import LogisticRegression
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
from method.aggregative import PACC, CC, EMQ
|
import numpy as np
|
||||||
|
|
||||||
|
from Ordinal.model import OrderedLogisticRegression, StackedClassifier, RegressionQuantification, RegressorClassifier
|
||||||
|
from quapy.method.aggregative import PACC, CC, EMQ, PCC, ACC
|
||||||
from quapy.data import LabelledCollection
|
from quapy.data import LabelledCollection
|
||||||
from os.path import join
|
from os.path import join
|
||||||
from utils import load_samples
|
from utils import load_samples, load_samples_pkl
|
||||||
from evaluation import nmd
|
from evaluation import nmd, mnmd
|
||||||
|
from time import time
|
||||||
|
import pickle
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
domain = 'Books'
|
domain = 'Books-tfidf'
|
||||||
datapath = './data'
|
datapath = './data'
|
||||||
protocol = 'app'
|
protocol = 'app'
|
||||||
|
drift = 'low'
|
||||||
|
|
||||||
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, ngram_range=(1, 2))
|
train = pickle.load(open(join(datapath, domain, 'training_data.pkl'), 'rb'))
|
||||||
|
|
||||||
train = LabelledCollection.load(join(datapath, domain, 'training_data.txt'), loader_func=qp.data.reader.from_text)
|
|
||||||
train.instances = tfidf.fit_transform(train.instances)
|
|
||||||
|
|
||||||
|
|
||||||
def load_test_samples():
|
def load_test_samples():
|
||||||
for sample in load_samples(join(datapath, domain, protocol, 'test_samples'), classes=train.classes_):
|
ids = np.load(join(datapath, domain, protocol, f'{drift}drift.test.id.npy'))
|
||||||
sample.instances = tfidf.transform(sample.instances)
|
ids = set(ids)
|
||||||
|
for sample in tqdm(load_samples_pkl(join(datapath, domain, protocol, 'test_samples'), filter=ids), total=len(ids)):
|
||||||
yield sample.instances, sample.prevalence()
|
yield sample.instances, sample.prevalence()
|
||||||
|
|
||||||
|
|
||||||
q = EMQ(LogisticRegression())
|
def load_dev_samples():
|
||||||
|
ids = np.load(join(datapath, domain, protocol, f'{drift}drift.dev.id.npy'))
|
||||||
|
ids = set(ids)
|
||||||
|
for sample in tqdm(load_samples_pkl(join(datapath, domain, protocol, 'dev_samples'), filter=ids), total=len(ids)):
|
||||||
|
yield sample.instances, sample.prevalence()
|
||||||
|
|
||||||
|
|
||||||
|
print('fitting the quantifier')
|
||||||
|
|
||||||
|
# q = PACC(LogisticRegression(class_weight='balanced'))
|
||||||
|
# q = PACC(OrderedLogisticRegression())
|
||||||
|
# q = PACC(StackedClassifier(LogisticRegression(class_weight='balanced')))
|
||||||
|
# q = RegressionQuantification(PCC(LogisticRegression(class_weight='balanced')), val_samples_generator=load_dev_samples)
|
||||||
|
q = PACC(RegressorClassifier())
|
||||||
|
|
||||||
|
q = qp.model_selection.GridSearchQ(
|
||||||
|
q,
|
||||||
|
# {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']},
|
||||||
|
{'C': np.logspace(-3,3,14)},
|
||||||
|
1000,
|
||||||
|
'gen',
|
||||||
|
error=mnmd,
|
||||||
|
val_split=load_dev_samples,
|
||||||
|
n_jobs=-1,
|
||||||
|
refit=False,
|
||||||
|
verbose=True)
|
||||||
|
|
||||||
q.fit(train)
|
q.fit(train)
|
||||||
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd], eval_budget=100)
|
|
||||||
|
print('[done]')
|
||||||
|
|
||||||
|
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
|
||||||
mean_nmd = report['nmd'].mean()
|
mean_nmd = report['nmd'].mean()
|
||||||
std_nmd = report['nmd'].std()
|
std_nmd = report['nmd'].std()
|
||||||
|
|
||||||
print(f'{mean_nmd:.4f} +-{std_nmd:.4f}')
|
print(f'{mean_nmd:.4f} +-{std_nmd:.4f}')
|
||||||
|
|
||||||
|
# drift='high'
|
||||||
|
# report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
|
||||||
|
# mean_nmd = report['nmd'].mean()
|
||||||
|
# std_nmd = report['nmd'].std()
|
||||||
|
# print(f'{mean_nmd:.4f} +-{std_nmd:.4f}')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -3,6 +3,7 @@ from quapy.data import LabelledCollection
|
||||||
from glob import glob
|
from glob import glob
|
||||||
import os
|
import os
|
||||||
from os.path import join
|
from os.path import join
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
|
||||||
def load_samples(path_dir, classes):
|
def load_samples(path_dir, classes):
|
||||||
|
@ -11,3 +12,11 @@ def load_samples(path_dir, classes):
|
||||||
yield LabelledCollection.load(join(path_dir, f'{id}.txt'), loader_func=qp.data.reader.from_text, classes=classes)
|
yield LabelledCollection.load(join(path_dir, f'{id}.txt'), loader_func=qp.data.reader.from_text, classes=classes)
|
||||||
|
|
||||||
|
|
||||||
|
def load_samples_pkl(path_dir, filter=None):
|
||||||
|
nsamples = len(glob(join(path_dir, f'*.pkl')))
|
||||||
|
for id in range(nsamples):
|
||||||
|
if filter is not None:
|
||||||
|
if id not in filter:
|
||||||
|
continue
|
||||||
|
yield pickle.load(open(join(path_dir, f'{id}.pkl'), 'rb'))
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,10 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from scipy.sparse import dok_matrix
|
from scipy.sparse import dok_matrix
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
from time import time
|
||||||
|
|
||||||
|
|
||||||
def from_text(path, encoding='utf-8', verbose=1, class2int=True):
|
def from_text(path, encoding='utf-8', verbose=0, class2int=True):
|
||||||
"""
|
"""
|
||||||
Reads a labelled colletion of documents.
|
Reads a labelled colletion of documents.
|
||||||
File fomart <0 or 1>\t<document>\n
|
File fomart <0 or 1>\t<document>\n
|
||||||
|
|
Loading…
Reference in New Issue