import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression import quapy as qp import quapy.functional as F from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC from quapy.protocol import AbstractProtocol from quapy.data.base import LabelledCollection from glob import glob from os.path import join def methods(): yield ('MLPE', MaximumLikelihoodPrevalenceEstimation()) yield ('CC', ClassifyAndCount(LogisticRegression(n_jobs=-1))) yield ('ACC', ACC(LogisticRegression(n_jobs=-1))) yield ('PCC', PCC(LogisticRegression(n_jobs=-1))) yield ('PACC', PACC(LogisticRegression(n_jobs=-1))) yield ('EMQ', EMQ(LogisticRegression(n_jobs=-1))) def load_txt_sample(path, verbose=False): if verbose: print(f'loading {path}...', end='') df = pd.read_csv(path, sep='\t') if verbose: print('[done]') X = df['text'] y = df['first_letter_category'] return X, y class RetrievedSamples(AbstractProtocol): def __init__(self, path_dir: str, load_fn, vectorizer, classes): self.path_dir = path_dir self.load_fn = load_fn self.vectorizer = vectorizer self.classes = classes def __call__(self): for file in glob(join(self.path_dir, 'test_data_*.txt')): X, y = self.load_fn(file) if len(X)!=qp.environ['SAMPLE_SIZE']: print(f'[warning]: file {file} contains {len(X)} instances (expected: {qp.environ["SAMPLE_SIZE"]})') # assert len(X) == qp.environ['SAMPLE_SIZE'], f'unexpected sample size for file {file}, found {len(X)}' X = self.vectorizer.transform(X) sample = LabelledCollection(X, y, classes=self.classes) yield sample.Xp qp.environ['SAMPLE_SIZE']=100 data_path = './data' train_path = join(data_path, 'train_data.txt') tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5) training = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True) # training = training.sampling(1000) Xtr, ytr = training.Xy Xtr = tfidf.fit_transform(Xtr) print('Xtr shape = ', Xtr.shape) training = LabelledCollection(Xtr, ytr) classes = training.classes_ test_prot = RetrievedSamples(data_path, load_fn=load_txt_sample, vectorizer=tfidf, classes=classes) print('Training prevalence:', F.strprev(training.prevalence())) for X, p in test_prot(): print('Test prevalence:', F.strprev(p)) for method_name, quantifier in methods(): print('training ', method_name) quantifier.fit(training) print('[done]') report = qp.evaluation.evaluation_report(quantifier, test_prot, error_metrics=['mae', 'mrae'], verbose=True) print(report.mean())