QuaPy/examples/4.using_pretrained_classifi...

"""
Aggregative quantifiers use an underlying classifier. Often, one has one pre-trained classifier available, and
needs to use this classifier at the basis of a quantification system. In such cases, the classifier should not
be retrained, but only used to issue classifier predictions for the quantifier.
In this example, we show how to instantiate a quantifier with a pre-trained classifier.
"""
from typing import List, Dict

import quapy as qp
from quapy.method.aggregative import PACC
from sklearn.base import BaseEstimator, ClassifierMixin
from transformers import pipeline
import numpy as np
import quapy.functional as F


# A scikit-learn's style wrapper for a huggingface-based pre-trained transformer for binary sentiment classification
class HFTextClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, model_name='distilbert-base-uncased-finetuned-sst-2-english'):
        self.pipe = pipeline("sentiment-analysis", model=model_name)
        self.classes_ = np.asarray([0,1])

    def fit(self, X, y=None):
        return self

    def _binary_decisions(self, transformer_output: List[Dict]):
        return np.array([(1 if p['label']=='POSITIVE' else 0) for p in transformer_output], dtype=int)

    def predict(self, X):
        X = list(map(str, X))
        preds = self.pipe(X, truncation=True)
        return self._binary_decisions(preds)

    def predict_proba(self, X):
        X = list(map(str, X))
        n_examples = len(X)
        preds = self.pipe(X, truncation=True)
        decisions = self._binary_decisions(preds)
        scores = np.array([p['score'] for p in preds], dtype=float)
        probas = np.zeros(shape=(len(X), 2), dtype=float)
        probas[np.arange(n_examples),decisions] = scores
        probas[np.arange(n_examples),~decisions] = 1-scores
        return probas

# load a sentiment dataset
dataset = qp.datasets.fetch_reviews('imdb', tfidf=False)  # raw text
train, test = dataset.training, dataset.test

# instantiate a pre-trained classifier
clf = HFTextClassifier()

# Let us fit a quantifier based on our pre-trained classifier.
# Note that, since the classifier is already fit, we will use the entire training set for
# learning the aggregation function of the quantifier.
# To do so, we only need to indicate "fit_classifier"=False, as follows:
quantifier = PACC(clf, fit_classifier=False)   # Probabilistic Classify & Count using a pre-trained model

print('training PACC...')
quantifier.fit(*train.Xy)

# let us simulate some shifted test data...
new_prevalence = [0.75, 0.25]
shifted_test = test.sampling(500, *new_prevalence, random_state=0)

# and do some evaluation
print('predicting with PACC...')
estim_prevalence = quantifier.predict(shifted_test.X)

print('Result:\n'+('='*20))
print(f'training prevalence: {F.strprev(train.prevalence())}')
print(f'(shifted) test prevalence: {F.strprev(shifted_test.prevalence())}')
print(f'estimated prevalence: {F.strprev(estim_prevalence)}')

absolute_error = qp.error.ae(new_prevalence, estim_prevalence)
print(f'absolute error={absolute_error:.4f}')