import os from collections import defaultdict from typing import List, Dict import matplotlib.pyplot as plt import kagglehub import pandas as pd from pathlib import Path import numpy as np from qunfold import KMM from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.decomposition import TruncatedSVD from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression as LR, LogisticRegressionCV from tqdm import tqdm import quapy as qp from data import LabelledCollection, Dataset import quapy.functional as F from method.composable import QUnfoldWrapper from quapy.method.aggregative import DistributionMatchingY, EMQ, KDEyML from quapy.method.non_aggregative import DistributionMatchingX from quapy.method.aggregative import CC, ACC, HDy from transformers import pipeline pd.set_option('display.max_columns', None) pd.set_option('display.width', 2000) pd.set_option('display.max_rows', None) pd.set_option("display.expand_frame_repr", False) pd.set_option("display.precision", 4) pd.set_option("display.float_format", "{:.4f}".format) def prepare_xy_date_blocks(df, freq="M"): """ df: DataFrame con columnas 'text', 'airline_sentiment', 'tweet_created' freq: frecuencia de los bloques temporales ('D', 'W', 'M', etc.) Devuelve: X: lista de textos y: np.ndarray de etiquetas date: lista de índices enteros por bloque temporal idx2date: lista con los límites temporales de cada bloque (tuplas) """ df["tweet_created"] = pd.to_datetime(df["tweet_created"], errors="coerce") df = df.sort_values("tweet_created").reset_index(drop=True) X = df["text"].astype(str).values y = df["airline_sentiment"].values # group dates by requested frequency date_groups = df["tweet_created"].dt.to_period(freq) # assigns index to date blocks unique_periods = date_groups.unique() period_to_idx = {p: i for i, p in enumerate(unique_periods)} date = np.asarray([period_to_idx[p] for p in date_groups]) # get true limits of period intervals idx2date = [] for p in unique_periods: start = p.start_time end = p.end_time idx2date.append((start, end)) return X, y, date, idx2date def prepare_labelled_collections(): # loads and prepares the Twitter US Arlines Sentiment dataset (from Kaggle) # returns a labelled collection for the training data (day 0 and 1), and a list of the # test sets (days 2 to 8) and the time limits for each test period # The dataset is originally ternary (negative, neutral, positive), but we binarize it discarding neutral # Download latest version path = kagglehub.dataset_download("crowdflower/twitter-airline-sentiment") df = pd.read_csv(Path(path) / 'Tweets.csv') X, y, date, idx2date = prepare_xy_date_blocks(df, freq="D") # binarize keep_idx = (y!='neutral') X = X[keep_idx] y = y[keep_idx] date = date[keep_idx] y[y != 'negative'] = 1 y[y == 'negative'] = 0 y = y.astype(int) # use day 0 for training, the rest for test X_train, y_train = X[date<=1], y[date<=1] train = LabelledCollection(X_train, y_train) print(f'training has {len(train)} docs and prevalence={F.strprev(train.prevalence())} classes={train.classes}') tests = [] test_init = [] for date_i in range(2, max(date)+1): X_test_i, y_test_i = X[date==date_i], y[date==date_i] test_i = LabelledCollection(X_test_i, y_test_i, classes=train.classes) print(f'test-{date_i} has {len(test_i)} docs and prevalence={F.strprev(test_i.prevalence())}') tests.append(test_i) test_init.append(idx2date[date_i]) return train, tests, test_init from scipy.interpolate import CubicSpline import numpy as np import matplotlib.pyplot as plt def smooth_curve(dates, values, num_points=300): """ dates: list of timestamps values: list of Y-values num_points: number of points in the smooth curve Returns new_x, new_y for plotting a smooth line. """ # Convert datetime to numeric (matplotlib float representation) x = [d.timestamp() for d in dates] x = np.array(x) y = np.array(values) # Create new X-axis with more points x_new = np.linspace(x.min(), x.max(), num_points) # Smooth spline spline = CubicSpline(x, y) y_new = spline(x_new) # Convert numeric x_new back to datetime dates_new = [pd.to_datetime(t, unit='s') for t in x_new] return dates_new, y_new def plot_prevalences(results_dict, target_class=1, target_label='positive', savepath=None): """ Plot prevalence estimates over time for each method contained in results_dict. Parameters ---------- results_dict : dict A dictionary where: - "date-start" : list of datetime-like objects - all other keys : list of prevalence vectors (arrays), e.g. [p_pos, p_neg] Only the first component (p_pos) will be plotted. """ dates = results_dict["date-start"] # Create figure plt.figure(figsize=(20, 10)) # Plot one line per method (except "date-start") for method, values in results_dict.items(): if method == "date-start": continue # Extract first component from each prevalence array target_component = [v[target_class]*100 for v in values] dates_smooth, y_smooth = smooth_curve(dates, target_component) if method=='true-prev': line,=plt.plot(dates_smooth, y_smooth, label=method, linewidth=3, linestyle='-') else: line,=plt.plot(dates_smooth, y_smooth, label=method, linewidth=2, linestyle='--') plt.plot(dates, target_component, 'o', markersize=10, color=line.get_color()) # Axis labels # plt.xlabel("Date") plt.ylabel("% of "+target_label+" tweets") # Rotate date labels for readability plt.xticks(rotation=45) plt.minorticks_on() plt.grid(which='major', linestyle='-', linewidth=0.5) plt.grid(which='minor', linestyle=':', linewidth=0.3) # Place the legend outside to the right plt.legend(loc="center left", bbox_to_anchor=(1, 0.5)) plt.tight_layout() if savepath is not None: os.makedirs(Path(savepath).parent, exist_ok=True) plt.savefig(savepath) else: plt.show() class HDxDensify(DistributionMatchingX): def fit(self, X, y): self.reductor = TruncatedSVD(n_components=5, random_state=0) Xred = self.reductor.fit_transform(X) return super().fit(Xred, y) def predict(self, X): Xred = self.reductor.transform(X) return super().predict(Xred) class QUnfoldWrapperDensify(QUnfoldWrapper): def fit(self, X, y): self.reductor = TruncatedSVD(n_components=5, random_state=0) Xred = self.reductor.fit_transform(X) return super().fit(Xred, y) def predict(self, X): Xred = self.reductor.transform(X) return super().predict(Xred) # A scikit-learn's style wrapper for a huggingface-based pre-trained transformer for binary sentiment classification class HFTextClassifier(BaseEstimator, ClassifierMixin): def __init__(self, model_name='distilbert-base-uncased-finetuned-sst-2-english'): self.pipe = pipeline("sentiment-analysis", model=model_name) self.classes_ = np.asarray([0,1]) def fit(self, X, y=None): return self def _binary_decisions(self, transformer_output: List[Dict]): return np.array([(1 if p['label']=='POSITIVE' else 0) for p in transformer_output], dtype=int) def predict(self, X): X = list(map(str, X)) preds = self.pipe(X, truncation=True) return self._binary_decisions(preds) def predict_proba(self, X): X = list(map(str, X)) n_examples = len(X) preds = self.pipe(X, truncation=True) decisions = self._binary_decisions(preds) scores = np.array([p['score'] for p in preds], dtype=float) probas = np.zeros(shape=(len(X), 2), dtype=float) probas[np.arange(n_examples),decisions] = scores probas[np.arange(n_examples),~decisions] = 1-scores return probas # def methods(pre_trained_classifier): # yield 'CC', CC(pre_trained_classifier, fit_classifier=False) USE_LOGISTIC_REGRESSION = True if USE_LOGISTIC_REGRESSION: new_classifier = lambda:LR() to_fit = True else: pretrained = HFTextClassifier() new_classifier = lambda:pretrained to_fit = False def methods(): yield 'CC', CC(new_classifier(), fit_classifier=to_fit) yield 'ACC', ACC(new_classifier(), fit_classifier=to_fit) yield 'HDy', DistributionMatchingY(new_classifier(), fit_classifier=to_fit) yield 'HDx', HDxDensify() yield 'KMM', QUnfoldWrapperDensify(KMM()) yield 'SLD', EMQ(new_classifier(), fit_classifier=to_fit) yield 'KDEy', KDEyML(new_classifier(), fit_classifier=to_fit) train, tests, test_init = prepare_labelled_collections() if USE_LOGISTIC_REGRESSION: # vectorize text for logistic regression vectorizer = TfidfVectorizer(min_df=5, sublinear_tf=True) Xtr = vectorizer.fit_transform(train.X) train = LabelledCollection(Xtr, train.labels, train.classes_) for i in range(len(tests)): Xte = vectorizer.transform(tests[i].X) tests[i] = LabelledCollection(Xte, tests[i].labels, train.classes_) results = defaultdict(list) for test_i, test_init_i in zip(tests, test_init): results['true-prev'].append(test_i.prevalence()) results['date-start'].append(test_init_i[0]) for q_name, quant in methods(): quant.fit(*train.Xy) for test_i, test_init_i in tqdm(zip(tests, test_init), desc=f'{q_name} predicting', total=len(tests)): pred_i = quant.predict(test_i.X) results[q_name].append(pred_i) suffix = '_lr' if USE_LOGISTIC_REGRESSION else '_transformer' plot_prevalences(results, savepath=f'./plots_ieee/over_time{suffix}.pdf')