lazy index construction in labelled collection

This commit is contained in:
Alejandro Moreo Fernandez 2025-10-23 12:35:01 +02:00
parent 088ebcdd31
commit 41baeb78ca
3 changed files with 21 additions and 13 deletions

View File

@ -3,6 +3,7 @@ Change Log 0.2.1
- Improved documentation of confidence regions.
- Added ReadMe method by Daniel Hopkins and Gary King
- Internal index in LabelledCollection is now "lazy", and is only constructed if required.
Change Log 0.2.0
-----------------

View File

@ -7,7 +7,7 @@ import numpy as np
from experimental_non_aggregative.custom_vectorizers import *
from protocol import APP
from quapy.method.aggregative import _get_divergence, HDy, DistributionMatching
from quapy.method.aggregative import HDy, DistributionMatchingY
from quapy.method.base import BaseQuantifier
from scipy import optimize
import pandas as pd
@ -30,28 +30,30 @@ class DxS(BaseQuantifier):
# return np.asarray(instances.sum(axis=0) / instances.sum()).flatten()
def __as_distribution(self, instances):
dist = instances.sum(axis=0) / instances.sum()
dist = instances.mean(axis=0)
return np.asarray(dist).flatten()
def fit(self, data: LabelledCollection):
def fit(self, text_instances, labels):
text_instances, labels = data.Xy
classes = np.unique(labels)
if self.vectorizer is not None:
text_instances = self.vectorizer.fit_transform(text_instances, y=labels)
distributions = []
for class_i in data.classes_:
for class_i in classes:
distributions.append(self.__as_distribution(text_instances[labels == class_i]))
self.validation_distribution = np.asarray(distributions)
return self
def quantify(self, text_instances):
def predict(self, text_instances):
if self.vectorizer is not None:
text_instances = self.vectorizer.transform(text_instances)
test_distribution = self.__as_distribution(text_instances)
divergence = _get_divergence(self.divergence)
divergence = qp.functional.get_divergence(self.divergence)
n_classes, n_feats = self.validation_distribution.shape
def match(prev):
@ -121,10 +123,10 @@ if __name__ == '__main__':
hdy = HDy(LogisticRegression())
yield data, hdy, 'HDy'
dm = DistributionMatching(LogisticRegression(), divergence=div, nbins=5)
dm = DistributionMatchingY(LogisticRegression(), divergence=div, nbins=5)
yield data, dm, 'DM-5b'
dm = DistributionMatching(LogisticRegression(), divergence=div, nbins=10)
dm = DistributionMatchingY(LogisticRegression(), divergence=div, nbins=10)
yield data, dm, 'DM-10b'
@ -132,9 +134,9 @@ if __name__ == '__main__':
with open(result_path, 'wt') as csv:
csv.write(f'Method\tDataset\tMAE\tMRAE\n')
for data, quantifier, quant_name in gen_methods():
quantifier.fit(data.training)
quantifier.fit(*data.training.Xy)
report = qp.evaluation.evaluation_report(quantifier, APP(data.test, repeats=repeats), error_metrics=['mae','mrae'], verbose=True)
means = report.mean()
means = report.mean(numeric_only=True)
csv.write(f'{quant_name}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n')
df = pd.read_csv(result_path, sep='\t')

View File

@ -33,7 +33,6 @@ class LabelledCollection:
else:
self.instances = np.asarray(instances)
self.labels = np.asarray(labels)
n_docs = len(self)
if classes is None:
self.classes_ = F.classes_from_labels(self.labels)
else:
@ -41,7 +40,13 @@ class LabelledCollection:
self.classes_.sort()
if len(set(self.labels).difference(set(classes))) > 0:
raise ValueError(f'labels ({set(self.labels)}) contain values not included in classes_ ({set(classes)})')
self.index = {class_: np.arange(n_docs)[self.labels == class_] for class_ in self.classes_}
self._index = None
@property
def index(self):
if self._index is None:
self._index = {class_: np.arange(len(self))[self.labels == class_] for class_ in self.classes_}
return self._index
@classmethod
def load(cls, path: str, loader_func: callable, classes=None, **loader_kwargs):