lazy index construction in labelled collection
This commit is contained in:
parent
088ebcdd31
commit
41baeb78ca
|
|
@ -3,6 +3,7 @@ Change Log 0.2.1
|
||||||
|
|
||||||
- Improved documentation of confidence regions.
|
- Improved documentation of confidence regions.
|
||||||
- Added ReadMe method by Daniel Hopkins and Gary King
|
- Added ReadMe method by Daniel Hopkins and Gary King
|
||||||
|
- Internal index in LabelledCollection is now "lazy", and is only constructed if required.
|
||||||
|
|
||||||
Change Log 0.2.0
|
Change Log 0.2.0
|
||||||
-----------------
|
-----------------
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,7 @@ import numpy as np
|
||||||
|
|
||||||
from experimental_non_aggregative.custom_vectorizers import *
|
from experimental_non_aggregative.custom_vectorizers import *
|
||||||
from protocol import APP
|
from protocol import APP
|
||||||
from quapy.method.aggregative import _get_divergence, HDy, DistributionMatching
|
from quapy.method.aggregative import HDy, DistributionMatchingY
|
||||||
from quapy.method.base import BaseQuantifier
|
from quapy.method.base import BaseQuantifier
|
||||||
from scipy import optimize
|
from scipy import optimize
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
@ -30,28 +30,30 @@ class DxS(BaseQuantifier):
|
||||||
# return np.asarray(instances.sum(axis=0) / instances.sum()).flatten()
|
# return np.asarray(instances.sum(axis=0) / instances.sum()).flatten()
|
||||||
|
|
||||||
def __as_distribution(self, instances):
|
def __as_distribution(self, instances):
|
||||||
dist = instances.sum(axis=0) / instances.sum()
|
dist = instances.mean(axis=0)
|
||||||
return np.asarray(dist).flatten()
|
return np.asarray(dist).flatten()
|
||||||
|
|
||||||
def fit(self, data: LabelledCollection):
|
def fit(self, text_instances, labels):
|
||||||
|
|
||||||
text_instances, labels = data.Xy
|
classes = np.unique(labels)
|
||||||
|
|
||||||
if self.vectorizer is not None:
|
if self.vectorizer is not None:
|
||||||
text_instances = self.vectorizer.fit_transform(text_instances, y=labels)
|
text_instances = self.vectorizer.fit_transform(text_instances, y=labels)
|
||||||
|
|
||||||
distributions = []
|
distributions = []
|
||||||
for class_i in data.classes_:
|
for class_i in classes:
|
||||||
distributions.append(self.__as_distribution(text_instances[labels == class_i]))
|
distributions.append(self.__as_distribution(text_instances[labels == class_i]))
|
||||||
|
|
||||||
self.validation_distribution = np.asarray(distributions)
|
self.validation_distribution = np.asarray(distributions)
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def quantify(self, text_instances):
|
def predict(self, text_instances):
|
||||||
if self.vectorizer is not None:
|
if self.vectorizer is not None:
|
||||||
text_instances = self.vectorizer.transform(text_instances)
|
text_instances = self.vectorizer.transform(text_instances)
|
||||||
|
|
||||||
test_distribution = self.__as_distribution(text_instances)
|
test_distribution = self.__as_distribution(text_instances)
|
||||||
divergence = _get_divergence(self.divergence)
|
divergence = qp.functional.get_divergence(self.divergence)
|
||||||
n_classes, n_feats = self.validation_distribution.shape
|
n_classes, n_feats = self.validation_distribution.shape
|
||||||
|
|
||||||
def match(prev):
|
def match(prev):
|
||||||
|
|
@ -121,10 +123,10 @@ if __name__ == '__main__':
|
||||||
hdy = HDy(LogisticRegression())
|
hdy = HDy(LogisticRegression())
|
||||||
yield data, hdy, 'HDy'
|
yield data, hdy, 'HDy'
|
||||||
|
|
||||||
dm = DistributionMatching(LogisticRegression(), divergence=div, nbins=5)
|
dm = DistributionMatchingY(LogisticRegression(), divergence=div, nbins=5)
|
||||||
yield data, dm, 'DM-5b'
|
yield data, dm, 'DM-5b'
|
||||||
|
|
||||||
dm = DistributionMatching(LogisticRegression(), divergence=div, nbins=10)
|
dm = DistributionMatchingY(LogisticRegression(), divergence=div, nbins=10)
|
||||||
yield data, dm, 'DM-10b'
|
yield data, dm, 'DM-10b'
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -132,9 +134,9 @@ if __name__ == '__main__':
|
||||||
with open(result_path, 'wt') as csv:
|
with open(result_path, 'wt') as csv:
|
||||||
csv.write(f'Method\tDataset\tMAE\tMRAE\n')
|
csv.write(f'Method\tDataset\tMAE\tMRAE\n')
|
||||||
for data, quantifier, quant_name in gen_methods():
|
for data, quantifier, quant_name in gen_methods():
|
||||||
quantifier.fit(data.training)
|
quantifier.fit(*data.training.Xy)
|
||||||
report = qp.evaluation.evaluation_report(quantifier, APP(data.test, repeats=repeats), error_metrics=['mae','mrae'], verbose=True)
|
report = qp.evaluation.evaluation_report(quantifier, APP(data.test, repeats=repeats), error_metrics=['mae','mrae'], verbose=True)
|
||||||
means = report.mean()
|
means = report.mean(numeric_only=True)
|
||||||
csv.write(f'{quant_name}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n')
|
csv.write(f'{quant_name}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n')
|
||||||
|
|
||||||
df = pd.read_csv(result_path, sep='\t')
|
df = pd.read_csv(result_path, sep='\t')
|
||||||
|
|
|
||||||
|
|
@ -33,7 +33,6 @@ class LabelledCollection:
|
||||||
else:
|
else:
|
||||||
self.instances = np.asarray(instances)
|
self.instances = np.asarray(instances)
|
||||||
self.labels = np.asarray(labels)
|
self.labels = np.asarray(labels)
|
||||||
n_docs = len(self)
|
|
||||||
if classes is None:
|
if classes is None:
|
||||||
self.classes_ = F.classes_from_labels(self.labels)
|
self.classes_ = F.classes_from_labels(self.labels)
|
||||||
else:
|
else:
|
||||||
|
|
@ -41,7 +40,13 @@ class LabelledCollection:
|
||||||
self.classes_.sort()
|
self.classes_.sort()
|
||||||
if len(set(self.labels).difference(set(classes))) > 0:
|
if len(set(self.labels).difference(set(classes))) > 0:
|
||||||
raise ValueError(f'labels ({set(self.labels)}) contain values not included in classes_ ({set(classes)})')
|
raise ValueError(f'labels ({set(self.labels)}) contain values not included in classes_ ({set(classes)})')
|
||||||
self.index = {class_: np.arange(n_docs)[self.labels == class_] for class_ in self.classes_}
|
self._index = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def index(self):
|
||||||
|
if self._index is None:
|
||||||
|
self._index = {class_: np.arange(len(self))[self.labels == class_] for class_ in self.classes_}
|
||||||
|
return self._index
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, path: str, loader_func: callable, classes=None, **loader_kwargs):
|
def load(cls, path: str, loader_func: callable, classes=None, **loader_kwargs):
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue