From e81009e665fefe771f820c937febdfbc60783c22 Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Thu, 3 Dec 2020 16:36:54 +0100 Subject: [PATCH] fixing dataset loading --- quapy/dataset/__init__.py | 5 +++-- quapy/dataset/base.py | 3 +-- quapy/dataset/preprocessing.py | 12 ++++++------ 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/quapy/dataset/__init__.py b/quapy/dataset/__init__.py index 0853ddb..e44efa4 100644 --- a/quapy/dataset/__init__.py +++ b/quapy/dataset/__init__.py @@ -1,4 +1,5 @@ from .base import * -from . import base -from . import reader +from .reader import * from . import preprocessing + + diff --git a/quapy/dataset/base.py b/quapy/dataset/base.py index 7086596..29a188f 100644 --- a/quapy/dataset/base.py +++ b/quapy/dataset/base.py @@ -1,6 +1,5 @@ import numpy as np -from scipy.sparse import issparse, dok_matrix -from sklearn.feature_extraction.text import TfidfVectorizer +from scipy.sparse import issparse from sklearn.model_selection import train_test_split from quapy.functional import artificial_prevalence_sampling from scipy.sparse import vstack diff --git a/quapy/dataset/preprocessing.py b/quapy/dataset/preprocessing.py index 1db2c27..a6259b2 100644 --- a/quapy/dataset/preprocessing.py +++ b/quapy/dataset/preprocessing.py @@ -1,7 +1,7 @@ +import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from dataset.base import Dataset from scipy.sparse import spmatrix -import numpy as np from utils.util import parallelize from .base import LabelledCollection @@ -17,8 +17,8 @@ def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kw :return: a new Dataset in csr_matrix format (if inplace=False) or a reference to the current Dataset (inplace=True) where the instances are stored in a csr_matrix of real-valued tfidf scores """ - __check_type(dataset.training.instances, list, str) - __check_type(dataset.test.instances, list, str) + __check_type(dataset.training.instances, np.ndarray, str) + __check_type(dataset.test.instances, np.ndarray, str) vectorizer = TfidfVectorizer(min_df=min_df, sublinear_tf=sublinear_tf, **kwargs) training_documents = vectorizer.fit_transform(dataset.training.instances) @@ -45,8 +45,8 @@ def reduce_columns(dataset:Dataset, min_df=5, inplace=False): :return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True) where the dimensions corresponding to infrequent instances have been removed """ - __check_type(dataset.training, spmatrix) - __check_type(dataset.test, spmatrix) + __check_type(dataset.training.instances, spmatrix) + __check_type(dataset.test.instances, spmatrix) assert dataset.training.instances.shape[1] == dataset.test.instances.shape[1], 'unaligned vector spaces' def filter_by_occurrences(X, W): @@ -101,7 +101,7 @@ def __check_type(container, container_type=None, element_type=None): assert isinstance(container, container_type), \ f'unexpected type of container (expected {container_type}, found {type(container)})' if element_type: - assert isinstance(next(container), element_type), \ + assert isinstance(container[0], element_type), \ f'unexpected type of element (expected {container_type}, found {type(container)})'