big refactor

This commit is contained in:
Alejandro Moreo Fernandez 2018-10-30 18:00:51 +01:00
parent bd09d635f6
commit 5dda876938
5 changed files with 213 additions and 159 deletions

0
README.md Normal file
View File

View File

@ -1,43 +0,0 @@
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.feature_selection import SelectKBest, chi2
from doc_representation import *
nfolds = 2
do_feat_selection = True
params = [{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],'class_weight':['balanced',None]}]
path = '/home/moreo/Dante/testi'
Xtr,ytr,ep1,ep2 = load_documents(path, by_sentences=True)
if do_feat_selection:
print('feature selection')
num_feats = int(0.1 * Xtr.shape[1])
feature_selector = SelectKBest(chi2, k=num_feats)
Xtr = feature_selector.fit_transform(Xtr,ytr)
print('final shape={}'.format(Xtr.shape))
ep1 = feature_selector.transform(ep1)
ep2 = feature_selector.transform(ep2)
# learn a SVM
print('optimizing a SVM')
svm_base = LinearSVC()
svm_optimized = GridSearchCV(svm_base, param_grid=params, cv=nfolds)
svm_optimized.fit(Xtr,ytr)
print('Best params: {}'.format(svm_optimized.best_params_))
# evaluation of results
print('computng the cross-val score')
accuracies = cross_val_score(svm_optimized, Xtr, ytr, cv=nfolds, n_jobs=-1)
acc_mean, acc_std = accuracies.mean(), accuracies.std()
print('Accuracy={:.3f} (+-{:.3f})'.format(acc_mean, acc_std))
# final test
print('predicting the Epistolas')
ep1_ = svm_optimized.predict(ep1)
ep2_ = svm_optimized.predict(ep2)
print('Epistola1 acc = {:.3f} {}'.format(ep1_.mean(), ep1_))
print('Epistola2 acc = {:.3f} {}'.format(ep2_.mean(), ep2_))

View File

@ -1,46 +1,55 @@
from sklearn.svm import *
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.feature_selection import SelectKBest, chi2
from doc_representation2 import *
import numpy as np
from sklearn.metrics import f1_score, make_scorer
nfolds = 2
do_feat_selection = False
params = [{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],'class_weight':['balanced',None]}]
probability=False
# SVM = SVC
SVM = LinearSVC
path = 'Data'
Xtr,ytr,ep2 = load_documents(path)
nfolds = 3
params = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'class_weight':['balanced',None]}
if SVM is SVC:
params['kernel']=['linear','rbf']
if do_feat_selection:
print('feature selection')
num_feats = int(0.1 * Xtr.shape[1])
feature_selector = SelectKBest(chi2, k=num_feats)
Xtr = feature_selector.fit_transform(Xtr,ytr)
print('final shape={}'.format(Xtr.shape))
#ep1 = feature_selector.transform(ep1)
ep2 = feature_selector.transform(ep2)
path = '../testi'
Xtr,ytr,ep1,ep2 = load_documents(path, split_documents=True, function_words_freq=True, tfidf=True, tfidf_feat_selection_ratio=0.1)
# learn a SVM
print('optimizing a SVM')
svm_base = LinearSVC(max_iter=-1) #i risultati non convergevano, cosi' ho provato SVC piuttosto che LinearSVC
svm_optimized = GridSearchCV(svm_base, param_grid=params, cv=nfolds)
#print ("Shape of X:", Xtr.shape)
svm_optimized.fit(Xtr, ytr)
#print('Best params: {}'.format(svm_optimized.best_params_))
# svm = SVM(probability=probability)
svm = SVM()
positive_examples = ytr.sum()
if positive_examples>nfolds:
print('optimizing {}'.format(svm.__class__.__name__))
svm = GridSearchCV(svm, param_grid=params, cv=nfolds, scoring=make_scorer(f1_score))
svm.fit(Xtr, ytr)
if isinstance(svm, GridSearchCV):
print('Best params: {}'.format(svm.best_params_))
# evaluation of results
print('computing the cross-val score')
accuracies = cross_val_score(svm_optimized, Xtr, ytr, cv=nfolds, n_jobs=-1)
acc_mean, acc_std = accuracies.mean(), accuracies.std()
print('Accuracy={:.3f} (+-{:.3f})'.format(acc_mean, acc_std))
f1scores = cross_val_score(svm, Xtr, ytr, cv=nfolds, n_jobs=-1, scoring=make_scorer(f1_score))
f1_mean, f1_std = f1scores.mean(), f1scores.std()
print('F1-measure={:.3f} (+-{:.3f})\n'.format(f1_mean, f1_std))
# final test
print('predicting the Epistolas')
#ep1_ = svm_optimized.predict(ep1)
ep2= np.reshape(ep2, (1,-1))
ep2_ = svm_optimized.predict(ep2)
#print('Epistola1 acc = {:.3f} {}'.format(ep1_.mean(), ep1_))
print('Epistola2 acc = {:.3f} {}'.format(ep2_.mean(), ep2_))
def predictEpistola(ep, epistola_name):
pred = svm.predict(ep)
full_doc_prediction = pred[0]
print('{} is from Dante: {}'.format(epistola_name, 'Yes' if full_doc_prediction == 1 else 'No'))
if len(pred>0):
fragment_predictions= pred[1:]
print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions))
if SVM is SVC and probability:
prob = svm.predict_proba(ep)[:,1]
np.set_printoptions(precision=2, linewidth=200)
print('probabilistic view: full={:.3f}, fragments average {:.3f}, fragments={}'.format(prob[0], prob[1:].mean(), prob[1:]))
print('Predicting the Epistolas')
predictEpistola(ep1, 'Epistola 1')
predictEpistola(ep2, 'Epistola 2')

View File

@ -1,50 +0,0 @@
import os
from os.path import join
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
def load_documents(path, by_sentences=False):
#read documents
docs,y = [],[]
for file in os.listdir(path):
if file.startswith('EpistolaXIII_'): continue
file_clean = file.replace('.txt','')
author, textname = file_clean.split('_')[0],file_clean.split('_')[1]
if by_sentences:
lines = open(join(path, file)).readlines()
docs.extend(lines)
if author == 'Dante':
y.extend([1] * len(lines))
else:
y.extend([0] * len(lines))
else:
docs.append(open(join(path,file)).read())
if author == 'Dante':
y.append(1)
else:
y.append(0)
if not by_sentences:
y = y + y
docs = docs + docs
if by_sentences:
ep1 = open(join(path, 'EpistolaXIII_1.txt')).readlines()
ep2 = open(join(path, 'EpistolaXIII_2.txt')).readlines()
else:
ep1 = [open(join(path, 'EpistolaXIII_1.txt' )).read()]
ep2 = [open(join(path, 'EpistolaXIII_2.txt')).read()]
# document representation
tfidf = TfidfVectorizer(sublinear_tf=True)
X = tfidf.fit_transform(docs)
y = np.array(y)
Epistola1 = tfidf.transform(ep1)
Epistola2 = tfidf.transform(ep2)
print('documents read, shape={}'.format(X.shape))
# print(y)
return X, y, Epistola1, Epistola2

View File

@ -2,49 +2,187 @@ import nltk
import numpy as np
import os
from os.path import join
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from scipy.sparse import hstack, csr_matrix
function_words = ["et", "in", "de", "ad", "ut", "cum", "non", "per", "a", "que", "ex", "sed"]
def load_documents(path):
X, y = [], []
i=0;
# ------------------------------------------------------------------------
# document loading routine
# ------------------------------------------------------------------------
def _load_texts(path):
# load the training data (all documents but Epistolas 1 and 2)
documents = []
authors = []
ndocs=0
for file in os.listdir(path):
if file.startswith('EpistolaXIII_'): continue
file_clean = file.replace('.txt','')
author, textname = file_clean.split('_')[0],file_clean.split('_')[1]
tokens = nltk.word_tokenize(open(join(path,file), encoding= "utf8").read())
author_tokens = ([token.lower() for token in tokens
if any(char.isalpha() for char in token)])
freqs= nltk.FreqDist(author_tokens)
X.append([])
#print(f"From {textname} by {author}:")
for function_word in function_words:
feature= (freqs[function_word]*1000)/len(author_tokens)
#print(function_word, " = ", freqs[function_word], ", ", feature)
X[i].append(feature)
i+=1
if author == "Dante":
y.append(1)
else:
y.append(0)
y= y + y
X= X + X
y= np.array(y)
ep = []
tokens = nltk.word_tokenize(open(join(path, 'EpistolaXIII_2.txt'), encoding= "utf8").read())
ep2_tokens = ([token.lower() for token in tokens
if any(char.isalpha() for char in token)])
freqs= nltk.FreqDist(ep2_tokens)
#print("From Epistola XIII_2:")
for function_word in function_words:
feature= (freqs[function_word]*1000/len(ep2_tokens))
ep.append(feature)
#print(function_word, " = ", freqs[function_word], ", ", feature)
ep2 = np.array(ep)
text = open(join(path,file), encoding= "utf8").read()
return X, y, ep2
documents.append(text)
authors.append(author)
ndocs+=1
# load the test data (Epistolas 1 and 2)
ep1_text = open(join(path, 'EpistolaXIII_1.txt'), encoding="utf8").read()
ep2_text = open(join(path, 'EpistolaXIII_2.txt'), encoding="utf8").read()
return documents, authors, ep1_text, ep2_text
# ------------------------------------------------------------------------
# split policies
# ------------------------------------------------------------------------
# TODO: implement other split policies (e.g., overlapping ones, etc)
def _split_by_endline(text):
return [t.strip() for t in text.split('\n') if t.strip()]
def splitter(documents, authors=None, split_policy=_split_by_endline):
fragments = []
authors_fragments = []
for i, text in enumerate(documents):
text_fragments = split_policy(text)
fragments.extend(text_fragments)
if authors is not None:
authors_fragments.extend([authors[i]] * len(text_fragments))
if authors is not None:
return fragments, authors_fragments
return fragments
# ------------------------------------------------------------------------
# feature extraction methods
# ------------------------------------------------------------------------
# TODO: implement other feature extraction methods
def _features_function_words_freq(documents):
"""
Extract features as the frequency (x1000) of the functions used in the documents
:param documents: a list where each element is the text (string) of a document
:return: a np.array of shape (D,F) where D is len(documents) and F is len(function_words)
"""
features = []
for text in documents:
tokens = nltk.word_tokenize(text)
author_tokens = ([token.lower() for token in tokens if any(char.isalpha() for char in token)])
freqs = nltk.FreqDist(author_tokens)
nwords = len(author_tokens)
funct_words_freq = [1000. * freqs[function_word] / nwords for function_word in function_words]
features.append(funct_words_freq)
return np.array(features)
def _features_tfidf(documents, tfidf_vectorizer=None):
"""
Extract features as tfidf matrix extracted from the documents
:param documents: a list where each element is the text (string) of a document
:return: a tuple M,V, where M is an np.array of shape (D,F), with D being the len(documents) and F the number of
distinct words; and V is the TfidfVectorizer already fit
"""
if tfidf_vectorizer is None:
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True)
tfidf_vectorizer.fit(documents)
features = tfidf_vectorizer.transform(documents)
return features, tfidf_vectorizer
def _feature_selection(X, y, EP1, EP2, tfidf_feat_selection_ratio):
nF = X.shape[1]
num_feats = int(tfidf_feat_selection_ratio * nF)
feature_selector = SelectKBest(chi2, k=num_feats)
X = feature_selector.fit_transform(X, y)
EP1 = feature_selector.transform(EP1)
EP2 = feature_selector.transform(EP2)
return X,EP1,EP2
def load_documents(path,
function_words_freq=True,
tfidf=False,
tfidf_feat_selection_ratio=1.,
split_documents=False,
split_policy = _split_by_endline,
verbose=True):
"""
Loads the documents contained in path applying a number of feature extraction policies. The directory is assumed to
contain files named according to <author>_<text_name>.txt plus two special files EpistolaXIII_1.txt and
EpistolaXIII_2.txt concerning the two documents whose authorship attribution is to be determined.
:param path: the path containing the texts, each named as <author>_<text_name>.txt
:param function_words_freq: add the frequency of function words as features
:param tfidf: add the tfidf as features
:param split_documents: whether to split text into smaller documents or not (currenty, the policy is to split by '\n').
Currently, the fragments resulting from the split are added to the pool of documents (i.e., they do not replace the
full documents, which are anyway retained).
:param split_policy: a callable that implements the split to be applied (ignored if split_documents=False)
:param verbose: show information by stdout or not
:return: np.arrays or csr_matrix (depending on whether tfidf is activated or not) X, y, EP1, EP2, where X is the
matrix of features for the training set and y are the labels (np.array);
EP1 and EP2 are the matrix of features for the epistola 1 (first row) and fragments (from row 2nd to last) if
split_documents=True) and 2 (similar)
"""
documents, authors, ep1_text, ep2_text = _load_texts(path)
ep1,ep2 = [ep1_text],[ep2_text]
n_original_docs=len(documents)
if split_documents:
doc_fragments, authors_fragments = splitter(documents, authors, split_policy=split_policy)
documents.extend(doc_fragments)
authors.extend(authors_fragments)
ep1.extend(splitter(ep1, split_policy=split_policy))
ep2.extend(splitter(ep2, split_policy=split_policy))
# represent the target vector
y = np.array([(1 if author == "Dante" else 0) for author in authors])
# initialize the document-by-feature vector
X = np.empty((len(documents), 0))
EP1 = np.empty((len(ep1), 0))
EP2 = np.empty((len(ep2), 0))
if function_words_freq:
X = np.hstack((X,_features_function_words_freq(documents)))
EP1 = np.hstack((EP1, _features_function_words_freq(ep1)))
EP2 = np.hstack((EP2, _features_function_words_freq(ep2)))
if tfidf:
X_features, vectorizer = _features_tfidf(documents)
ep1_features, _ = _features_tfidf(ep1, vectorizer)
ep2_features, _ = _features_tfidf(ep2, vectorizer)
if tfidf_feat_selection_ratio < 1.:
if verbose: print('feature selection')
X_features, ep1_features, ep2_features = \
_feature_selection(X_features, y, ep1_features, ep2_features, tfidf_feat_selection_ratio)
# matrix is sparse now
X = hstack((csr_matrix(X), X_features))
EP1 = hstack((csr_matrix(EP1), ep1_features))
EP2 = hstack((csr_matrix(EP2), ep2_features))
# print summary
if verbose:
print('load_documents: function_words_freq={} tfidf={}, split_documents={}, split_policy={}'
.format(function_words_freq, tfidf, split_documents, split_policy.__name__))
print('number of training (full) documents: {}'.format(n_original_docs))
print('X shape (#documents,#features): {}'.format(X.shape))
print('y prevalence: {:.2f}%'.format(y.mean()*100))
print('Epistola 1 shape:', EP1.shape)
print('Epistola 2 shape:', EP2.shape)
print()
return X, y, EP1, EP2