big refactor
This commit is contained in:
parent
bd09d635f6
commit
5dda876938
|
@ -1,43 +0,0 @@
|
|||
from sklearn.svm import LinearSVC
|
||||
from sklearn.model_selection import cross_val_score, GridSearchCV
|
||||
from sklearn.feature_selection import SelectKBest, chi2
|
||||
from doc_representation import *
|
||||
|
||||
nfolds = 2
|
||||
do_feat_selection = True
|
||||
params = [{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],'class_weight':['balanced',None]}]
|
||||
|
||||
path = '/home/moreo/Dante/testi'
|
||||
Xtr,ytr,ep1,ep2 = load_documents(path, by_sentences=True)
|
||||
|
||||
if do_feat_selection:
|
||||
print('feature selection')
|
||||
num_feats = int(0.1 * Xtr.shape[1])
|
||||
feature_selector = SelectKBest(chi2, k=num_feats)
|
||||
Xtr = feature_selector.fit_transform(Xtr,ytr)
|
||||
print('final shape={}'.format(Xtr.shape))
|
||||
ep1 = feature_selector.transform(ep1)
|
||||
ep2 = feature_selector.transform(ep2)
|
||||
|
||||
|
||||
# learn a SVM
|
||||
print('optimizing a SVM')
|
||||
svm_base = LinearSVC()
|
||||
|
||||
svm_optimized = GridSearchCV(svm_base, param_grid=params, cv=nfolds)
|
||||
svm_optimized.fit(Xtr,ytr)
|
||||
print('Best params: {}'.format(svm_optimized.best_params_))
|
||||
|
||||
# evaluation of results
|
||||
print('computng the cross-val score')
|
||||
accuracies = cross_val_score(svm_optimized, Xtr, ytr, cv=nfolds, n_jobs=-1)
|
||||
acc_mean, acc_std = accuracies.mean(), accuracies.std()
|
||||
print('Accuracy={:.3f} (+-{:.3f})'.format(acc_mean, acc_std))
|
||||
|
||||
# final test
|
||||
print('predicting the Epistolas')
|
||||
ep1_ = svm_optimized.predict(ep1)
|
||||
ep2_ = svm_optimized.predict(ep2)
|
||||
print('Epistola1 acc = {:.3f} {}'.format(ep1_.mean(), ep1_))
|
||||
print('Epistola2 acc = {:.3f} {}'.format(ep2_.mean(), ep2_))
|
||||
|
|
@ -1,46 +1,55 @@
|
|||
from sklearn.svm import *
|
||||
from sklearn.model_selection import cross_val_score, GridSearchCV
|
||||
from sklearn.feature_selection import SelectKBest, chi2
|
||||
from doc_representation2 import *
|
||||
import numpy as np
|
||||
from sklearn.metrics import f1_score, make_scorer
|
||||
|
||||
nfolds = 2
|
||||
do_feat_selection = False
|
||||
params = [{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],'class_weight':['balanced',None]}]
|
||||
probability=False
|
||||
# SVM = SVC
|
||||
SVM = LinearSVC
|
||||
|
||||
path = 'Data'
|
||||
Xtr,ytr,ep2 = load_documents(path)
|
||||
nfolds = 3
|
||||
params = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'class_weight':['balanced',None]}
|
||||
if SVM is SVC:
|
||||
params['kernel']=['linear','rbf']
|
||||
|
||||
if do_feat_selection:
|
||||
print('feature selection')
|
||||
num_feats = int(0.1 * Xtr.shape[1])
|
||||
feature_selector = SelectKBest(chi2, k=num_feats)
|
||||
Xtr = feature_selector.fit_transform(Xtr,ytr)
|
||||
print('final shape={}'.format(Xtr.shape))
|
||||
#ep1 = feature_selector.transform(ep1)
|
||||
ep2 = feature_selector.transform(ep2)
|
||||
path = '../testi'
|
||||
|
||||
Xtr,ytr,ep1,ep2 = load_documents(path, split_documents=True, function_words_freq=True, tfidf=True, tfidf_feat_selection_ratio=0.1)
|
||||
|
||||
# learn a SVM
|
||||
print('optimizing a SVM')
|
||||
svm_base = LinearSVC(max_iter=-1) #i risultati non convergevano, cosi' ho provato SVC piuttosto che LinearSVC
|
||||
|
||||
svm_optimized = GridSearchCV(svm_base, param_grid=params, cv=nfolds)
|
||||
#print ("Shape of X:", Xtr.shape)
|
||||
svm_optimized.fit(Xtr, ytr)
|
||||
#print('Best params: {}'.format(svm_optimized.best_params_))
|
||||
# svm = SVM(probability=probability)
|
||||
svm = SVM()
|
||||
|
||||
positive_examples = ytr.sum()
|
||||
if positive_examples>nfolds:
|
||||
print('optimizing {}'.format(svm.__class__.__name__))
|
||||
svm = GridSearchCV(svm, param_grid=params, cv=nfolds, scoring=make_scorer(f1_score))
|
||||
|
||||
svm.fit(Xtr, ytr)
|
||||
|
||||
if isinstance(svm, GridSearchCV):
|
||||
print('Best params: {}'.format(svm.best_params_))
|
||||
|
||||
# evaluation of results
|
||||
print('computing the cross-val score')
|
||||
accuracies = cross_val_score(svm_optimized, Xtr, ytr, cv=nfolds, n_jobs=-1)
|
||||
acc_mean, acc_std = accuracies.mean(), accuracies.std()
|
||||
print('Accuracy={:.3f} (+-{:.3f})'.format(acc_mean, acc_std))
|
||||
f1scores = cross_val_score(svm, Xtr, ytr, cv=nfolds, n_jobs=-1, scoring=make_scorer(f1_score))
|
||||
f1_mean, f1_std = f1scores.mean(), f1scores.std()
|
||||
print('F1-measure={:.3f} (+-{:.3f})\n'.format(f1_mean, f1_std))
|
||||
|
||||
# final test
|
||||
print('predicting the Epistolas')
|
||||
#ep1_ = svm_optimized.predict(ep1)
|
||||
ep2= np.reshape(ep2, (1,-1))
|
||||
ep2_ = svm_optimized.predict(ep2)
|
||||
#print('Epistola1 acc = {:.3f} {}'.format(ep1_.mean(), ep1_))
|
||||
print('Epistola2 acc = {:.3f} {}'.format(ep2_.mean(), ep2_))
|
||||
def predictEpistola(ep, epistola_name):
|
||||
pred = svm.predict(ep)
|
||||
full_doc_prediction = pred[0]
|
||||
print('{} is from Dante: {}'.format(epistola_name, 'Yes' if full_doc_prediction == 1 else 'No'))
|
||||
if len(pred>0):
|
||||
fragment_predictions= pred[1:]
|
||||
print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions))
|
||||
if SVM is SVC and probability:
|
||||
prob = svm.predict_proba(ep)[:,1]
|
||||
np.set_printoptions(precision=2, linewidth=200)
|
||||
print('probabilistic view: full={:.3f}, fragments average {:.3f}, fragments={}'.format(prob[0], prob[1:].mean(), prob[1:]))
|
||||
|
||||
print('Predicting the Epistolas')
|
||||
predictEpistola(ep1, 'Epistola 1')
|
||||
predictEpistola(ep2, 'Epistola 2')
|
||||
|
|
|
@ -1,50 +0,0 @@
|
|||
import os
|
||||
from os.path import join
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
import numpy as np
|
||||
|
||||
def load_documents(path, by_sentences=False):
|
||||
#read documents
|
||||
docs,y = [],[]
|
||||
for file in os.listdir(path):
|
||||
if file.startswith('EpistolaXIII_'): continue
|
||||
file_clean = file.replace('.txt','')
|
||||
author, textname = file_clean.split('_')[0],file_clean.split('_')[1]
|
||||
if by_sentences:
|
||||
lines = open(join(path, file)).readlines()
|
||||
docs.extend(lines)
|
||||
if author == 'Dante':
|
||||
y.extend([1] * len(lines))
|
||||
else:
|
||||
y.extend([0] * len(lines))
|
||||
else:
|
||||
docs.append(open(join(path,file)).read())
|
||||
if author == 'Dante':
|
||||
y.append(1)
|
||||
else:
|
||||
y.append(0)
|
||||
|
||||
if not by_sentences:
|
||||
y = y + y
|
||||
docs = docs + docs
|
||||
|
||||
if by_sentences:
|
||||
ep1 = open(join(path, 'EpistolaXIII_1.txt')).readlines()
|
||||
ep2 = open(join(path, 'EpistolaXIII_2.txt')).readlines()
|
||||
else:
|
||||
ep1 = [open(join(path, 'EpistolaXIII_1.txt' )).read()]
|
||||
ep2 = [open(join(path, 'EpistolaXIII_2.txt')).read()]
|
||||
|
||||
# document representation
|
||||
tfidf = TfidfVectorizer(sublinear_tf=True)
|
||||
X = tfidf.fit_transform(docs)
|
||||
y = np.array(y)
|
||||
Epistola1 = tfidf.transform(ep1)
|
||||
Epistola2 = tfidf.transform(ep2)
|
||||
|
||||
print('documents read, shape={}'.format(X.shape))
|
||||
# print(y)
|
||||
|
||||
return X, y, Epistola1, Epistola2
|
||||
|
||||
|
|
@ -2,49 +2,187 @@ import nltk
|
|||
import numpy as np
|
||||
import os
|
||||
from os.path import join
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.feature_selection import SelectKBest
|
||||
from sklearn.feature_selection import chi2
|
||||
from scipy.sparse import hstack, csr_matrix
|
||||
|
||||
function_words = ["et", "in", "de", "ad", "ut", "cum", "non", "per", "a", "que", "ex", "sed"]
|
||||
|
||||
def load_documents(path):
|
||||
X, y = [], []
|
||||
i=0;
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# document loading routine
|
||||
# ------------------------------------------------------------------------
|
||||
def _load_texts(path):
|
||||
# load the training data (all documents but Epistolas 1 and 2)
|
||||
documents = []
|
||||
authors = []
|
||||
ndocs=0
|
||||
for file in os.listdir(path):
|
||||
if file.startswith('EpistolaXIII_'): continue
|
||||
file_clean = file.replace('.txt','')
|
||||
author, textname = file_clean.split('_')[0],file_clean.split('_')[1]
|
||||
tokens = nltk.word_tokenize(open(join(path,file), encoding= "utf8").read())
|
||||
author_tokens = ([token.lower() for token in tokens
|
||||
if any(char.isalpha() for char in token)])
|
||||
freqs= nltk.FreqDist(author_tokens)
|
||||
X.append([])
|
||||
#print(f"From {textname} by {author}:")
|
||||
for function_word in function_words:
|
||||
feature= (freqs[function_word]*1000)/len(author_tokens)
|
||||
#print(function_word, " = ", freqs[function_word], ", ", feature)
|
||||
X[i].append(feature)
|
||||
i+=1
|
||||
if author == "Dante":
|
||||
y.append(1)
|
||||
else:
|
||||
y.append(0)
|
||||
|
||||
|
||||
y= y + y
|
||||
X= X + X
|
||||
y= np.array(y)
|
||||
|
||||
ep = []
|
||||
tokens = nltk.word_tokenize(open(join(path, 'EpistolaXIII_2.txt'), encoding= "utf8").read())
|
||||
ep2_tokens = ([token.lower() for token in tokens
|
||||
if any(char.isalpha() for char in token)])
|
||||
freqs= nltk.FreqDist(ep2_tokens)
|
||||
#print("From Epistola XIII_2:")
|
||||
for function_word in function_words:
|
||||
feature= (freqs[function_word]*1000/len(ep2_tokens))
|
||||
ep.append(feature)
|
||||
#print(function_word, " = ", freqs[function_word], ", ", feature)
|
||||
ep2 = np.array(ep)
|
||||
text = open(join(path,file), encoding= "utf8").read()
|
||||
|
||||
return X, y, ep2
|
||||
documents.append(text)
|
||||
authors.append(author)
|
||||
ndocs+=1
|
||||
|
||||
# load the test data (Epistolas 1 and 2)
|
||||
ep1_text = open(join(path, 'EpistolaXIII_1.txt'), encoding="utf8").read()
|
||||
ep2_text = open(join(path, 'EpistolaXIII_2.txt'), encoding="utf8").read()
|
||||
|
||||
return documents, authors, ep1_text, ep2_text
|
||||
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# split policies
|
||||
# ------------------------------------------------------------------------
|
||||
# TODO: implement other split policies (e.g., overlapping ones, etc)
|
||||
def _split_by_endline(text):
|
||||
return [t.strip() for t in text.split('\n') if t.strip()]
|
||||
|
||||
|
||||
def splitter(documents, authors=None, split_policy=_split_by_endline):
|
||||
fragments = []
|
||||
authors_fragments = []
|
||||
for i, text in enumerate(documents):
|
||||
text_fragments = split_policy(text)
|
||||
fragments.extend(text_fragments)
|
||||
if authors is not None:
|
||||
authors_fragments.extend([authors[i]] * len(text_fragments))
|
||||
|
||||
if authors is not None:
|
||||
return fragments, authors_fragments
|
||||
return fragments
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# feature extraction methods
|
||||
# ------------------------------------------------------------------------
|
||||
# TODO: implement other feature extraction methods
|
||||
def _features_function_words_freq(documents):
|
||||
"""
|
||||
Extract features as the frequency (x1000) of the functions used in the documents
|
||||
:param documents: a list where each element is the text (string) of a document
|
||||
:return: a np.array of shape (D,F) where D is len(documents) and F is len(function_words)
|
||||
"""
|
||||
features = []
|
||||
for text in documents:
|
||||
tokens = nltk.word_tokenize(text)
|
||||
author_tokens = ([token.lower() for token in tokens if any(char.isalpha() for char in token)])
|
||||
freqs = nltk.FreqDist(author_tokens)
|
||||
|
||||
nwords = len(author_tokens)
|
||||
funct_words_freq = [1000. * freqs[function_word] / nwords for function_word in function_words]
|
||||
|
||||
features.append(funct_words_freq)
|
||||
|
||||
return np.array(features)
|
||||
|
||||
|
||||
def _features_tfidf(documents, tfidf_vectorizer=None):
|
||||
"""
|
||||
Extract features as tfidf matrix extracted from the documents
|
||||
:param documents: a list where each element is the text (string) of a document
|
||||
:return: a tuple M,V, where M is an np.array of shape (D,F), with D being the len(documents) and F the number of
|
||||
distinct words; and V is the TfidfVectorizer already fit
|
||||
"""
|
||||
if tfidf_vectorizer is None:
|
||||
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True)
|
||||
tfidf_vectorizer.fit(documents)
|
||||
|
||||
features = tfidf_vectorizer.transform(documents)
|
||||
|
||||
return features, tfidf_vectorizer
|
||||
|
||||
def _feature_selection(X, y, EP1, EP2, tfidf_feat_selection_ratio):
|
||||
nF = X.shape[1]
|
||||
num_feats = int(tfidf_feat_selection_ratio * nF)
|
||||
feature_selector = SelectKBest(chi2, k=num_feats)
|
||||
X = feature_selector.fit_transform(X, y)
|
||||
EP1 = feature_selector.transform(EP1)
|
||||
EP2 = feature_selector.transform(EP2)
|
||||
return X,EP1,EP2
|
||||
|
||||
def load_documents(path,
|
||||
function_words_freq=True,
|
||||
tfidf=False,
|
||||
tfidf_feat_selection_ratio=1.,
|
||||
split_documents=False,
|
||||
split_policy = _split_by_endline,
|
||||
verbose=True):
|
||||
"""
|
||||
Loads the documents contained in path applying a number of feature extraction policies. The directory is assumed to
|
||||
contain files named according to <author>_<text_name>.txt plus two special files EpistolaXIII_1.txt and
|
||||
EpistolaXIII_2.txt concerning the two documents whose authorship attribution is to be determined.
|
||||
:param path: the path containing the texts, each named as <author>_<text_name>.txt
|
||||
:param function_words_freq: add the frequency of function words as features
|
||||
:param tfidf: add the tfidf as features
|
||||
:param split_documents: whether to split text into smaller documents or not (currenty, the policy is to split by '\n').
|
||||
Currently, the fragments resulting from the split are added to the pool of documents (i.e., they do not replace the
|
||||
full documents, which are anyway retained).
|
||||
:param split_policy: a callable that implements the split to be applied (ignored if split_documents=False)
|
||||
:param verbose: show information by stdout or not
|
||||
:return: np.arrays or csr_matrix (depending on whether tfidf is activated or not) X, y, EP1, EP2, where X is the
|
||||
matrix of features for the training set and y are the labels (np.array);
|
||||
EP1 and EP2 are the matrix of features for the epistola 1 (first row) and fragments (from row 2nd to last) if
|
||||
split_documents=True) and 2 (similar)
|
||||
"""
|
||||
|
||||
documents, authors, ep1_text, ep2_text = _load_texts(path)
|
||||
ep1,ep2 = [ep1_text],[ep2_text]
|
||||
n_original_docs=len(documents)
|
||||
|
||||
if split_documents:
|
||||
doc_fragments, authors_fragments = splitter(documents, authors, split_policy=split_policy)
|
||||
documents.extend(doc_fragments)
|
||||
authors.extend(authors_fragments)
|
||||
|
||||
ep1.extend(splitter(ep1, split_policy=split_policy))
|
||||
ep2.extend(splitter(ep2, split_policy=split_policy))
|
||||
|
||||
|
||||
# represent the target vector
|
||||
y = np.array([(1 if author == "Dante" else 0) for author in authors])
|
||||
|
||||
# initialize the document-by-feature vector
|
||||
X = np.empty((len(documents), 0))
|
||||
EP1 = np.empty((len(ep1), 0))
|
||||
EP2 = np.empty((len(ep2), 0))
|
||||
|
||||
if function_words_freq:
|
||||
X = np.hstack((X,_features_function_words_freq(documents)))
|
||||
EP1 = np.hstack((EP1, _features_function_words_freq(ep1)))
|
||||
EP2 = np.hstack((EP2, _features_function_words_freq(ep2)))
|
||||
|
||||
if tfidf:
|
||||
X_features, vectorizer = _features_tfidf(documents)
|
||||
ep1_features, _ = _features_tfidf(ep1, vectorizer)
|
||||
ep2_features, _ = _features_tfidf(ep2, vectorizer)
|
||||
|
||||
if tfidf_feat_selection_ratio < 1.:
|
||||
if verbose: print('feature selection')
|
||||
X_features, ep1_features, ep2_features = \
|
||||
_feature_selection(X_features, y, ep1_features, ep2_features, tfidf_feat_selection_ratio)
|
||||
|
||||
# matrix is sparse now
|
||||
X = hstack((csr_matrix(X), X_features))
|
||||
EP1 = hstack((csr_matrix(EP1), ep1_features))
|
||||
EP2 = hstack((csr_matrix(EP2), ep2_features))
|
||||
|
||||
|
||||
# print summary
|
||||
if verbose:
|
||||
print('load_documents: function_words_freq={} tfidf={}, split_documents={}, split_policy={}'
|
||||
.format(function_words_freq, tfidf, split_documents, split_policy.__name__))
|
||||
print('number of training (full) documents: {}'.format(n_original_docs))
|
||||
print('X shape (#documents,#features): {}'.format(X.shape))
|
||||
print('y prevalence: {:.2f}%'.format(y.mean()*100))
|
||||
print('Epistola 1 shape:', EP1.shape)
|
||||
print('Epistola 2 shape:', EP2.shape)
|
||||
print()
|
||||
|
||||
return X, y, EP1, EP2
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue