From 5dda87693825ac4efca638bec5c84752671e10df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Moreo=20Fern=C3=A1ndez?= Date: Tue, 30 Oct 2018 18:00:51 +0100 Subject: [PATCH] big refactor --- README.md | 0 src/classifier.py | 43 -------- src/classifier2.py | 69 ++++++------ src/doc_representation.py | 50 --------- src/doc_representation2.py | 210 ++++++++++++++++++++++++++++++------- 5 files changed, 213 insertions(+), 159 deletions(-) create mode 100644 README.md delete mode 100644 src/classifier.py delete mode 100644 src/doc_representation.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..e69de29 diff --git a/src/classifier.py b/src/classifier.py deleted file mode 100644 index 874c02e..0000000 --- a/src/classifier.py +++ /dev/null @@ -1,43 +0,0 @@ -from sklearn.svm import LinearSVC -from sklearn.model_selection import cross_val_score, GridSearchCV -from sklearn.feature_selection import SelectKBest, chi2 -from doc_representation import * - -nfolds = 2 -do_feat_selection = True -params = [{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],'class_weight':['balanced',None]}] - -path = '/home/moreo/Dante/testi' -Xtr,ytr,ep1,ep2 = load_documents(path, by_sentences=True) - -if do_feat_selection: - print('feature selection') - num_feats = int(0.1 * Xtr.shape[1]) - feature_selector = SelectKBest(chi2, k=num_feats) - Xtr = feature_selector.fit_transform(Xtr,ytr) - print('final shape={}'.format(Xtr.shape)) - ep1 = feature_selector.transform(ep1) - ep2 = feature_selector.transform(ep2) - - -# learn a SVM -print('optimizing a SVM') -svm_base = LinearSVC() - -svm_optimized = GridSearchCV(svm_base, param_grid=params, cv=nfolds) -svm_optimized.fit(Xtr,ytr) -print('Best params: {}'.format(svm_optimized.best_params_)) - -# evaluation of results -print('computng the cross-val score') -accuracies = cross_val_score(svm_optimized, Xtr, ytr, cv=nfolds, n_jobs=-1) -acc_mean, acc_std = accuracies.mean(), accuracies.std() -print('Accuracy={:.3f} (+-{:.3f})'.format(acc_mean, acc_std)) - -# final test -print('predicting the Epistolas') -ep1_ = svm_optimized.predict(ep1) -ep2_ = svm_optimized.predict(ep2) -print('Epistola1 acc = {:.3f} {}'.format(ep1_.mean(), ep1_)) -print('Epistola2 acc = {:.3f} {}'.format(ep2_.mean(), ep2_)) - diff --git a/src/classifier2.py b/src/classifier2.py index 6a20a96..980c335 100644 --- a/src/classifier2.py +++ b/src/classifier2.py @@ -1,46 +1,55 @@ from sklearn.svm import * from sklearn.model_selection import cross_val_score, GridSearchCV -from sklearn.feature_selection import SelectKBest, chi2 from doc_representation2 import * -import numpy as np +from sklearn.metrics import f1_score, make_scorer -nfolds = 2 -do_feat_selection = False -params = [{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],'class_weight':['balanced',None]}] +probability=False +# SVM = SVC +SVM = LinearSVC -path = 'Data' -Xtr,ytr,ep2 = load_documents(path) +nfolds = 3 +params = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'class_weight':['balanced',None]} +if SVM is SVC: + params['kernel']=['linear','rbf'] -if do_feat_selection: - print('feature selection') - num_feats = int(0.1 * Xtr.shape[1]) - feature_selector = SelectKBest(chi2, k=num_feats) - Xtr = feature_selector.fit_transform(Xtr,ytr) - print('final shape={}'.format(Xtr.shape)) - #ep1 = feature_selector.transform(ep1) - ep2 = feature_selector.transform(ep2) +path = '../testi' +Xtr,ytr,ep1,ep2 = load_documents(path, split_documents=True, function_words_freq=True, tfidf=True, tfidf_feat_selection_ratio=0.1) # learn a SVM -print('optimizing a SVM') -svm_base = LinearSVC(max_iter=-1) #i risultati non convergevano, cosi' ho provato SVC piuttosto che LinearSVC -svm_optimized = GridSearchCV(svm_base, param_grid=params, cv=nfolds) - #print ("Shape of X:", Xtr.shape) -svm_optimized.fit(Xtr, ytr) - #print('Best params: {}'.format(svm_optimized.best_params_)) +# svm = SVM(probability=probability) +svm = SVM() + +positive_examples = ytr.sum() +if positive_examples>nfolds: + print('optimizing {}'.format(svm.__class__.__name__)) + svm = GridSearchCV(svm, param_grid=params, cv=nfolds, scoring=make_scorer(f1_score)) + +svm.fit(Xtr, ytr) + +if isinstance(svm, GridSearchCV): + print('Best params: {}'.format(svm.best_params_)) # evaluation of results print('computing the cross-val score') -accuracies = cross_val_score(svm_optimized, Xtr, ytr, cv=nfolds, n_jobs=-1) -acc_mean, acc_std = accuracies.mean(), accuracies.std() -print('Accuracy={:.3f} (+-{:.3f})'.format(acc_mean, acc_std)) +f1scores = cross_val_score(svm, Xtr, ytr, cv=nfolds, n_jobs=-1, scoring=make_scorer(f1_score)) +f1_mean, f1_std = f1scores.mean(), f1scores.std() +print('F1-measure={:.3f} (+-{:.3f})\n'.format(f1_mean, f1_std)) # final test -print('predicting the Epistolas') - #ep1_ = svm_optimized.predict(ep1) -ep2= np.reshape(ep2, (1,-1)) -ep2_ = svm_optimized.predict(ep2) - #print('Epistola1 acc = {:.3f} {}'.format(ep1_.mean(), ep1_)) -print('Epistola2 acc = {:.3f} {}'.format(ep2_.mean(), ep2_)) +def predictEpistola(ep, epistola_name): + pred = svm.predict(ep) + full_doc_prediction = pred[0] + print('{} is from Dante: {}'.format(epistola_name, 'Yes' if full_doc_prediction == 1 else 'No')) + if len(pred>0): + fragment_predictions= pred[1:] + print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions)) + if SVM is SVC and probability: + prob = svm.predict_proba(ep)[:,1] + np.set_printoptions(precision=2, linewidth=200) + print('probabilistic view: full={:.3f}, fragments average {:.3f}, fragments={}'.format(prob[0], prob[1:].mean(), prob[1:])) +print('Predicting the Epistolas') +predictEpistola(ep1, 'Epistola 1') +predictEpistola(ep2, 'Epistola 2') diff --git a/src/doc_representation.py b/src/doc_representation.py deleted file mode 100644 index 9e45ebf..0000000 --- a/src/doc_representation.py +++ /dev/null @@ -1,50 +0,0 @@ -import os -from os.path import join -from sklearn.feature_extraction.text import TfidfVectorizer -import numpy as np - -def load_documents(path, by_sentences=False): - #read documents - docs,y = [],[] - for file in os.listdir(path): - if file.startswith('EpistolaXIII_'): continue - file_clean = file.replace('.txt','') - author, textname = file_clean.split('_')[0],file_clean.split('_')[1] - if by_sentences: - lines = open(join(path, file)).readlines() - docs.extend(lines) - if author == 'Dante': - y.extend([1] * len(lines)) - else: - y.extend([0] * len(lines)) - else: - docs.append(open(join(path,file)).read()) - if author == 'Dante': - y.append(1) - else: - y.append(0) - - if not by_sentences: - y = y + y - docs = docs + docs - - if by_sentences: - ep1 = open(join(path, 'EpistolaXIII_1.txt')).readlines() - ep2 = open(join(path, 'EpistolaXIII_2.txt')).readlines() - else: - ep1 = [open(join(path, 'EpistolaXIII_1.txt' )).read()] - ep2 = [open(join(path, 'EpistolaXIII_2.txt')).read()] - - # document representation - tfidf = TfidfVectorizer(sublinear_tf=True) - X = tfidf.fit_transform(docs) - y = np.array(y) - Epistola1 = tfidf.transform(ep1) - Epistola2 = tfidf.transform(ep2) - - print('documents read, shape={}'.format(X.shape)) - # print(y) - - return X, y, Epistola1, Epistola2 - - diff --git a/src/doc_representation2.py b/src/doc_representation2.py index 9ac77ca..d194d95 100644 --- a/src/doc_representation2.py +++ b/src/doc_representation2.py @@ -2,49 +2,187 @@ import nltk import numpy as np import os from os.path import join +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.feature_selection import SelectKBest +from sklearn.feature_selection import chi2 +from scipy.sparse import hstack, csr_matrix function_words = ["et", "in", "de", "ad", "ut", "cum", "non", "per", "a", "que", "ex", "sed"] -def load_documents(path): - X, y = [], [] - i=0; + +# ------------------------------------------------------------------------ +# document loading routine +# ------------------------------------------------------------------------ +def _load_texts(path): + # load the training data (all documents but Epistolas 1 and 2) + documents = [] + authors = [] + ndocs=0 for file in os.listdir(path): if file.startswith('EpistolaXIII_'): continue file_clean = file.replace('.txt','') author, textname = file_clean.split('_')[0],file_clean.split('_')[1] - tokens = nltk.word_tokenize(open(join(path,file), encoding= "utf8").read()) - author_tokens = ([token.lower() for token in tokens - if any(char.isalpha() for char in token)]) - freqs= nltk.FreqDist(author_tokens) - X.append([]) - #print(f"From {textname} by {author}:") - for function_word in function_words: - feature= (freqs[function_word]*1000)/len(author_tokens) - #print(function_word, " = ", freqs[function_word], ", ", feature) - X[i].append(feature) - i+=1 - if author == "Dante": - y.append(1) - else: - y.append(0) - - - y= y + y - X= X + X - y= np.array(y) - - ep = [] - tokens = nltk.word_tokenize(open(join(path, 'EpistolaXIII_2.txt'), encoding= "utf8").read()) - ep2_tokens = ([token.lower() for token in tokens - if any(char.isalpha() for char in token)]) - freqs= nltk.FreqDist(ep2_tokens) - #print("From Epistola XIII_2:") - for function_word in function_words: - feature= (freqs[function_word]*1000/len(ep2_tokens)) - ep.append(feature) - #print(function_word, " = ", freqs[function_word], ", ", feature) - ep2 = np.array(ep) + text = open(join(path,file), encoding= "utf8").read() - return X, y, ep2 + documents.append(text) + authors.append(author) + ndocs+=1 + + # load the test data (Epistolas 1 and 2) + ep1_text = open(join(path, 'EpistolaXIII_1.txt'), encoding="utf8").read() + ep2_text = open(join(path, 'EpistolaXIII_2.txt'), encoding="utf8").read() + + return documents, authors, ep1_text, ep2_text + + + +# ------------------------------------------------------------------------ +# split policies +# ------------------------------------------------------------------------ +# TODO: implement other split policies (e.g., overlapping ones, etc) +def _split_by_endline(text): + return [t.strip() for t in text.split('\n') if t.strip()] + + +def splitter(documents, authors=None, split_policy=_split_by_endline): + fragments = [] + authors_fragments = [] + for i, text in enumerate(documents): + text_fragments = split_policy(text) + fragments.extend(text_fragments) + if authors is not None: + authors_fragments.extend([authors[i]] * len(text_fragments)) + + if authors is not None: + return fragments, authors_fragments + return fragments + +# ------------------------------------------------------------------------ +# feature extraction methods +# ------------------------------------------------------------------------ +# TODO: implement other feature extraction methods +def _features_function_words_freq(documents): + """ + Extract features as the frequency (x1000) of the functions used in the documents + :param documents: a list where each element is the text (string) of a document + :return: a np.array of shape (D,F) where D is len(documents) and F is len(function_words) + """ + features = [] + for text in documents: + tokens = nltk.word_tokenize(text) + author_tokens = ([token.lower() for token in tokens if any(char.isalpha() for char in token)]) + freqs = nltk.FreqDist(author_tokens) + + nwords = len(author_tokens) + funct_words_freq = [1000. * freqs[function_word] / nwords for function_word in function_words] + + features.append(funct_words_freq) + + return np.array(features) + + +def _features_tfidf(documents, tfidf_vectorizer=None): + """ + Extract features as tfidf matrix extracted from the documents + :param documents: a list where each element is the text (string) of a document + :return: a tuple M,V, where M is an np.array of shape (D,F), with D being the len(documents) and F the number of + distinct words; and V is the TfidfVectorizer already fit + """ + if tfidf_vectorizer is None: + tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True) + tfidf_vectorizer.fit(documents) + + features = tfidf_vectorizer.transform(documents) + + return features, tfidf_vectorizer + +def _feature_selection(X, y, EP1, EP2, tfidf_feat_selection_ratio): + nF = X.shape[1] + num_feats = int(tfidf_feat_selection_ratio * nF) + feature_selector = SelectKBest(chi2, k=num_feats) + X = feature_selector.fit_transform(X, y) + EP1 = feature_selector.transform(EP1) + EP2 = feature_selector.transform(EP2) + return X,EP1,EP2 + +def load_documents(path, + function_words_freq=True, + tfidf=False, + tfidf_feat_selection_ratio=1., + split_documents=False, + split_policy = _split_by_endline, + verbose=True): + """ + Loads the documents contained in path applying a number of feature extraction policies. The directory is assumed to + contain files named according to _.txt plus two special files EpistolaXIII_1.txt and + EpistolaXIII_2.txt concerning the two documents whose authorship attribution is to be determined. + :param path: the path containing the texts, each named as _.txt + :param function_words_freq: add the frequency of function words as features + :param tfidf: add the tfidf as features + :param split_documents: whether to split text into smaller documents or not (currenty, the policy is to split by '\n'). + Currently, the fragments resulting from the split are added to the pool of documents (i.e., they do not replace the + full documents, which are anyway retained). + :param split_policy: a callable that implements the split to be applied (ignored if split_documents=False) + :param verbose: show information by stdout or not + :return: np.arrays or csr_matrix (depending on whether tfidf is activated or not) X, y, EP1, EP2, where X is the + matrix of features for the training set and y are the labels (np.array); + EP1 and EP2 are the matrix of features for the epistola 1 (first row) and fragments (from row 2nd to last) if + split_documents=True) and 2 (similar) + """ + + documents, authors, ep1_text, ep2_text = _load_texts(path) + ep1,ep2 = [ep1_text],[ep2_text] + n_original_docs=len(documents) + + if split_documents: + doc_fragments, authors_fragments = splitter(documents, authors, split_policy=split_policy) + documents.extend(doc_fragments) + authors.extend(authors_fragments) + + ep1.extend(splitter(ep1, split_policy=split_policy)) + ep2.extend(splitter(ep2, split_policy=split_policy)) + + + # represent the target vector + y = np.array([(1 if author == "Dante" else 0) for author in authors]) + + # initialize the document-by-feature vector + X = np.empty((len(documents), 0)) + EP1 = np.empty((len(ep1), 0)) + EP2 = np.empty((len(ep2), 0)) + + if function_words_freq: + X = np.hstack((X,_features_function_words_freq(documents))) + EP1 = np.hstack((EP1, _features_function_words_freq(ep1))) + EP2 = np.hstack((EP2, _features_function_words_freq(ep2))) + + if tfidf: + X_features, vectorizer = _features_tfidf(documents) + ep1_features, _ = _features_tfidf(ep1, vectorizer) + ep2_features, _ = _features_tfidf(ep2, vectorizer) + + if tfidf_feat_selection_ratio < 1.: + if verbose: print('feature selection') + X_features, ep1_features, ep2_features = \ + _feature_selection(X_features, y, ep1_features, ep2_features, tfidf_feat_selection_ratio) + + # matrix is sparse now + X = hstack((csr_matrix(X), X_features)) + EP1 = hstack((csr_matrix(EP1), ep1_features)) + EP2 = hstack((csr_matrix(EP2), ep2_features)) + + + # print summary + if verbose: + print('load_documents: function_words_freq={} tfidf={}, split_documents={}, split_policy={}' + .format(function_words_freq, tfidf, split_documents, split_policy.__name__)) + print('number of training (full) documents: {}'.format(n_original_docs)) + print('X shape (#documents,#features): {}'.format(X.shape)) + print('y prevalence: {:.2f}%'.format(y.mean()*100)) + print('Epistola 1 shape:', EP1.shape) + print('Epistola 2 shape:', EP2.shape) + print() + + return X, y, EP1, EP2