From 5dda87693825ac4efca638bec5c84752671e10df Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Moreo=20Fern=C3=A1ndez?=
 <alejandro.moreo@isti.cnr.it>
Date: Tue, 30 Oct 2018 18:00:51 +0100
Subject: [PATCH] big refactor

---
 README.md                  |   0
 src/classifier.py          |  43 --------
 src/classifier2.py         |  69 ++++++------
 src/doc_representation.py  |  50 ---------
 src/doc_representation2.py | 210 ++++++++++++++++++++++++++++++-------
 5 files changed, 213 insertions(+), 159 deletions(-)
 create mode 100644 README.md
 delete mode 100644 src/classifier.py
 delete mode 100644 src/doc_representation.py

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..e69de29
diff --git a/src/classifier.py b/src/classifier.py
deleted file mode 100644
index 874c02e..0000000
--- a/src/classifier.py
+++ /dev/null
@@ -1,43 +0,0 @@
-from sklearn.svm import LinearSVC
-from sklearn.model_selection import cross_val_score, GridSearchCV
-from sklearn.feature_selection import SelectKBest, chi2
-from doc_representation import *
-
-nfolds = 2
-do_feat_selection = True
-params = [{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],'class_weight':['balanced',None]}]
-
-path = '/home/moreo/Dante/testi'
-Xtr,ytr,ep1,ep2 = load_documents(path, by_sentences=True)
-
-if do_feat_selection:
-    print('feature selection')
-    num_feats = int(0.1 * Xtr.shape[1])
-    feature_selector = SelectKBest(chi2, k=num_feats)
-    Xtr = feature_selector.fit_transform(Xtr,ytr)
-    print('final shape={}'.format(Xtr.shape))
-    ep1 = feature_selector.transform(ep1)
-    ep2 = feature_selector.transform(ep2)
-
-
-# learn a SVM
-print('optimizing a SVM')
-svm_base = LinearSVC()
-
-svm_optimized = GridSearchCV(svm_base, param_grid=params, cv=nfolds)
-svm_optimized.fit(Xtr,ytr)
-print('Best params: {}'.format(svm_optimized.best_params_))
-
-# evaluation of results
-print('computng the cross-val score')
-accuracies = cross_val_score(svm_optimized, Xtr, ytr, cv=nfolds, n_jobs=-1)
-acc_mean, acc_std = accuracies.mean(), accuracies.std()
-print('Accuracy={:.3f} (+-{:.3f})'.format(acc_mean, acc_std))
-
-# final test
-print('predicting the Epistolas')
-ep1_ = svm_optimized.predict(ep1)
-ep2_ = svm_optimized.predict(ep2)
-print('Epistola1 acc = {:.3f} {}'.format(ep1_.mean(), ep1_))
-print('Epistola2 acc = {:.3f} {}'.format(ep2_.mean(), ep2_))
-
diff --git a/src/classifier2.py b/src/classifier2.py
index 6a20a96..980c335 100644
--- a/src/classifier2.py
+++ b/src/classifier2.py
@@ -1,46 +1,55 @@
 from sklearn.svm import *
 from sklearn.model_selection import cross_val_score, GridSearchCV
-from sklearn.feature_selection import SelectKBest, chi2
 from doc_representation2 import *
-import numpy as np
+from sklearn.metrics import f1_score, make_scorer
 
-nfolds = 2
-do_feat_selection = False
-params = [{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],'class_weight':['balanced',None]}]
+probability=False
+# SVM = SVC
+SVM = LinearSVC
 
-path = 'Data'
-Xtr,ytr,ep2 = load_documents(path)
+nfolds = 3
+params = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'class_weight':['balanced',None]}
+if SVM is SVC:
+    params['kernel']=['linear','rbf']
 
-if do_feat_selection:
-    print('feature selection')
-    num_feats = int(0.1 * Xtr.shape[1])
-    feature_selector = SelectKBest(chi2, k=num_feats)
-    Xtr = feature_selector.fit_transform(Xtr,ytr)
-    print('final shape={}'.format(Xtr.shape))
-        #ep1 = feature_selector.transform(ep1)
-    ep2 = feature_selector.transform(ep2)
+path = '../testi'
 
+Xtr,ytr,ep1,ep2 = load_documents(path, split_documents=True, function_words_freq=True, tfidf=True, tfidf_feat_selection_ratio=0.1)
 
 # learn a SVM
-print('optimizing a SVM')
-svm_base = LinearSVC(max_iter=-1) #i risultati non convergevano, cosi' ho provato SVC piuttosto che LinearSVC
 
-svm_optimized = GridSearchCV(svm_base, param_grid=params, cv=nfolds)
-    #print ("Shape of X:", Xtr.shape)
-svm_optimized.fit(Xtr, ytr)
-    #print('Best params: {}'.format(svm_optimized.best_params_))
+# svm = SVM(probability=probability)
+svm = SVM()
+
+positive_examples = ytr.sum()
+if positive_examples>nfolds:
+    print('optimizing {}'.format(svm.__class__.__name__))
+    svm = GridSearchCV(svm, param_grid=params, cv=nfolds, scoring=make_scorer(f1_score))
+
+svm.fit(Xtr, ytr)
+
+if isinstance(svm, GridSearchCV):
+    print('Best params: {}'.format(svm.best_params_))
 
 # evaluation of results
 print('computing the cross-val score')
-accuracies = cross_val_score(svm_optimized, Xtr, ytr, cv=nfolds, n_jobs=-1)
-acc_mean, acc_std = accuracies.mean(), accuracies.std()
-print('Accuracy={:.3f} (+-{:.3f})'.format(acc_mean, acc_std))
+f1scores = cross_val_score(svm, Xtr, ytr, cv=nfolds, n_jobs=-1, scoring=make_scorer(f1_score))
+f1_mean, f1_std = f1scores.mean(), f1scores.std()
+print('F1-measure={:.3f} (+-{:.3f})\n'.format(f1_mean, f1_std))
 
 # final test
-print('predicting the Epistolas')
-    #ep1_ = svm_optimized.predict(ep1)
-ep2= np.reshape(ep2, (1,-1))
-ep2_ = svm_optimized.predict(ep2)
-    #print('Epistola1 acc = {:.3f} {}'.format(ep1_.mean(), ep1_))
-print('Epistola2 acc = {:.3f} {}'.format(ep2_.mean(), ep2_))
+def predictEpistola(ep, epistola_name):
+    pred = svm.predict(ep)
+    full_doc_prediction = pred[0]
+    print('{} is from Dante: {}'.format(epistola_name, 'Yes' if full_doc_prediction == 1 else 'No'))
+    if len(pred>0):
+        fragment_predictions= pred[1:]
+        print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions))
+        if SVM is SVC and probability:
+            prob = svm.predict_proba(ep)[:,1]
+            np.set_printoptions(precision=2, linewidth=200)
+            print('probabilistic view: full={:.3f}, fragments average {:.3f}, fragments={}'.format(prob[0], prob[1:].mean(), prob[1:]))
 
+print('Predicting the Epistolas')
+predictEpistola(ep1, 'Epistola 1')
+predictEpistola(ep2, 'Epistola 2')
diff --git a/src/doc_representation.py b/src/doc_representation.py
deleted file mode 100644
index 9e45ebf..0000000
--- a/src/doc_representation.py
+++ /dev/null
@@ -1,50 +0,0 @@
-import os
-from os.path import join
-from sklearn.feature_extraction.text import TfidfVectorizer
-import numpy as np
-
-def load_documents(path, by_sentences=False):
-    #read documents
-    docs,y = [],[]
-    for file in os.listdir(path):
-        if file.startswith('EpistolaXIII_'): continue
-        file_clean = file.replace('.txt','')
-        author, textname = file_clean.split('_')[0],file_clean.split('_')[1]
-        if by_sentences:
-            lines = open(join(path, file)).readlines()
-            docs.extend(lines)
-            if author == 'Dante':
-                y.extend([1] * len(lines))
-            else:
-                y.extend([0] * len(lines))
-        else:
-            docs.append(open(join(path,file)).read())
-            if author == 'Dante':
-                y.append(1)
-            else:
-                y.append(0)
-
-    if not by_sentences:
-        y = y + y
-        docs = docs + docs
-
-    if by_sentences:
-        ep1 = open(join(path, 'EpistolaXIII_1.txt')).readlines()
-        ep2 = open(join(path, 'EpistolaXIII_2.txt')).readlines()
-    else:
-        ep1 = [open(join(path, 'EpistolaXIII_1.txt' )).read()]
-        ep2 = [open(join(path, 'EpistolaXIII_2.txt')).read()]
-
-    # document representation
-    tfidf = TfidfVectorizer(sublinear_tf=True)
-    X = tfidf.fit_transform(docs)
-    y = np.array(y)
-    Epistola1 = tfidf.transform(ep1)
-    Epistola2 = tfidf.transform(ep2)
-
-    print('documents read, shape={}'.format(X.shape))
-    # print(y)
-
-    return X, y, Epistola1, Epistola2
-
-
diff --git a/src/doc_representation2.py b/src/doc_representation2.py
index 9ac77ca..d194d95 100644
--- a/src/doc_representation2.py
+++ b/src/doc_representation2.py
@@ -2,49 +2,187 @@ import nltk
 import numpy as np
 import os
 from os.path import join
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.feature_selection import SelectKBest
+from sklearn.feature_selection import chi2
+from scipy.sparse import hstack, csr_matrix
 
 function_words = ["et", "in", "de", "ad", "ut", "cum", "non", "per", "a", "que", "ex", "sed"]
 
-def load_documents(path):
-    X, y = [], []
-    i=0;
+
+# ------------------------------------------------------------------------
+# document loading routine
+# ------------------------------------------------------------------------
+def _load_texts(path):
+    # load the training data (all documents but Epistolas 1 and 2)
+    documents = []
+    authors   = []
+    ndocs=0
     for file in os.listdir(path):
         if file.startswith('EpistolaXIII_'): continue
         file_clean = file.replace('.txt','')
         author, textname = file_clean.split('_')[0],file_clean.split('_')[1]
-        tokens = nltk.word_tokenize(open(join(path,file), encoding= "utf8").read())
-        author_tokens = ([token.lower() for token in tokens
-                                  if any(char.isalpha() for char in token)])
-        freqs= nltk.FreqDist(author_tokens)
-        X.append([])
-        #print(f"From {textname} by {author}:")
-        for function_word in function_words:
-            feature= (freqs[function_word]*1000)/len(author_tokens)
-            #print(function_word, " = ", freqs[function_word], ", ", feature)
-            X[i].append(feature)
-        i+=1
-        if author == "Dante":
-            y.append(1)
-        else:
-            y.append(0)
-            
-    
-    y= y + y
-    X= X + X
-    y= np.array(y)
-    
-    ep = []
-    tokens = nltk.word_tokenize(open(join(path, 'EpistolaXIII_2.txt'), encoding= "utf8").read())
-    ep2_tokens = ([token.lower() for token in tokens
-                                  if any(char.isalpha() for char in token)])
-    freqs= nltk.FreqDist(ep2_tokens)
-    #print("From Epistola XIII_2:")
-    for function_word in function_words:
-        feature= (freqs[function_word]*1000/len(ep2_tokens))
-        ep.append(feature)
-        #print(function_word, " = ", freqs[function_word], ", ", feature)
-        ep2 = np.array(ep)
+        text = open(join(path,file), encoding= "utf8").read()
 
-    return X, y, ep2
+        documents.append(text)
+        authors.append(author)
+        ndocs+=1
+
+    # load the test data (Epistolas 1 and 2)
+    ep1_text = open(join(path, 'EpistolaXIII_1.txt'), encoding="utf8").read()
+    ep2_text = open(join(path, 'EpistolaXIII_2.txt'), encoding="utf8").read()
+
+    return documents, authors, ep1_text, ep2_text
+
+
+
+# ------------------------------------------------------------------------
+# split policies
+# ------------------------------------------------------------------------
+# TODO: implement other split policies (e.g., overlapping ones, etc)
+def _split_by_endline(text):
+    return [t.strip() for t in text.split('\n') if t.strip()]
+
+
+def splitter(documents, authors=None, split_policy=_split_by_endline):
+    fragments = []
+    authors_fragments = []
+    for i, text in enumerate(documents):
+        text_fragments = split_policy(text)
+        fragments.extend(text_fragments)
+        if authors is not None:
+            authors_fragments.extend([authors[i]] * len(text_fragments))
+
+    if authors is not None:
+        return fragments, authors_fragments
+    return fragments
+
+# ------------------------------------------------------------------------
+# feature extraction methods
+# ------------------------------------------------------------------------
+# TODO: implement other feature extraction methods
+def _features_function_words_freq(documents):
+    """
+    Extract features as the frequency (x1000) of the functions used in the documents
+    :param documents: a list where each element is the text (string) of a document
+    :return: a np.array of shape (D,F) where D is len(documents) and F is len(function_words)
+    """
+    features = []
+    for text in documents:
+        tokens = nltk.word_tokenize(text)
+        author_tokens = ([token.lower() for token in tokens if any(char.isalpha() for char in token)])
+        freqs = nltk.FreqDist(author_tokens)
+
+        nwords = len(author_tokens)
+        funct_words_freq = [1000. * freqs[function_word] / nwords for function_word in function_words]
+
+        features.append(funct_words_freq)
+
+    return np.array(features)
+
+
+def _features_tfidf(documents, tfidf_vectorizer=None):
+    """
+    Extract features as tfidf matrix extracted from the documents
+    :param documents: a list where each element is the text (string) of a document
+    :return: a tuple M,V, where M is an np.array of shape (D,F), with D being the len(documents) and F the number of
+    distinct words; and V is the TfidfVectorizer already fit
+    """
+    if tfidf_vectorizer is None:
+        tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True)
+        tfidf_vectorizer.fit(documents)
+
+    features = tfidf_vectorizer.transform(documents)
+
+    return features, tfidf_vectorizer
+
+def _feature_selection(X, y, EP1, EP2, tfidf_feat_selection_ratio):
+    nF = X.shape[1]
+    num_feats = int(tfidf_feat_selection_ratio * nF)
+    feature_selector = SelectKBest(chi2, k=num_feats)
+    X = feature_selector.fit_transform(X, y)
+    EP1 = feature_selector.transform(EP1)
+    EP2 = feature_selector.transform(EP2)
+    return X,EP1,EP2
+
+def load_documents(path,
+                   function_words_freq=True,
+                   tfidf=False,
+                   tfidf_feat_selection_ratio=1.,
+                   split_documents=False,
+                   split_policy = _split_by_endline,
+                   verbose=True):
+    """
+    Loads the documents contained in path applying a number of feature extraction policies. The directory is assumed to
+    contain files named according to <author>_<text_name>.txt plus two special files EpistolaXIII_1.txt and
+    EpistolaXIII_2.txt concerning the two documents whose authorship attribution is to be determined.
+    :param path: the path containing the texts, each named as <author>_<text_name>.txt
+    :param function_words_freq: add the frequency of function words as features
+    :param tfidf: add the tfidf as features
+    :param split_documents: whether to split text into smaller documents or not (currenty, the policy is to split by '\n').
+    Currently, the fragments resulting from the split are added to the pool of documents (i.e., they do not replace the
+    full documents, which are anyway retained).
+    :param split_policy: a callable that implements the split to be applied (ignored if split_documents=False)
+    :param verbose: show information by stdout or not
+    :return: np.arrays or csr_matrix (depending on whether tfidf is activated or not) X, y, EP1, EP2, where X is the
+    matrix of features for the training set and y are the labels (np.array);
+    EP1 and EP2 are the matrix of features for the epistola 1 (first row) and fragments (from row 2nd to last) if
+    split_documents=True) and 2 (similar)
+    """
+
+    documents, authors, ep1_text, ep2_text = _load_texts(path)
+    ep1,ep2 = [ep1_text],[ep2_text]
+    n_original_docs=len(documents)
+
+    if split_documents:
+        doc_fragments, authors_fragments = splitter(documents, authors, split_policy=split_policy)
+        documents.extend(doc_fragments)
+        authors.extend(authors_fragments)
+
+        ep1.extend(splitter(ep1, split_policy=split_policy))
+        ep2.extend(splitter(ep2, split_policy=split_policy))
+
+
+    # represent the target vector
+    y = np.array([(1 if author == "Dante" else 0) for author in authors])
+
+    # initialize the document-by-feature vector
+    X = np.empty((len(documents), 0))
+    EP1 = np.empty((len(ep1), 0))
+    EP2 = np.empty((len(ep2), 0))
+
+    if function_words_freq:
+        X = np.hstack((X,_features_function_words_freq(documents)))
+        EP1 = np.hstack((EP1, _features_function_words_freq(ep1)))
+        EP2 = np.hstack((EP2, _features_function_words_freq(ep2)))
+
+    if tfidf:
+        X_features, vectorizer = _features_tfidf(documents)
+        ep1_features, _ = _features_tfidf(ep1, vectorizer)
+        ep2_features, _ = _features_tfidf(ep2, vectorizer)
+
+        if tfidf_feat_selection_ratio < 1.:
+            if verbose: print('feature selection')
+            X_features, ep1_features, ep2_features = \
+                _feature_selection(X_features, y, ep1_features, ep2_features, tfidf_feat_selection_ratio)
+
+        # matrix is sparse now
+        X   = hstack((csr_matrix(X), X_features))
+        EP1 = hstack((csr_matrix(EP1), ep1_features))
+        EP2 = hstack((csr_matrix(EP2), ep2_features))
+
+
+    # print summary
+    if verbose:
+        print('load_documents: function_words_freq={} tfidf={}, split_documents={}, split_policy={}'
+              .format(function_words_freq, tfidf, split_documents, split_policy.__name__))
+        print('number of training (full) documents: {}'.format(n_original_docs))
+        print('X shape (#documents,#features): {}'.format(X.shape))
+        print('y prevalence: {:.2f}%'.format(y.mean()*100))
+        print('Epistola 1 shape:', EP1.shape)
+        print('Epistola 2 shape:', EP2.shape)
+        print()
+
+    return X, y, EP1, EP2