From 04f0eb17edf17f58e99f8047ffb42adb8325a570 Mon Sep 17 00:00:00 2001
From: Alex Moreo <alejandro.moreo@isti.cnr.it>
Date: Fri, 27 Nov 2020 21:04:00 +0100
Subject: [PATCH] final refinements

---
 src/author_identification_loo.py     |  2 +-
 src/author_identification_unknown.py |  4 ++--
 src/experiments.sh                   | 21 ++++++++++++++++-----
 src/helpers.py                       |  2 +-
 src/model.py                         | 25 +++----------------------
 src/settings.py                      |  5 +----
 6 files changed, 24 insertions(+), 35 deletions(-)

diff --git a/src/author_identification_loo.py b/src/author_identification_loo.py
index 3349946..dcc0009 100755
--- a/src/author_identification_loo.py
+++ b/src/author_identification_loo.py
@@ -37,7 +37,7 @@ def main():
             n_full_docs = len(positive) + len(negative)
             print(f'read {n_full_docs} documents from {path}')
 
-            feature_extractor = FeatureExtractor(**settings.config_loo)
+            feature_extractor = FeatureExtractor(**settings.config_feature_extraction)
 
             Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)
             frange_chgrams = feature_extractor.feature_range['_cngrams_task']
diff --git a/src/author_identification_unknown.py b/src/author_identification_unknown.py
index 5d7b25c..7c957d8 100755
--- a/src/author_identification_unknown.py
+++ b/src/author_identification_unknown.py
@@ -38,8 +38,8 @@ def main():
             n_full_docs = len(positive) + len(negative)
             print(f'read {n_full_docs} documents from {path}')
 
-            settings.config_unk['feature_selection_ratio'] = args.featsel
-            feature_extractor = FeatureExtractor(**settings.config_unk)
+            settings.config_feature_extraction['feature_selection_ratio'] = args.featsel
+            feature_extractor = FeatureExtractor(**settings.config_feature_extraction)
 
             Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)
             frange_chgrams = feature_extractor.feature_range['_cngrams_task']
diff --git a/src/experiments.sh b/src/experiments.sh
index ddee056..5f577ec 100755
--- a/src/experiments.sh
+++ b/src/experiments.sh
@@ -9,11 +9,22 @@ if [ ! -d $corpus ]; then
   rm ../MedLatin.zip
 fi
 
-PY="python3 author_identification_loo.py"
+PYLOO="python3 author_identification_loo.py"
+PYUNK="python3 author_identification_unknown.py"
+
 MedLatin1="../MedLatin/Corpora/MedLatin1"
 MedLatin2="../MedLatin/Corpora/MedLatin2"
-EP1="../MedLatin/Epistle/EpistolaXIII_1.txt"
-EP2="../MedLatin/Epistle/EpistolaXIII_2.txt"
 
-$PY $MedLatin1 ALL --log ./resultsLoo_EP1.txt
-$PY $MedLatin2 ALL --log ./resultsLoo_EP2.txt
+EPXIII1="../MedLatin/Epistle/EpistolaXIII_1.txt"
+EPXIII2="../MedLatin/Epistle/EpistolaXIII_2.txt"
+EPXIV="../Epistola_ArigoVII.txt"
+
+for learner in lr svm mnb ; do
+  $PYLOO $MedLatin1 ALL --learner $learner --log ../results/resultsLOO_EP1_$learner.txt
+  $PYLOO $MedLatin2 ALL --learner $learner --log ../results/resultsLOO_EP2_$learner.txt
+
+  $PYUNK $MedLatin1 Dante $EPXIII1 --learner $learner --log ../results/resultsUNK_EP13_1_$learner.txt
+  $PYUNK $MedLatin2 Dante $EPXIII2 --learner $learner --log ../results/resultsUNK_EP13_2_$learner.txt
+  $PYUNK $MedLatin1 Dante $EPXIV --learner $learner --log ../results/resultsUNK_EP14_$learner.txt
+done
+
diff --git a/src/helpers.py b/src/helpers.py
index 9ce7dd0..a18e7c6 100644
--- a/src/helpers.py
+++ b/src/helpers.py
@@ -54,9 +54,9 @@ def check_log_loo(args):
 
 
 def check_log_unknown(args):
+    args.unknown_name = pathlib.Path(args.unknown).name
     if args.log is None:
         os.makedirs('../results', exist_ok=True)
         assert os.path.exists(args.unknown), f'file {args.unknown} does not exist'
-        args.unknown_name = pathlib.Path(args.unknown).name
         args.log = f'../results/Unknown{args.unknown_name}_Corpus{args.corpus_name}.Author{args.positive}.' \
                    f'fs{args.featsel}.classweight{str(args.class_weight)}.CLS{args.learner}.txt'
\ No newline at end of file
diff --git a/src/model.py b/src/model.py
index 2f0a125..e81838a 100755
--- a/src/model.py
+++ b/src/model.py
@@ -9,7 +9,6 @@ from util.evaluation import f1_metric
 from typing import List, Union
 
 
-
 class AuthorshipVerificator(BaseEstimator):
 
     def __init__(self, nfolds=10, param_grid=None, learner=None, C=1., alpha=0.001, class_weight='balanced',
@@ -24,10 +23,7 @@ class AuthorshipVerificator(BaseEstimator):
         self.feat_selection_slices = feat_selection_slices
         self.feat_selection_ratio = feat_selection_ratio
 
-    def fit(self, X, y, groups=None, hyperparam_optimization=True):
-        if self.param_grid is None and hyperparam_optimization:
-            raise ValueError('Param grid is None, but hyperparameter optimization is requested')
-
+    def fit(self, X, y, groups=None):
         if self.feat_selection_slices is not None:
             self.fs = MultiRangeFeatureSelector(self.feat_selection_slices, feat_sel=self.feat_selection_ratio)
             X = self.fs.fit(X, y).transform(X)
@@ -37,7 +33,7 @@ class AuthorshipVerificator(BaseEstimator):
                 C=self.C, class_weight=self.class_weight, max_iter=1000, random_state=self.random_seed, solver='lbfgs'
             )
         elif self.learner == 'svm':
-            self.classifier = LinearSVC(C=self.C, class_weight=self.class_weight)
+            self.classifier = LinearSVC(C=self.C, class_weight=self.class_weight, max_iter=2500, random_state=self.random_seed)
         elif self.learner == 'mnb':
             self.classifier = MultinomialNB(alpha=self.alpha)
 
@@ -47,7 +43,7 @@ class AuthorshipVerificator(BaseEstimator):
         if groups is None:
             groups = np.arange(len(y))
 
-        if hyperparam_optimization and (positive_examples >= self.nfolds) and (len(np.unique(groups[y==1])) > 1):
+        if (positive_examples >= self.nfolds) and (len(np.unique(groups[y==1])) > 1):
             folds = list(GroupKFold(n_splits=self.nfolds).split(X, y, groups))
             self.estimator = GridSearchCV(
                 self.classifier, param_grid=self.param_grid, cv=folds, scoring=make_scorer(f1_metric), n_jobs=-1,
@@ -135,18 +131,3 @@ class MultiRangeFeatureSelector(BaseEstimator, TransformerMixin):
     def __sort_ranges(self, ranges: List[slice]):
         return np.asarray(ranges)[np.argsort([r.start for r in ranges])[::-1]]
 
-
-def get_valid_folds(nfolds, X, y, groups, max_trials=100):
-    trials = 0
-    folds = list(GroupKFold(n_splits=nfolds).split(X, y, groups))
-    n_docs = len(y)
-    print(f'different classes={np.unique(y)}; #different documents={len(np.unique(groups))} positives={len(np.unique(groups[y==1]))}')
-    while any(len(np.unique(y[train])) < 2 for train, test in folds):
-        shuffle_index = np.random.permutation(n_docs)
-        X, y, groups = X[shuffle_index], y[shuffle_index], groups[shuffle_index]
-        folds = list(GroupKFold(n_splits=nfolds).split(X, y, groups))
-        print(f'\ttrial{trials}:{[len(np.unique(y[train])) for train, test in folds]}')
-        trials+=1
-        if trials>max_trials:
-            raise ValueError(f'could not meet condition after {max_trials} trials')
-    return folds
diff --git a/src/settings.py b/src/settings.py
index 3ca4fc4..f000cb7 100644
--- a/src/settings.py
+++ b/src/settings.py
@@ -46,7 +46,7 @@ param_grid = {
     'mnb': {'alpha': np.logspace(-7,-1,7)}
 }
 
-config_loo = {
+config_feature_extraction = {
     'function_words_freq': 'latin',
     'conjugations_freq': 'latin',
     'features_Mendenhall': True,
@@ -62,6 +62,3 @@ config_loo = {
     'window_size': 3,
     'normalize_features': True
 }
-
-config_unk = config_loo.copy()
-config_unk['feature_selection_ratio']=0.1
\ No newline at end of file