From 04f0eb17edf17f58e99f8047ffb42adb8325a570 Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Fri, 27 Nov 2020 21:04:00 +0100 Subject: [PATCH] final refinements --- src/author_identification_loo.py | 2 +- src/author_identification_unknown.py | 4 ++-- src/experiments.sh | 21 ++++++++++++++++----- src/helpers.py | 2 +- src/model.py | 25 +++---------------------- src/settings.py | 5 +---- 6 files changed, 24 insertions(+), 35 deletions(-) diff --git a/src/author_identification_loo.py b/src/author_identification_loo.py index 3349946..dcc0009 100755 --- a/src/author_identification_loo.py +++ b/src/author_identification_loo.py @@ -37,7 +37,7 @@ def main(): n_full_docs = len(positive) + len(negative) print(f'read {n_full_docs} documents from {path}') - feature_extractor = FeatureExtractor(**settings.config_loo) + feature_extractor = FeatureExtractor(**settings.config_feature_extraction) Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative) frange_chgrams = feature_extractor.feature_range['_cngrams_task'] diff --git a/src/author_identification_unknown.py b/src/author_identification_unknown.py index 5d7b25c..7c957d8 100755 --- a/src/author_identification_unknown.py +++ b/src/author_identification_unknown.py @@ -38,8 +38,8 @@ def main(): n_full_docs = len(positive) + len(negative) print(f'read {n_full_docs} documents from {path}') - settings.config_unk['feature_selection_ratio'] = args.featsel - feature_extractor = FeatureExtractor(**settings.config_unk) + settings.config_feature_extraction['feature_selection_ratio'] = args.featsel + feature_extractor = FeatureExtractor(**settings.config_feature_extraction) Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative) frange_chgrams = feature_extractor.feature_range['_cngrams_task'] diff --git a/src/experiments.sh b/src/experiments.sh index ddee056..5f577ec 100755 --- a/src/experiments.sh +++ b/src/experiments.sh @@ -9,11 +9,22 @@ if [ ! -d $corpus ]; then rm ../MedLatin.zip fi -PY="python3 author_identification_loo.py" +PYLOO="python3 author_identification_loo.py" +PYUNK="python3 author_identification_unknown.py" + MedLatin1="../MedLatin/Corpora/MedLatin1" MedLatin2="../MedLatin/Corpora/MedLatin2" -EP1="../MedLatin/Epistle/EpistolaXIII_1.txt" -EP2="../MedLatin/Epistle/EpistolaXIII_2.txt" -$PY $MedLatin1 ALL --log ./resultsLoo_EP1.txt -$PY $MedLatin2 ALL --log ./resultsLoo_EP2.txt +EPXIII1="../MedLatin/Epistle/EpistolaXIII_1.txt" +EPXIII2="../MedLatin/Epistle/EpistolaXIII_2.txt" +EPXIV="../Epistola_ArigoVII.txt" + +for learner in lr svm mnb ; do + $PYLOO $MedLatin1 ALL --learner $learner --log ../results/resultsLOO_EP1_$learner.txt + $PYLOO $MedLatin2 ALL --learner $learner --log ../results/resultsLOO_EP2_$learner.txt + + $PYUNK $MedLatin1 Dante $EPXIII1 --learner $learner --log ../results/resultsUNK_EP13_1_$learner.txt + $PYUNK $MedLatin2 Dante $EPXIII2 --learner $learner --log ../results/resultsUNK_EP13_2_$learner.txt + $PYUNK $MedLatin1 Dante $EPXIV --learner $learner --log ../results/resultsUNK_EP14_$learner.txt +done + diff --git a/src/helpers.py b/src/helpers.py index 9ce7dd0..a18e7c6 100644 --- a/src/helpers.py +++ b/src/helpers.py @@ -54,9 +54,9 @@ def check_log_loo(args): def check_log_unknown(args): + args.unknown_name = pathlib.Path(args.unknown).name if args.log is None: os.makedirs('../results', exist_ok=True) assert os.path.exists(args.unknown), f'file {args.unknown} does not exist' - args.unknown_name = pathlib.Path(args.unknown).name args.log = f'../results/Unknown{args.unknown_name}_Corpus{args.corpus_name}.Author{args.positive}.' \ f'fs{args.featsel}.classweight{str(args.class_weight)}.CLS{args.learner}.txt' \ No newline at end of file diff --git a/src/model.py b/src/model.py index 2f0a125..e81838a 100755 --- a/src/model.py +++ b/src/model.py @@ -9,7 +9,6 @@ from util.evaluation import f1_metric from typing import List, Union - class AuthorshipVerificator(BaseEstimator): def __init__(self, nfolds=10, param_grid=None, learner=None, C=1., alpha=0.001, class_weight='balanced', @@ -24,10 +23,7 @@ class AuthorshipVerificator(BaseEstimator): self.feat_selection_slices = feat_selection_slices self.feat_selection_ratio = feat_selection_ratio - def fit(self, X, y, groups=None, hyperparam_optimization=True): - if self.param_grid is None and hyperparam_optimization: - raise ValueError('Param grid is None, but hyperparameter optimization is requested') - + def fit(self, X, y, groups=None): if self.feat_selection_slices is not None: self.fs = MultiRangeFeatureSelector(self.feat_selection_slices, feat_sel=self.feat_selection_ratio) X = self.fs.fit(X, y).transform(X) @@ -37,7 +33,7 @@ class AuthorshipVerificator(BaseEstimator): C=self.C, class_weight=self.class_weight, max_iter=1000, random_state=self.random_seed, solver='lbfgs' ) elif self.learner == 'svm': - self.classifier = LinearSVC(C=self.C, class_weight=self.class_weight) + self.classifier = LinearSVC(C=self.C, class_weight=self.class_weight, max_iter=2500, random_state=self.random_seed) elif self.learner == 'mnb': self.classifier = MultinomialNB(alpha=self.alpha) @@ -47,7 +43,7 @@ class AuthorshipVerificator(BaseEstimator): if groups is None: groups = np.arange(len(y)) - if hyperparam_optimization and (positive_examples >= self.nfolds) and (len(np.unique(groups[y==1])) > 1): + if (positive_examples >= self.nfolds) and (len(np.unique(groups[y==1])) > 1): folds = list(GroupKFold(n_splits=self.nfolds).split(X, y, groups)) self.estimator = GridSearchCV( self.classifier, param_grid=self.param_grid, cv=folds, scoring=make_scorer(f1_metric), n_jobs=-1, @@ -135,18 +131,3 @@ class MultiRangeFeatureSelector(BaseEstimator, TransformerMixin): def __sort_ranges(self, ranges: List[slice]): return np.asarray(ranges)[np.argsort([r.start for r in ranges])[::-1]] - -def get_valid_folds(nfolds, X, y, groups, max_trials=100): - trials = 0 - folds = list(GroupKFold(n_splits=nfolds).split(X, y, groups)) - n_docs = len(y) - print(f'different classes={np.unique(y)}; #different documents={len(np.unique(groups))} positives={len(np.unique(groups[y==1]))}') - while any(len(np.unique(y[train])) < 2 for train, test in folds): - shuffle_index = np.random.permutation(n_docs) - X, y, groups = X[shuffle_index], y[shuffle_index], groups[shuffle_index] - folds = list(GroupKFold(n_splits=nfolds).split(X, y, groups)) - print(f'\ttrial{trials}:{[len(np.unique(y[train])) for train, test in folds]}') - trials+=1 - if trials>max_trials: - raise ValueError(f'could not meet condition after {max_trials} trials') - return folds diff --git a/src/settings.py b/src/settings.py index 3ca4fc4..f000cb7 100644 --- a/src/settings.py +++ b/src/settings.py @@ -46,7 +46,7 @@ param_grid = { 'mnb': {'alpha': np.logspace(-7,-1,7)} } -config_loo = { +config_feature_extraction = { 'function_words_freq': 'latin', 'conjugations_freq': 'latin', 'features_Mendenhall': True, @@ -62,6 +62,3 @@ config_loo = { 'window_size': 3, 'normalize_features': True } - -config_unk = config_loo.copy() -config_unk['feature_selection_ratio']=0.1 \ No newline at end of file