enron mail

2020-05-07 14:03:47 +02:00 · 2020-05-07 14:03:47 +02:00 · 8ab808282a
parent 0be3e5547e
commit 8ab808282a
4 changed files with 154 additions and 16 deletions
--- a/src/data/AuthorshipDataset.py
+++ b/src/data/AuthorshipDataset.py
@ -2,16 +2,24 @@ from abc import ABC, abstractmethod
 import random
 import numpy as np
 from collections import Counter
 import os
 import pickle
 class LabelledCorpus:
    def __init__(self, documents, labels):
-        if not isinstance(documents, np.ndarray): documents = np.asarray(documents)
+        if not isinstance(documents, np.ndarray): documents = np.asarray(documents, dtype=str)
        if not isinstance(labels, np.ndarray): labels = np.asarray(labels)
        self.data = documents
        self.target = labels
    def _tolist(self):
        self.data = self.data.tolist()
    def _toarray(self):
        self.data = np.asarray(self.data, dtype=str)
    def __len__(self):
        return len(self.data)
@ -27,8 +35,28 @@ class LabelledCorpus:
 class AuthorshipDataset(ABC):
-    def __init__(self, data_path, n_authors=-1, docs_by_author=-1, n_open_set_authors = 0, random_state=42):
+    @classmethod
    def load(cls, loader, pickle_path=None, **kwargs):
        #assert isinstance(loader, AuthorshipDataset), 'unknown loader'
        if pickle_path and os.path.exists(pickle_path):
            print(f'loading dataset image in {pickle_path}')
            dataset = pickle.load(open(pickle_path, 'rb'))
            dataset.train._toarray()
            dataset.test._toarray()
        else:
            dataset = loader(**kwargs)
            if pickle_path:
                print(f'dumping dataset in {pickle_path} for faster load')
                dataset.train._tolist()
                dataset.test._tolist()
                pickle.dump(dataset, open(pickle_path, 'wb'), pickle.HIGHEST_PROTOCOL)
                dataset.train._toarray()
                dataset.test._toarray()
        return dataset
    def __init__(self, data_path, n_authors=-1, docs_by_author=-1, n_open_set_authors=0, random_state=42):
        self.data_path = data_path
        self.n_authors = n_authors
        random.seed(random_state)
        np.random.seed(random_state)
@ -45,17 +73,14 @@ class AuthorshipDataset(ABC):
        super().__init__()
    @abstractmethod
    def _fetch_and_split(self):
        pass
    @abstractmethod
    def _check_n_authors(self, n_authors, n_open_set_authors):
        pass
    def _reduce_authors_documents(self, n_authors, n_docs_by_author, n_open_set_authors):
        if n_authors != -1 or n_docs_by_author != -1:
@ -88,7 +113,6 @@ class AuthorshipDataset(ABC):
        else:
            self.test_out = None
    # reindex labels so that the unique labels are equal to range(#num_different_authors)
    # and unique training labels are range(#num_different_training_authors)
    def _remove_label_gaps(self):
@ -131,11 +155,13 @@ class AuthorshipDataset(ABC):
            return
        author_doc_count = Counter(self.train.target)
-        to_remove = frozenset([id for id,count in author_doc_count.most_common() if count<docs_by_author])
+        to_remove = frozenset([id for id, count in author_doc_count.most_common() if count < docs_by_author])
        assert len(to_remove) < len(author_doc_count), 'impossible selection'
-        if len(to_remove)>0:
+        if len(to_remove) > 0:
            self.train = LabelledCorpus.filter(self.train, to_remove)
            self.test  = LabelledCorpus.filter(self.test,  to_remove)
            self.target_names = sorted(set(self.target_names) - to_remove)
--- a/src/data/fetch_enron_mail.py
+++ b/src/data/fetch_enron_mail.py
@ -0,0 +1,107 @@
 import eml_parser
 from glob import glob
 from sklearn.model_selection import train_test_split
 from tqdm import tqdm
 from data.AuthorshipDataset import AuthorshipDataset, LabelledCorpus
 import numpy as np
 from joblib import Parallel, delayed
 from collections import Counter
 class EnronMail(AuthorshipDataset):
    NUM_AUTHORS = 150
    MAX_MAIL_LENGHT = 5000 # in words
    TEST_SIZE = 0.1
    MIN_TOKENS = 10
    def __init__(self, mail_dir='../data/enron_mail_20150507/maildir/*', n_authors=-1, docs_by_author=-1, n_open_set_authors=0, random_state=42):
        self.mail_dir = mail_dir
        super().__init__(mail_dir, n_authors, docs_by_author, n_open_set_authors, random_state)
    def filter(self, base_str, filter_str):
        if filter_str in base_str:
            idx = base_str.index(filter_str)
            base_str = base_str[:idx]
        return base_str
    def _fetch_and_split(self):
        labels = []
        data = []
        path_list = self._get_most_prolific_authors(self.n_authors)
        emails_authors = Parallel(n_jobs=-1)(
            delayed(_fetch_emails_from_author)(author_path, EnronMail.MIN_TOKENS) for author_path in path_list
        )
        for emails, author in emails_authors:
            data.extend(emails)
            labels.extend([author]*len(emails))
        target_names = sorted(np.unique(labels))
        train_data, test_data, train_labels, test_labels = \
            train_test_split(data, labels, test_size=EnronMail.TEST_SIZE, stratify=labels)
        return LabelledCorpus(train_data, train_labels), LabelledCorpus(test_data, test_labels), target_names
    def _check_n_authors(self, n_authors, n_open_set_authors):
        pass
    def _get_most_prolific_authors(self, n):
        assert n <= EnronMail.NUM_AUTHORS, f'too many authors requested (maximum is {EnronMail.NUM_AUTHORS})'
        author_paths = glob(self.mail_dir)
        if n == -1:
            return author_paths
        author_count = Counter(
            {author_path :
                 len(glob(f'{author_path}/sent/*.')) +
                 len(glob(f'{author_path}/sent_items/*.'))
             for author_path in author_paths
             })
        return [path for path, count in author_count.most_common(n)]
 def _fetch_emails_from_author(author_path, min_tokens):
    subject_filters = ['fw:', 'fwd:', 're:']
    body_filters = ['-----Original Message-----', '----- Forward', 'cc:', 'To:', 'to:', 'From:', 'from:']
    parsed_mails = 0
    author_mails = []
    author_docs = 0
    author_name = author_path[author_path.rindex('/') + 1:]
    author_bar = tqdm(list(glob(f'{author_path}/sent/*.')) + list(glob(f'{author_path}/sent_items/*.')))
    errors, trimmed = 0, 0
    for email in author_bar:
        author_bar.set_description(f'parsing for {author_path} errors={errors} trimmed={trimmed}')
        raw_email = open(email, 'rb').read()
        try:
            parsed_mail = eml_parser.eml_parser.decode_email_b(raw_email, include_raw_body=True)
            # subject = parsed_mail['header']['subject']
            body = parsed_mail['body'][0]['content']
            # for filter in subject_filters:
            #    if filter in subject.lower():
            #        continue
            # for filter in body_filters:
            #    body = self.filter(body, filter)
            # body = subject+'\n'+body
            body_tokens = body.split()
            ntokens = len(body_tokens)
            if ntokens >= min_tokens:
                if ntokens > EnronMail.MAX_MAIL_LENGHT:
                    trimmed += 1
                    body = ' '.join(body_tokens[:EnronMail.MAX_MAIL_LENGHT])
                author_mails.append(body)
                author_docs += 1
                # if n_docs_by_author != -1 and author_docs >= n_docs_by_author:
                #    add_author = True
                #    break
            parsed_mails += 1
        except Exception:
            errors += 1
            print(f'ERROR in file {email}')
    return author_mails, author_name
--- a/src/evaluation.py
+++ b/src/evaluation.py
@ -5,7 +5,7 @@ def evaluation(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    macrof1 = f1_score(y_true, y_pred, average='macro')
    microf1 = f1_score(y_true, y_pred, average='micro')
-    print(f'acc={acc * 100:.2f}%')
+    print(f'acc={acc * 100:.4f}%')
-    print(f'macro-f1={macrof1:.2f}')
+    print(f'macro-f1={macrof1:.4f}')
-    print(f'micro-f1={microf1:.2f}')
+    print(f'micro-f1={microf1:.4f}')
    return acc, macrof1, microf1
--- a/src/main.py
+++ b/src/main.py
@ -1,6 +1,7 @@
 import numpy as np
-
+from data.AuthorshipDataset import AuthorshipDataset
 from data.fetch_imdb62 import Imdb62
 from data.fetch_enron_mail import EnronMail
 from index import Index
 from model.classifiers import AuthorshipAttributionClassifier, SameAuthorClassifier, FullAuthorClassifier
 from data.fetch_victorian import Victorian
@ -17,8 +18,9 @@ pad_length=3000
 batch_size=50
 n_epochs=256
 bigrams=False
-n_authors=-1
+n_authors=50
 docs_by_author=-1
 seed=1
 debug=False
 if debug:
@ -28,8 +30,8 @@ if debug:
    pad_length=100
    batch_size=10
    n_epochs=20
-    n_authors = 5
+    n_authors = 50
-    docs_by_author = 10
+    docs_by_author = -1
 if torch.cuda.is_available():
    device = torch.device('cuda')
@ -37,12 +39,15 @@ else:
    device = torch.device('cpu')
 print(f'running on {device}')
 dataset = AuthorshipDataset.load(EnronMail, pickle_path=f'../pickles/EnronMail{n_authors}_{seed}.pickle', mail_dir='../../authorship_analysis/data/enron_mail_20150507/maildir/*', n_authors=n_authors, docs_by_author=-1, random_state=seed)
 #dataset = Victorian(data_path='../../authorship_analysis/data/victoria', n_authors=5, docs_by_author=25)
-dataset = Imdb62(data_path='../../authorship_analysis/data/imdb62/imdb62.txt', n_authors=n_authors, docs_by_author=docs_by_author)
+#dataset = Imdb62(data_path='../../authorship_analysis/data/imdb62/imdb62.txt', n_authors=n_authors, docs_by_author=docs_by_author)
 Xtr, ytr = dataset.train.data, dataset.train.target
 Xte, yte = dataset.test.data, dataset.test.target
 A = np.unique(ytr)
 print(f'num authors={len(A)}')
 print(f'ntr = {len(Xtr)} nte = {len(Xte)}')
 #sys.exit(0)
 index = Index(analyzer='char', ngram_range=(2,2) if bigrams else (1,1))
 Xtr = index.fit_transform(Xtr)