enron mail

This commit is contained in:
Alejandro Moreo Fernandez 2020-05-07 14:03:47 +02:00
parent 0be3e5547e
commit 8ab808282a
4 changed files with 154 additions and 16 deletions

View File

@ -2,16 +2,24 @@ from abc import ABC, abstractmethod
import random import random
import numpy as np import numpy as np
from collections import Counter from collections import Counter
import os
import pickle
class LabelledCorpus: class LabelledCorpus:
def __init__(self, documents, labels): def __init__(self, documents, labels):
if not isinstance(documents, np.ndarray): documents = np.asarray(documents) if not isinstance(documents, np.ndarray): documents = np.asarray(documents, dtype=str)
if not isinstance(labels, np.ndarray): labels = np.asarray(labels) if not isinstance(labels, np.ndarray): labels = np.asarray(labels)
self.data = documents self.data = documents
self.target = labels self.target = labels
def _tolist(self):
self.data = self.data.tolist()
def _toarray(self):
self.data = np.asarray(self.data, dtype=str)
def __len__(self): def __len__(self):
return len(self.data) return len(self.data)
@ -27,8 +35,28 @@ class LabelledCorpus:
class AuthorshipDataset(ABC): class AuthorshipDataset(ABC):
def __init__(self, data_path, n_authors=-1, docs_by_author=-1, n_open_set_authors = 0, random_state=42): @classmethod
def load(cls, loader, pickle_path=None, **kwargs):
#assert isinstance(loader, AuthorshipDataset), 'unknown loader'
if pickle_path and os.path.exists(pickle_path):
print(f'loading dataset image in {pickle_path}')
dataset = pickle.load(open(pickle_path, 'rb'))
dataset.train._toarray()
dataset.test._toarray()
else:
dataset = loader(**kwargs)
if pickle_path:
print(f'dumping dataset in {pickle_path} for faster load')
dataset.train._tolist()
dataset.test._tolist()
pickle.dump(dataset, open(pickle_path, 'wb'), pickle.HIGHEST_PROTOCOL)
dataset.train._toarray()
dataset.test._toarray()
return dataset
def __init__(self, data_path, n_authors=-1, docs_by_author=-1, n_open_set_authors=0, random_state=42):
self.data_path = data_path self.data_path = data_path
self.n_authors = n_authors
random.seed(random_state) random.seed(random_state)
np.random.seed(random_state) np.random.seed(random_state)
@ -45,17 +73,14 @@ class AuthorshipDataset(ABC):
super().__init__() super().__init__()
@abstractmethod @abstractmethod
def _fetch_and_split(self): def _fetch_and_split(self):
pass pass
@abstractmethod @abstractmethod
def _check_n_authors(self, n_authors, n_open_set_authors): def _check_n_authors(self, n_authors, n_open_set_authors):
pass pass
def _reduce_authors_documents(self, n_authors, n_docs_by_author, n_open_set_authors): def _reduce_authors_documents(self, n_authors, n_docs_by_author, n_open_set_authors):
if n_authors != -1 or n_docs_by_author != -1: if n_authors != -1 or n_docs_by_author != -1:
@ -88,7 +113,6 @@ class AuthorshipDataset(ABC):
else: else:
self.test_out = None self.test_out = None
# reindex labels so that the unique labels are equal to range(#num_different_authors) # reindex labels so that the unique labels are equal to range(#num_different_authors)
# and unique training labels are range(#num_different_training_authors) # and unique training labels are range(#num_different_training_authors)
def _remove_label_gaps(self): def _remove_label_gaps(self):
@ -131,11 +155,13 @@ class AuthorshipDataset(ABC):
return return
author_doc_count = Counter(self.train.target) author_doc_count = Counter(self.train.target)
to_remove = frozenset([id for id,count in author_doc_count.most_common() if count<docs_by_author]) to_remove = frozenset([id for id, count in author_doc_count.most_common() if count < docs_by_author])
assert len(to_remove) < len(author_doc_count), 'impossible selection' assert len(to_remove) < len(author_doc_count), 'impossible selection'
if len(to_remove)>0: if len(to_remove) > 0:
self.train = LabelledCorpus.filter(self.train, to_remove) self.train = LabelledCorpus.filter(self.train, to_remove)
self.test = LabelledCorpus.filter(self.test, to_remove) self.test = LabelledCorpus.filter(self.test, to_remove)
self.target_names = sorted(set(self.target_names) - to_remove) self.target_names = sorted(set(self.target_names) - to_remove)

View File

@ -0,0 +1,107 @@
import eml_parser
from glob import glob
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from data.AuthorshipDataset import AuthorshipDataset, LabelledCorpus
import numpy as np
from joblib import Parallel, delayed
from collections import Counter
class EnronMail(AuthorshipDataset):
NUM_AUTHORS = 150
MAX_MAIL_LENGHT = 5000 # in words
TEST_SIZE = 0.1
MIN_TOKENS = 10
def __init__(self, mail_dir='../data/enron_mail_20150507/maildir/*', n_authors=-1, docs_by_author=-1, n_open_set_authors=0, random_state=42):
self.mail_dir = mail_dir
super().__init__(mail_dir, n_authors, docs_by_author, n_open_set_authors, random_state)
def filter(self, base_str, filter_str):
if filter_str in base_str:
idx = base_str.index(filter_str)
base_str = base_str[:idx]
return base_str
def _fetch_and_split(self):
labels = []
data = []
path_list = self._get_most_prolific_authors(self.n_authors)
emails_authors = Parallel(n_jobs=-1)(
delayed(_fetch_emails_from_author)(author_path, EnronMail.MIN_TOKENS) for author_path in path_list
)
for emails, author in emails_authors:
data.extend(emails)
labels.extend([author]*len(emails))
target_names = sorted(np.unique(labels))
train_data, test_data, train_labels, test_labels = \
train_test_split(data, labels, test_size=EnronMail.TEST_SIZE, stratify=labels)
return LabelledCorpus(train_data, train_labels), LabelledCorpus(test_data, test_labels), target_names
def _check_n_authors(self, n_authors, n_open_set_authors):
pass
def _get_most_prolific_authors(self, n):
assert n <= EnronMail.NUM_AUTHORS, f'too many authors requested (maximum is {EnronMail.NUM_AUTHORS})'
author_paths = glob(self.mail_dir)
if n == -1:
return author_paths
author_count = Counter(
{author_path :
len(glob(f'{author_path}/sent/*.')) +
len(glob(f'{author_path}/sent_items/*.'))
for author_path in author_paths
})
return [path for path, count in author_count.most_common(n)]
def _fetch_emails_from_author(author_path, min_tokens):
subject_filters = ['fw:', 'fwd:', 're:']
body_filters = ['-----Original Message-----', '----- Forward', 'cc:', 'To:', 'to:', 'From:', 'from:']
parsed_mails = 0
author_mails = []
author_docs = 0
author_name = author_path[author_path.rindex('/') + 1:]
author_bar = tqdm(list(glob(f'{author_path}/sent/*.')) + list(glob(f'{author_path}/sent_items/*.')))
errors, trimmed = 0, 0
for email in author_bar:
author_bar.set_description(f'parsing for {author_path} errors={errors} trimmed={trimmed}')
raw_email = open(email, 'rb').read()
try:
parsed_mail = eml_parser.eml_parser.decode_email_b(raw_email, include_raw_body=True)
# subject = parsed_mail['header']['subject']
body = parsed_mail['body'][0]['content']
# for filter in subject_filters:
# if filter in subject.lower():
# continue
# for filter in body_filters:
# body = self.filter(body, filter)
# body = subject+'\n'+body
body_tokens = body.split()
ntokens = len(body_tokens)
if ntokens >= min_tokens:
if ntokens > EnronMail.MAX_MAIL_LENGHT:
trimmed += 1
body = ' '.join(body_tokens[:EnronMail.MAX_MAIL_LENGHT])
author_mails.append(body)
author_docs += 1
# if n_docs_by_author != -1 and author_docs >= n_docs_by_author:
# add_author = True
# break
parsed_mails += 1
except Exception:
errors += 1
print(f'ERROR in file {email}')
return author_mails, author_name

View File

@ -5,7 +5,7 @@ def evaluation(y_true, y_pred):
acc = accuracy_score(y_true, y_pred) acc = accuracy_score(y_true, y_pred)
macrof1 = f1_score(y_true, y_pred, average='macro') macrof1 = f1_score(y_true, y_pred, average='macro')
microf1 = f1_score(y_true, y_pred, average='micro') microf1 = f1_score(y_true, y_pred, average='micro')
print(f'acc={acc * 100:.2f}%') print(f'acc={acc * 100:.4f}%')
print(f'macro-f1={macrof1:.2f}') print(f'macro-f1={macrof1:.4f}')
print(f'micro-f1={microf1:.2f}') print(f'micro-f1={microf1:.4f}')
return acc, macrof1, microf1 return acc, macrof1, microf1

View File

@ -1,6 +1,7 @@
import numpy as np import numpy as np
from data.AuthorshipDataset import AuthorshipDataset
from data.fetch_imdb62 import Imdb62 from data.fetch_imdb62 import Imdb62
from data.fetch_enron_mail import EnronMail
from index import Index from index import Index
from model.classifiers import AuthorshipAttributionClassifier, SameAuthorClassifier, FullAuthorClassifier from model.classifiers import AuthorshipAttributionClassifier, SameAuthorClassifier, FullAuthorClassifier
from data.fetch_victorian import Victorian from data.fetch_victorian import Victorian
@ -17,8 +18,9 @@ pad_length=3000
batch_size=50 batch_size=50
n_epochs=256 n_epochs=256
bigrams=False bigrams=False
n_authors=-1 n_authors=50
docs_by_author=-1 docs_by_author=-1
seed=1
debug=False debug=False
if debug: if debug:
@ -28,8 +30,8 @@ if debug:
pad_length=100 pad_length=100
batch_size=10 batch_size=10
n_epochs=20 n_epochs=20
n_authors = 5 n_authors = 50
docs_by_author = 10 docs_by_author = -1
if torch.cuda.is_available(): if torch.cuda.is_available():
device = torch.device('cuda') device = torch.device('cuda')
@ -37,12 +39,15 @@ else:
device = torch.device('cpu') device = torch.device('cpu')
print(f'running on {device}') print(f'running on {device}')
dataset = AuthorshipDataset.load(EnronMail, pickle_path=f'../pickles/EnronMail{n_authors}_{seed}.pickle', mail_dir='../../authorship_analysis/data/enron_mail_20150507/maildir/*', n_authors=n_authors, docs_by_author=-1, random_state=seed)
#dataset = Victorian(data_path='../../authorship_analysis/data/victoria', n_authors=5, docs_by_author=25) #dataset = Victorian(data_path='../../authorship_analysis/data/victoria', n_authors=5, docs_by_author=25)
dataset = Imdb62(data_path='../../authorship_analysis/data/imdb62/imdb62.txt', n_authors=n_authors, docs_by_author=docs_by_author) #dataset = Imdb62(data_path='../../authorship_analysis/data/imdb62/imdb62.txt', n_authors=n_authors, docs_by_author=docs_by_author)
Xtr, ytr = dataset.train.data, dataset.train.target Xtr, ytr = dataset.train.data, dataset.train.target
Xte, yte = dataset.test.data, dataset.test.target Xte, yte = dataset.test.data, dataset.test.target
A = np.unique(ytr) A = np.unique(ytr)
print(f'num authors={len(A)}') print(f'num authors={len(A)}')
print(f'ntr = {len(Xtr)} nte = {len(Xte)}')
#sys.exit(0)
index = Index(analyzer='char', ngram_range=(2,2) if bigrams else (1,1)) index = Index(analyzer='char', ngram_range=(2,2) if bigrams else (1,1))
Xtr = index.fit_transform(Xtr) Xtr = index.fit_transform(Xtr)