enron mail
This commit is contained in:
parent
0be3e5547e
commit
8ab808282a
|
@ -2,16 +2,24 @@ from abc import ABC, abstractmethod
|
||||||
import random
|
import random
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
|
import os
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
|
||||||
class LabelledCorpus:
|
class LabelledCorpus:
|
||||||
|
|
||||||
def __init__(self, documents, labels):
|
def __init__(self, documents, labels):
|
||||||
if not isinstance(documents, np.ndarray): documents = np.asarray(documents)
|
if not isinstance(documents, np.ndarray): documents = np.asarray(documents, dtype=str)
|
||||||
if not isinstance(labels, np.ndarray): labels = np.asarray(labels)
|
if not isinstance(labels, np.ndarray): labels = np.asarray(labels)
|
||||||
self.data = documents
|
self.data = documents
|
||||||
self.target = labels
|
self.target = labels
|
||||||
|
|
||||||
|
def _tolist(self):
|
||||||
|
self.data = self.data.tolist()
|
||||||
|
|
||||||
|
def _toarray(self):
|
||||||
|
self.data = np.asarray(self.data, dtype=str)
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return len(self.data)
|
return len(self.data)
|
||||||
|
|
||||||
|
@ -27,8 +35,28 @@ class LabelledCorpus:
|
||||||
|
|
||||||
class AuthorshipDataset(ABC):
|
class AuthorshipDataset(ABC):
|
||||||
|
|
||||||
def __init__(self, data_path, n_authors=-1, docs_by_author=-1, n_open_set_authors = 0, random_state=42):
|
@classmethod
|
||||||
|
def load(cls, loader, pickle_path=None, **kwargs):
|
||||||
|
#assert isinstance(loader, AuthorshipDataset), 'unknown loader'
|
||||||
|
if pickle_path and os.path.exists(pickle_path):
|
||||||
|
print(f'loading dataset image in {pickle_path}')
|
||||||
|
dataset = pickle.load(open(pickle_path, 'rb'))
|
||||||
|
dataset.train._toarray()
|
||||||
|
dataset.test._toarray()
|
||||||
|
else:
|
||||||
|
dataset = loader(**kwargs)
|
||||||
|
if pickle_path:
|
||||||
|
print(f'dumping dataset in {pickle_path} for faster load')
|
||||||
|
dataset.train._tolist()
|
||||||
|
dataset.test._tolist()
|
||||||
|
pickle.dump(dataset, open(pickle_path, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||||
|
dataset.train._toarray()
|
||||||
|
dataset.test._toarray()
|
||||||
|
return dataset
|
||||||
|
|
||||||
|
def __init__(self, data_path, n_authors=-1, docs_by_author=-1, n_open_set_authors=0, random_state=42):
|
||||||
self.data_path = data_path
|
self.data_path = data_path
|
||||||
|
self.n_authors = n_authors
|
||||||
|
|
||||||
random.seed(random_state)
|
random.seed(random_state)
|
||||||
np.random.seed(random_state)
|
np.random.seed(random_state)
|
||||||
|
@ -45,17 +73,14 @@ class AuthorshipDataset(ABC):
|
||||||
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def _fetch_and_split(self):
|
def _fetch_and_split(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def _check_n_authors(self, n_authors, n_open_set_authors):
|
def _check_n_authors(self, n_authors, n_open_set_authors):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def _reduce_authors_documents(self, n_authors, n_docs_by_author, n_open_set_authors):
|
def _reduce_authors_documents(self, n_authors, n_docs_by_author, n_open_set_authors):
|
||||||
|
|
||||||
if n_authors != -1 or n_docs_by_author != -1:
|
if n_authors != -1 or n_docs_by_author != -1:
|
||||||
|
@ -88,7 +113,6 @@ class AuthorshipDataset(ABC):
|
||||||
else:
|
else:
|
||||||
self.test_out = None
|
self.test_out = None
|
||||||
|
|
||||||
|
|
||||||
# reindex labels so that the unique labels are equal to range(#num_different_authors)
|
# reindex labels so that the unique labels are equal to range(#num_different_authors)
|
||||||
# and unique training labels are range(#num_different_training_authors)
|
# and unique training labels are range(#num_different_training_authors)
|
||||||
def _remove_label_gaps(self):
|
def _remove_label_gaps(self):
|
||||||
|
@ -131,11 +155,13 @@ class AuthorshipDataset(ABC):
|
||||||
return
|
return
|
||||||
|
|
||||||
author_doc_count = Counter(self.train.target)
|
author_doc_count = Counter(self.train.target)
|
||||||
to_remove = frozenset([id for id,count in author_doc_count.most_common() if count<docs_by_author])
|
to_remove = frozenset([id for id, count in author_doc_count.most_common() if count < docs_by_author])
|
||||||
assert len(to_remove) < len(author_doc_count), 'impossible selection'
|
assert len(to_remove) < len(author_doc_count), 'impossible selection'
|
||||||
if len(to_remove)>0:
|
if len(to_remove) > 0:
|
||||||
self.train = LabelledCorpus.filter(self.train, to_remove)
|
self.train = LabelledCorpus.filter(self.train, to_remove)
|
||||||
self.test = LabelledCorpus.filter(self.test, to_remove)
|
self.test = LabelledCorpus.filter(self.test, to_remove)
|
||||||
self.target_names = sorted(set(self.target_names) - to_remove)
|
self.target_names = sorted(set(self.target_names) - to_remove)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,107 @@
|
||||||
|
import eml_parser
|
||||||
|
from glob import glob
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from tqdm import tqdm
|
||||||
|
from data.AuthorshipDataset import AuthorshipDataset, LabelledCorpus
|
||||||
|
import numpy as np
|
||||||
|
from joblib import Parallel, delayed
|
||||||
|
from collections import Counter
|
||||||
|
|
||||||
|
|
||||||
|
class EnronMail(AuthorshipDataset):
|
||||||
|
|
||||||
|
NUM_AUTHORS = 150
|
||||||
|
MAX_MAIL_LENGHT = 5000 # in words
|
||||||
|
TEST_SIZE = 0.1
|
||||||
|
MIN_TOKENS = 10
|
||||||
|
|
||||||
|
def __init__(self, mail_dir='../data/enron_mail_20150507/maildir/*', n_authors=-1, docs_by_author=-1, n_open_set_authors=0, random_state=42):
|
||||||
|
self.mail_dir = mail_dir
|
||||||
|
super().__init__(mail_dir, n_authors, docs_by_author, n_open_set_authors, random_state)
|
||||||
|
|
||||||
|
def filter(self, base_str, filter_str):
|
||||||
|
if filter_str in base_str:
|
||||||
|
idx = base_str.index(filter_str)
|
||||||
|
base_str = base_str[:idx]
|
||||||
|
return base_str
|
||||||
|
|
||||||
|
def _fetch_and_split(self):
|
||||||
|
labels = []
|
||||||
|
data = []
|
||||||
|
|
||||||
|
path_list = self._get_most_prolific_authors(self.n_authors)
|
||||||
|
emails_authors = Parallel(n_jobs=-1)(
|
||||||
|
delayed(_fetch_emails_from_author)(author_path, EnronMail.MIN_TOKENS) for author_path in path_list
|
||||||
|
)
|
||||||
|
for emails, author in emails_authors:
|
||||||
|
data.extend(emails)
|
||||||
|
labels.extend([author]*len(emails))
|
||||||
|
|
||||||
|
target_names = sorted(np.unique(labels))
|
||||||
|
|
||||||
|
train_data, test_data, train_labels, test_labels = \
|
||||||
|
train_test_split(data, labels, test_size=EnronMail.TEST_SIZE, stratify=labels)
|
||||||
|
|
||||||
|
return LabelledCorpus(train_data, train_labels), LabelledCorpus(test_data, test_labels), target_names
|
||||||
|
|
||||||
|
def _check_n_authors(self, n_authors, n_open_set_authors):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def _get_most_prolific_authors(self, n):
|
||||||
|
assert n <= EnronMail.NUM_AUTHORS, f'too many authors requested (maximum is {EnronMail.NUM_AUTHORS})'
|
||||||
|
author_paths = glob(self.mail_dir)
|
||||||
|
if n == -1:
|
||||||
|
return author_paths
|
||||||
|
author_count = Counter(
|
||||||
|
{author_path :
|
||||||
|
len(glob(f'{author_path}/sent/*.')) +
|
||||||
|
len(glob(f'{author_path}/sent_items/*.'))
|
||||||
|
for author_path in author_paths
|
||||||
|
})
|
||||||
|
return [path for path, count in author_count.most_common(n)]
|
||||||
|
|
||||||
|
|
||||||
|
def _fetch_emails_from_author(author_path, min_tokens):
|
||||||
|
subject_filters = ['fw:', 'fwd:', 're:']
|
||||||
|
body_filters = ['-----Original Message-----', '----- Forward', 'cc:', 'To:', 'to:', 'From:', 'from:']
|
||||||
|
|
||||||
|
parsed_mails = 0
|
||||||
|
author_mails = []
|
||||||
|
author_docs = 0
|
||||||
|
author_name = author_path[author_path.rindex('/') + 1:]
|
||||||
|
author_bar = tqdm(list(glob(f'{author_path}/sent/*.')) + list(glob(f'{author_path}/sent_items/*.')))
|
||||||
|
errors, trimmed = 0, 0
|
||||||
|
for email in author_bar:
|
||||||
|
author_bar.set_description(f'parsing for {author_path} errors={errors} trimmed={trimmed}')
|
||||||
|
raw_email = open(email, 'rb').read()
|
||||||
|
try:
|
||||||
|
parsed_mail = eml_parser.eml_parser.decode_email_b(raw_email, include_raw_body=True)
|
||||||
|
# subject = parsed_mail['header']['subject']
|
||||||
|
body = parsed_mail['body'][0]['content']
|
||||||
|
|
||||||
|
# for filter in subject_filters:
|
||||||
|
# if filter in subject.lower():
|
||||||
|
# continue
|
||||||
|
|
||||||
|
# for filter in body_filters:
|
||||||
|
# body = self.filter(body, filter)
|
||||||
|
|
||||||
|
# body = subject+'\n'+body
|
||||||
|
body_tokens = body.split()
|
||||||
|
ntokens = len(body_tokens)
|
||||||
|
if ntokens >= min_tokens:
|
||||||
|
if ntokens > EnronMail.MAX_MAIL_LENGHT:
|
||||||
|
trimmed += 1
|
||||||
|
body = ' '.join(body_tokens[:EnronMail.MAX_MAIL_LENGHT])
|
||||||
|
author_mails.append(body)
|
||||||
|
author_docs += 1
|
||||||
|
# if n_docs_by_author != -1 and author_docs >= n_docs_by_author:
|
||||||
|
# add_author = True
|
||||||
|
# break
|
||||||
|
|
||||||
|
parsed_mails += 1
|
||||||
|
except Exception:
|
||||||
|
errors += 1
|
||||||
|
print(f'ERROR in file {email}')
|
||||||
|
|
||||||
|
return author_mails, author_name
|
|
@ -5,7 +5,7 @@ def evaluation(y_true, y_pred):
|
||||||
acc = accuracy_score(y_true, y_pred)
|
acc = accuracy_score(y_true, y_pred)
|
||||||
macrof1 = f1_score(y_true, y_pred, average='macro')
|
macrof1 = f1_score(y_true, y_pred, average='macro')
|
||||||
microf1 = f1_score(y_true, y_pred, average='micro')
|
microf1 = f1_score(y_true, y_pred, average='micro')
|
||||||
print(f'acc={acc * 100:.2f}%')
|
print(f'acc={acc * 100:.4f}%')
|
||||||
print(f'macro-f1={macrof1:.2f}')
|
print(f'macro-f1={macrof1:.4f}')
|
||||||
print(f'micro-f1={microf1:.2f}')
|
print(f'micro-f1={microf1:.4f}')
|
||||||
return acc, macrof1, microf1
|
return acc, macrof1, microf1
|
||||||
|
|
15
src/main.py
15
src/main.py
|
@ -1,6 +1,7 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from data.AuthorshipDataset import AuthorshipDataset
|
||||||
from data.fetch_imdb62 import Imdb62
|
from data.fetch_imdb62 import Imdb62
|
||||||
|
from data.fetch_enron_mail import EnronMail
|
||||||
from index import Index
|
from index import Index
|
||||||
from model.classifiers import AuthorshipAttributionClassifier, SameAuthorClassifier, FullAuthorClassifier
|
from model.classifiers import AuthorshipAttributionClassifier, SameAuthorClassifier, FullAuthorClassifier
|
||||||
from data.fetch_victorian import Victorian
|
from data.fetch_victorian import Victorian
|
||||||
|
@ -17,8 +18,9 @@ pad_length=3000
|
||||||
batch_size=50
|
batch_size=50
|
||||||
n_epochs=256
|
n_epochs=256
|
||||||
bigrams=False
|
bigrams=False
|
||||||
n_authors=-1
|
n_authors=50
|
||||||
docs_by_author=-1
|
docs_by_author=-1
|
||||||
|
seed=1
|
||||||
|
|
||||||
debug=False
|
debug=False
|
||||||
if debug:
|
if debug:
|
||||||
|
@ -28,8 +30,8 @@ if debug:
|
||||||
pad_length=100
|
pad_length=100
|
||||||
batch_size=10
|
batch_size=10
|
||||||
n_epochs=20
|
n_epochs=20
|
||||||
n_authors = 5
|
n_authors = 50
|
||||||
docs_by_author = 10
|
docs_by_author = -1
|
||||||
|
|
||||||
if torch.cuda.is_available():
|
if torch.cuda.is_available():
|
||||||
device = torch.device('cuda')
|
device = torch.device('cuda')
|
||||||
|
@ -37,12 +39,15 @@ else:
|
||||||
device = torch.device('cpu')
|
device = torch.device('cpu')
|
||||||
print(f'running on {device}')
|
print(f'running on {device}')
|
||||||
|
|
||||||
|
dataset = AuthorshipDataset.load(EnronMail, pickle_path=f'../pickles/EnronMail{n_authors}_{seed}.pickle', mail_dir='../../authorship_analysis/data/enron_mail_20150507/maildir/*', n_authors=n_authors, docs_by_author=-1, random_state=seed)
|
||||||
#dataset = Victorian(data_path='../../authorship_analysis/data/victoria', n_authors=5, docs_by_author=25)
|
#dataset = Victorian(data_path='../../authorship_analysis/data/victoria', n_authors=5, docs_by_author=25)
|
||||||
dataset = Imdb62(data_path='../../authorship_analysis/data/imdb62/imdb62.txt', n_authors=n_authors, docs_by_author=docs_by_author)
|
#dataset = Imdb62(data_path='../../authorship_analysis/data/imdb62/imdb62.txt', n_authors=n_authors, docs_by_author=docs_by_author)
|
||||||
Xtr, ytr = dataset.train.data, dataset.train.target
|
Xtr, ytr = dataset.train.data, dataset.train.target
|
||||||
Xte, yte = dataset.test.data, dataset.test.target
|
Xte, yte = dataset.test.data, dataset.test.target
|
||||||
A = np.unique(ytr)
|
A = np.unique(ytr)
|
||||||
print(f'num authors={len(A)}')
|
print(f'num authors={len(A)}')
|
||||||
|
print(f'ntr = {len(Xtr)} nte = {len(Xte)}')
|
||||||
|
#sys.exit(0)
|
||||||
|
|
||||||
index = Index(analyzer='char', ngram_range=(2,2) if bigrams else (1,1))
|
index = Index(analyzer='char', ngram_range=(2,2) if bigrams else (1,1))
|
||||||
Xtr = index.fit_transform(Xtr)
|
Xtr = index.fit_transform(Xtr)
|
||||||
|
|
Loading…
Reference in New Issue