From c54032f6d753431cca668717805f033ea14366d2 Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Sat, 9 May 2020 11:03:49 +0200 Subject: [PATCH] blogs dataset added --- src/data/fetch_blogs.py | 61 +++++++++++++++++++++++++++++++++++++++++ src/main.py | 6 +++- 2 files changed, 66 insertions(+), 1 deletion(-) create mode 100644 src/data/fetch_blogs.py diff --git a/src/data/fetch_blogs.py b/src/data/fetch_blogs.py new file mode 100644 index 0000000..5b4f927 --- /dev/null +++ b/src/data/fetch_blogs.py @@ -0,0 +1,61 @@ +import numpy as np +#import xml.etree.ElementTree +from data.AuthorshipDataset import LabelledCorpus, AuthorshipDataset +from glob import glob +from tqdm import tqdm +from sklearn.model_selection import train_test_split +from collections import Counter + + +# http://u.cs.biu.ac.il/~koppel/BlogCorpus.htm +# J. Schler, M. Koppel, S. Argamon and J. Pennebaker (2006). +# Effects of Age and Gender on Blogging in Proceedings of 2006 AAAI Spring Symposium on Computational Approaches for +# Analyzing Weblogs. +class Blogs(AuthorshipDataset): + + TEST_SIZE = 0.1 + + def __init__(self, data_path='../data/blogs', n_authors=-1, docs_by_author=-1, n_open_set_authors=0, random_state=42): + super().__init__(data_path, n_authors, docs_by_author, n_open_set_authors, random_state) + + def _fetch_and_split(self): + files = glob(f'{self.data_path}/*.xml') + + data, labels = [], [] + for file in tqdm(files): + posts, author = fetch_xml(file) + data.extend(posts) + labels.extend([author]*len(posts)) + + print(f'elements = {len(data)} from {len(np.unique(labels))}') + data, labels = _get_most_prolific_authors(data, labels, self.n_authors) + print(f'elements = {len(data)} from {len(np.unique(labels))}') + + target_names = sorted(np.unique(labels)) + + train_data, test_data, train_labels, test_labels = \ + train_test_split(data, labels, test_size=Blogs.TEST_SIZE, stratify=labels) + + return LabelledCorpus(train_data, train_labels), LabelledCorpus(test_data, test_labels), target_names + + + def _check_n_authors(self, n_authors, n_open_set_authors): + pass + + +def fetch_xml(path): + file = open(path, 'rt', encoding='ISO-8859-1').read() + posts = file.split('')[1:] + posts = [post[:post.rindex('')].strip() for post in posts] + author = path[path.rfind('/')+1:].replace('.xml','') + return posts, author + + +def _get_most_prolific_authors(data, labels, n): + if n == -1: + return data, labels + + author_selection = frozenset(label for label,_ in Counter(labels).most_common(n)) + return list(zip(*[(d,a) for d,a in zip(data,labels) if a in author_selection])) + + diff --git a/src/main.py b/src/main.py index 5db4d23..7edccf7 100644 --- a/src/main.py +++ b/src/main.py @@ -1,6 +1,7 @@ import argparse import numpy as np from data.AuthorshipDataset import AuthorshipDataset +from data.fetch_blogs import Blogs from data.fetch_imdb62 import Imdb62 from data.fetch_enron_mail import EnronMail from index import Index @@ -29,6 +30,9 @@ def main(opt): elif opt.dataset == 'victorian': loader = Victorian data_path='../../authorship_analysis/data/victoria' + elif opt.dataset == 'blogs': + loader = Blogs + data_path = '../../authorship_analysis/data/blogs' dataset_name = f'{loader.__name__}_A{opt.authors}_D{opt.documents}_S{opt.seed}' pickle_path = None @@ -157,7 +161,7 @@ if __name__ == '__main__': requiredNamed.add_argument('-d', '--dataset', help='Name of the dataset', required=True, type=str) opt = parser.parse_args() - assert opt.dataset in ['enron', 'imdb62'], 'unknown dataset' + assert opt.dataset in ['enron', 'imdb62', 'blogs'], 'unknown dataset' create_path_if_not_exists(opt.output) create_path_if_not_exists(opt.log)