From c54032f6d753431cca668717805f033ea14366d2 Mon Sep 17 00:00:00 2001
From: Alex Moreo <alejandro.moreo@isti.cnr.it>
Date: Sat, 9 May 2020 11:03:49 +0200
Subject: [PATCH] blogs dataset added

---
 src/data/fetch_blogs.py | 61 +++++++++++++++++++++++++++++++++++++++++
 src/main.py             |  6 +++-
 2 files changed, 66 insertions(+), 1 deletion(-)
 create mode 100644 src/data/fetch_blogs.py
diff --git a/src/data/fetch_blogs.py b/src/data/fetch_blogs.py
new file mode 100644
index 0000000..5b4f927
--- /dev/null
+++ b/src/data/fetch_blogs.py
@@ -0,0 +1,61 @@
+import numpy as np
+#import xml.etree.ElementTree
+from data.AuthorshipDataset import LabelledCorpus, AuthorshipDataset
+from glob import glob
+from tqdm import tqdm
+from sklearn.model_selection import train_test_split
+from collections import Counter
+
+
+# http://u.cs.biu.ac.il/~koppel/BlogCorpus.htm
+# J. Schler, M. Koppel, S. Argamon and J. Pennebaker (2006).
+# Effects of Age and Gender on Blogging in Proceedings of 2006 AAAI Spring Symposium on Computational Approaches for
+# Analyzing Weblogs.
+class Blogs(AuthorshipDataset):
+
+    TEST_SIZE = 0.1
+
+    def __init__(self, data_path='../data/blogs', n_authors=-1, docs_by_author=-1, n_open_set_authors=0, random_state=42):
+        super().__init__(data_path, n_authors, docs_by_author, n_open_set_authors, random_state)
+
+    def _fetch_and_split(self):
+        files = glob(f'{self.data_path}/*.xml')
+
+        data, labels = [], []
+        for file in tqdm(files):
+            posts, author = fetch_xml(file)
+            data.extend(posts)
+            labels.extend([author]*len(posts))
+
+        print(f'elements = {len(data)} from {len(np.unique(labels))}')
+        data, labels = _get_most_prolific_authors(data, labels, self.n_authors)
+        print(f'elements = {len(data)} from {len(np.unique(labels))}')
+
+        target_names = sorted(np.unique(labels))
+
+        train_data, test_data, train_labels, test_labels = \
+            train_test_split(data, labels, test_size=Blogs.TEST_SIZE, stratify=labels)
+
+        return LabelledCorpus(train_data, train_labels), LabelledCorpus(test_data, test_labels), target_names
+
+
+    def _check_n_authors(self, n_authors, n_open_set_authors):
+        pass
+
+
+def fetch_xml(path):
+    file = open(path, 'rt', encoding='ISO-8859-1').read()
+    posts = file.split('<post>')[1:]
+    posts = [post[:post.rindex('</post>')].strip() for post in posts]
+    author = path[path.rfind('/')+1:].replace('.xml','')
+    return posts, author
+
+
+def _get_most_prolific_authors(data, labels, n):
+    if n == -1:
+        return data, labels
+
+    author_selection = frozenset(label for label,_ in Counter(labels).most_common(n))
+    return list(zip(*[(d,a) for d,a in zip(data,labels) if a in author_selection]))
+
+
diff --git a/src/main.py b/src/main.py
index 5db4d23..7edccf7 100644
--- a/src/main.py
+++ b/src/main.py
@@ -1,6 +1,7 @@
 import argparse
 import numpy as np
 from data.AuthorshipDataset import AuthorshipDataset
+from data.fetch_blogs import Blogs
 from data.fetch_imdb62 import Imdb62
 from data.fetch_enron_mail import EnronMail
 from index import Index
@@ -29,6 +30,9 @@ def main(opt):
     elif opt.dataset == 'victorian':
         loader = Victorian
         data_path='../../authorship_analysis/data/victoria'
+    elif opt.dataset == 'blogs':
+        loader = Blogs
+        data_path = '../../authorship_analysis/data/blogs'
 
     dataset_name = f'{loader.__name__}_A{opt.authors}_D{opt.documents}_S{opt.seed}'
     pickle_path = None
@@ -157,7 +161,7 @@ if __name__ == '__main__':
     requiredNamed.add_argument('-d', '--dataset', help='Name of the dataset', required=True, type=str)
     opt = parser.parse_args()
 
-    assert opt.dataset in ['enron', 'imdb62'], 'unknown dataset'
+    assert opt.dataset in ['enron', 'imdb62', 'blogs'], 'unknown dataset'
 
     create_path_if_not_exists(opt.output)
     create_path_if_not_exists(opt.log)