blogs dataset added
This commit is contained in:
parent
f0b08278e4
commit
c54032f6d7
|
@ -0,0 +1,61 @@
|
||||||
|
import numpy as np
|
||||||
|
#import xml.etree.ElementTree
|
||||||
|
from data.AuthorshipDataset import LabelledCorpus, AuthorshipDataset
|
||||||
|
from glob import glob
|
||||||
|
from tqdm import tqdm
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from collections import Counter
|
||||||
|
|
||||||
|
|
||||||
|
# http://u.cs.biu.ac.il/~koppel/BlogCorpus.htm
|
||||||
|
# J. Schler, M. Koppel, S. Argamon and J. Pennebaker (2006).
|
||||||
|
# Effects of Age and Gender on Blogging in Proceedings of 2006 AAAI Spring Symposium on Computational Approaches for
|
||||||
|
# Analyzing Weblogs.
|
||||||
|
class Blogs(AuthorshipDataset):
|
||||||
|
|
||||||
|
TEST_SIZE = 0.1
|
||||||
|
|
||||||
|
def __init__(self, data_path='../data/blogs', n_authors=-1, docs_by_author=-1, n_open_set_authors=0, random_state=42):
|
||||||
|
super().__init__(data_path, n_authors, docs_by_author, n_open_set_authors, random_state)
|
||||||
|
|
||||||
|
def _fetch_and_split(self):
|
||||||
|
files = glob(f'{self.data_path}/*.xml')
|
||||||
|
|
||||||
|
data, labels = [], []
|
||||||
|
for file in tqdm(files):
|
||||||
|
posts, author = fetch_xml(file)
|
||||||
|
data.extend(posts)
|
||||||
|
labels.extend([author]*len(posts))
|
||||||
|
|
||||||
|
print(f'elements = {len(data)} from {len(np.unique(labels))}')
|
||||||
|
data, labels = _get_most_prolific_authors(data, labels, self.n_authors)
|
||||||
|
print(f'elements = {len(data)} from {len(np.unique(labels))}')
|
||||||
|
|
||||||
|
target_names = sorted(np.unique(labels))
|
||||||
|
|
||||||
|
train_data, test_data, train_labels, test_labels = \
|
||||||
|
train_test_split(data, labels, test_size=Blogs.TEST_SIZE, stratify=labels)
|
||||||
|
|
||||||
|
return LabelledCorpus(train_data, train_labels), LabelledCorpus(test_data, test_labels), target_names
|
||||||
|
|
||||||
|
|
||||||
|
def _check_n_authors(self, n_authors, n_open_set_authors):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_xml(path):
|
||||||
|
file = open(path, 'rt', encoding='ISO-8859-1').read()
|
||||||
|
posts = file.split('<post>')[1:]
|
||||||
|
posts = [post[:post.rindex('</post>')].strip() for post in posts]
|
||||||
|
author = path[path.rfind('/')+1:].replace('.xml','')
|
||||||
|
return posts, author
|
||||||
|
|
||||||
|
|
||||||
|
def _get_most_prolific_authors(data, labels, n):
|
||||||
|
if n == -1:
|
||||||
|
return data, labels
|
||||||
|
|
||||||
|
author_selection = frozenset(label for label,_ in Counter(labels).most_common(n))
|
||||||
|
return list(zip(*[(d,a) for d,a in zip(data,labels) if a in author_selection]))
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
import argparse
|
import argparse
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from data.AuthorshipDataset import AuthorshipDataset
|
from data.AuthorshipDataset import AuthorshipDataset
|
||||||
|
from data.fetch_blogs import Blogs
|
||||||
from data.fetch_imdb62 import Imdb62
|
from data.fetch_imdb62 import Imdb62
|
||||||
from data.fetch_enron_mail import EnronMail
|
from data.fetch_enron_mail import EnronMail
|
||||||
from index import Index
|
from index import Index
|
||||||
|
@ -29,6 +30,9 @@ def main(opt):
|
||||||
elif opt.dataset == 'victorian':
|
elif opt.dataset == 'victorian':
|
||||||
loader = Victorian
|
loader = Victorian
|
||||||
data_path='../../authorship_analysis/data/victoria'
|
data_path='../../authorship_analysis/data/victoria'
|
||||||
|
elif opt.dataset == 'blogs':
|
||||||
|
loader = Blogs
|
||||||
|
data_path = '../../authorship_analysis/data/blogs'
|
||||||
|
|
||||||
dataset_name = f'{loader.__name__}_A{opt.authors}_D{opt.documents}_S{opt.seed}'
|
dataset_name = f'{loader.__name__}_A{opt.authors}_D{opt.documents}_S{opt.seed}'
|
||||||
pickle_path = None
|
pickle_path = None
|
||||||
|
@ -157,7 +161,7 @@ if __name__ == '__main__':
|
||||||
requiredNamed.add_argument('-d', '--dataset', help='Name of the dataset', required=True, type=str)
|
requiredNamed.add_argument('-d', '--dataset', help='Name of the dataset', required=True, type=str)
|
||||||
opt = parser.parse_args()
|
opt = parser.parse_args()
|
||||||
|
|
||||||
assert opt.dataset in ['enron', 'imdb62'], 'unknown dataset'
|
assert opt.dataset in ['enron', 'imdb62', 'blogs'], 'unknown dataset'
|
||||||
|
|
||||||
create_path_if_not_exists(opt.output)
|
create_path_if_not_exists(opt.output)
|
||||||
create_path_if_not_exists(opt.log)
|
create_path_if_not_exists(opt.log)
|
||||||
|
|
Loading…
Reference in New Issue