import os.path
import sys

import sklearn

from sklearn.base import BaseEstimator
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

import quapy as qp
from method.base import BaseQuantifier
from quapy.data import LabelledCollection
from quapy.method.aggregative import EMQ, CC, PACC, PCC, HDy, ACC
from quapy import functional as F
import numpy as np
from itertools import chain


def split_from_index(collection: LabelledCollection, index: np.ndarray):
    in_index_set = set(index)
    out_index_set = set(range(len(collection))) - in_index_set
    out_index = np.asarray(list(out_index_set), dtype=int)
    return collection.sampling_from_index(index), collection.sampling_from_index(out_index)


def relevance_sampling_index(pool: LabelledCollection, classifier: BaseEstimator, k: int):
    prob = classifier.predict_proba(pool.instances)[:, 1].flatten()
    top_relevant_idx = np.argsort(-prob)[:k]
    return top_relevant_idx


def uncertainty_sampling_index(pool: LabelledCollection, classifier: BaseEstimator, k: int):
    prob = classifier.predict_proba(pool.instances)[:, 1].flatten()
    top_uncertain_idx = np.argsort(np.abs(prob - 0.5))[:k]
    return top_uncertain_idx


def mix_rel_unc_sampling_index(pool: LabelledCollection, classifier: BaseEstimator, k: int):
    relevance_idx = relevance_sampling_index(pool, classifier, k)
    uncertanty_idx = uncertainty_sampling_index(pool, classifier, k)
    interleave_idx = np.asarray(list(chain.from_iterable(zip(relevance_idx, uncertanty_idx))))
    _, unique_idx = np.unique(interleave_idx, return_index=True)
    top_interleaved_idx = interleave_idx[np.sort(unique_idx)][:k]
    return top_interleaved_idx


def negative_sampling_index(pool: LabelledCollection, classifier: BaseEstimator, k: int):
    prob = classifier.predict_proba(pool.instances)[:, 0].flatten()
    top_relevant_idx = np.argsort(-prob)[:k]
    return top_relevant_idx


def recall(train_prev, pool_prev, train_size, pool_size):
    frac_tr_pos = train_prev[1]
    frac_te_pos = pool_prev[1]
    recall = (frac_tr_pos * train_size) / (frac_tr_pos * train_size + frac_te_pos * pool_size)
    return recall


def NewClassifier():
    # return CalibratedClassifierCV(LinearSVC(class_weight='balanced'))
    return LogisticRegression(class_weight=None)


def NewQuantifier():
    return EMQ(CalibratedClassifierCV(NewClassifier()))


def create_dataset(datasetname):
    if datasetname == 'imdb.10K.75p':
        data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5)
        collection = data.training.sampling(10000, 0.75)
        return collection

    elif datasetname == 'RCV1.C4':
        X, y = sklearn.datasets.fetch_rcv1(subset='train', return_X_y=True)
        y = y.toarray()
        prev = y.mean(axis=0).flatten()
        # choose the first category having a positive prevalence between [0.1,0.2] (realistic scenario for e-Discovery)
        # this category happens to be the cat with id 4
        target_cat = np.argwhere(np.logical_and(prev > 0.1, prev < 0.2)).flatten()[0]
        print('chosen cat', target_cat)
        y = y[:, target_cat].flatten()
        return LabelledCollection(X, y)


def estimate_prev_CC(train, pool):
    q = CC(NewClassifier()).fit(train)
    return q.quantify(pool.instances), q.learner


def estimate_prev_Q(train, pool, classifier):
    # q = qp.model_selection.GridSearchQ(
    #     ACC(LogisticRegression()),
    #     param_grid={'C':np.logspace(-3,3,7), 'class_weight':[None, 'balanced']},
    #     sample_size=len(train),
    #     protocol='app',
    #     n_prevpoints=21,
    #     n_repetitions=10)

    q = NewQuantifier()
    # q = ACC(NewClassifier())
    # borrow (supposedly negative) pool documents
    # train_pos = train.counts()[1]
    # train_negs = train.counts()[0]
    # neg_idx = negative_sampling_index(pool, classifier, max(train_pos-train_negs, 5))
    # neg_sample = pool.sampling_from_index(neg_idx)
    # train_augmented = train + LabelledCollection(neg_sample.instances, [0]*len(neg_sample))
    # q.fit(train_augmented)
    q.fit(train)
    # q.fit(first_train)
    # bootstrap_prev = qp.evaluation.natural_prevalence_prediction(q, pool, sample_size=len(train), n_repetitions=50)[1].mean(axis=0).flatten()
    prev = q.quantify(pool.instances)
    return prev, None
    # return q.quantify(pool_instances), None


def tee(msg):
    foo.write(msg + '\n')
    foo.flush()
    print(msg)


datasetname = 'RCV1.C4'
collection = qp.util.pickled_resource(f'./dataset/{datasetname}.pkl', create_dataset, datasetname)
nD = len(collection)

# initial labelled data selection
init_nD = 1000
init_prev = [0.5, 0.5]
idx = collection.sampling_index(init_nD, *init_prev)
train, pool = split_from_index(collection, idx)
#first_train = LabelledCollection(train.instances, train.labels)

k = 100
recall_target = 0.99

outputdir = './results'
qp.util.create_if_not_exist(outputdir)

# sampling_fn, sampling_name = relevance_sampling_index, 'relevance'
sampling_fn, sampling_name = mix_rel_unc_sampling_index, 'mix'
q_name = NewQuantifier().__class__.__name__

experiment_suffix = f'{sampling_name}_{q_name}'

i = 0
with open(os.path.join(outputdir, f'{datasetname}_{experiment_suffix}.csv'), 'wt') as foo:
    tee('it\t%\ttr-size\tte-size\ttr-prev\tte-prev\tte-estim\tte-estimCC\tR\tRhat\tRhatCC\tShift\tAE\tAE_CC')
    while True:

        pool_p_hat_cc, classifier = estimate_prev_CC(train, pool)
        pool_p_hat, _ = estimate_prev_Q(train, pool, classifier)

        tr_p = train.prevalence()
        te_p = pool.prevalence()
        nDtr = len(train)
        nDte = len(pool)

        r_hat_cc = recall(tr_p, pool_p_hat_cc, nDtr, nDte)
        r_hat = recall(tr_p, pool_p_hat, nDtr, nDte)
        r = recall(tr_p, te_p, nDtr, nDte)
        tr_te_shift = qp.error.ae(tr_p, te_p)

        progress = 100 * nDtr / nD

        q_ae = qp.error.ae(te_p, pool_p_hat)
        cc_ae = qp.error.ae(te_p, pool_p_hat_cc)

        tee(
            f'{i}\t{progress:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat[1]:.3f}\t{pool_p_hat_cc[1]:.3f}'
            f'\t{r:.3f}\t{r_hat:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{q_ae:.4f}\t{cc_ae:.4f}')

        if nDte < k:
            break

        top_relevant_idx = sampling_fn(pool, classifier, k)
        selected, pool = split_from_index(pool, top_relevant_idx)
        train = train + selected

        i += 1