import pandas as pd
from matplotlib import pyplot as plt
import h2o
import numpy as np
from collections import defaultdict
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
import argparse
import sklearn
from sklearn.pipeline import Pipeline
from h2o.grid.grid_search import H2OGridSearch
import copy


def getBootstrap(groups, n_bootstrap_reps,traintestset,response, seed=None,batch_variable=None):
    """
    Returns a list of (train,test)set index tuples for each repetition of the bootstrapping. Bootstrapping here is group-based,
    meaning groups (i.e. patients) are selected together and will always end up all in either the training or test set together.
    Splitting is non-stratified, but repetitions with < 5 cases of each class in the training set will not be accepted.
    :param groups:
    :param n_bootstrap_reps:
    :param traintestset:
    :param response:
    :param seed:
    :param batch_variable:
    :return: list of (train, test) indices
    """
    groups = pd.Series(groups)
    ix = np.arange(len(groups))
    unique = np.unique(groups)
    np.random.RandomState(seed).shuffle(unique)
    result = []
    rep=0
    while len(result)<n_bootstrap_reps:
        rep+=1
        traingset_idx= np.random.RandomState(seed+rep).randint(0, len(unique), len(unique))
        mask = groups.isin(unique[traingset_idx])
        train_idx, test_idx = ix[mask], ix[~mask]

        # This is to ensure that the splitting procedure does not produce a test set with 4 or less samples of a given category:
        test_set=traintestset.iloc[test_idx]
        training_set=traintestset.iloc[train_idx]
        seed_ok=True
        if len(test_set[test_set[response]==1][response].values)<5 or len(test_set[test_set[response]==0][response].values)<5:
            seed_ok=False
        if seed_ok:
            result.append((train_idx, test_idx))
    return result


def prepare_for_boxplot_classification(df, nr_of_rounds, reference,averagemeasure="median",highest_threshold_for_class0=""):
    p1_df=df[[str(x)+"_p1" for x in range(nr_of_rounds)]]
    df=df.assign(**{"min_prediction":df[[str(x) for x in range(nr_of_rounds)]].min(axis=1),
                    "max_prediction":df[[str(x) for x in range(nr_of_rounds)]].max(axis=1),
                    "min_p1":df[[str(x)+"_p1" for x in range(nr_of_rounds)]].min(axis=1),
                    "max_p1":df[[str(x)+"_p1" for x in range(nr_of_rounds)]].max(axis=1),
                    "mean_p1_minus_1sd":p1_df.mean(axis=1)-p1_df.std(axis=1),
                    "mean_p1_plus_1sd":p1_df.mean(axis=1)+p1_df.std(axis=1),
                    "mean_p1_minus_1se":p1_df.mean(axis=1)-p1_df.sem(axis=1),
                    "mean_p1_plus_1se":p1_df.mean(axis=1)+p1_df.sem(axis=1)})

    df[reference]=df[reference].astype(float)
    if averagemeasure=="median":
        df=df.assign(**{"median_prediction":df[[str(x) for x in range(nr_of_rounds)]].median(axis=1)})
        df=df.assign(**{"median_p1":df[[str(x)+"_p1" for x in range(nr_of_rounds)]].median(axis=1)})
        df=df.assign(**{"median_prediction - reference information":df["median_prediction"]-df[reference].astype(float)})
        df=df.assign(**{"median_p1 - reference information":df["median_p1"]-df[reference].astype(float)})
        if not highest_threshold_for_class0:
            highest_threshold_for_class0=df[df[reference]==0]["median_p1"].max(axis=0)
        df=df.assign(**{"threshold for 100% specificity (non CV'ed)":highest_threshold_for_class0})
        df=df.assign(**{"median p1 > threshold for 100% specificity (non CV'ed)":np.where(df['median p1']>highest_threshold_for_class0, 1,0)})
        df=df.assign(**{"median p1 > threshold for 100% specificity (non CV'ed) - reference information":df["median p1 > threshold for 100% specificity (non CV'ed)"]-df[reference].astype(float)})
    elif averagemeasure=="mean":
        df=df.assign(**{"mean_prediction":df[[str(x) for x in range(nr_of_rounds)]].mean(axis=1)})
        df=df.assign(**{"mean_p1":df[[str(x)+"_p1" for x in range(nr_of_rounds)]].mean(axis=1)})
        df=df.assign(**{"mean_prediction - reference information":df["mean_prediction"]-df[reference].astype(float)})
        df=df.assign(**{"mean_p1 - reference information":df["mean_p1"]-df[reference].astype(float)})
        if not highest_threshold_for_class0:
            highest_threshold_for_class0=df[df[reference]==0]["mean_p1"].max(axis=0)
        df=df.assign(**{"threshold for 100% specificity (non CV'ed)":highest_threshold_for_class0})
        df=df.assign(**{"mean p1 > threshold for 100% specificity (non CV'ed)":np.where(df['mean_p1']>highest_threshold_for_class0, 1,0)})
        df=df.assign(**{"mean p1 > threshold for 100% specificity (non CV'ed) - reference information":df["mean p1 > threshold for 100% specificity (non CV'ed)"]-df[reference].astype(float)})
    df=df.sort_values(by=[reference])
    return df

def run_classification(comparisonname,df,labelsamples,unclearsamples,response,predictors, response_name,
                       alternative_reference_for_unclears="",alternative_response_name="", metalearn=False,
                       n_bootstrap_reps=100,
                       plot_only_these_samples="", myseed=42,max_mem=50,max_threads=15):

    np.random.seed(seed=myseed)
    predictors_orig=copy.copy(predictors)
    if not alternative_reference_for_unclears:
        alternative_reference_for_unclears=response
        alternative_response_name=response_name

    # prepare H2o and df's for h2o
    h2o.init(max_mem_size=str(max_mem)+"G", nthreads=max_threads)

    unclear_set=df[df["sample"].isin(unclearsamples)]
    traintestset=df[df["sample"].isin(labelsamples)]
    # shuffle the order of rows in the traintestset
    traintestset=traintestset.sample(frac=1,random_state=myseed).reset_index(drop=True)
    unclear_set_h2o=h2o.H2OFrame(unclear_set)
    unclear_set_h2o[[response]]=unclear_set_h2o[[response]].asfactor()

    # store the predictions in dataframes:
    out_of_sample_predictions_classification={}
    out_of_sample_predictions_classification[
        "bestmodel_classification_out_of_sample"]=traintestset[[response,"sample","patient"]]
    out_of_sample_predictions_classification[
        "bestmodel_classification_traintestset"]=traintestset[[response,"sample","patient"]].append(unclear_set[[response,"sample","patient"]],ignore_index=True)
    unclears_predictions_classification={}
    unclears_predictions_classification[
        "best_classification_prediction_of_unclears"]=unclear_set[[response,"sample","patient",alternative_reference_for_unclears]]

    winner_names=[]

    print("\n\n############################## \n Running binary classification procedure for task %s"
          " \n using predictors%s"
          "\n############################## \n "%(comparisonname,(predictors[:20]+["..."]) if len(predictors)>20 else predictors))

    # store the performance of the models:
    performance_dict=defaultdict(list)
    i=-1

    for train_index, test_index in getBootstrap(groups=list(traintestset["patient"].values),
                                                n_bootstrap_reps=n_bootstrap_reps,
                                                traintestset=traintestset,
                                                response=response,seed=myseed):
        i+=1
        print("Running iteration %s out of %s"%(i,n_bootstrap_reps))

        # prepare sets:
        test_set=traintestset.iloc[test_index]
        training_set=traintestset.iloc[train_index]

        members_class_0=len(training_set[training_set[response]==0].values)
        members_class_1=len(training_set[training_set[response]==1].values)
        oversamplingfactor_class_0=1.0 if members_class_0 > members_class_1 else 1/(members_class_0/(members_class_1))
        oversamplingfactor_class_1= 1.0 if members_class_1 > members_class_0 else 1/(members_class_1/(members_class_0))

        # validation of the splitting procedure:
        with open("split_log.txt","w" if (i==0) else "a") as outf:
            print("This is iteration %s out of %s"%(i+1,n_bootstrap_reps,),file=outf)
            print("Size of training set:",len(training_set["sample"].values),file=outf)
            print("Size of test set:",len(test_set["sample"].values),file=outf)
            print("Fraction of samples that are in test set:",
                  len(test_set["sample"].values)/(len(training_set["sample"].values)+(len(test_set["sample"].values))),file=outf)
            print("Nr. of members of class 0: ", members_class_0,file=outf)
            print("Nr. of members of class 1: ", members_class_1,file=outf)
            print("Oversampling factor for class 0: ", oversamplingfactor_class_0 ,file=outf)
            print("Oversampling factor for class 1: ", oversamplingfactor_class_1 ,file=outf)
            print("Test set: ", sorted(test_set["patient"].values),file=outf)
            print("Training set: ", sorted(training_set["patient"].values),file=outf)
            print("Test set (response)", sorted(test_set[response].values),file=outf)
            print("Training set (response) ",sorted(training_set[response].values),file=outf)
            print("\n",file=outf)

        ##### actual training:
        models_and_performances={}
        
        training_set_h2o=h2o.H2OFrame(training_set)
        training_set_h2o[[response]]=training_set_h2o[[response]].asfactor()
        h2o_train_with_this=training_set_h2o
        test_set_h2o=h2o.H2OFrame(test_set)
        test_set_h2o[[response]]=test_set_h2o[[response]].asfactor()
        predictor_name="all"

        if metalearn:
            predictors=[x for x in predictors_orig if "___%s___"%(i) in x]
            elasticnet_params = {'alpha': [.1, .5, .7, .9, .95, .99, 1]} # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNetCV.html#sklearn.linear_model.ElasticNetCV
            elasticnet_grid=H2OGridSearch(model=H2OGeneralizedLinearEstimator(nfolds=5, fold_assignment="stratified",
                                                                              seed=myseed+i, balance_classes = True,
                                                                              family="binomial",lambda_search=True,
                                                                              ignore_const_cols=False,
                                                                              class_sampling_factors=[oversamplingfactor_class_0,oversamplingfactor_class_1]),
                                                                              hyper_params=elasticnet_params)
            elasticnet_grid.train(x=predictors, y=response, training_frame=h2o_train_with_this[predictors+[response]])
            best_model_id_net = elasticnet_grid.get_grid(sort_by='auc', decreasing=True).model_ids[0]
            net_gridwinner = h2o.get_model(best_model_id_net)

            # this saves the standardized coefficients of the model in a dataframe (for each iteration), and calculates the median
            coeff_table_this_iteration=net_gridwinner._model_json['output']['coefficients_table'].as_data_frame()
            coeff_table_this_iteration.columns=["names","dummy",i]
            if i==0:
                coeff_table=coeff_table_this_iteration[["names",i]] #.to_csv("metalearner_%s_coeffs.csv"%(i,))
            else:
                coeff_table=pd.concat([coeff_table,coeff_table_this_iteration[i]],axis="columns")
            if i==n_bootstrap_reps-1:
                coeff_table=coeff_table.assign(**{"median_standardized_coefficient":coeff_table[list(range(0,n_bootstrap_reps))].median(axis="columns")})
                coeff_table.to_csv("metalearner_coeffs.csv")

            performance_dict["metalearner_inner_CV_"+predictor_name].append(net_gridwinner.model_performance(xval=True).auc())
            print(net_gridwinner.model_performance(xval=True).auc())
            models_and_performances["metalearner_"+predictor_name]=(net_gridwinner,net_gridwinner.model_performance(xval=True).auc())

        else: # if not metalearner
            
            ########## linear SVM ###########
            if True:
                inner_cv=sklearn.model_selection.StratifiedKFold(n_splits=5,random_state=myseed+i+100)
                SVCpipe = Pipeline([('scale', sklearn.preprocessing.StandardScaler()),
                                    ('SVC',sklearn.svm.SVC(probability=True, random_state=myseed+i))]) 
                param_grid = [
                    {'SVC__C':[2**k for k in list(range(-5,16,2))],
                     'SVC__kernel':["linear"]}]
                linearSVC = sklearn.model_selection.GridSearchCV(SVCpipe,param_grid,cv=inner_cv,scoring="roc_auc", n_jobs=max_threads,
                                                                 return_train_score=False, iid=False)
                linearSVC.fit(training_set[predictors],training_set[response].values)
                bestlinearSVC = linearSVC.best_estimator_
                print(linearSVC.best_score_)
                performance_dict["SVM_inner_CV_"+predictor_name].append(linearSVC.best_score_)
                models_and_performances["SVM_"+predictor_name]=(bestlinearSVC,linearSVC.best_score_)

            ########## MLP #############
            if True:
                mlp = H2ODeepLearningEstimator(nfolds=5, fold_assignment="stratified", seed=myseed+i,reproducible=True ,
                                               balance_classes = True,shuffle_training_data=True,
                                               class_sampling_factors=[oversamplingfactor_class_0,oversamplingfactor_class_1]) #,epochs=1000
                mlp.train(x = predictors, y = response, training_frame = h2o_train_with_this[predictors+[response]])
                hyperparams_df=pd.DataFrame(mlp.get_params())
                hyperparams_df.to_csv("MLP_hyperparameters.csv")
                print(mlp.model_performance(xval=True).auc())
                performance_dict["MLP_inner_CV_"+predictor_name].append(mlp.model_performance(xval=True).auc())
                models_and_performances["MLP_"+predictor_name]=(mlp,mlp.model_performance(xval=True).auc())

            ############# RF ##########
            if True:
                rf = H2ORandomForestEstimator(nfolds=5, fold_assignment="stratified",seed=myseed+i,balance_classes = True,
                                              class_sampling_factors=[oversamplingfactor_class_0,oversamplingfactor_class_1],ntrees=200)
                rf.train(x=predictors, y=response, training_frame = h2o_train_with_this[predictors+[response]])
                performance_dict["RF_inner_CV"].append(rf.model_performance(xval=True).auc())
                models_and_performances["RF_"+predictor_name]=(rf,rf.model_performance(xval=True).auc())
                print(rf.model_performance(xval=True).auc())

            ##### GLM with elastic net #####
            if True:
                elasticnet_params = {'alpha': [.1, .5, .7, .9, .95, .99, 1]} # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNetCV.html#sklearn.linear_model.ElasticNetCV
                elasticnet_grid=H2OGridSearch(model=H2OGeneralizedLinearEstimator(nfolds=5, fold_assignment="stratified",
                                                                                  seed=myseed+i, balance_classes = True,
                                                                                  family="binomial",lambda_search=True,
                                                                                  class_sampling_factors=[oversamplingfactor_class_0,oversamplingfactor_class_1]),
                                              hyper_params=elasticnet_params)
                elasticnet_grid.train(x=predictors, y=response, training_frame=h2o_train_with_this[predictors+[response]])
                best_model_id_net = elasticnet_grid.get_grid(sort_by='auc', decreasing=True).model_ids[0]
                net_gridwinner = h2o.get_model(best_model_id_net)
                performance_dict["GLM_elnet_inner_CV_"+predictor_name].append(net_gridwinner.model_performance(xval=True).auc())
                print(net_gridwinner.model_performance(xval=True).auc())
                models_and_performances["GLM_elnet_"+predictor_name]=(net_gridwinner,net_gridwinner.model_performance(xval=True).auc())
                net_gridwinner._model_json['output']['coefficients_table'].as_data_frame().to_csv("GLM_coefficients_%s.csv"%(i))

        bestmodel_name=max(models_and_performances.keys(), key=(lambda key: models_and_performances[key][1]))
        winner_names.append(bestmodel_name)
        bestmodel=models_and_performances[bestmodel_name][0]
        testset_df=pd.DataFrame()
        testset_df["sample"]=test_set["sample"].values
        trainingset_df=pd.DataFrame()
        trainingset_df["sample"]=training_set["sample"].values

        if not "SVM" in bestmodel_name: ## H2o version
            performance_dict["best_inner_CV"].append(bestmodel.model_performance(xval=True).auc())
            performance_dict["best_outer_CV"].append(bestmodel.model_performance(test_data=test_set_h2o).auc())
            prediction_testset=bestmodel.predict(test_set_h2o[predictors]).as_data_frame()
            testset_df[str(i)]=prediction_testset["predict"].values
            testset_df[str(i)+"_p1"]=prediction_testset["p1"].values
            prediction_trainingset=bestmodel.predict(training_set_h2o[predictors]).as_data_frame()
            trainingset_df=trainingset_df.assign(**{"___"+str(i)+"___p1":prediction_trainingset["p1"].values})
            unclears_prediction_df=bestmodel.predict(unclear_set_h2o[predictors]).as_data_frame()
        else: # sklearn version
            performance_dict["best_inner_CV"].append(linearSVC.best_score_)
            prediction_testset=bestmodel.predict_proba(test_set[predictors])[:,1]
            prediction_trainingset=bestmodel.predict_proba(training_set[predictors])[:,1]
            performance_dict["best_outer_CV"].append(sklearn.metrics.roc_auc_score(y_true=test_set[response].values,y_score=prediction_testset))
            testset_df[str(i)+"_p1"]=prediction_testset
            testset_df[str(i)]=bestmodel.predict(test_set[predictors])
            trainingset_df=trainingset_df.assign(**{"___"+str(i)+"___p1":prediction_trainingset})
            unclears_prediction_df=pd.DataFrame()
            unclears_prediction_df["p1"]=bestmodel.predict_proba(unclear_set[predictors])[:,1]
            unclears_prediction_df["predict"]=bestmodel.predict(unclear_set[predictors])

        unclears_predictions_classification["best_classification_prediction_of_unclears"]=unclears_predictions_classification["best_classification_prediction_of_unclears"].assign(**{str(i):unclears_prediction_df["predict"].values})
        unclears_predictions_classification["best_classification_prediction_of_unclears"]=unclears_predictions_classification["best_classification_prediction_of_unclears"].assign(**{str(i)+"_p1":unclears_prediction_df["p1"].values})
        out_of_sample_predictions_classification["bestmodel_classification_out_of_sample"]=out_of_sample_predictions_classification["bestmodel_classification_out_of_sample"].merge(testset_df[["sample",str(i),str(i)+"_p1"]],left_on="sample",right_on="sample",how="left")
        testset_df2=testset_df[["sample",str(i)+"_p1"]].rename({str(i)+"_p1":"___"+str(i)+"___p1"},axis=1)
        unclears_prediction_df2=unclears_prediction_df[["p1"]].rename({"p1":"___"+str(i)+"___p1"},axis=1)
        unclears_prediction_df2["sample"]=unclear_set["sample"].values
        unclears_prediction_df2=unclears_prediction_df2[["sample","___"+str(i)+"___p1"]]
        out_of_sample_predictions_classification["bestmodel_classification_traintestset"]=out_of_sample_predictions_classification["bestmodel_classification_traintestset"].merge(trainingset_df[["sample","___"+str(i)+"___p1"]].append(testset_df2,ignore_index=True).append(unclears_prediction_df2,ignore_index=True),left_on="sample",right_on="sample",how="left")

        # Remove unnecessary dataframes to save memory:
        remove_this=list(h2o.ls()["key"].values)
        try:
            remove_this.remove(unclear_set_h2o.frame_id)
        except:
            pass
        h2o.remove(remove_this)

    print("The following values are means from %s iterations of bootstrapping"%(n_bootstrap_reps))
    for key,value in performance_dict.items():
        print(key, ":",round(np.mean(value),3))
        print(key, ":",value)
        
    for name, prediction_frame in out_of_sample_predictions_classification.items():
        if name=="bestmodel_classification_traintestset":
            prediction_frame.drop([response,"patient"],axis=1).to_csv("trainingset_based_predictions_for_metalearner.csv",index=False)
        if prediction_frame.empty or name=="bestmodel_classification_traintestset":
            continue
        prediction_frame=prepare_for_boxplot_classification(prediction_frame[["sample","patient",response]+[str(x) for x in range(n_bootstrap_reps)]+[str(x)+"_p1" for x in range(n_bootstrap_reps)]],n_bootstrap_reps,response,"mean")
        prediction_frame.to_csv(name+"_predictions_%s.csv"%(comparisonname,))