import pandas as pd from matplotlib import pyplot as plt import h2o import numpy as np from collections import defaultdict from h2o.estimators.glm import H2OGeneralizedLinearEstimator from h2o.estimators.random_forest import H2ORandomForestEstimator from h2o.estimators.deeplearning import H2ODeepLearningEstimator import argparse import sklearn from sklearn.pipeline import Pipeline from h2o.grid.grid_search import H2OGridSearch import copy def getBootstrap(groups, n_bootstrap_reps,traintestset,response, seed=None,batch_variable=None): """ Returns a list of (train,test)set index tuples for each repetition of the bootstrapping. Bootstrapping here is group-based, meaning groups (i.e. patients) are selected together and will always end up all in either the training or test set together. Splitting is non-stratified, but repetitions with < 5 cases of each class in the training set will not be accepted. :param groups: :param n_bootstrap_reps: :param traintestset: :param response: :param seed: :param batch_variable: :return: list of (train, test) indices """ groups = pd.Series(groups) ix = np.arange(len(groups)) unique = np.unique(groups) np.random.RandomState(seed).shuffle(unique) result = [] rep=0 while len(result) threshold for 100% specificity (non CV'ed)":np.where(df['median p1']>highest_threshold_for_class0, 1,0)}) df=df.assign(**{"median p1 > threshold for 100% specificity (non CV'ed) - reference information":df["median p1 > threshold for 100% specificity (non CV'ed)"]-df[reference].astype(float)}) elif averagemeasure=="mean": df=df.assign(**{"mean_prediction":df[[str(x) for x in range(nr_of_rounds)]].mean(axis=1)}) df=df.assign(**{"mean_p1":df[[str(x)+"_p1" for x in range(nr_of_rounds)]].mean(axis=1)}) df=df.assign(**{"mean_prediction - reference information":df["mean_prediction"]-df[reference].astype(float)}) df=df.assign(**{"mean_p1 - reference information":df["mean_p1"]-df[reference].astype(float)}) if not highest_threshold_for_class0: highest_threshold_for_class0=df[df[reference]==0]["mean_p1"].max(axis=0) df=df.assign(**{"threshold for 100% specificity (non CV'ed)":highest_threshold_for_class0}) df=df.assign(**{"mean p1 > threshold for 100% specificity (non CV'ed)":np.where(df['mean_p1']>highest_threshold_for_class0, 1,0)}) df=df.assign(**{"mean p1 > threshold for 100% specificity (non CV'ed) - reference information":df["mean p1 > threshold for 100% specificity (non CV'ed)"]-df[reference].astype(float)}) df=df.sort_values(by=[reference]) return df def run_classification(comparisonname,df,labelsamples,unclearsamples,response,predictors, response_name, alternative_reference_for_unclears="",alternative_response_name="", metalearn=False, n_bootstrap_reps=100, plot_only_these_samples="", myseed=42,max_mem=50,max_threads=15): np.random.seed(seed=myseed) predictors_orig=copy.copy(predictors) if not alternative_reference_for_unclears: alternative_reference_for_unclears=response alternative_response_name=response_name # prepare H2o and df's for h2o h2o.init(max_mem_size=str(max_mem)+"G", nthreads=max_threads) unclear_set=df[df["sample"].isin(unclearsamples)] traintestset=df[df["sample"].isin(labelsamples)] # shuffle the order of rows in the traintestset traintestset=traintestset.sample(frac=1,random_state=myseed).reset_index(drop=True) unclear_set_h2o=h2o.H2OFrame(unclear_set) unclear_set_h2o[[response]]=unclear_set_h2o[[response]].asfactor() # store the predictions in dataframes: out_of_sample_predictions_classification={} out_of_sample_predictions_classification[ "bestmodel_classification_out_of_sample"]=traintestset[[response,"sample","patient"]] out_of_sample_predictions_classification[ "bestmodel_classification_traintestset"]=traintestset[[response,"sample","patient"]].append(unclear_set[[response,"sample","patient"]],ignore_index=True) unclears_predictions_classification={} unclears_predictions_classification[ "best_classification_prediction_of_unclears"]=unclear_set[[response,"sample","patient",alternative_reference_for_unclears]] winner_names=[] print("\n\n############################## \n Running binary classification procedure for task %s" " \n using predictors%s" "\n############################## \n "%(comparisonname,(predictors[:20]+["..."]) if len(predictors)>20 else predictors)) # store the performance of the models: performance_dict=defaultdict(list) i=-1 for train_index, test_index in getBootstrap(groups=list(traintestset["patient"].values), n_bootstrap_reps=n_bootstrap_reps, traintestset=traintestset, response=response,seed=myseed): i+=1 print("Running iteration %s out of %s"%(i,n_bootstrap_reps)) # prepare sets: test_set=traintestset.iloc[test_index] training_set=traintestset.iloc[train_index] members_class_0=len(training_set[training_set[response]==0].values) members_class_1=len(training_set[training_set[response]==1].values) oversamplingfactor_class_0=1.0 if members_class_0 > members_class_1 else 1/(members_class_0/(members_class_1)) oversamplingfactor_class_1= 1.0 if members_class_1 > members_class_0 else 1/(members_class_1/(members_class_0)) # validation of the splitting procedure: with open("split_log.txt","w" if (i==0) else "a") as outf: print("This is iteration %s out of %s"%(i+1,n_bootstrap_reps,),file=outf) print("Size of training set:",len(training_set["sample"].values),file=outf) print("Size of test set:",len(test_set["sample"].values),file=outf) print("Fraction of samples that are in test set:", len(test_set["sample"].values)/(len(training_set["sample"].values)+(len(test_set["sample"].values))),file=outf) print("Nr. of members of class 0: ", members_class_0,file=outf) print("Nr. of members of class 1: ", members_class_1,file=outf) print("Oversampling factor for class 0: ", oversamplingfactor_class_0 ,file=outf) print("Oversampling factor for class 1: ", oversamplingfactor_class_1 ,file=outf) print("Test set: ", sorted(test_set["patient"].values),file=outf) print("Training set: ", sorted(training_set["patient"].values),file=outf) print("Test set (response)", sorted(test_set[response].values),file=outf) print("Training set (response) ",sorted(training_set[response].values),file=outf) print("\n",file=outf) ##### actual training: models_and_performances={} training_set_h2o=h2o.H2OFrame(training_set) training_set_h2o[[response]]=training_set_h2o[[response]].asfactor() h2o_train_with_this=training_set_h2o test_set_h2o=h2o.H2OFrame(test_set) test_set_h2o[[response]]=test_set_h2o[[response]].asfactor() predictor_name="all" if metalearn: predictors=[x for x in predictors_orig if "___%s___"%(i) in x] elasticnet_params = {'alpha': [.1, .5, .7, .9, .95, .99, 1]} # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNetCV.html#sklearn.linear_model.ElasticNetCV elasticnet_grid=H2OGridSearch(model=H2OGeneralizedLinearEstimator(nfolds=5, fold_assignment="stratified", seed=myseed+i, balance_classes = True, family="binomial",lambda_search=True, ignore_const_cols=False, class_sampling_factors=[oversamplingfactor_class_0,oversamplingfactor_class_1]), hyper_params=elasticnet_params) elasticnet_grid.train(x=predictors, y=response, training_frame=h2o_train_with_this[predictors+[response]]) best_model_id_net = elasticnet_grid.get_grid(sort_by='auc', decreasing=True).model_ids[0] net_gridwinner = h2o.get_model(best_model_id_net) # this saves the standardized coefficients of the model in a dataframe (for each iteration), and calculates the median coeff_table_this_iteration=net_gridwinner._model_json['output']['coefficients_table'].as_data_frame() coeff_table_this_iteration.columns=["names","dummy",i] if i==0: coeff_table=coeff_table_this_iteration[["names",i]] #.to_csv("metalearner_%s_coeffs.csv"%(i,)) else: coeff_table=pd.concat([coeff_table,coeff_table_this_iteration[i]],axis="columns") if i==n_bootstrap_reps-1: coeff_table=coeff_table.assign(**{"median_standardized_coefficient":coeff_table[list(range(0,n_bootstrap_reps))].median(axis="columns")}) coeff_table.to_csv("metalearner_coeffs.csv") performance_dict["metalearner_inner_CV_"+predictor_name].append(net_gridwinner.model_performance(xval=True).auc()) print(net_gridwinner.model_performance(xval=True).auc()) models_and_performances["metalearner_"+predictor_name]=(net_gridwinner,net_gridwinner.model_performance(xval=True).auc()) else: # if not metalearner ########## linear SVM ########### if True: inner_cv=sklearn.model_selection.StratifiedKFold(n_splits=5,random_state=myseed+i+100) SVCpipe = Pipeline([('scale', sklearn.preprocessing.StandardScaler()), ('SVC',sklearn.svm.SVC(probability=True, random_state=myseed+i))]) param_grid = [ {'SVC__C':[2**k for k in list(range(-5,16,2))], 'SVC__kernel':["linear"]}] linearSVC = sklearn.model_selection.GridSearchCV(SVCpipe,param_grid,cv=inner_cv,scoring="roc_auc", n_jobs=max_threads, return_train_score=False, iid=False) linearSVC.fit(training_set[predictors],training_set[response].values) bestlinearSVC = linearSVC.best_estimator_ print(linearSVC.best_score_) performance_dict["SVM_inner_CV_"+predictor_name].append(linearSVC.best_score_) models_and_performances["SVM_"+predictor_name]=(bestlinearSVC,linearSVC.best_score_) ########## MLP ############# if True: mlp = H2ODeepLearningEstimator(nfolds=5, fold_assignment="stratified", seed=myseed+i,reproducible=True , balance_classes = True,shuffle_training_data=True, class_sampling_factors=[oversamplingfactor_class_0,oversamplingfactor_class_1]) #,epochs=1000 mlp.train(x = predictors, y = response, training_frame = h2o_train_with_this[predictors+[response]]) hyperparams_df=pd.DataFrame(mlp.get_params()) hyperparams_df.to_csv("MLP_hyperparameters.csv") print(mlp.model_performance(xval=True).auc()) performance_dict["MLP_inner_CV_"+predictor_name].append(mlp.model_performance(xval=True).auc()) models_and_performances["MLP_"+predictor_name]=(mlp,mlp.model_performance(xval=True).auc()) ############# RF ########## if True: rf = H2ORandomForestEstimator(nfolds=5, fold_assignment="stratified",seed=myseed+i,balance_classes = True, class_sampling_factors=[oversamplingfactor_class_0,oversamplingfactor_class_1],ntrees=200) rf.train(x=predictors, y=response, training_frame = h2o_train_with_this[predictors+[response]]) performance_dict["RF_inner_CV"].append(rf.model_performance(xval=True).auc()) models_and_performances["RF_"+predictor_name]=(rf,rf.model_performance(xval=True).auc()) print(rf.model_performance(xval=True).auc()) ##### GLM with elastic net ##### if True: elasticnet_params = {'alpha': [.1, .5, .7, .9, .95, .99, 1]} # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNetCV.html#sklearn.linear_model.ElasticNetCV elasticnet_grid=H2OGridSearch(model=H2OGeneralizedLinearEstimator(nfolds=5, fold_assignment="stratified", seed=myseed+i, balance_classes = True, family="binomial",lambda_search=True, class_sampling_factors=[oversamplingfactor_class_0,oversamplingfactor_class_1]), hyper_params=elasticnet_params) elasticnet_grid.train(x=predictors, y=response, training_frame=h2o_train_with_this[predictors+[response]]) best_model_id_net = elasticnet_grid.get_grid(sort_by='auc', decreasing=True).model_ids[0] net_gridwinner = h2o.get_model(best_model_id_net) performance_dict["GLM_elnet_inner_CV_"+predictor_name].append(net_gridwinner.model_performance(xval=True).auc()) print(net_gridwinner.model_performance(xval=True).auc()) models_and_performances["GLM_elnet_"+predictor_name]=(net_gridwinner,net_gridwinner.model_performance(xval=True).auc()) net_gridwinner._model_json['output']['coefficients_table'].as_data_frame().to_csv("GLM_coefficients_%s.csv"%(i)) bestmodel_name=max(models_and_performances.keys(), key=(lambda key: models_and_performances[key][1])) winner_names.append(bestmodel_name) bestmodel=models_and_performances[bestmodel_name][0] testset_df=pd.DataFrame() testset_df["sample"]=test_set["sample"].values trainingset_df=pd.DataFrame() trainingset_df["sample"]=training_set["sample"].values if not "SVM" in bestmodel_name: ## H2o version performance_dict["best_inner_CV"].append(bestmodel.model_performance(xval=True).auc()) performance_dict["best_outer_CV"].append(bestmodel.model_performance(test_data=test_set_h2o).auc()) prediction_testset=bestmodel.predict(test_set_h2o[predictors]).as_data_frame() testset_df[str(i)]=prediction_testset["predict"].values testset_df[str(i)+"_p1"]=prediction_testset["p1"].values prediction_trainingset=bestmodel.predict(training_set_h2o[predictors]).as_data_frame() trainingset_df=trainingset_df.assign(**{"___"+str(i)+"___p1":prediction_trainingset["p1"].values}) unclears_prediction_df=bestmodel.predict(unclear_set_h2o[predictors]).as_data_frame() else: # sklearn version performance_dict["best_inner_CV"].append(linearSVC.best_score_) prediction_testset=bestmodel.predict_proba(test_set[predictors])[:,1] prediction_trainingset=bestmodel.predict_proba(training_set[predictors])[:,1] performance_dict["best_outer_CV"].append(sklearn.metrics.roc_auc_score(y_true=test_set[response].values,y_score=prediction_testset)) testset_df[str(i)+"_p1"]=prediction_testset testset_df[str(i)]=bestmodel.predict(test_set[predictors]) trainingset_df=trainingset_df.assign(**{"___"+str(i)+"___p1":prediction_trainingset}) unclears_prediction_df=pd.DataFrame() unclears_prediction_df["p1"]=bestmodel.predict_proba(unclear_set[predictors])[:,1] unclears_prediction_df["predict"]=bestmodel.predict(unclear_set[predictors]) unclears_predictions_classification["best_classification_prediction_of_unclears"]=unclears_predictions_classification["best_classification_prediction_of_unclears"].assign(**{str(i):unclears_prediction_df["predict"].values}) unclears_predictions_classification["best_classification_prediction_of_unclears"]=unclears_predictions_classification["best_classification_prediction_of_unclears"].assign(**{str(i)+"_p1":unclears_prediction_df["p1"].values}) out_of_sample_predictions_classification["bestmodel_classification_out_of_sample"]=out_of_sample_predictions_classification["bestmodel_classification_out_of_sample"].merge(testset_df[["sample",str(i),str(i)+"_p1"]],left_on="sample",right_on="sample",how="left") testset_df2=testset_df[["sample",str(i)+"_p1"]].rename({str(i)+"_p1":"___"+str(i)+"___p1"},axis=1) unclears_prediction_df2=unclears_prediction_df[["p1"]].rename({"p1":"___"+str(i)+"___p1"},axis=1) unclears_prediction_df2["sample"]=unclear_set["sample"].values unclears_prediction_df2=unclears_prediction_df2[["sample","___"+str(i)+"___p1"]] out_of_sample_predictions_classification["bestmodel_classification_traintestset"]=out_of_sample_predictions_classification["bestmodel_classification_traintestset"].merge(trainingset_df[["sample","___"+str(i)+"___p1"]].append(testset_df2,ignore_index=True).append(unclears_prediction_df2,ignore_index=True),left_on="sample",right_on="sample",how="left") # Remove unnecessary dataframes to save memory: remove_this=list(h2o.ls()["key"].values) try: remove_this.remove(unclear_set_h2o.frame_id) except: pass h2o.remove(remove_this) print("The following values are means from %s iterations of bootstrapping"%(n_bootstrap_reps)) for key,value in performance_dict.items(): print(key, ":",round(np.mean(value),3)) print(key, ":",value) for name, prediction_frame in out_of_sample_predictions_classification.items(): if name=="bestmodel_classification_traintestset": prediction_frame.drop([response,"patient"],axis=1).to_csv("trainingset_based_predictions_for_metalearner.csv",index=False) if prediction_frame.empty or name=="bestmodel_classification_traintestset": continue prediction_frame=prepare_for_boxplot_classification(prediction_frame[["sample","patient",response]+[str(x) for x in range(n_bootstrap_reps)]+[str(x)+"_p1" for x in range(n_bootstrap_reps)]],n_bootstrap_reps,response,"mean") prediction_frame.to_csv(name+"_predictions_%s.csv"%(comparisonname,))