{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ " ** Figure 6: Machine-learning based detection of EwS and distinction from other sarcomas **\n", "\n", "Here, we are using a table (\"ML_input_features.xlsx\") containing all the features required as input, including features from global fragmentation, regional fragmentation & read depth, as well as features based on fragment coverage at EwS-specific DHSs. How these metrics were generated is outlined in the previous notebooks." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%bash\n", "wget http://medical-epigenomics.org/papers/peneder2020_f17c4e3befc643ffbb31e69f43630748/data/ML_input_features.xlsx" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from scipy import stats\n", "import sys\n", "import os\n", "sys.path.insert(0, os.getcwd())\n", "import binary_classifier_considering_patients as binary_classifier\n", "import subprocess\n", "from datetime import datetime\n", "from sklearn.linear_model import LinearRegression\n", "import pickle" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# settings\n", "max_threads=10\n", "max_mem=100\n", "myseed=42\n", "metalearn=True\n", "run_baselearners=True\n", "skip_baselearn_if_present=True\n", "n_bootstrap_reps=10" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# This function is used in all ML experiments and calls the training and testing procedure for all 4 feature sets and the metalearner,\n", "# for a given set of samples and a given response\n", "# The actual training and testing is in \n", "def run_classification(name, traintestset, unclear_set, response, response_name,alternative_reference_for_unclears,alternative_response_name,\n", " n_bootstrap_reps=n_bootstrap_reps):\n", " \n", " if not os.path.exists(name):\n", " os.mkdir(name)\n", " os.chdir(name)\n", " \n", " for predictors_w_names_and_metalearnername in [(\"METALEARNER_fullx\",predictorset_fullx),\n", " (\"METALEARNER_1x\",predictorset_1x),\n", " (\"METALEARNER_0.1x\", predictorset_0point1x)]:\n", " \n", " predictors_w_names=predictors_w_names_and_metalearnername[1]\n", " metalearn_dirname=predictors_w_names_and_metalearnername[0]\n", " \n", " # saves the predictions of each classifier for metalearning\n", " p1_dict={\"traintestset\":pd.DataFrame(),\"unclearset\":pd.DataFrame()}\n", " predictornames_for_metalearning_w_mean_p1=[]\n", " # saves the prediction of each classifier - only using samples in the trainingset\n", " trainingsetbased_p1=pd.DataFrame()\n", "\n", " for predictorelem in predictors_w_names:\n", " predictors=predictorelem[1]\n", " predictor_name=predictorelem[0]\n", " print(\"Running for\",predictor_name)\n", " if not os.path.exists(predictor_name):\n", " os.mkdir(predictor_name)\n", " os.chdir(predictor_name)\n", "\n", " if run_baselearners and not (skip_baselearn_if_present and os.path.isfile(\"bestmodel_classification_out_of_sample_predictions_\"+name+\".csv\")):\n", " cols=(predictors+[\"sample\",\"patient\",response]+([alternative_reference_for_unclears] if not alternative_reference_for_unclears in predictors else []))\n", " \n", " # run the actual classification using the given set of features\n", " binary_classifier.run_classification(comparisonname=name,df=pd.concat([traintestset,unclear_set],axis=0)[cols],\n", " labelsamples=list(traintestset[\"sample\"]),\n", " unclearsamples=list(unclear_set[\"sample\"]),\n", " response=response,predictors=predictors, response_name=response_name,\n", " alternative_reference_for_unclears=alternative_reference_for_unclears,\n", " alternative_response_name=alternative_response_name,\n", " n_bootstrap_reps=n_bootstrap_reps,max_threads=max_threads,max_mem=max_mem)\n", "\n", " # save the resulting predictions of this classifier in the table such that it can be used by the meta-learner\n", " trainingsetbased_p1_thisclassifier=pd.read_csv(\"trainingset_based_predictions_for_metalearner.csv\")\n", " trainingsetbased_p1_thisclassifier=trainingsetbased_p1_thisclassifier.rename({x:predictor_name+x for x in trainingsetbased_p1_thisclassifier.columns if not x==\"sample\"},axis=1)\n", " if trainingsetbased_p1.empty:\n", " trainingsetbased_p1=trainingsetbased_p1_thisclassifier\n", " else:\n", " trainingsetbased_p1=pd.merge(trainingsetbased_p1,trainingsetbased_p1_thisclassifier,left_on=\"sample\",right_on=\"sample\",how=\"inner\")\n", " \n", " os.chdir('..')\n", "\n", " ## metalearning :\n", " if metalearn:\n", " metainput=pd.merge(pd.concat([traintestset,unclear_set]),trainingsetbased_p1,left_on=\"sample\",right_on=\"sample\",how=\"inner\")\n", " metapreds=[x for x in trainingsetbased_p1.columns if not x==\"sample\"]\n", "\n", " if not os.path.exists(metalearn_dirname):\n", " os.mkdir(metalearn_dirname)\n", " else:\n", " print(\"Metalearning already done. skipping\")\n", " continue\n", " os.chdir(metalearn_dirname)\n", " \n", " # run the actual classification using the predictions of the previously run classifiers as input features\n", " binary_classifier.run_classification(comparisonname=name,df=metainput,\n", " labelsamples=list(traintestset[\"sample\"]),\n", " unclearsamples=list(unclear_set[\"sample\"]),\n", " response=response,predictors=metapreds,\n", " response_name=response_name,\n", " alternative_reference_for_unclears=alternative_reference_for_unclears,\n", " alternative_response_name=alternative_response_name,\n", " n_bootstrap_reps=n_bootstrap_reps,max_threads=max_threads,\n", " metalearn=True)\n", " os.chdir('..')\n", " os.chdir('..')\n", " \n", " \n", "def run_w_shuffled_lables(workdir,traintestset,response,nr_outerouter_folds_start,nr_outerouter_folds_end,name):\n", " ### with shuffled labels:\n", " os.makedirs(workdir,exist_ok=True)\n", " os.chdir(workdir)\n", " os.makedirs(\"RANDOMIZED_labels_traintestset\",exist_ok=True)\n", " os.chdir(\"RANDOMIZED_labels_traintestset\")\n", "\n", " for i in range(nr_outerouter_folds_start,nr_outerouter_folds_end+1):\n", " np.random.seed(seed=myseed+1000+i)\n", " os.makedirs(str(i),exist_ok=True)\n", " os.chdir(str(i))\n", " shuffledtraintestset=traintestset.assign(**{response:np.random.permutation(traintestset[response].values)})\n", " unclear_set=df[~df[\"sample\"].isin(shuffledtraintestset[\"sample\"])]\n", " unclear_set=unclear_set.assign(**{response:np.nan})\n", " run_classification(name=name,traintestset=shuffledtraintestset,\n", " unclear_set=unclear_set,response=response,response_name=\"RANDOM\"+response,\n", " alternative_reference_for_unclears=\"is_genomic_tumor_evidence_available\",\n", " alternative_response_name=\"Genomic tumor evidence available\",n_bootstrap_reps=1)\n", " os.chdir(\"..\")\n", " os.chdir(\"../..\" if not workdir==\".\" else \"..\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# load data\n", "df=pd.read_excel(\"ML_input_features.xlsx\")\n", "\n", "# Define the feature-sets for each of the coverage levels 12x, 1x, and 0.1x\n", "# to keep things simple, lists containing the feature-sets are already pickled and just loaded here:\n", "with open(\"predictorset_fullx.pickle\", \"rb\") as fp:\n", " predictorset_fullx=pickle.load(fp)\n", "with open(\"predictorset_1x.pickle\", \"rb\") as fp:\n", " predictorset_1x=pickle.load(fp)\n", "with open(\"predictorset_0point1x.pickle\", \"rb\") as fp:\n", " predictorset_0point1x=pickle.load(fp)\n", "df_all=df.copy()\n", "df=df[df[\"Sample type\"]!=\"Non-EwS sarcoma\"]\n", "\n", "# Define control sets\n", "our_controls=df[df[\"sample\"].str.contains(\"Ctrl\")]\n", "crist_controls=df[df[\"sample\"].str.contains(\"EGAR\")]\n", "ulz_ctrls=df[df[\"sample\"].str.contains(\"NPH\")]\n", "controls=pd.concat([our_controls,crist_controls,ulz_ctrls],axis=0)\n", "non_ews_cancers=df[df[\"Sample type\"]==\"Non-EwS sarcoma\"]\n", "\n", "\n", "# Start the ML experiments\n", "if True:\n", " ##### Clinical evidence for tumor: YES vs. healthy CTRLs (seperately for each control set) #####\n", " for controlsetname, controlset in [(\"our_ctrls_only\",our_controls),(\"crist_ctrl_only\",crist_controls),(\"ulz_ctrl_only\",ulz_ctrls)]:\n", "\n", " os.makedirs(controlsetname,exist_ok=True)\n", " os.chdir(controlsetname)\n", " response=\"clinical data indicating presence of tumor (PET-SCAN, MRI, CT)\"\n", " clinical_evidence_yes=df[df[response]==\"yes\"]\n", " clinical_evidence_yes=clinical_evidence_yes.assign(**{response:1})\n", " controlset=controlset.assign(**{response:0})\n", " traintestset=pd.concat([clinical_evidence_yes,controlset],axis=0)\n", " unclear_set=df[~df[\"sample\"].isin(traintestset[\"sample\"])]\n", " unclear_set=unclear_set.assign(**{response:np.nan})\n", " name=\"Clinical_evidence_for_tumor_YES__vs__healthy_CTRLs\"\n", " run_classification(name=name,traintestset=traintestset,\n", " unclear_set=unclear_set,response=response,response_name=\"Clinical tumor evidence\",\n", " alternative_reference_for_unclears=\"is_genomic_tumor_evidence_available\",\n", " alternative_response_name=\"Genomic tumor evidence available\",\n", " n_bootstrap_reps=n_bootstrap_reps)\n", " os.chdir(\"..\")\n", "\n", "if False:\n", " ##### OUR vs CRISTIANO CTRLS #####\n", " response=\"is_crist_ctrl\"\n", " crist_controls1=crist_controls.assign(**{response:1})\n", " our_controls1=our_controls.assign(**{response:0})\n", " traintestset=pd.concat([our_controls1,crist_controls1],axis=0)\n", " unclear_set=df[~df[\"sample\"].isin(traintestset[\"sample\"])]\n", " unclear_set=unclear_set.assign(**{response:np.nan})\n", " name=\"is_crist_ctrl\"\n", " run_classification(name=name,traintestset=traintestset,\n", " unclear_set=unclear_set,response=response,response_name=\"Is cristiano et al ctrl\",\n", " alternative_reference_for_unclears=\"is_genomic_tumor_evidence_available\",\n", " alternative_response_name=\"Genomic tumor evidence available\")\n", "if False:\n", " ##### OUR vs ULZ CTRLS #####\n", " response=\"is_ulz_ctrl\"\n", " ulz_controls2=ulz_ctrls.assign(**{response:1})\n", " our_controls2=our_controls.assign(**{response:0})\n", " traintestset=pd.concat([our_controls2,ulz_controls2],axis=0)\n", " unclear_set=df[~df[\"sample\"].isin(traintestset[\"sample\"])]\n", " unclear_set=unclear_set.assign(**{response:np.nan})\n", " name=\"is_ulz_ctrl\"\n", " run_classification(name=name,traintestset=traintestset,\n", " unclear_set=unclear_set,response=response,response_name=\"Is ulz et al ctrl\",\n", " alternative_reference_for_unclears=\"is_genomic_tumor_evidence_available\",\n", " alternative_response_name=\"Genomic tumor evidence available\")\n", "\n", "if False:\n", " #### Diagnostic EwS vs Ctrls from this study\n", " response=\"is_diag_EwS\"\n", " diag=df[df[\"sample timepoint\"]==\"diagnosis\"]\n", " diag=diag.assign(**{response:1})\n", " controls_for_this_experiment=our_controls.assign(**{response:0})\n", " traintestset=pd.concat([diag,controls_for_this_experiment],axis=0)\n", " unclear_set=df[~df[\"sample\"].isin(traintestset[\"sample\"])]\n", " unclear_set=unclear_set.assign(**{response:np.nan})\n", " name=\"diagnostic_EwS_vs_healthy\"\n", " run_classification(name=name,traintestset=traintestset,\n", " unclear_set=unclear_set,response=response,response_name=\"Diagnostic EwS sample\",\n", " alternative_reference_for_unclears=\"is_genomic_tumor_evidence_available\",\n", " alternative_response_name=\"Genomic tumor evidence available\")\n", " os.chdir(\"..\")\n", " \n", "if False:\n", " ##### EwS vs non-EwS samples - both with genomic evidence for tumor ######\n", " response=\"is_ewing_w_gen_evidence_not_nonewingcancerinclEwslike_w_gen_evidence\"\n", " ews_genomic_evidence_yes=df[df[\"is_genomic_tumor_evidence_available\"]==1]\n", " ews_genomic_evidence_yes=ews_genomic_evidence_yes.assign(**{response:1})\n", " nonews_cancer=df_all[(df_all[\"sample\"].isin(non_ews_cancers))].assign(**{response:0})\n", " nonews_genomic_evidence_yes=nonews_cancer[nonews_cancer[\"is_genomic_tumor_evidence_available\"]==1]\n", " nonews_genomic_evidence_yes=nonews_genomic_evidence_yes.assign(**{response:0})\n", " traintestset=pd.concat([ews_genomic_evidence_yes,\n", " nonews_genomic_evidence_yes],axis=0)\n", " unclear_set=df[~df[\"sample\"].isin(traintestset[\"sample\"])]\n", " unclear_set=unclear_set.assign(**{response:np.nan})\n", " unclear_set=unclear_set.assign(**{\"dummy\":np.nan})\n", " name=\"ewing_w_gen_evidence_not_nonewingcancerinclEwslike_w_gen_evidence\"\n", " run_classification(name=name,traintestset=traintestset,\n", " unclear_set=unclear_set,response=response,response_name=\"Is EwS sample w. tumor ev., not other cancer w. tumor ev.\",\n", " alternative_reference_for_unclears=\"dummy\",\n", " alternative_response_name=\"No information available\")" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "## To summarize the performance of different classifiers in one plot:\n", "\n", "import sklearn\n", "from matplotlib import pyplot as plt\n", "from matplotlib.font_manager import FontProperties\n", "from sklearn.metrics import roc_curve, precision_recall_curve, auc,average_precision_score\n", "import pandas as pd\n", "from scipy import interp\n", "import numpy as np\n", "from collections import defaultdict\n", "import sys\n", "import glob\n", "from matplotlib import rc\n", "rc('font',**{'sans-serif':['Arial']})\n", "np.seterr(all='raise')\n", "\n", "\n", "def plot_ROC_curves(outname, response, csvname,n_bootstrap_its, featureset_paths_and_names,combine_controlsets=True,\n", "use_only_ulz_ctrls=False,\n", "use_only_cristiano_ctrls=False,\n", "use_only_our_ctrls=False,sort_by_AUC=True,restrict_to_these_testset_ews_samples=None):\n", " \n", " plt.gcf().set_size_inches(4,4)\n", "\n", " table_text=[]\n", " rownames=[]\n", " colors=[]\n", " tabledict={}\n", "\n", " color_base=[\"cadetblue\",\"coral\",\"mediumseagreen\",\"firebrick\",\"#9467bd\"]\n", " def get_color(name):\n", " if \"Global\" in name:\n", " return color_base[0]\n", " if \"DHS\" in name:\n", " return color_base[1]\n", " if \"depth\" in name:\n", " return color_base[2]\n", " if \"Regional\" in name:\n", " return color_base[3]\n", " if \"Meta\" in name:\n", " return color_base[4]\n", " else:\n", " return \"black\"\n", "\n", " for j in range(0,len(featureset_paths_and_names),2): # for every feature-set (folder name and label)\n", " tprs = []\n", " aucs = []\n", " mean_fpr = np.linspace(0, 1, 10000)\n", " mean_sensitivity_at_100spec=[]\n", " mean_sensitivity_at_95spec=[]\n", " name=(featureset_paths_and_names[j+1]).replace(\"__\",\"\\n\")\n", "\n", " parentname=csvname.replace(\"bestmodel_classification_out_of_sample_predictions_\",\"\").replace(\".csv\",\"\")\n", "\n", " print(featureset_paths_and_names[j])\n", " if combine_controlsets==False and use_only_our_ctrls==False:\n", " df_our=pd.DataFrame()\n", " else:\n", " df_our=pd.read_csv(\"our_ctrls_only/\"+parentname+\"/\"+featureset_paths_and_names[j]+\"/%s\"%(csvname))[[response,\"sample\"]+[\"%s_p1\"%(idx) for idx in range(n_bootstrap_its)]]\n", " if combine_controlsets==False and use_only_cristiano_ctrls==False:\n", " df_crist=pd.DataFrame()\n", " else:\n", " df_crist=pd.read_csv(\"crist_ctrl_only/\"+parentname+\"/\"+featureset_paths_and_names[j]+\"/%s\"%(csvname))[[response,\"sample\"]+[\"%s_p1\"%(idx) for idx in range(n_bootstrap_its)]]\n", " if combine_controlsets==False and use_only_ulz_ctrls==False:\n", " df_ulz=pd.DataFrame()\n", " else:\n", " df_ulz=pd.read_csv(\"ulz_ctrl_only/\"+parentname+\"/\"+featureset_paths_and_names[j]+\"/%s\"%(csvname))[[response,\"sample\"]+[\"%s_p1\"%(idx) for idx in range(n_bootstrap_its)]]\n", "\n", " if combine_controlsets==True: # here, the results from the different control sets are averaged in a meta-analysis approach\n", "\n", " # simply rename the columns of the cristiano and ulz datasets to higher iteration numbers to keep them apart from the other datasets\n", " df_crist.columns=[response,\"sample\"]+[\"%s_p1\"%(idx) for idx in range(n_bootstrap_its,n_bootstrap_its*2)]\n", " df_ulz.columns=[response,\"sample\"]+[\"%s_p1\"%(idx) for idx in range(n_bootstrap_its*2,n_bootstrap_its*3)]\n", " sampleset=set()\n", " for i in range(n_bootstrap_its*3):\n", " if i>=n_bootstrap_its*2:\n", " df=df_ulz\n", " elif i>=n_bootstrap_its:\n", " df=df_crist\n", " else:\n", " df=df_our\n", " testset=df.dropna(subset=[str(i)+\"_p1\"],axis=0) # keep only samples that were in the testset in this fold\n", " if restrict_to_these_testset_ews_samples:\n", " testset=testset[(testset[response]==0) | (testset[\"sample\"].isin(restrict_to_these_testset_ews_samples))]\n", " sampleset.update(set(testset[testset[response]==1][\"sample\"].values))\n", "\n", " # Get the roc curve and auc of this fold:\n", " #https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc_crossval.html#sphx-glr-auto-examples-model-selection-plot-roc-crossval-py\n", " fpr, tpr, thresholds = roc_curve([int(x) for x in testset[response].values], testset[str(i)+\"_p1\"].values)\n", " tprs.append(interp(mean_fpr, fpr, tpr))\n", " mean_sensitivity_at_100spec.append(interp(mean_fpr, fpr, tpr)[0])\n", " mean_sensitivity_at_95spec.append(max([tpr_val for tpr_val,fpr_val in zip(tpr,fpr) if fpr_val<0.05]))# interp(mean_fpr, fpr, tpr)[0])\n", "\n", " tprs[-1][0] = 0.0\n", " roc_auc = auc(fpr, tpr)\n", " aucs.append(roc_auc)\n", " elif combine_controlsets==False: # Here, the results from only on control set are used\n", " sampleset=set()\n", " df=df_our.append(df_crist,sort=True)\n", " df=df.append(df_ulz,sort=True)\n", " for i in range(n_bootstrap_its):\n", " testset=df.dropna(subset=[str(i)+\"_p1\"],axis=0) # keep only samples that were in the testset in this fold\n", " if restrict_to_these_testset_ews_samples:\n", " testset=testset[(testset[response]==0) | (testset[\"sample\"].isin(restrict_to_these_testset_ews_samples))]\n", " sampleset.update(set(testset[testset[response]==1][\"sample\"].values))\n", "\n", " # Get the roc curve and auc of this fold:\n", " fpr, tpr, thresholds = roc_curve([int(x) for x in testset[response].values], testset[str(i)+\"_p1\"].values)\n", " tprs.append(interp(mean_fpr, fpr, tpr))\n", " mean_sensitivity_at_100spec.append(interp(mean_fpr, fpr, tpr)[0])\n", " mean_sensitivity_at_95spec.append(max([tpr_val for tpr_val,fpr_val in zip(tpr,fpr) if fpr_val<0.05]))# interp(mean_fpr, fpr, tpr)[0])\n", " tprs[-1][0] = 0.0\n", " roc_auc = auc(fpr, tpr)\n", " aucs.append(roc_auc)\n", "\n", " # Average over all folds:\n", " mean_tpr = np.mean(tprs, axis=0)\n", " mean_tpr[-1] = 1.0\n", " mean_auc = auc(mean_fpr, mean_tpr)\n", " std_auc = np.std(aucs)\n", " CI_auc = \"%0.2f-%0.2f\"%(np.percentile(aucs,2.5),np.percentile(aucs,97.5))\n", " CI_sens_100_spec= \"%0.2f-%0.2f\"%(np.percentile(mean_sensitivity_at_100spec,2.5),np.percentile(mean_sensitivity_at_100spec,97.5))\n", "\n", " # plot the averaged ROC curve\n", " p=plt.plot(mean_fpr,mean_tpr, lw=2, color=get_color(name), alpha=.7)\n", " # and add statistics to the table\n", " this_table_text=[\"%0.2f (%s)\"%(mean_auc,CI_auc),\"%0.2f (%s)\"%(np.mean(mean_sensitivity_at_100spec), CI_sens_100_spec)]\n", " rownames.append(name)\n", " colors.append(get_color(name))\n", " tabledict[name]={\"text\":this_table_text,\"color\":get_color(name),\"rank\":1-mean_auc}\n", " print(np.mean(mean_sensitivity_at_95spec))\n", "\n", "\n", " plt.plot([0, 1], [0, 1], linestyle='--', lw=1.5, color='grey',\n", " label='Chance', alpha=.8)\n", " colwidth=0.2\n", " bbox=[0.51,0.02,0.537,0.4]\n", " if sort_by_AUC:\n", " tabledict={k:v for k,v in sorted(tabledict.items(),key=lambda item: item[1][\"rank\"])}\n", " table_text=[v[\"text\"] for v in tabledict.values()]\n", " rownames=list(tabledict.keys())\n", " colors=[v[\"color\"] for v in tabledict.values()]\n", "\n", " table=plt.table(cellText=table_text,rowLabels=[\"—\" for x in rownames],colLabels=[\"ROC\\nAUC\\n\\nmean (CI) \",\"Sens. at\\n100% spec.\\n\\nmean (CI) \"],rowColours=colors,rowLoc=\"center\",cellLoc=\"center\",bbox=bbox,colWidths=[0.25,0.25])\n", "\n", " table.auto_set_font_size(False)\n", " table.set_fontsize(7)\n", "\n", " cellDict= table.get_celld()\n", "\n", " # columns\n", " for i in [0,1]:\n", "\n", " # column labels\n", " cellDict[(0,i)].set_color(\"white\")\n", " cellDict[(0,i)].set_edgecolor(None)\n", " cellDict[(0,i)].set_linewidth(2)\n", " cellDict[(0,i)].set_alpha(1)\n", " cellDict[(0,i)].set_height(0.067)\n", " cellDict[(0,i)].set_text_props(weight=\"bold\",color=\"black\")\n", "\n", "\n", " for j in range(1,len(range(0,len(featureset_paths_and_names),2))+1): # rows\n", " # row labels:\n", " cellDict[(j,-1)].set_alpha(1)\n", " cellDict[(j,-1)].set_text_props(weight=\"bold\",color=\"white\")\n", " cellDict[(j,-1)].set_edgecolor(None)\n", " cellDict[(j,-1)].set_linewidth(0)\n", " cellDict[(j,-1)].set_color(\"white\")#(\"#f2f2f2\")\n", " cellDict[(j,-1)].set_height(0.03)\n", " cellDict[(j,-1)].set_text_props(weight=1000,color=colors[j-1],fontproperties=FontProperties(size=15))\n", "\n", " # entries\n", " cellDict[(j,i)].set_color(\"white\")#(\"#f2f2f2\")\n", " cellDict[(j,i)].set_edgecolor(None)\n", " cellDict[(j,i)].set_linewidth(0)\n", " cellDict[(j,i)].set_height(0.03)\n", " cellDict[(j,i)].set_alpha(1)#(0.9)\n", " cellDict[(j,i)].set_text_props(fontproperties=FontProperties(size=7))\n", "\n", " for cell in table._cells:\n", " table._cells[cell].set_edgecolor(None)\n", " table._cells[cell].set_linewidth(0)\n", "\n", "\n", " plt.xlim([-.01, 1])\n", " plt.ylim([-0, 1])\n", " plt.xlabel('False-positive fraction')\n", " plt.ylabel('True-positive fraction')\n", " plt.gca().spines['top'].set_visible(False)\n", " plt.gca().spines['right'].set_visible(False)\n", " plt.gca().set_aspect('equal')\n", " plt.tight_layout()\n", " plt.gcf().savefig(outname)\n", " plt.show()" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "coverage_at_EwS_DHS\n", "0.7235142118863049\n", "global_fragment_size\n", "0.8863049095607235\n", "read_depth_5mb\n", "0.9397071490094746\n", "regional_fragmentation_5mb\n", "0.9698535745047372\n", "METALEARNER_fullx\n", "0.9698535745047372\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Example of ROC curve calculation (here only for 2 bootstrap iterations for quick calculation):\n", "plot_ROC_curves(\"test.pdf\",\n", " 'clinical data indicating presence of tumor (PET-SCAN, MRI, CT)',\n", " 'bestmodel_classification_out_of_sample_predictions_Clinical_evidence_for_tumor_YES__vs__healthy_CTRLs.csv',\n", " 2,\n", " [\"coverage_at_EwS_DHS\",'Coverage at__Ews-specifc DHSs',\n", " \"global_fragment_size\",\"Global fragment__size distribution\",\n", " \"read_depth_5mb\",'Read depth__in 5 Mb bins',\n", " \"regional_fragmentation_5mb\",'Regional fragmentation__patterns',\n", " \"METALEARNER_fullx\", \"Metalearner\"],\n", " combine_controlsets=False,use_only_our_ctrls=True)" ] } ], "metadata": { "kernelspec": { "display_name": "ews_cfdna", "language": "python", "name": "ews_cfdna" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }