Figure 2: Global fragment size distribution
We start with creating the heatmap and histogram shown in Fig 2a. For this, we use the output files of Picard's CollectInsertSizeMetrics.
%%bash
wget http://medical-epigenomics.org/papers/peneder2020_f17c4e3befc643ffbb31e69f43630748/data/fragment_size.tar.gz #if needed, update with the path of the downloaded data directory on your system
tar -xzf fragment_size.tar.gz
We also need to load details about the samples, both clinical and genome-based:
import pandas as pd
import glob
from matplotlib import pyplot as plt
import numpy as np
clinical=pd.read_excel("../tables/Supplementary Table 1_Patient cohort.xlsx","Patient clinical data")
genomic=pd.read_excel("../tables/Supplementary Table 2_ctDNA quantification.xlsx","Genomic ctDNA quantification")
# note that in these two tables and the files other provided on this website, Rhabdomyosarcoma samples are encoded as "Rha" instead of "RMS"
non_ews_cancers=list(clinical[clinical["Sample type"]=="Non-EwS sarcoma"]["Sample"].values)
fixation_excluded_samples=list(genomic[genomic["Excluded due to fixation (1=yes)"]==1]["Sample"].values)
size_selected=list(genomic[genomic["Excluded due to size selection prior to library preparation (1=yes)"]==1]["Sample"].values)
controls=list(clinical[clinical["Sample type"]=="Healthy control"]["Sample"].values)
ews_samples_to_plot=genomic[~genomic["Sample"].isin(size_selected+fixation_excluded_samples+non_ews_cancers+controls)]
zero_perc=list(ews_samples_to_plot[ews_samples_to_plot["Combined quantification: % ctDNA"]==0]["Sample"].values)
upto20_perc=list(ews_samples_to_plot[(ews_samples_to_plot["Combined quantification: % ctDNA"]>0) & (ews_samples_to_plot["Combined quantification: % ctDNA"]<=20)]["Sample"].values)
morethan20_perc=list(ews_samples_to_plot[(ews_samples_to_plot["Combined quantification: % ctDNA"]>20)]["Sample"].values)
non_ews_cancers_to_plot=[x for x in non_ews_cancers if not x in size_selected+fixation_excluded_samples]
# Turns Picard's output into a python dict
def get_frag_distro(fraglen_histo,cumul, start_from):
fdict={}
total=0
with open(fraglen_histo) as f:
counter=False
for line in f:
if line.startswith("insert_size"):
counter=True
continue
if counter and line.replace("\n",""):
size,nr=line.split()
if int(size)>=start_from:
total+=int(nr)
if cumul:
fdict[int(size)]=int(total)
else:
fdict[int(size)]=int(nr)
return fdict
# A few basic settings:
insertfolder="fragment_size"
relative_over_zoom_only=True
zoomed_area_start=100
zoomed_area_end=450
log_vs_ctrl=True
samplenamelist=[sample.replace("_insertsizemetrics.txt","").split("/")[-1] for sample in glob.glob(insertfolder+"/*insertsizemetrics.txt")]
distrofilelist=glob.glob(insertfolder+"/*insertsizemetrics.txt")
#this stores the fragment size distribution of every sample:
sampledict={}
for distrofile in distrofilelist:
samplename=distrofile.replace("_insertsizemetrics.txt","").split("/")[-1]
if samplename not in list(ews_samples_to_plot["Sample"].values)+non_ews_cancers_to_plot+controls:
continue
fragdict=get_frag_distro(distrofile,False,0)
# In case a value is missing in the provided histogram, fill it with 0 or the previous value
fragdict_full={}
for i in range(0,801):
if i in fragdict.keys():
fragdict_full[i]=fragdict[i]
else:
fragdict_full[i]=0
fragdict=fragdict_full
# devide by the total if relative
if relative_over_zoom_only:
allcounts=sum([value for key,value in fragdict.items() if zoomed_area_start<=key<=zoomed_area_end])
else:
allcounts=sum([value for value in fragdict.values()])
fragdict={key:100*value/allcounts for key, value in fragdict.items()}
fragweights_for_plot=list(fragdict.values())
sampledict[samplename]=fragweights_for_plot[zoomed_area_start:zoomed_area_end+1]
for sample,col in zip(["EwS_7_1","EwS_7_5","EwS_7_2"],["red","hotpink","coral"]):
plt.plot(list(range(zoomed_area_start,zoomed_area_end+1)),sampledict[sample],alpha=0.6,label="%s"%(sample),color=col)
# Plot range of healthy controls as grey area
ctrl_y_vals=[]
for sample in sampledict.keys():
if not "Ctrl" in sample:
continue
ctrl_y_vals.append(sampledict[sample])
min_ctrls=[]
max_ctrls=[]
for i in range(len(ctrl_y_vals[0])):
min_ctrls.append(min([yvallist[i] for yvallist in ctrl_y_vals]))
max_ctrls.append(max([yvallist[i] for yvallist in ctrl_y_vals]))
plt.fill_between(list(range(zoomed_area_start,zoomed_area_end+1)),min_ctrls,max_ctrls,alpha=0.15, label="Range of the "+str(len(ctrl_y_vals))+" controls",color="grey")
plt.legend(fontsize=6)
plt.axvline(167*2,linewidth=1, color="black", alpha=1,zorder=10,linestyle="--")
plt.axvline(167,linewidth=1, color="black", alpha=1,linestyle="--",zorder=10)
plt.ylabel("Density in range 100-450bp (%)")
plt.xlabel("Fragment size (bp)")
plt.show()
# Now, we create a heatmap of all relevant samples:
df=pd.DataFrame(sampledict)
df=df.transpose()
df.columns=list(range(zoomed_area_start,zoomed_area_end+1))
# Using the difference to the median of controls makes the signal more clearly visible
if log_vs_ctrl:
median_ctrl=df.loc[controls].median()
df_log=df/median_ctrl
df_log=np.log2(df_log)
df=df_log
# Max and min of the colorbar
mymax=3
mymin=-3
for sampleset,samplesetname in zip([controls,zero_perc,upto20_perc,morethan20_perc,non_ews_cancers_to_plot],["Ctrl","EwS, 0%","EwS, <20%,>0%% ","EwS, >=20%","Non-Ews cancers"]):
# within each group, order by tumor content
samplesetdf=df.loc[sampleset]
samplesetdf["Combined quantification: % ctDNA"]=samplesetdf.apply(lambda row: genomic[genomic["Sample"]==row.name]["Combined quantification: % ctDNA"].values[0],axis=1)
samplesetdf=samplesetdf.sort_values(by="Combined quantification: % ctDNA")
samplesetdf=samplesetdf.drop("Combined quantification: % ctDNA",axis=1)
ax1=plt.gca()
# set size of plots according to the nr. of samples in the group
plt.gcf().set_size_inches(5,len(sampleset)/10)
plt.imshow(samplesetdf,aspect='auto',vmin=mymin,vmax=mymax)#
plt.colorbar()
plt.xticks([0,167-zoomed_area_start,334-zoomed_area_start,zoomed_area_end-zoomed_area_start],[0,167,334])
plt.yticks([],[])
plt.xlabel("Fragment size [bp]")
plt.subplots_adjust(bottom=0.35)
plt.axvline(167*2-zoomed_area_start,linewidth=1, color="black", alpha=1,zorder=10,linestyle="--")
plt.axvline(167-zoomed_area_start,linewidth=1, color="black", alpha=1,linestyle="--",zorder=10)
plt.title(samplesetname)
plt.show()
samplesetdf["local max"]=samplesetdf.apply(lambda row: zoomed_area_start+list(row.values).index(row.max()),axis=1)
samplesetdf.to_csv(samplesetname.replace(" ","_")+".csv")
Finally, we produce a table of features of global fragmentation, which can then easily be compared to results from Mouliere et al, and used for machine learning:
insertfolder="fragment_size"
zoomed_area_start=20 # for plots only
zoomed_area_end=450 # for plots only
plot=False # Set to True to generate plots for all samples
samplenamelist=[sample.replace("_insertsizemetrics.txt","").split("/")[-1] for sample in glob.glob(insertfolder+"/*insertsizemetrics.txt")]
distrofilelist=glob.glob(insertfolder+"/*insertsizemetrics.txt")
#this stores the fragment size distribution of every sample:
distrodict={}
# Stores fragmentation-based features of all samples:
featuredict={}
for distrofile in distrofilelist:
sample_featuredict={}
samplename=distrofile.replace("_insertsizemetrics.txt","").split("/")[-1]
if samplename not in list(ews_samples_to_plot["Sample"].values)+non_ews_cancers_to_plot+controls:
continue
fragdict=get_frag_distro(distrofile,False,0)
# In case a value is missing in the provided histogram, fill it with 0 or the previous value
fragdict_full={}
for i in range(0,801):
if i in fragdict.keys():
fragdict_full[i]=fragdict[i]
else:
fragdict_full[i]=0
fragdict=fragdict_full
allcounts=sum([value for value in fragdict.values()])
fragdict={key:value/allcounts for key, value in fragdict.items()}
fragweights_for_plot=list(fragdict.values())
distrodict[samplename]=fragweights_for_plot[zoomed_area_start:zoomed_area_end+1]
# Fragment based features, mainly from Mouliere et al.
for start,end in [(20,150),(100,150),(125,180),(160,180),(180,220),(250,320),]:
sample_featuredict["P("+str(start)+"_"+str(end)+")"]=sum(fragweights_for_plot[start:end])
sample_featuredict["P(160_180)/P(180_220"]=sum(fragweights_for_plot[160:180])/sum(fragweights_for_plot[180:220])
sample_featuredict["P(20_150)/P(160_180)"]=(sum(fragweights_for_plot[20:150])/sum(fragweights_for_plot[160:180]))
sample_featuredict["P(100_150)/P(163_169)"]=(sum(fragweights_for_plot[100:150])/sum(fragweights_for_plot[163:169]))
sample_featuredict["P(20_150)/P(180_220)"]=(sum(fragweights_for_plot[20:150])/sum(fragweights_for_plot[180:220]))
# From Mouliere et al.
local_minima=[84, 96, 106, 116, 126, 137, 148]
local_maxima=[81, 92, 102, 112, 122, 134, 144]
# to test whether Mouliere et al.'s positions local minima and maxima are similar to ours:
"""
my_local_max=[75+x for x in argrelextrema(np.array(fragweights_for_plot[75:150]), np.greater)]
my_local_min=[75+x for x in argrelextrema(np.array(fragweights_for_plot[75:150]), np.less)]
my_local_max_smooth=[75+x for x in argrelextrema(np.array(ndimage.gaussian_filter(fragweights_for_plot[75:150],mode="nearest",sigma=1)), np.greater)]
my_local_min_smooth=[75+x for x in argrelextrema(np.array(ndimage.gaussian_filter(fragweights_for_plot[75:150],mode="nearest",sigma=1)), np.less)]
print("##### MIN/MAX ###")
print(my_local_min)
print(my_local_max)
print("##### MIN/MAX SMOOTH ###")
print(my_local_min_smooth)
print(my_local_max_smooth)
"""
periodicity=sum([fragweights_for_plot[x] for x in local_maxima])-sum([fragweights_for_plot[x] for x in local_minima])
sample_featuredict["amplitude_10bp"]=periodicity
featuredict[samplename]=sample_featuredict
df=pd.DataFrame(featuredict)
df.transpose().to_csv("global_fragmentation_features.csv")
if plot:
# Range of healthy controls as grey area
ctrl_y_vals=[]
for sample in distrodict.keys():
if not "Ctrl" in sample:
continue
ctrl_y_vals.append(distrodict[sample])
min_ctrls=[]
max_ctrls=[]
for i in range(len(ctrl_y_vals[0])):
min_ctrls.append(min([yvallist[i] for yvallist in ctrl_y_vals]))
max_ctrls.append(max([yvallist[i] for yvallist in ctrl_y_vals]))
for sample in distrodict.keys():
plt.plot(list(range(zoomed_area_start,zoomed_area_end+1)),distrodict[sample],alpha=0.6,label="%s"%(sample))
plt.fill_between(list(range(zoomed_area_start,zoomed_area_end+1)),min_ctrls,max_ctrls,alpha=0.15, label="Range of the "+str(len(ctrl_y_vals))+" controls",color="grey")
plt.legend(fontsize=6)
plt.axvline(167*2,linewidth=1, color="black", alpha=1,zorder=10,linestyle="--")
plt.axvline(167,linewidth=1, color="black", alpha=1,linestyle="--",zorder=10)
plt.ylabel("Density")
plt.xlabel("Fragment size (bp)")
plt.title(sample)
plt.show()