Figure 4 - Regional fragment size distribution along the genome

In order to count the number of short and long fragments in 100 kb bins, the script in the following cell was run on the mapped .bam files. Since their filesize is large and access is restricted due to legal reasons, we provide the output tables (GC corrected numbers of short and long fragments per bin) on our website, and we can continue working with them instead of the .bam files. Therefore, you can skip the next cell and continue below.

In [ ]:
#### SKIP THIS CELL - REQUIRES .BAM FILES ###

import deeptools.countReadsPerBin as crpb
import glob
import deeptools
import matplotlib.pyplot as plt
import numpy as np
import py2bit
import pandas as pd
from loess.loess_1d import loess_1d
import statsmodels.api
import time
import sys

bamfiles=list(glob.glob(sys.argv[1]+"/*.bam"))

samplenames=[x.split("/")[-1].replace(".bam","") for x in bamfiles]
twobit="/home/peter/ctDNA/data/reference_data/hg38/hg38.2bit"
outdir="."
tb=py2bit.open(twobit)

for short_or_long in ["long","short"]:

    # Count reads per bin                                # 5mb for ML, 100kb otherwise
    cr = crpb.CountReadsPerBin(list(bamfiles), binLength=100000, stepSize=100000,
                               minMappingQuality=30,
                               ignoreDuplicates=False, # not nessesary since input file is already duplicate filtered 
                               numberOfProcessors=10, verbose=False,
                               genomeChunkSize=100000*100,
                               samFlag_exclude=256, # exclude if not primary alignment
                               extendReads=True,minFragmentLength=100 if short_or_long=="short" else 151,
                               maxFragmentLength=150 if short_or_long=="short" else 220,
                               out_file_for_raw_data=outdir+"/%s_rawdata_countreadsperbin.tsv"%(short_or_long),
                               blackListFileName="/home/peter/ctDNA/data/reference_data/hg38/blacklist/hg38.blacklist2.0_and_gaps.merged.bed")
    countobject=cr.run()

    # Get GC content for each bin
    gc_content_per_bin=[]
    with open(outdir+"/%s_rawdata_countreadsperbin.tsv"%(short_or_long)) as infile:
        for line in infile:
            chrom,start,end=line.split()[:3]
            start=int(start)
            end=int(end)
            try:
                gc_content_per_bin.append(deeptools.utilities.getGC_content(tb,chrom,start,end))
            except:
                gc_content_per_bin.append(np.nan)

    # store both in a dataframe
    df=pd.read_csv(outdir+"/%s_rawdata_countreadsperbin.tsv"%(short_or_long),delimiter="\t",header=None)
    df["GC"]=gc_content_per_bin
    df.columns=["chrom","start","end"]+samplenames+["GC"]
    df=df.assign(**{"mean_readcount_over_all_samples":df[samplenames].mean(axis="columns")})

    # filter out the 10% of bins with lowest read counts:
    # Note that this step was skipped for the 5mb bins used in the ML anaylsis
    df=df[df["mean_readcount_over_all_samples"]>df["mean_readcount_over_all_samples"].quantile(q=0.1)]

    # LOESS fitting and correction per sample
    df=df.sort_values(by="GC")
    for sample in samplenames:
        x=df["GC"].values
        y=df[sample].values

        yout=statsmodels.nonparametric.smoothers_lowess.lowess(endog=y,exog=x,frac=0.75,is_sorted=True,return_sorted=False)
        xout=x

        df=df.assign(**{sample+"_corrected":y-yout+np.median(y)})

        plt.clf()
        plt.plot(x, y, 'rx', label='Original data',alpha=0.1)
        plt.plot(xout, yout, 'b', linewidth=0.1, label='LOESS')
        plt.legend()
        plt.gcf().savefig(outdir+"/%s_%s_loess_before.pdf"%(short_or_long,sample))
        plt.close("all")

        plt.plot(df["GC"],df[sample+"_corrected"].values, 'rx', label='Corrected data',alpha=0.1)
        plt.plot(xout, yout, 'b', linewidth=0.1, label='LOESS')
        plt.legend()
        plt.gcf().savefig(outdir+"/%s_%s_loess_after.pdf"%(short_or_long,sample))
        plt.close("all")
        print("Finished sample %s"%(sample))

    df.to_csv(outdir+"/%s_readcounts_corr_and_uncorr_and_GC.csv"%(short_or_long))

    df=df.assign(**{"bin":df["chrom"]+"_"+df["start"].map(str)+"_"+df["end"].map(str)})
    df=df.sort_values(by=["chrom","start"])
    df_out=df[["bin"]+[x+"_corrected" for x in samplenames]]
    df_out.columns=["bin"]+samplenames
    df_out=df_out.set_index("bin")
    df_out=df_out.transpose()
    df_out.to_csv(outdir+"/%s_corr_readcounts.csv"%(short_or_long)) # these are the files we continue working with

Instead of running the above code, here we just download the output:

In [3]:
%%bash
wget http://medical-epigenomics.org/papers/peneder2020_f17c4e3befc643ffbb31e69f43630748/data/long_corr_readcounts.csv
wget http://medical-epigenomics.org/papers/peneder2020_f17c4e3befc643ffbb31e69f43630748/data/short_corr_readcounts.csv
wget http://medical-epigenomics.org/papers/peneder2020_f17c4e3befc643ffbb31e69f43630748/data/ichorCNA_output.tar.gz # CNA-information
tar -xzf ichorCNA_output.tar.gz

Note that ichorCNA (git commit 1d54a1f) was used to create the downloaded files as follows:

In [ ]:
%%bash

#### SKIP THIS CELL - REQUIRES .BAM FILES ###

### Readcounter
for((i=1;i<=${#BAMFILES[@]};i++));
do
    bamfile=${BAMFILES[i-1]}
    samplename=$(basename ${bamfile} $BAMFILE_SUFFIX)
    readCounter --window 500000 --quality 20 --chromosome \
    "chr1,chr2,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chr10,chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr20,chr21,chr22,chrX,chrY"\
    $bamfile > ${samplename}.readCounter_500kb_out.wig
done

### ichorCNA 
for((i=1;i<=${#BAMFILES[@]};i++));
do
    bamfile=${BAMFILES[i-1]}
    samplename=$(basename ${bamfile} $BAMFILE_SUFFIX)
    Rscript $ICHORCNA --id $samplename --WIG ${samplename}.readCounter_500kb_out.wig\
    --normal "c(0.5,0.6,0.7,0.8,0.9,0.95)" --ploidy "c(2,3)" --maxCN 5 \
    --gcWig ${ICHORCNA_BASEDIR}/inst/extdata/gc_hg38_500kb.wig \
    --mapWig ${ICHORCNA_BASEDIR}/inst/extdata/map_hg38_500kb.wig \
    --centromere ${ICHORCNA_BASEDIR}/inst/extdata/GRCh38.GCA_000001405.2_centromere_acen.txt \
    --includeHOMD False --chrs 'c(1:22,"X")' --chrTrain "c(1:22)" \
    --estimateNormal True --estimatePloidy True --estimateScPrevalence True \
    --scStates "c(1,3)" --txnE 0.9999 --txnStrength 10000 --outDir $(pwd) \
    --normal "c(0.5,0.6,0.7,0.8,0.9,0.95)" --genomeStyle=UCSC \
    --normalPanel ${ICHORCNA_BASEDIR}/inst/extdata/HD_ULP_PoN_hg38_500kb_median_normAutosome_median.rds 
done

Now we are ready to identify the bins which are significantly different to the controls, and determine the z-scores. We also use the output of ichorCNA to detect CNA-affected bins. This script produces a number of output files per sample, which we can summarize, plot, and use as input for LOLA later on.

In [ ]:
import pandas as pd
import numpy as np
import math
import os
import scipy
from scipy import stats
from statsmodels.stats import multitest as mt
from matplotlib import pyplot as plt
import subprocess

def check_region_overlap(cn_event_region_list,cn_noevent_region_list,chrom,start,end):

    for regiontuple in cn_event_region_list:
        cv_chrom=regiontuple[0]
        cv_start=regiontuple[1]
        cv_end=regiontuple[2]
        if cv_chrom==chrom and ((cv_start<start and cv_end >start) or (cv_end>start and cv_start <end)):
            return True

    for regiontuple in cn_noevent_region_list:
        cv_chrom=regiontuple[0]
        cv_start=regiontuple[1]
        cv_end=regiontuple[2]
        if cv_chrom==chrom and ((cv_start<start and cv_end >start) or (cv_end>start and cv_start <end)):
            return False

    return np.nan

# create output dir
os.makedirs("significant_bins",exist_ok=True)

# Read in data
short_readcounts=pd.read_csv("short_corr_readcounts.csv")
long_readcounts=pd.read_csv("long_corr_readcounts.csv")

short_readcounts.columns=["sample"]+list(short_readcounts.columns[1:])
long_readcounts.columns=["sample"]+list(long_readcounts.columns[1:])

# Limit data to bins that are present both in the short and long read file
common_bins=[value for value in short_readcounts.columns if ((value in long_readcounts.columns) and ("chr" in value or "sample" in value) and ("random" not in value) and ("Un" not in value) )]

# Limit data to bins that have at least 100 short and long fragments in the controls:
short_df_NONnorm_cntrls=short_readcounts[(short_readcounts["sample"].str.contains("Ctrl"))]
short_df_NONnorm_cntrls=short_df_NONnorm_cntrls[short_readcounts.columns[1:]]
short_min=list(short_df_NONnorm_cntrls.min().values)
okay_bins_short=[binname for binname,minval  in zip(short_df_NONnorm_cntrls.columns,short_min) if minval>99]
long_df_NONnorm_cntrls=long_readcounts[(long_readcounts["sample"].str.contains("Ctrl"))]
long_df_NONnorm_cntrls=long_df_NONnorm_cntrls[long_readcounts.columns[1:]]
long_min=list(long_df_NONnorm_cntrls.min().values)
okay_bins_long=[binname for binname,minval  in zip(long_df_NONnorm_cntrls.columns,long_min) if minval>99]
common_bins=[x for x in common_bins if (x in okay_bins_long and x in okay_bins_short) or x=="sample"]
common_bins=[x for x in common_bins if not "chrY" in x]

print("Common bins:",len(common_bins))
print("Total bins:",short_readcounts.shape[1],",",long_readcounts.shape[1])
short_readcounts=short_readcounts[common_bins]
long_readcounts=long_readcounts[common_bins]

features=common_bins.copy()
features.remove("sample")

# Create column with the ratio short/long
short_long_df=short_readcounts[features].div(long_readcounts[features])
short_long_df["sample"]=short_readcounts["sample"]
short_long_df=short_long_df[["sample"]+features]
features.sort(key= lambda x: (25 if "X" in x else int(x.replace("chr","").split("_")[0]),int(x.split("_")[1]))) #sort by genomic position




for sample in short_long_df["sample"].values:
    import os
    if os.path.isfile("significant_bins/%s_vs_ctrls_pvals_etc_INCL_CNVaffected.json"%(sample,)):
        continue
    print("Treating sample %s"%(sample,))

    # Prepare df to get reference values from. Note that if the sample we are currently treating is a control, it is excluded from the references frame
    # then: nr_controls=21, else: nr_controls=22
    short_long_df_NONnorm_cntrls=short_long_df[(short_long_df["sample"].str.contains("Ctrl")) & (short_long_df["sample"]!=sample)] 
    nr_controls=len(short_long_df_NONnorm_cntrls["sample"].values)

    # Get tuples of regions, with the info if ichor marked the corresponding segment as CNV-affected
    ichor_segments=pd.read_csv("ichorCNA_output/ichorCNA_90to150_ss_all_samples/%s_90to150.seg"%(sample,),delimiter="\t")
    ichor_segments_nonneut=ichor_segments[ichor_segments["event"]!="NEUT"]
    ichor_segments_neut=ichor_segments[ichor_segments["event"]=="NEUT"]
    cn_event_region_list=[(chrom,int(start),int(end)) for (chrom,start,end) in zip(list(ichor_segments_nonneut["chr"].values),
                                                                                   list(ichor_segments_nonneut["start"].values),
                                                                                   list(ichor_segments_nonneut["end"].values))]
    cn_noevent_region_list=[(chrom,int(start),int(end)) for (chrom,start,end) in zip(list(ichor_segments["chr"].values),
                                                                                 list(ichor_segments["start"].values),
                                                                                 list(ichor_segments["end"].values))]

    # Prepare a dataframe that contains only allowed bins, and the information of this sample only
    df_onlyonesample=short_long_df[short_long_df["sample"]==sample]
    df_onlyonesample_shortreadcounts=short_readcounts[short_readcounts["sample"]==sample]
    df_onlyonesample_longreadcounts=long_readcounts[long_readcounts["sample"]==sample]
    # Filter out unwanted chromosomes
    common_bins=[value for value in df_onlyonesample_shortreadcounts.columns if ((value in df_onlyonesample_longreadcounts.columns) and ("chr" in value or "sample" in value) and ("random" not in value) and ("Un" not in value) )]
    df_onlyonesample=df_onlyonesample[common_bins]
    df_onlyonesample_shortreadcounts=df_onlyonesample_shortreadcounts[common_bins]
    df_onlyonesample_longreadcounts=df_onlyonesample_longreadcounts[common_bins]
    assert list(df_onlyonesample_longreadcounts.columns)==list(df_onlyonesample_shortreadcounts.columns)==list(df_onlyonesample.columns)

    df_onlyonesample=df_onlyonesample.set_index("sample").transpose()
    df_onlyonesample=df_onlyonesample.reset_index()
    df_onlyonesample.columns=["bin name","FC(short/long) - unnormalized"]

    df_onlyonesample_shortreadcounts=df_onlyonesample_shortreadcounts.set_index("sample").transpose()
    df_onlyonesample_shortreadcounts=df_onlyonesample_shortreadcounts.reset_index()
    df_onlyonesample_shortreadcounts.columns=["bin name","short"]

    df_onlyonesample_longreadcounts=df_onlyonesample_longreadcounts.set_index("sample").transpose()
    df_onlyonesample_longreadcounts=df_onlyonesample_longreadcounts.reset_index()
    df_onlyonesample_longreadcounts.columns=["bin name","long"]
    
    # Include info about position and CNA status 
    df_onlyonesample=df_onlyonesample.assign(**{"chromosome":[x.split("_")[0] for x in df_onlyonesample["bin name"].values],
                                                "start":[int(x.split("_")[1]) for x in df_onlyonesample["bin name"].values],
                                                "end":[int(x.split("_")[2]) for x in df_onlyonesample["bin name"].values],
                                                "gene":np.nan,
                                                "short":df_onlyonesample_shortreadcounts["short"],
                                                "long":df_onlyonesample_longreadcounts["long"],
                                                "weight":1})
    df_onlyonesample=df_onlyonesample.assign(**{"is_in_gain_or_loss_region":[check_region_overlap(cn_event_region_list,cn_noevent_region_list,chrom,start,end)
                                                                             for (chrom,start,end) in zip(list(df_onlyonesample["chromosome"].values),
                                                                                                          list(df_onlyonesample["start"].values),
                                                                                                          list(df_onlyonesample["end"].values))]})

    # kick out bins that have counts of short or long reads below 100:
    df_onlyonesample=df_onlyonesample[(df_onlyonesample["short"]>100) &(df_onlyonesample["long"]>100) ]

    # kick out all except chr 1-22:
    df_onlyonesample=df_onlyonesample[df_onlyonesample["chromosome"].isin(["chr"+str(x) for x in (range(1,23))])]

    # Normalize by genome wide mean - ignoring CNV-affected regions - to correct for overall fragment size distribution
    genomewide_mean_fc=df_onlyonesample[df_onlyonesample["is_in_gain_or_loss_region"]==False]["FC(short/long) - unnormalized"].mean()
    df_onlyonesample=df_onlyonesample.assign(**{"FC(short/long)":[x/genomewide_mean_fc for x in df_onlyonesample["FC(short/long) - unnormalized"]]})

    df_onlyonesample=df_onlyonesample.assign(**{"UNNORMALIZED log2":[np.log2(x) for x in df_onlyonesample["FC(short/long) - unnormalized"].values]})
    genomewide_mean_logfc=df_onlyonesample[df_onlyonesample["is_in_gain_or_loss_region"]==False]["UNNORMALIZED log2"].mean()
    df_onlyonesample=df_onlyonesample.assign(**{"log2":[x-genomewide_mean_logfc for x in df_onlyonesample["UNNORMALIZED log2"]]})

    # Normalize the controls by the genome wide mean, and use the same bins for mean-calculation as for the tumor sample
    short_to_long_per_ctrl=[[singlebinlist[control_idx] for singlebinlist in [short_long_df_NONnorm_cntrls[x].values for x in df_onlyonesample[df_onlyonesample["is_in_gain_or_loss_region"]==False]["bin name"].values]] for control_idx in range(nr_controls)]
    genome_wide_mean_per_ctrl_fc=[np.mean(x) for x in short_to_long_per_ctrl]
    genome_wide_mean_per_ctrl_logfc=[np.mean(np.log2(x)) for x in short_to_long_per_ctrl]
    df_onlyonesample=df_onlyonesample.assign(**{"reference log2":[np.log2(y)-genome_wide_mean_per_ctrl_logfc for y in [short_long_df_NONnorm_cntrls[x].values for x in df_onlyonesample["bin name"].values]]})
    df_onlyonesample=df_onlyonesample.assign(**{"reference FC":[y-genome_wide_mean_per_ctrl_fc for y in [short_long_df_NONnorm_cntrls[x].values for x in df_onlyonesample["bin name"].values]]})
    df_onlyonesample=df_onlyonesample.assign(**{"UNNORMALIZED reference log2":[np.log2(y) for y in [short_long_df_NONnorm_cntrls[x].values for x in df_onlyonesample["bin name"].values]]})

    # Calculate various ways of quantifying strength of change vs ctrl
    df_onlyonesample=df_onlyonesample.assign(**{"zscore vs reference":[(x-np.mean(ref))/np.std(ref) for x,ref in zip(df_onlyonesample["log2"].values,df_onlyonesample["reference log2"].values)]})
    df_onlyonesample=df_onlyonesample.assign(**{"pval vs reference":[stats.norm.sf(abs(x))*2 for x in df_onlyonesample["zscore vs reference"].values]})
    # Benjamini-hochenberg FDR
    fdr_corr=mt.multipletests(pvals=df_onlyonesample["pval vs reference"].values, alpha=0.05, method="fdr_bh")
    df_onlyonesample=df_onlyonesample.assign(**{"FDR vs reference":fdr_corr[1]})
    df_onlyonesample=df_onlyonesample.assign(**{"sum of reads":df_onlyonesample["short"]+df_onlyonesample["long"]})
    df_onlyonesample=df_onlyonesample.assign(**{"direction of change compared to reference":["longer frags than ref" if z<0 else "shorter frags than ref" for z in df_onlyonesample["zscore vs reference"]]})
    df_onlyonesample=df_onlyonesample.assign(**{"abs(log2)":[abs(x) for x in df_onlyonesample["log2"]]})
    df_onlyonesample=df_onlyonesample.assign(**{"FDR rank":df_onlyonesample["FDR vs reference"].rank(),
                                                "log2 rank":df_onlyonesample["abs(log2)"].rank(),
                                                "sum of reads rank":df_onlyonesample["sum of reads"].rank(ascending=False)})
    df_onlyonesample=df_onlyonesample.assign(**{"worst rank":[max([x,y,z]) for x,y,z in zip(df_onlyonesample["log2 rank"],
                                                                                           df_onlyonesample["sum of reads rank"],
                                                                                           df_onlyonesample["FDR rank"])]})
    df_onlyonesample=df_onlyonesample.assign(**{"sample":sample})
    df_onlyonesample=df_onlyonesample.sample(frac=1)
    df_onlyonesample=df_onlyonesample.sort_values(by="FDR vs reference")

    # Save the information we have gathered as JSON
    df_onlyonesample.to_json("significant_bins/%s_vs_ctrls_pvals_etc_INCL_CNVaffected.json"%(sample,))

    # Filter out regions that are affected by copy numbers, and regions that are not analyzed by ichorCNA
    df_onlyonesample=df_onlyonesample[df_onlyonesample["is_in_gain_or_loss_region"]==False]
    
    # this will be the LOLA universe:
    df_onlyonesample[["chromosome","start","end","sample","FDR vs reference"]].to_csv("significant_bins/%s_nonCNVaffected_bins.bed"%(sample,),sep="\t",header=False,index=False)

    df_onlyonesample_shorter=df_onlyonesample[df_onlyonesample["direction of change compared to reference"]=="shorter frags than ref"]
    df_onlyonesample_longer=df_onlyonesample[df_onlyonesample["direction of change compared to reference"]=="longer frags than ref"]
    
    # And these 2 sets will be the main LOLA input
    df_onlyonesample_shorter[df_onlyonesample_shorter["FDR vs reference"]<0.05][["chromosome","start","end","sample","FDR vs reference"]].to_csv("significant_bins/%s_significant_FDR_shorterfrag_bins.bed"%(sample,),sep="\t",header=False,index=False)
    df_onlyonesample_longer[df_onlyonesample_longer["FDR vs reference"]<0.05][["chromosome","start","end","sample","FDR vs reference"]].to_csv("significant_bins/%s_significant_FDR_longerfrag_bins.bed"%(sample,),sep="\t",header=False,index=False)
    
    # These 2 sets of random CNA neutral bins serve as controls
    df_onlyonesample[["chromosome","start","end","sample","FDR vs reference"]].sample(n=df_onlyonesample_shorter[df_onlyonesample_shorter["FDR vs reference"]<0.05].shape[0]).to_csv("significant_bins/%s_random_nonCNVaffected_bins_as_many_as_sign_shorter.bed"%(sample,),sep="\t",header=False,index=False)
    df_onlyonesample[["chromosome","start","end","sample","FDR vs reference"]].sample(n=df_onlyonesample_longer[df_onlyonesample_longer["FDR vs reference"]<0.05].shape[0]).to_csv("significant_bins/%s_random_nonCNVaffected_bins_as_many_as_sign_longer.bed"%(sample,),sep="\t",header=False,index=False)

Let's summarize the data we have collected so far:

In [ ]:
import pandas as pd
import glob
from scipy import stats

df_nr_total=pd.DataFrame()
df_nr_short=pd.DataFrame()
df_zscore_CNVnonaffected=pd.DataFrame()
df_log2_CNVnonaffected=pd.DataFrame()
df_zscore=pd.DataFrame()

for file in glob.glob("significant_bins/*vs_ctrls_pvals_etc_INCL_CNVaffected.json"):
    sample=file.split("/")[-1].replace("_vs_ctrls_pvals_etc_INCL_CNVaffected.json","")
    print("Summarizing", sample)
    sample_df=pd.read_json(file)
    sample_df_nr_total=sample_df[["bin name","sum of reads"]]
    sample_df_nr_short=sample_df[["bin name","short"]]
    sample_df_zscore=sample_df[~sample_df["is_in_gain_or_loss_region"].isna()][["bin name","zscore vs reference"]]
    sample_df_zscore_CNVnonaffected=sample_df[sample_df["is_in_gain_or_loss_region"]==False][["bin name","zscore vs reference"]]
    sample_df_log2_CNVnonaffected=sample_df[sample_df["is_in_gain_or_loss_region"]==False][["bin name","log2"]]
    sample_df=sample_df[["bin name","log2"]]

    sample_df_nr_total.columns=["bin name",sample]
    sample_df_nr_short.columns=["bin name",sample]
    sample_df_zscore.columns=["bin name",sample]
    sample_df_zscore_CNVnonaffected.columns=["bin name",sample]
    sample_df_log2_CNVnonaffected.columns=["bin name",sample]

    if df_nr_total.empty:
        df_nr_total=sample_df_nr_total
        df_nr_short=sample_df_nr_short
        df_zscore=sample_df_zscore
        df_zscore_CNVnonaffected=sample_df_zscore_CNVnonaffected
        df_log2_CNVnonaffected=sample_df_log2_CNVnonaffected

    else:
        df_nr_total=pd.merge(sample_df_nr_total,df_nr_total,left_on="bin name",right_on="bin name",how="inner")
        df_nr_short=pd.merge(sample_df_nr_short,df_nr_short,left_on="bin name",right_on="bin name",how="inner")
        df_zscore_CNVnonaffected=pd.merge(sample_df_zscore_CNVnonaffected,df_zscore_CNVnonaffected,left_on="bin name",right_on="bin name",how="outer")
        df_zscore=pd.merge(sample_df_zscore,df_zscore,left_on="bin name",right_on="bin name",how="outer")
        df_log2_CNVnonaffected=pd.merge(sample_df_log2_CNVnonaffected,df_log2_CNVnonaffected,left_on="bin name",right_on="bin name",how="outer")

df_nr_total=df_nr_total.set_index("bin name")
df_nr_total=df_nr_total.T
df_nr_total=df_nr_total.reset_index()
df_nr_total.columns=["sample"]+list(df_nr_total.columns)[1:]
df_nr_total.to_csv("Sumofreads_all_samples_all_bins.csv",index=False)
chroms=[x for x in df_nr_total.columns if "chr" in x]
df_nr_total_zscored=pd.DataFrame(stats.zscore(df_nr_total[chroms],axis=1))
df_nr_total_zscored.columns=df_nr_total[chroms].columns
df_nr_total_zscored=df_nr_total_zscored.assign(sample=df_nr_total["sample"])
df_nr_total_zscored.to_csv("Sumofreads_all_samples_all_bins_ZSCORED.csv",index=False) # this will be used in the ML part

df_nr_short=df_nr_short.set_index("bin name")
df_nr_short=df_nr_short.T
df_nr_short=df_nr_short.reset_index()
df_nr_short.columns=["sample"]+list(df_nr_short.columns)[1:]
df_nr_short.to_csv("Nr_short_reads_all_samples_all_bins.csv",index=False)
chroms=[x for x in df_nr_short.columns if "chr" in x]
df_nr_short_zscored=pd.DataFrame(stats.zscore(df_nr_short[chroms],axis=1))
df_nr_short_zscored.columns=df_nr_short[chroms].columns
df_nr_short_zscored=df_nr_short_zscored.assign(sample=df_nr_short["sample"])
df_nr_short_zscored.to_csv("Nr_short_reads_all_samples_all_bins_ZSCORED.csv",index=False) # this will be used in the ML part

df_zscore=df_zscore.set_index("bin name")
df_zscore=df_zscore.T
df_zscore=df_zscore.reset_index()
bins=list(df_zscore.columns)[1:]
df_zscore.columns=["sample"]+list(df_zscore.columns)[1:]
df_zscore.to_csv("zscore_all_samples.csv",index=False) # this will be used to generate the heatmap

df_zscore_CNVnonaffected=df_zscore_CNVnonaffected.set_index("bin name")
df_zscore_CNVnonaffected=df_zscore_CNVnonaffected.T
df_zscore_CNVnonaffected=df_zscore_CNVnonaffected.reset_index()
bins=list(df_zscore_CNVnonaffected.columns)[1:]
df_zscore_CNVnonaffected.columns=["sample"]+list(df_zscore_CNVnonaffected.columns)[1:]
df_zscore_CNVnonaffected.to_csv("zscore_CNVnonaffected_all_samples.csv",index=False) # this will be used in Fig 5 (Tumor evolution & progression)

df_log2_CNVnonaffected=df_log2_CNVnonaffected.set_index("bin name")
df_log2_CNVnonaffected=df_log2_CNVnonaffected.T
df_log2_CNVnonaffected=df_log2_CNVnonaffected.reset_index()
bins=list(df_log2_CNVnonaffected.columns)[1:]
df_log2_CNVnonaffected.columns=["sample"]+list(df_log2_CNVnonaffected.columns)[1:]
df_log2_CNVnonaffected.to_csv("log2_CNVnonaffected_all_samples.csv",index=False) # this will be used in Fig 5 (Tumor evolution & progression)

We can now visualize the zscored S/L ratio via a heatmap:

In [8]:
import glob
from matplotlib import pyplot as plt
import sys
import pandas as pd
import seaborn as sns
import matplotlib.patches as patches
import numpy as np

df=pd.read_csv("zscore_all_samples.csv",index_col="sample")

# Define the sample-sets:
clinical=pd.read_excel("../tables/Supplementary Table 1_Patient cohort.xlsx","Patient clinical data")
genomic=pd.read_excel("../tables/Supplementary Table 2_ctDNA quantification.xlsx","Genomic ctDNA quantification")
# note that in these two tables and the files other provided on this website, Rhabdomyosarcoma samples are encoded as "Rha" instead of "RMS"
non_ews_cancers=list(clinical[clinical["Sample type"]=="Non-EwS sarcoma"]["Sample"].values)
fixation_excluded_samples=list(genomic[genomic["Excluded due to fixation (1=yes)"]==1]["Sample"].values)
size_selected=list(genomic[genomic["Excluded due to size selection prior to library preparation (1=yes)"]==1]["Sample"].values)
controls=list(clinical[clinical["Sample type"]=="Healthy control"]["Sample"].values)

ews_samples_to_plot=genomic[~genomic["Sample"].isin(size_selected+fixation_excluded_samples+non_ews_cancers+controls)]
zero_perc=list(ews_samples_to_plot[ews_samples_to_plot["Combined quantification: % ctDNA"]==0]["Sample"].values)
upto20_perc=list(ews_samples_to_plot[(ews_samples_to_plot["Combined quantification: % ctDNA"]>0) & (ews_samples_to_plot["Combined quantification: % ctDNA"]<=20)]["Sample"].values)
morethan20_perc=list(ews_samples_to_plot[(ews_samples_to_plot["Combined quantification: % ctDNA"]>20)]["Sample"].values)
non_ews_cancers_to_plot=[x for x in non_ews_cancers if not x in size_selected+fixation_excluded_samples]

# Order the bins by their genomic location
bins=list(df.columns)
bins_s=[(int(x.split("_")[0].replace("chr","")),int(x.split("_")[1]),int(x.split("_")[2])) for x in bins]
# sort by chromosome, then by start
bins_s=sorted(bins_s,key=lambda x: x[1])
bins_s=sorted(bins_s,key=lambda x: x[0])
bins=["chr%s_%s_%s"%x for x in bins_s]
df=df[bins]
df.columns=["chr%s_%s"%(x[0],x[1]) for x in bins_s]

# Only consider bins for which we have data for every sample. This excludes e.g. bins for which ichorCNA gave no CNA status in one or more samples
df=df.dropna(axis=1)


chr_borders=[]
for chrom in range(1,23):
    for idx,bin in enumerate(df.columns):
        if "chr%s"%(chrom) in bin:
            chr_borders.append(idx)
            break

for sampleset,samplesetname in zip([controls,zero_perc,upto20_perc,morethan20_perc,non_ews_cancers_to_plot],["Ctrl","EwS, 0%","EwS, <20%,>0%% ","EwS, >=20%","Non-Ews cancers"]):

    # within each group, sort by tumor content
    samplesetdf=df.loc[sampleset]
    samplesetdf["Combined quantification: % ctDNA"]=samplesetdf.apply(lambda row: genomic[genomic["Sample"]==row.name]["Combined quantification: % ctDNA"].values[0],axis=1)
    samplesetdf=samplesetdf.sort_values(by="Combined quantification: % ctDNA")
    samplesetdf=samplesetdf.drop("Combined quantification: % ctDNA",axis=1)

    ax=sns.heatmap(samplesetdf,robust=True,cmap="viridis",vmin=-3,vmax=3,cbar=True,linewidths=0)
 
    # set size of plots according to the nr. of samples in the group
    plt.gcf().set_size_inches(16,len(sampleset)/10)
    
    plt.title(samplesetname)
    plt.xticks([],[])
    ax.set_xticks(chr_borders[1:])

    plt.yticks([],[])

    plt.ylabel("")
    plt.show()