import pandas as pd import matplotlib import matplotlib.pyplot as plt import numpy as np from scipy import stats import sys mastertable=sys.argv[1] df=pd.read_csv(mastertable, compression='gzip') samplename=mastertable.split("/")[-2] # The follwing creates a heatmap for GC vs corrected / uncorrected coverage, and fits a trend line #https://stackoverflow.com/questions/893657/how-do-i-calculate-r-squared-using-python-and-numpy def polyfit(x, y, degree): results = {} coeffs = np.polyfit(x, y, degree) # Polynomial Coefficients results['polynomial'] = coeffs.tolist() # r-squared p = np.poly1d(coeffs) # fit values, and mean yhat = p(x) # or [p(z) for z in x] ybar = np.sum(y)/len(y) # or sum(y)/len(y) ssreg = np.sum((yhat-ybar)**2) # or sum([ (yihat - ybar)**2 for yihat in yhat]) sstot = np.sum((y - ybar)**2) # or sum([ (yi - ybar)**2 for yi in y]) results['determination'] = ssreg / sstot return results print(df.shape) for (x,y) in [("GCcont","coverage"),("GCcont","corr_coverage")]: if x=="GCcont": heatmap, xedges, yedges = np.histogram2d(df[x]*100,df[y], bins=(100,100),range=[[0,100],[0,2]] if y=="coverage" else [[0,100],[-1,1]]) else: heatmap, xedges, yedges = np.histogram2d(df[x]*100,df[y], bins=(100,100),range=[[95,100],[0,2]] if y=="coverage" else [[95,100],[-1.5,1]]) extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]] plt.clf() myvmin=0 myvmax=1700 if "GC" in x else 100 plt.imshow(heatmap.T, extent=extent, origin='lower', aspect='auto',vmin=myvmin, vmax=myvmax) plt.ylabel("Uncorrected Coverage" if y=="coverage" else "Corrected Coverage") plt.xlabel("% GC content " if x=="GCcont" else "% Mappability") cbar=plt.colorbar() cbar.set_label('# of bins', rotation=270,labelpad=13) slope, intercept, r_value, p_value, std_err = stats.linregress(df[x],df[y]) plt.title("before correction" if y=="coverage" else "after correction") y_lim=plt.gca().get_ylim() z = np.polyfit(df[x]*100, df[y], 2) f = np.poly1d(z) if x=="GCcont": x_new = np.linspace(0,100, 50) else: x_new = np.linspace(95,100, 50) y_new = f(x_new) plt.plot(x_new, y_new, color="firebrick", linewidth=0.5, ) plt.ylim(y_lim) r2=round(polyfit(df[x],df[y],2)["determination"],3) texty=0.75 if y=="corr_coverage" else 1.75 plt.text(x=70 if "GC" in x else 96,y=texty, s=r'$R^{2}$ = %s'%(r2,),color="white") plt.locator_params(nbins=2) plt.savefig("%s_%s_vs_%s.png"%(samplename,x,y)) plt.show() plt.close()