import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import sys

mastertable=sys.argv[1]
df=pd.read_csv(mastertable, compression='gzip')
samplename=mastertable.split("/")[-2]

# The follwing creates a heatmap for GC vs corrected / uncorrected coverage, and fits a trend line

#https://stackoverflow.com/questions/893657/how-do-i-calculate-r-squared-using-python-and-numpy
def polyfit(x, y, degree):
    results = {}

    coeffs = np.polyfit(x, y, degree)

    # Polynomial Coefficients
    results['polynomial'] = coeffs.tolist()

    # r-squared
    p = np.poly1d(coeffs)
    # fit values, and mean
    yhat = p(x)                         # or [p(z) for z in x]
    ybar = np.sum(y)/len(y)          # or sum(y)/len(y)
    ssreg = np.sum((yhat-ybar)**2)   # or sum([ (yihat - ybar)**2 for yihat in yhat])
    sstot = np.sum((y - ybar)**2)    # or sum([ (yi - ybar)**2 for yi in y])
    results['determination'] = ssreg / sstot
    return results

print(df.shape)
for (x,y) in [("GCcont","coverage"),("GCcont","corr_coverage")]:

    if x=="GCcont":
        heatmap, xedges, yedges = np.histogram2d(df[x]*100,df[y], bins=(100,100),range=[[0,100],[0,2]] if y=="coverage" else [[0,100],[-1,1]])
    else:
        heatmap, xedges, yedges = np.histogram2d(df[x]*100,df[y], bins=(100,100),range=[[95,100],[0,2]] if y=="coverage" else [[95,100],[-1.5,1]])
    extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
    plt.clf()
    myvmin=0
    myvmax=1700 if "GC" in x else 100
    plt.imshow(heatmap.T, extent=extent, origin='lower', aspect='auto',vmin=myvmin, vmax=myvmax)
    plt.ylabel("Uncorrected Coverage" if y=="coverage" else "Corrected Coverage")
    plt.xlabel("% GC content " if x=="GCcont" else "% Mappability")
    cbar=plt.colorbar()
    cbar.set_label('# of bins', rotation=270,labelpad=13)
    slope, intercept, r_value, p_value, std_err = stats.linregress(df[x],df[y])
    plt.title("before correction" if y=="coverage" else "after correction")


    y_lim=plt.gca().get_ylim()


    z = np.polyfit(df[x]*100, df[y], 2)
    f = np.poly1d(z)
    if x=="GCcont":
        x_new = np.linspace(0,100, 50)
    else:
        x_new = np.linspace(95,100, 50)
    y_new = f(x_new)
    plt.plot(x_new, y_new, color="firebrick", linewidth=0.5, )
    plt.ylim(y_lim)

    r2=round(polyfit(df[x],df[y],2)["determination"],3)
    texty=0.75 if y=="corr_coverage" else 1.75
    plt.text(x=70 if "GC" in x else 96,y=texty, s=r'$R^{2}$ = %s'%(r2,),color="white")
    plt.locator_params(nbins=2)
    plt.savefig("%s_%s_vs_%s.png"%(samplename,x,y))
    plt.show()
    plt.close()