library(MASS)
library(cqn)
library(scales)
library(RColorBrewer)
library(edgeR)

#load in the data. Takes the following form:
#chr    start   stop    full_name   exon_length gc_content  strand...
#cols 8-52 are read counts per individual, as indicated by ind_id below
data=read.table('/HGDP_RNAseq/scripts/expression_normalization/supplemental_script_input_files/exon_counts_all_gc_uniq.txt', header=F) #all exons with unique start and stop positions

counts=data[,8:48]
ind_id=c("HGDP00854","HGDP00855","HGDP00856","HGDP00857","HGDP00858","HGDP00868","HGDP00877","HGDP00948","HGDP00950","HGDP00955","HGDP00959","HGDP00964","HGDP00967","HGDP00711","HGDP00712","HGDP00713","HGDP00715","HGDP00716","HGDP00720","HGDP00721","HGDP00213","HGDP00222","HGDP00232","HGDP00237","HGDP00239","HGDP00247","HGDP00258","HGDP01258","HGDP01259","HGDP01262","HGDP01264","HGDP01274","HGDP01275","HGDP01277","HGDP00449","HGDP00456","HGDP00462","HGDP00467","HGDP00471","HGDP00476","HGDP01081")
colnames(counts)=ind_id
counts=data.frame(counts)
covs=data.frame(length=data[,5], gccontent=data[,6]) #make a covariates data frame
rownames(covs)=data[,4] #counts and covariates are in the same order, so give them the same row names
rownames(counts)=data[,4]

#total_reads come from final picard mapped pairs... should be consistent for all analyses (transcript, exon, gene)
total_reads=c(39275189, 17202597, 22598945, 39247971, 41307705, 27323525, 26629931, 18116664, 22685944, 31631509, 35815108, 26659324, 28186520, 36848816, 29573044, 33008344, 44825592, 40771740, 23401330, 23456356, 35469906, 17613885, 35707350, 20848854, 32352307, 24613091, 22864566, 12076284, 25682694, 33715502, 37801519, 28123051, 37081363, 29665178, 36222536, 28897707, 19436014, 34953807, 42358732, 20939621, 27040536)
names(total_reads)=c("HGDP00854","HGDP00855","HGDP00856","HGDP00857","HGDP00858","HGDP00868","HGDP00877","HGDP00948","HGDP00950","HGDP00955","HGDP00959","HGDP00964","HGDP00967","HGDP00711","HGDP00712","HGDP00713","HGDP00715","HGDP00716","HGDP00720","HGDP00721","HGDP00213","HGDP00222","HGDP00232","HGDP00237","HGDP00239","HGDP00247","HGDP00258","HGDP01258","HGDP01259","HGDP01262","HGDP01264","HGDP01274","HGDP01275","HGDP01277","HGDP00449","HGDP00456","HGDP00462","HGDP00467","HGDP00471","HGDP00476","HGDP01081")
nhets <- c(1592701, 1514683, 1608281, 1459374, 1562783, 1703872, 1667675, 1739368, 1728192, 1697423, 1643000, 1737060, 1713637, 1754825, 1693280, 1798134, 1787883, 1751286, 1809665, 1718554, 1914989, 1939435, 1901192, 1893994, 1894682, 1787932, 1836691, 1797353, 1959229, 2079354, 2034194, 1947736, 2063233, 2012364, 1926280, 2272609, 2317831, 2398421, 2338970, 2255041, 2347218)

#run conditional quantile normalization
date(); cqn_hgdp <- cqn(counts, lengths = covs$length,x = covs$gccontent, sizeFactors = total_reads,verbose = TRUE); date()

#make color vector for plotting
brewerVector=c(colors()[c(466)],"red","orange","yellow","darkgreen","turquoise")
colorPop<-c(rep(brewerVector[1],7),rep(brewerVector[2],6),rep(brewerVector[3],7),rep(brewerVector[4],7),rep(brewerVector[5],7),rep(brewerVector[6],7))

#plot GC bias and exon length bias (quantile regression fits that estimate f_i,j, which are sample-dependent systematic biases... I believe these are eigenvalues)
#see eqn in section 4. Methods from cqn paper for further explanation
pdf('gc_length_exon_biases.pdf', onefile=T)
par(mar=c(7,5,7,7))
cqnplot(cqn_hgdp, n=1, xlab="GC content", lty=1, ylim=c(0,6), col=colorPop, main='GC Content Effect', cex.lab=1.4, cex.axis=1.5, cex.main=2, bty='n')
legend('topright', col=brewerVector, lty=1, lwd=2, c('Maya', 'Yakut', 'Cambodian', 'Pathan', 'Mozabite', 'Mbuti'), bty='n')
cqnplot(cqn_hgdp, n=2, xlab="Exon length", lty=1, ylim=c(-2,7), col=colorPop, main='Exon Length Effect',cex.lab=1.4, cex.axis=1.5, cex.main=2, bty='n')
legend('topleft', col=brewerVector, lty=1, lwd=2, c('Maya', 'Yakut', 'Cambodian', 'Pathan', 'Mozabite', 'Mbuti'), bty='n')
#plot exon lengths at knots, where knots are quantiles at 2.5%, 25%, 50%, 75%, and 97.5% (I can't figure out the rest of the scale yet... log2 based)
for(i in 1:5) {
    text(cqn_hgdp$knots2[i], -1.8, round(quantile(covs$length, c(0.025, .25, .5, .75, .975)))[i], cex=0.8)
}
dev.off()

#normalized expression values:
FPKM_cqn_hgdp=cqn_hgdp$y + cqn_hgdp$offset
#standard FPKM calculation:
reads_per_million = sweep(log2(counts + 1), 2, log2(total_reads/10^6))
FPKM_std_hgdp = sweep(reads_per_million, 1, log2(covs$length / 10^3))

maya=c("HGDP00854","HGDP00855","HGDP00856","HGDP00857","HGDP00858","HGDP00868","HGDP00877")
yakut=c("HGDP00948","HGDP00950","HGDP00955","HGDP00959","HGDP00964","HGDP00967")
cambodian=c("HGDP00711","HGDP00712","HGDP00713","HGDP00715","HGDP00716","HGDP00720","HGDP00721")
pathan=c("HGDP00213","HGDP00222","HGDP00232","HGDP00237","HGDP00239","HGDP00247","HGDP00258")
mozabite=c("HGDP01258","HGDP01259","HGDP01262","HGDP01264","HGDP01274","HGDP01275","HGDP01277")
mbuti=c("HGDP00449","HGDP00456","HGDP00462","HGDP00467","HGDP00471","HGDP00476","HGDP01081")

#MA plotting function
plot_group=function(group1, group2, name1, name2) {
    whGenes <- which(rowMeans(FPKM_std_hgdp) >= 2 & covs$length >= 100)
    M.std <- rowMeans(FPKM_std_hgdp[whGenes, group1]) - rowMeans(FPKM_std_hgdp[whGenes, group2])
    A.std <- rowMeans(FPKM_std_hgdp[whGenes,])
    M.cqn <- rowMeans(FPKM_cqn_hgdp[whGenes, group1]) - rowMeans(FPKM_cqn_hgdp[whGenes, group2])
    A.cqn <- rowMeans(FPKM_cqn_hgdp[whGenes,])
    
    pdf(paste('MA_', name1, '_', name2, '.pdf', sep=''), onefile=T)
    plot(A.std, M.std, cex = 0.5, pch = 16, xlab = "A (average expression)", ylab = "M (distribution of over/underexpression)", main = "Standard FPKM", ylim = c(-4,4), xlim = c(0,12), col = alpha("black", 0.25))
    mtext(paste('Comparison: ', name1, ', ', name2, sep=''))
    abline(h=0, col='blue')
    fit=loess(M.std~A.std)
    lines(predict(fit), col='red')
    plot(A.cqn, M.cqn, cex = 0.5, pch = 16, xlab = "A (average expression)", ylab = "M (distribution of over/underexpression)", main = "CQN normalized FPKM", ylim = c(-4,4), xlim = c(0,12), col = alpha("black", 0.25))
    mtext(paste('Comparison: ', name1, ', ', name2, sep=''))
    abline(h=0, col='blue')
    fit=loess(M.cqn~A.cqn)
    lines(predict(fit), col='red')
    dev.off()
}

#plot MA plots for all 21 pairwise combos
plot_group(maya, yakut, 'Maya', 'Yakut')
plot_group(yakut, cambodian, 'Yakut', 'Cambodian')
plot_group(cambodian, pathan, 'Cambodian', 'Pathan')
plot_group(pathan, mozabite, 'Pathan', 'Mozabite')
plot_group(mozabite, mbuti, 'Mozabite', 'Mbuti')
#plot_group(mbuti, san, 'Mbuti', 'San')
plot_group(maya, cambodian, 'Maya', 'Cambodian')
plot_group(yakut, pathan, 'Yakut', 'Pathan')
plot_group(cambodian, mozabite, 'Cambodian', 'Mozabite')
plot_group(pathan, mbuti, 'Pathan', 'Mbuti')
#plot_group(mozabite, san, 'Mozabite', 'San')
plot_group(maya, pathan, 'Maya', 'Pathan')
plot_group(yakut, mozabite, 'Yakut', 'Mozabite')
plot_group(cambodian, mbuti, 'Cambodian', 'Mbuti')
#plot_group(pathan, san, 'Pathan', 'San')
plot_group(maya, mozabite, 'Maya', 'Mozabite')
plot_group(yakut, mbuti, 'Yakut', 'Mbuti')
#plot_group(cambodian, san, 'Cambodian', 'San')
plot_group(maya, mbuti, 'Maya', 'Mbuti')
#plot_group(yakut, san, 'Yakut', 'San')
#plot_group(maya, san, 'Maya', 'San')

#get limits for standard FPKM and cqn expression distribution plots
ymax=0
xmin=0
xmax=0
for(i in 1:41) {
    ymax=max(ymax, density(FPKM_std_hgdp[whGenes,i])$y)
    xmin=min(xmin, density(FPKM_std_hgdp[whGenes,i])$x)
    xmax=max(xmax, density(FPKM_std_hgdp[whGenes,i])$x)
}

#figure 3a and 3b, raw (standard) and cqn distributions
pdf('raw_exon_distribution.pdf')
par(mar=c(7,5,7,7))
plot(0,0, type='n', xlim=c(xmin, xmax), ylim=c(0, ymax), xlab=expression(log[2]~"(Expression (FPKM))"), ylab='Density', main='Raw Expression Distribution', cex.lab=1.4, cex.axis=1.5, cex.main=2, bty='n')
for(i in 1:41) {
    lines(density(FPKM_std_hgdp[whGenes,i]), col=colorPop[i])
}
legend('topright', col=brewerVector, lty=1, lwd=2, c('Maya', 'Yakut', 'Cambodian', 'Pathan', 'Mozabite', 'Mbuti'), bty='n')
dev.off()

pdf('cqn_exon_distribution.pdf')
par(mar=c(7,5,7,7))
plot(0,0, type='n', xlim=c(xmin, xmax), ylim=c(0, ymax), xlab=expression(log[2]~"(Expression (CQN FPKM))"), ylab='Density', main='CQN Expression Distribution', cex.lab=1.4, cex.axis=1.5, cex.main=2, bty='n')
for(i in 1:41) {
    lines(density(FPKM_cqn_hgdp[whGenes,i]), col=colorPop[i])
}
legend('topright', col=brewerVector, lty=1, lwd=2, c('Maya', 'Yakut', 'Cambodian', 'Pathan', 'Mozabite', 'Mbuti'), bty='n')
dev.off()

#figure 3e and 3f, plot randomized pair comparisons when using standard and cqn expression distributions
#because these are randomized, we expect an overall 0 fold change
rand_grp=sample(ind_id)

grp1=rand_grp[1:22]
grp2=rand_grp[23:44]
whGenes <- which(rowMeans(FPKM_std_hgdp) >= 2 & covs$length >= 100)
M.std <- rowMeans(FPKM_std_hgdp[whGenes, grp1]) - rowMeans(FPKM_std_hgdp[whGenes, grp2])
A.std <- rowMeans(FPKM_std_hgdp[whGenes,])
gc.std <- covs[whGenes,2]
M.cqn <- rowMeans(FPKM_cqn_hgdp[whGenes, grp1]) - rowMeans(FPKM_cqn_hgdp[whGenes, grp2])
A.cqn <- rowMeans(FPKM_cqn_hgdp[whGenes,])

png('fold_change_vs_gc_std.png')
plot(gc.std, M.std, cex = 0.5, pch = 16, xlab = "GC content", ylab =expression(log[2]~"(Fold Change)"), main = "Standard FPKM", ylim = c(min(M.std),max(M.std)), xlim = c(min(gc.std),max(gc.std)), col = alpha("black", 0.25), cex.lab=1.4, cex.axis=1.5, cex.main=2, bty='n')
mtext('Random sampling of group 1 vs group 2')
abline(h=0, col='blue')
dev.off()

png('fold_change_vs_gc_cqn.png')
plot(gc.std, M.cqn, cex = 0.5, pch = 16, xlab = "GC content", ylab =expression(log[2]~"(Fold Change)"), main = "CQN FPKM", ylim = c(min(M.cqn),max(M.cqn)), xlim = c(min(gc.std),max(gc.std)), col = alpha("black", 0.25), cex.lab=1.4, cex.axis=1.5, cex.main=2, bty='n')
mtext('Random sampling of group 1 vs group 2')
abline(h=0, col='blue')
dev.off()

#figure 1a, raw counts distributions
ymax=0
xmax=0
for(i in 1:41) {
    current=hist(log2(counts[,i] + 1))
    ymax=max(ymax,current$counts)
    xmax=max(xmax,current$breaks)
}

pdf('count_distribution.pdf')
plot(0,0, 'n', ylim=c(0, ymax), xlim=c(0,xmax), xlab=expression(log[2]~"(Number of reads)"), ylab='Frequency', main='Count distribution')
for(i in 1:41) {
    current=hist(log2(counts[,i] + 1), plot=F)
    lines(current$breaks[1:length(current$breaks)-1], current$counts, col=colorPop[i])
}
legend('topright', col=brewerVector, lty=1, lwd=2, c('Maya', 'Yakut', 'Cambodian', 'Pathan', 'Mozabite', 'Mbuti'))
dev.off()

##############################################
################edgeR#########################

pop<-factor(c(rep("Maya",7),rep("Yakut",6),rep("Cambodia",7),rep("Pathan",7),rep("Mozabite",7),rep("Mbuti",7)), levels=c("Maya", "Yakut", "Cambodia", "Pathan", "Mozabite", "Mbuti"))

d <- DGEList(counts = counts[whGenes,], lib.size = total_reads, group=pop, genes=covs[whGenes,])
design <- model.matrix(~ d$sample$group) #study design matrix
#estimate dispersion: 1 common dispersion per individual, a trended dispersion, and a per exon/gene/transcript/tag dispersion
d_common <- estimateGLMCommonDisp(d, design = design, offset = cqn_hgdp$offset[whGenes,])
d_trended <- estimateGLMTrendedDisp(d_common, design = design, offset = cqn_hgdp$offset[whGenes,])
d_tagwise <- estimateGLMTagwiseDisp(d_trended, design = design, offset = cqn_hgdp$offset[whGenes,])
#d_tagwise2 <- estimateGLMTagwiseDisp(d_common, design = design, offset = cqn_hgdp$offset[whGenes,])
#fit a GLM to the data using each dispersion estimate, using the cqn normalization offset
fit.common <- glmFit(d_common, design = design, offset = cqn_hgdp$offset[whGenes,])
fit.trended <- glmFit(d_trended, design = design, offset = cqn_hgdp$offset[whGenes,])
fit.tagwise <- glmFit(d_tagwise, design = design, offset = cqn_hgdp$offset[whGenes,])

lrt.all_pops <- glmLRT(fit.tagwise, coef=ncol(fit.tagwise$design))

write.table(lrt.all_pops$table, 'lrt_all_pops.txt', row.names=T, col.names=T, quote=F, sep='\t')
ordered_table <- topTags(lrt.all_pops, dim(lrt.all_pops$table)[1])$table
write.table(subset(ordered_table, ordered_table$FDR < 0.05), 'top_tags_lrt_all_pops_fdr05.txt', row.names=T, col.names=T, quote=F, sep='\t')

subset(ordered_table, ordered_table[,4] < bonferoni)
library(qvalue)
fdr_01 <- qvalue(ordered_table[,4])

bonferoni <- .05/length(whGenes)
#qqplots for each dispersion estiamte
pdf('qqnorm_dispersion.pdf', onefile=T)
qqnorm((fit.common$deviance - mean(fit.common$deviance))/sd(fit.common$deviance), bty='n', cex.lab=1.4, cex.axis=1.5, cex.main=2, main='Common')
abline(0,1, col='red')

qqnorm((fit.trended$deviance - mean(fit.trended$deviance))/sd(fit.trended$deviance), bty='n', cex.lab=1.4, cex.axis=1.5, cex.main=2, main='Trended')
abline(0,1, col='red')

qqnorm((fit.tagwise$deviance - mean(fit.tagwise$deviance))/sd(fit.tagwise$deviance), bty='n', cex.lab=1.4, cex.axis=1.5, cex.main=2, main='Tagwise')
abline(0,1, col='red')
dev.off()

#order is alphabetical: Cambodian, Maya, Mbuti, Mozabite, Pathan, San, Yakut
et_yak_maya <- exactTest(d_tagwise, pair=c('Yakut', 'Maya'))
et_cam_yak<- exactTest(d_tagwise, pair=c('Cambodia', 'Yakut'))
et_path_cam<- exactTest(d_tagwise, pair=c('Pathan', 'Cambodia'))
et_moz_path<- exactTest(d_tagwise, pair=c('Mozabite', 'Pathan'))
et_mbu_moz<- exactTest(d_tagwise, pair=c('Mbuti', 'Mozabite'))
#et_san_mbu <- exactTest(d_tagwise, pair=c('San', 'Mbuti'))

et_cam_maya <- exactTest(d_tagwise, pair=c('Cambodia', 'Maya'))
et_path_yak <- exactTest(d_tagwise, pair=c('Pathan', 'Yakut'))
et_moz_cam <- exactTest(d_tagwise, pair=c('Mozabite', 'Cambodia'))
et_mbu_path <- exactTest(d_tagwise, pair=c('Mbuti', 'Pathan'))
#et_san_moz <- exactTest(d_tagwise, pair=c('San', 'Mozabite'))

et_path_maya <- exactTest(d_tagwise, pair=c('Pathan', 'Maya'))
et_moz_yak <- exactTest(d_tagwise, pair=c('Mozabite', 'Yakut'))
et_mbu_cam <- exactTest(d_tagwise, pair=c('Mbuti', 'Cambodia'))
#et_san_path <- exactTest(d_tagwise, pair=c('San', 'Pathan'))

et_moz_maya <- exactTest(d_tagwise, pair=c('Mozabite', 'Maya'))
et_mbu_yak <- exactTest(d_tagwise, pair=c('Mbuti', 'Yakut'))
#et_san_cam <- exactTest(d_tagwise, pair=c('San', 'Cambodia'))

et_mbu_maya <- exactTest(d_tagwise, pair=c('Mbuti', 'Maya'))
#et_san_yak <- exactTest(d_tagwise, pair=c('San', 'Yakut'))

#et_san_maya <- exactTest(d_tagwise, pair=c('San', 'Maya'))

subset(et_yak_maya$table, et_yak_maya$table[,3] < bonferoni)
subset(et_cam_yak$table, et_cam_yak$table[,3] < bonferoni)
subset(et_path_cam$table, et_path_cam$table[,3] < bonferoni)
subset(et_moz_path$table, et_moz_path$table[,3] < bonferoni)
subset(et_mbu_moz$table, et_mbu_moz$table[,3] < bonferoni)
subset(et_cam_maya$table, et_cam_maya$table[,3] < bonferoni)
subset(et_path_yak$table, et_path_yak$table[,3] < bonferoni)
subset(et_moz_cam$table, et_moz_cam$table[,3] < bonferoni)
subset(et_mbu_path$table, et_mbu_path$table[,3] < bonferoni)
subset(et_path_maya$table, et_path_maya$table[,3] < bonferoni)
subset(et_moz_yak$table, et_moz_yak$table[,3] < bonferoni)
subset(et_mbu_cam$table, et_mbu_cam$table[,3] < bonferoni)
subset(et_moz_maya$table, et_moz_maya$table[,3] < bonferoni)
subset(et_mbu_yak$table, et_mbu_yak$table[,3] < bonferoni)
subset(et_mbu_maya$table, et_mbu_maya$table[,3] < bonferoni)

[order(lrt.mbu_maya$table[,4]),]

save.image('cqn_no_san.Rdata')
topTags(lrt.maya_yakut)
summary(decideTestsDGE(lrt.maya_yakut))

#correlate expression (spearman), mds, determine whether population structure exists in normalized expression data
pdf('src_nonmetric_mds_expression_no_san.pdf')
correlation <- cor(FPKM_cqn_hgdp[,1:41])
d <- dist(correlation)
mds <- isoMDS(d, k=2)
par(mar=c(7,5,7,7))
plot(mds$points[,1], mds$points[,2], xlab='Coordinate 1', ylab='Coordinate 2', main='Expression MDS', pch=21, cex=2, bg=colorPop[1:41], col="black", bty='n', cex.main=2, cex.lab=1.5, cex.axis=1.5)
legend('topright', c('Maya', 'Yakut', 'Cambodian', 'Pathan', 'Mozabite', 'Mbuti'), col="black", pt.bg=unique(colorPop)[1:6], pch=21, bty='n')
dev.off()

cor.test(mds$points[,1], nhets[1:41], method='spearman')