library(MASS) library(cqn) library(scales) library(RColorBrewer) library(edgeR) #load in the data. Takes the following form: #chr start stop full_name exon_length gc_content strand... #cols 8-52 are read counts per individual, as indicated by ind_id below data=read.table('/HGDP_RNAseq/scripts/expression_normalization/supplemental_script_input_files/exon_counts_all_gc_uniq.txt', header=F) #all exons with unique start and stop positions counts=data[,8:48] ind_id=c("HGDP00854","HGDP00855","HGDP00856","HGDP00857","HGDP00858","HGDP00868","HGDP00877","HGDP00948","HGDP00950","HGDP00955","HGDP00959","HGDP00964","HGDP00967","HGDP00711","HGDP00712","HGDP00713","HGDP00715","HGDP00716","HGDP00720","HGDP00721","HGDP00213","HGDP00222","HGDP00232","HGDP00237","HGDP00239","HGDP00247","HGDP00258","HGDP01258","HGDP01259","HGDP01262","HGDP01264","HGDP01274","HGDP01275","HGDP01277","HGDP00449","HGDP00456","HGDP00462","HGDP00467","HGDP00471","HGDP00476","HGDP01081") colnames(counts)=ind_id counts=data.frame(counts) covs=data.frame(length=data[,5], gccontent=data[,6]) #make a covariates data frame rownames(covs)=data[,4] #counts and covariates are in the same order, so give them the same row names rownames(counts)=data[,4] #total_reads come from final picard mapped pairs... should be consistent for all analyses (transcript, exon, gene) total_reads=c(39275189, 17202597, 22598945, 39247971, 41307705, 27323525, 26629931, 18116664, 22685944, 31631509, 35815108, 26659324, 28186520, 36848816, 29573044, 33008344, 44825592, 40771740, 23401330, 23456356, 35469906, 17613885, 35707350, 20848854, 32352307, 24613091, 22864566, 12076284, 25682694, 33715502, 37801519, 28123051, 37081363, 29665178, 36222536, 28897707, 19436014, 34953807, 42358732, 20939621, 27040536) names(total_reads)=c("HGDP00854","HGDP00855","HGDP00856","HGDP00857","HGDP00858","HGDP00868","HGDP00877","HGDP00948","HGDP00950","HGDP00955","HGDP00959","HGDP00964","HGDP00967","HGDP00711","HGDP00712","HGDP00713","HGDP00715","HGDP00716","HGDP00720","HGDP00721","HGDP00213","HGDP00222","HGDP00232","HGDP00237","HGDP00239","HGDP00247","HGDP00258","HGDP01258","HGDP01259","HGDP01262","HGDP01264","HGDP01274","HGDP01275","HGDP01277","HGDP00449","HGDP00456","HGDP00462","HGDP00467","HGDP00471","HGDP00476","HGDP01081") nhets <- c(1592701, 1514683, 1608281, 1459374, 1562783, 1703872, 1667675, 1739368, 1728192, 1697423, 1643000, 1737060, 1713637, 1754825, 1693280, 1798134, 1787883, 1751286, 1809665, 1718554, 1914989, 1939435, 1901192, 1893994, 1894682, 1787932, 1836691, 1797353, 1959229, 2079354, 2034194, 1947736, 2063233, 2012364, 1926280, 2272609, 2317831, 2398421, 2338970, 2255041, 2347218) #run conditional quantile normalization date(); cqn_hgdp <- cqn(counts, lengths = covs$length,x = covs$gccontent, sizeFactors = total_reads,verbose = TRUE); date() #make color vector for plotting brewerVector=c(colors()[c(466)],"red","orange","yellow","darkgreen","turquoise") colorPop<-c(rep(brewerVector[1],7),rep(brewerVector[2],6),rep(brewerVector[3],7),rep(brewerVector[4],7),rep(brewerVector[5],7),rep(brewerVector[6],7)) #plot GC bias and exon length bias (quantile regression fits that estimate f_i,j, which are sample-dependent systematic biases... I believe these are eigenvalues) #see eqn in section 4. Methods from cqn paper for further explanation pdf('gc_length_exon_biases.pdf', onefile=T) par(mar=c(7,5,7,7)) cqnplot(cqn_hgdp, n=1, xlab="GC content", lty=1, ylim=c(0,6), col=colorPop, main='GC Content Effect', cex.lab=1.4, cex.axis=1.5, cex.main=2, bty='n') legend('topright', col=brewerVector, lty=1, lwd=2, c('Maya', 'Yakut', 'Cambodian', 'Pathan', 'Mozabite', 'Mbuti'), bty='n') cqnplot(cqn_hgdp, n=2, xlab="Exon length", lty=1, ylim=c(-2,7), col=colorPop, main='Exon Length Effect',cex.lab=1.4, cex.axis=1.5, cex.main=2, bty='n') legend('topleft', col=brewerVector, lty=1, lwd=2, c('Maya', 'Yakut', 'Cambodian', 'Pathan', 'Mozabite', 'Mbuti'), bty='n') #plot exon lengths at knots, where knots are quantiles at 2.5%, 25%, 50%, 75%, and 97.5% (I can't figure out the rest of the scale yet... log2 based) for(i in 1:5) { text(cqn_hgdp$knots2[i], -1.8, round(quantile(covs$length, c(0.025, .25, .5, .75, .975)))[i], cex=0.8) } dev.off() #normalized expression values: FPKM_cqn_hgdp=cqn_hgdp$y + cqn_hgdp$offset #standard FPKM calculation: reads_per_million = sweep(log2(counts + 1), 2, log2(total_reads/10^6)) FPKM_std_hgdp = sweep(reads_per_million, 1, log2(covs$length / 10^3)) maya=c("HGDP00854","HGDP00855","HGDP00856","HGDP00857","HGDP00858","HGDP00868","HGDP00877") yakut=c("HGDP00948","HGDP00950","HGDP00955","HGDP00959","HGDP00964","HGDP00967") cambodian=c("HGDP00711","HGDP00712","HGDP00713","HGDP00715","HGDP00716","HGDP00720","HGDP00721") pathan=c("HGDP00213","HGDP00222","HGDP00232","HGDP00237","HGDP00239","HGDP00247","HGDP00258") mozabite=c("HGDP01258","HGDP01259","HGDP01262","HGDP01264","HGDP01274","HGDP01275","HGDP01277") mbuti=c("HGDP00449","HGDP00456","HGDP00462","HGDP00467","HGDP00471","HGDP00476","HGDP01081") #MA plotting function plot_group=function(group1, group2, name1, name2) { whGenes <- which(rowMeans(FPKM_std_hgdp) >= 2 & covs$length >= 100) M.std <- rowMeans(FPKM_std_hgdp[whGenes, group1]) - rowMeans(FPKM_std_hgdp[whGenes, group2]) A.std <- rowMeans(FPKM_std_hgdp[whGenes,]) M.cqn <- rowMeans(FPKM_cqn_hgdp[whGenes, group1]) - rowMeans(FPKM_cqn_hgdp[whGenes, group2]) A.cqn <- rowMeans(FPKM_cqn_hgdp[whGenes,]) pdf(paste('MA_', name1, '_', name2, '.pdf', sep=''), onefile=T) plot(A.std, M.std, cex = 0.5, pch = 16, xlab = "A (average expression)", ylab = "M (distribution of over/underexpression)", main = "Standard FPKM", ylim = c(-4,4), xlim = c(0,12), col = alpha("black", 0.25)) mtext(paste('Comparison: ', name1, ', ', name2, sep='')) abline(h=0, col='blue') fit=loess(M.std~A.std) lines(predict(fit), col='red') plot(A.cqn, M.cqn, cex = 0.5, pch = 16, xlab = "A (average expression)", ylab = "M (distribution of over/underexpression)", main = "CQN normalized FPKM", ylim = c(-4,4), xlim = c(0,12), col = alpha("black", 0.25)) mtext(paste('Comparison: ', name1, ', ', name2, sep='')) abline(h=0, col='blue') fit=loess(M.cqn~A.cqn) lines(predict(fit), col='red') dev.off() } #plot MA plots for all 21 pairwise combos plot_group(maya, yakut, 'Maya', 'Yakut') plot_group(yakut, cambodian, 'Yakut', 'Cambodian') plot_group(cambodian, pathan, 'Cambodian', 'Pathan') plot_group(pathan, mozabite, 'Pathan', 'Mozabite') plot_group(mozabite, mbuti, 'Mozabite', 'Mbuti') #plot_group(mbuti, san, 'Mbuti', 'San') plot_group(maya, cambodian, 'Maya', 'Cambodian') plot_group(yakut, pathan, 'Yakut', 'Pathan') plot_group(cambodian, mozabite, 'Cambodian', 'Mozabite') plot_group(pathan, mbuti, 'Pathan', 'Mbuti') #plot_group(mozabite, san, 'Mozabite', 'San') plot_group(maya, pathan, 'Maya', 'Pathan') plot_group(yakut, mozabite, 'Yakut', 'Mozabite') plot_group(cambodian, mbuti, 'Cambodian', 'Mbuti') #plot_group(pathan, san, 'Pathan', 'San') plot_group(maya, mozabite, 'Maya', 'Mozabite') plot_group(yakut, mbuti, 'Yakut', 'Mbuti') #plot_group(cambodian, san, 'Cambodian', 'San') plot_group(maya, mbuti, 'Maya', 'Mbuti') #plot_group(yakut, san, 'Yakut', 'San') #plot_group(maya, san, 'Maya', 'San') #get limits for standard FPKM and cqn expression distribution plots ymax=0 xmin=0 xmax=0 for(i in 1:41) { ymax=max(ymax, density(FPKM_std_hgdp[whGenes,i])$y) xmin=min(xmin, density(FPKM_std_hgdp[whGenes,i])$x) xmax=max(xmax, density(FPKM_std_hgdp[whGenes,i])$x) } #figure 3a and 3b, raw (standard) and cqn distributions pdf('raw_exon_distribution.pdf') par(mar=c(7,5,7,7)) plot(0,0, type='n', xlim=c(xmin, xmax), ylim=c(0, ymax), xlab=expression(log[2]~"(Expression (FPKM))"), ylab='Density', main='Raw Expression Distribution', cex.lab=1.4, cex.axis=1.5, cex.main=2, bty='n') for(i in 1:41) { lines(density(FPKM_std_hgdp[whGenes,i]), col=colorPop[i]) } legend('topright', col=brewerVector, lty=1, lwd=2, c('Maya', 'Yakut', 'Cambodian', 'Pathan', 'Mozabite', 'Mbuti'), bty='n') dev.off() pdf('cqn_exon_distribution.pdf') par(mar=c(7,5,7,7)) plot(0,0, type='n', xlim=c(xmin, xmax), ylim=c(0, ymax), xlab=expression(log[2]~"(Expression (CQN FPKM))"), ylab='Density', main='CQN Expression Distribution', cex.lab=1.4, cex.axis=1.5, cex.main=2, bty='n') for(i in 1:41) { lines(density(FPKM_cqn_hgdp[whGenes,i]), col=colorPop[i]) } legend('topright', col=brewerVector, lty=1, lwd=2, c('Maya', 'Yakut', 'Cambodian', 'Pathan', 'Mozabite', 'Mbuti'), bty='n') dev.off() #figure 3e and 3f, plot randomized pair comparisons when using standard and cqn expression distributions #because these are randomized, we expect an overall 0 fold change rand_grp=sample(ind_id) grp1=rand_grp[1:22] grp2=rand_grp[23:44] whGenes <- which(rowMeans(FPKM_std_hgdp) >= 2 & covs$length >= 100) M.std <- rowMeans(FPKM_std_hgdp[whGenes, grp1]) - rowMeans(FPKM_std_hgdp[whGenes, grp2]) A.std <- rowMeans(FPKM_std_hgdp[whGenes,]) gc.std <- covs[whGenes,2] M.cqn <- rowMeans(FPKM_cqn_hgdp[whGenes, grp1]) - rowMeans(FPKM_cqn_hgdp[whGenes, grp2]) A.cqn <- rowMeans(FPKM_cqn_hgdp[whGenes,]) png('fold_change_vs_gc_std.png') plot(gc.std, M.std, cex = 0.5, pch = 16, xlab = "GC content", ylab =expression(log[2]~"(Fold Change)"), main = "Standard FPKM", ylim = c(min(M.std),max(M.std)), xlim = c(min(gc.std),max(gc.std)), col = alpha("black", 0.25), cex.lab=1.4, cex.axis=1.5, cex.main=2, bty='n') mtext('Random sampling of group 1 vs group 2') abline(h=0, col='blue') dev.off() png('fold_change_vs_gc_cqn.png') plot(gc.std, M.cqn, cex = 0.5, pch = 16, xlab = "GC content", ylab =expression(log[2]~"(Fold Change)"), main = "CQN FPKM", ylim = c(min(M.cqn),max(M.cqn)), xlim = c(min(gc.std),max(gc.std)), col = alpha("black", 0.25), cex.lab=1.4, cex.axis=1.5, cex.main=2, bty='n') mtext('Random sampling of group 1 vs group 2') abline(h=0, col='blue') dev.off() #figure 1a, raw counts distributions ymax=0 xmax=0 for(i in 1:41) { current=hist(log2(counts[,i] + 1)) ymax=max(ymax,current$counts) xmax=max(xmax,current$breaks) } pdf('count_distribution.pdf') plot(0,0, 'n', ylim=c(0, ymax), xlim=c(0,xmax), xlab=expression(log[2]~"(Number of reads)"), ylab='Frequency', main='Count distribution') for(i in 1:41) { current=hist(log2(counts[,i] + 1), plot=F) lines(current$breaks[1:length(current$breaks)-1], current$counts, col=colorPop[i]) } legend('topright', col=brewerVector, lty=1, lwd=2, c('Maya', 'Yakut', 'Cambodian', 'Pathan', 'Mozabite', 'Mbuti')) dev.off() ############################################## ################edgeR######################### pop<-factor(c(rep("Maya",7),rep("Yakut",6),rep("Cambodia",7),rep("Pathan",7),rep("Mozabite",7),rep("Mbuti",7)), levels=c("Maya", "Yakut", "Cambodia", "Pathan", "Mozabite", "Mbuti")) d <- DGEList(counts = counts[whGenes,], lib.size = total_reads, group=pop, genes=covs[whGenes,]) design <- model.matrix(~ d$sample$group) #study design matrix #estimate dispersion: 1 common dispersion per individual, a trended dispersion, and a per exon/gene/transcript/tag dispersion d_common <- estimateGLMCommonDisp(d, design = design, offset = cqn_hgdp$offset[whGenes,]) d_trended <- estimateGLMTrendedDisp(d_common, design = design, offset = cqn_hgdp$offset[whGenes,]) d_tagwise <- estimateGLMTagwiseDisp(d_trended, design = design, offset = cqn_hgdp$offset[whGenes,]) #d_tagwise2 <- estimateGLMTagwiseDisp(d_common, design = design, offset = cqn_hgdp$offset[whGenes,]) #fit a GLM to the data using each dispersion estimate, using the cqn normalization offset fit.common <- glmFit(d_common, design = design, offset = cqn_hgdp$offset[whGenes,]) fit.trended <- glmFit(d_trended, design = design, offset = cqn_hgdp$offset[whGenes,]) fit.tagwise <- glmFit(d_tagwise, design = design, offset = cqn_hgdp$offset[whGenes,]) lrt.all_pops <- glmLRT(fit.tagwise, coef=ncol(fit.tagwise$design)) write.table(lrt.all_pops$table, 'lrt_all_pops.txt', row.names=T, col.names=T, quote=F, sep='\t') ordered_table <- topTags(lrt.all_pops, dim(lrt.all_pops$table)[1])$table write.table(subset(ordered_table, ordered_table$FDR < 0.05), 'top_tags_lrt_all_pops_fdr05.txt', row.names=T, col.names=T, quote=F, sep='\t') subset(ordered_table, ordered_table[,4] < bonferoni) library(qvalue) fdr_01 <- qvalue(ordered_table[,4]) bonferoni <- .05/length(whGenes) #qqplots for each dispersion estiamte pdf('qqnorm_dispersion.pdf', onefile=T) qqnorm((fit.common$deviance - mean(fit.common$deviance))/sd(fit.common$deviance), bty='n', cex.lab=1.4, cex.axis=1.5, cex.main=2, main='Common') abline(0,1, col='red') qqnorm((fit.trended$deviance - mean(fit.trended$deviance))/sd(fit.trended$deviance), bty='n', cex.lab=1.4, cex.axis=1.5, cex.main=2, main='Trended') abline(0,1, col='red') qqnorm((fit.tagwise$deviance - mean(fit.tagwise$deviance))/sd(fit.tagwise$deviance), bty='n', cex.lab=1.4, cex.axis=1.5, cex.main=2, main='Tagwise') abline(0,1, col='red') dev.off() #order is alphabetical: Cambodian, Maya, Mbuti, Mozabite, Pathan, San, Yakut et_yak_maya <- exactTest(d_tagwise, pair=c('Yakut', 'Maya')) et_cam_yak<- exactTest(d_tagwise, pair=c('Cambodia', 'Yakut')) et_path_cam<- exactTest(d_tagwise, pair=c('Pathan', 'Cambodia')) et_moz_path<- exactTest(d_tagwise, pair=c('Mozabite', 'Pathan')) et_mbu_moz<- exactTest(d_tagwise, pair=c('Mbuti', 'Mozabite')) #et_san_mbu <- exactTest(d_tagwise, pair=c('San', 'Mbuti')) et_cam_maya <- exactTest(d_tagwise, pair=c('Cambodia', 'Maya')) et_path_yak <- exactTest(d_tagwise, pair=c('Pathan', 'Yakut')) et_moz_cam <- exactTest(d_tagwise, pair=c('Mozabite', 'Cambodia')) et_mbu_path <- exactTest(d_tagwise, pair=c('Mbuti', 'Pathan')) #et_san_moz <- exactTest(d_tagwise, pair=c('San', 'Mozabite')) et_path_maya <- exactTest(d_tagwise, pair=c('Pathan', 'Maya')) et_moz_yak <- exactTest(d_tagwise, pair=c('Mozabite', 'Yakut')) et_mbu_cam <- exactTest(d_tagwise, pair=c('Mbuti', 'Cambodia')) #et_san_path <- exactTest(d_tagwise, pair=c('San', 'Pathan')) et_moz_maya <- exactTest(d_tagwise, pair=c('Mozabite', 'Maya')) et_mbu_yak <- exactTest(d_tagwise, pair=c('Mbuti', 'Yakut')) #et_san_cam <- exactTest(d_tagwise, pair=c('San', 'Cambodia')) et_mbu_maya <- exactTest(d_tagwise, pair=c('Mbuti', 'Maya')) #et_san_yak <- exactTest(d_tagwise, pair=c('San', 'Yakut')) #et_san_maya <- exactTest(d_tagwise, pair=c('San', 'Maya')) subset(et_yak_maya$table, et_yak_maya$table[,3] < bonferoni) subset(et_cam_yak$table, et_cam_yak$table[,3] < bonferoni) subset(et_path_cam$table, et_path_cam$table[,3] < bonferoni) subset(et_moz_path$table, et_moz_path$table[,3] < bonferoni) subset(et_mbu_moz$table, et_mbu_moz$table[,3] < bonferoni) subset(et_cam_maya$table, et_cam_maya$table[,3] < bonferoni) subset(et_path_yak$table, et_path_yak$table[,3] < bonferoni) subset(et_moz_cam$table, et_moz_cam$table[,3] < bonferoni) subset(et_mbu_path$table, et_mbu_path$table[,3] < bonferoni) subset(et_path_maya$table, et_path_maya$table[,3] < bonferoni) subset(et_moz_yak$table, et_moz_yak$table[,3] < bonferoni) subset(et_mbu_cam$table, et_mbu_cam$table[,3] < bonferoni) subset(et_moz_maya$table, et_moz_maya$table[,3] < bonferoni) subset(et_mbu_yak$table, et_mbu_yak$table[,3] < bonferoni) subset(et_mbu_maya$table, et_mbu_maya$table[,3] < bonferoni) [order(lrt.mbu_maya$table[,4]),] save.image('cqn_no_san.Rdata') topTags(lrt.maya_yakut) summary(decideTestsDGE(lrt.maya_yakut)) #correlate expression (spearman), mds, determine whether population structure exists in normalized expression data pdf('src_nonmetric_mds_expression_no_san.pdf') correlation <- cor(FPKM_cqn_hgdp[,1:41]) d <- dist(correlation) mds <- isoMDS(d, k=2) par(mar=c(7,5,7,7)) plot(mds$points[,1], mds$points[,2], xlab='Coordinate 1', ylab='Coordinate 2', main='Expression MDS', pch=21, cex=2, bg=colorPop[1:41], col="black", bty='n', cex.main=2, cex.lab=1.5, cex.axis=1.5) legend('topright', c('Maya', 'Yakut', 'Cambodian', 'Pathan', 'Mozabite', 'Mbuti'), col="black", pt.bg=unique(colorPop)[1:6], pch=21, bty='n') dev.off() cor.test(mds$points[,1], nhets[1:41], method='spearman')