#this file has ase site calls from the exome data and has the correct individual IDs, has ref_ratios outside 0.4-0.6 removed, and has unique IDs for RSID ase <- read.csv('/HGDP_RNAseq/scripts/allele_specific_expression/supplemental_script_input_files/ase_total30.csv', header=T) ase <- read.csv('/HGDP_RNAseq/scripts/allele_specific_expression/supplemental_script_input_files/ase_total30_sample30.csv', header=T) #some odd processing to deal with issues reading in the file #new_col <- c(colnames(ase)[2:21]) #ase <- ase[,1:20] #colnames(ase) <- new_col #Determine # of ASE sites shared between any two individuals pop <- c('Pathan','Pathan','Pathan','Pathan','Pathan','Pathan','Pathan','Mbuti_Pygmy','Mbuti_Pygmy','Mbuti_Pygmy','Mbuti_Pygmy','Mbuti_Pygmy','Mbuti_Pygmy','Cambodian','Cambodian','Cambodian','Cambodian','Cambodian','Cambodian','Cambodian','Maya','Maya','Maya','Maya','Maya','Maya','Maya','Yakut','Yakut','Yakut','Yakut','Yakut','Yakut','San','San','San','San','Mbuti_Pygmy','Mozabite','Mozabite','Mozabite','Mozabite','Mozabite','Mozabite','Mozabite') ids <- c('HGDP00213','HGDP00222','HGDP00232','HGDP00237','HGDP00239','HGDP00247','HGDP00258','HGDP00449','HGDP00456','HGDP00462','HGDP00467','HGDP00471','HGDP00476','HGDP00711','HGDP00712','HGDP00713','HGDP00715','HGDP00716','HGDP00720','HGDP00721','HGDP00854','HGDP00855','HGDP00856','HGDP00857','HGDP00858','HGDP00868','HGDP00877','HGDP00948','HGDP00950','HGDP00955','HGDP00959','HGDP00964','HGDP00967','HGDP00987','HGDP00992','HGDP01029','HGDP01036','HGDP01081','HGDP01258','HGDP01259','HGDP01262','HGDP01264','HGDP01274','HGDP01275','HGDP01277') nhets<-c(1914989,1939435,1901192,1893994,1894682,1787932,1836691,1926280,2272609,2317831,2398421,2338970,2255041,1754825,1693280,1798134,1787883,1751286,1809665,1718554,1592701,1514683,1608281,1459374,1562783,1703872,1667675,1739368,1728192,1697423,1643000,1737060,1713637,2301549,2534965,2494405,2450468,2347218,1797353,1959229,2079354,2034194,1947736,2063233,2012364) sig_ase <- subset(ase, ase$sample30_p <= 0.05) insig_ase <- subset(ase, ase$sample30_p> 0.05) plot(density(insig_ase$ref_count/insig_ase$total_count), col='red', xlim=c(0,1), main='Ref ratio', bty='n') lines(density(new_sig_ase$ref_count/new_sig_ase$total_count)) legend('topleft', c('No significant ASE', 'Significant ASE'), col=c('red', 'black'), lty='solid', bty='n') pdf('ase_sig_insig_dist.pdf') par(mar=c(5,5,3,3)) hist(insig_ase$ref_count/insig_ase$total_count,xlim=c(0,1), col=rgb(0, 0, 1,0.5), main='Reference Ratio', xlab='Reference ratio', cex.main=2, cex.lab=1.5, cex.axis=1.5) hist(sig_ase$ref_count/sig_ase$total_count,col= rgb(1, 0, 0,0.5),add=T) legend('topleft', c('No significant ASE', 'Significant ASE'), col=c(rgb(0, 0, 1,0.5), rgb(1, 0, 0,0.5)), lty='solid', bty='n', lwd=4) dev.off() insig = hist(log2(insig_ase$total_count), plot=F) sig = hist(log2(sig_ase$total_count), plot=F) xmin = min(insig$breaks, sig$breaks) xmax = max(insig$breaks, sig$breaks) pdf('ase_sig_insig_coverage.pdf') par(mar=c(5,5,3,3)) hist(log2(insig_ase$total_count), col=rgb(0, 0, 1,0.5), xlim=c(xmin, xmax), main='Coverage', xlab=expression(log[2](coverage)), cex.main=2, cex.lab=1.5, cex.axis=1.5) hist(log2(sig_ase$total_count),col= rgb(1, 0, 0,0.5),add=T) legend('topright', c('No significant ASE', 'Significant ASE'), col=c(rgb(0, 0, 1,0.5), rgb(1, 0, 0,0.5)), lty='solid', bty='n', lwd=4) dev.off() #Filter all ASE sites: >= 30 reads, >=2% ref, >=2% nonref, p=0.05 newsig_ase <- subset(ase, ase$sample30_p <= 0.05 & ase$ref_count >= .02 * ase$total_count & ase$nonref_count >= .02 * ase$total_count) newinsig_ase <- subset(ase, ase$sample30_p > 0.05 & ase$ref_count >= .02 * ase$total_count & ase$nonref_count >= .02 * ase$total_count) pdf('ase_newsig_insig_dist.pdf') par(mar=c(5,5,3,3)) hist(insig_ase$ref_count/insig_ase$total_count,xlim=c(0,1), col=rgb(0, 0, 1,0.5), main='Reference Ratio', xlab='Reference ratio', cex.main=2, cex.lab=1.5, cex.axis=1.5) hist(newsig_ase$ref_count/newsig_ase$total_count,col= rgb(1, 0, 0,0.5),add=T) legend('topleft', c('No significant ASE', 'Significant ASE'), col=c(rgb(0, 0, 1,0.5), rgb(1, 0, 0,0.5)), lty='solid', bty='n', lwd=4) dev.off() #look at all measured sites to know whether there's more ASE sharing than expected based on heterozygosity alone maya_bkgd <- subset(ase, newinsig_ase$individual=="HGDP00854" | newinsig_ase$individual=="HGDP00855" | newinsig_ase$individual=="HGDP00856" | newinsig_ase$individual=="HGDP00857" | newinsig_ase$individual=="HGDP00858" | newinsig_ase$individual=="HGDP00868" | newinsig_ase$individual=="HGDP00877") yak_bkgd <- subset(ase, newinsig_ase$individual=="HGDP00948"| newinsig_ase$individual=="HGDP00950"| newinsig_ase$individual=="HGDP00955"| newinsig_ase$individual=="HGDP00959"| newinsig_ase$individual=="HGDP00964"| newinsig_ase$individual=="HGDP00967") cam_bkgd <- subset(ase, newinsig_ase$individual=="HGDP00711"| newinsig_ase$individual=="HGDP00712"| newinsig_ase$individual=="HGDP00713"| newinsig_ase$individual=="HGDP00715"| newinsig_ase$individual=="HGDP00716"| newinsig_ase$individual=="HGDP00720"| newinsig_ase$individual=="HGDP00721") path_bkgd <- subset(ase, newinsig_ase$individual=="HGDP00213"| newinsig_ase$individual=="HGDP00222"| newinsig_ase$individual=="HGDP00232"| newinsig_ase$individual=="HGDP00237"| newinsig_ase$individual=="HGDP00239"| newinsig_ase$individual=="HGDP00247"| newinsig_ase$individual=="HGDP00258") moz_bkgd <- subset(ase, newinsig_ase$individual=="HGDP01258"| newinsig_ase$individual=="HGDP01259"| newinsig_ase$individual=="HGDP01262"| newinsig_ase$individual=="HGDP01264"| newinsig_ase$individual=="HGDP01274"| newinsig_ase$individual=="HGDP01275"| newinsig_ase$individual=="HGDP01277") mbu_bkgd <- subset(ase, newinsig_ase$individual=="HGDP00449"| newinsig_ase$individual=="HGDP00456"| newinsig_ase$individual=="HGDP00462"| newinsig_ase$individual=="HGDP00467"| newinsig_ase$individual=="HGDP00471"| newinsig_ase$individual=="HGDP00476"| newinsig_ase$individual=="HGDP01081") #Filter by pop for significant ASE (p=0.05) with total counts ³30 and at least 2% of non_ref and ref reads maya_filt <- subset(newsig_ase, newsig_ase$individual=="HGDP00854" | newsig_ase$individual=="HGDP00855" | newsig_ase$individual=="HGDP00856" | newsig_ase$individual=="HGDP00857" | newsig_ase$individual=="HGDP00858" | newsig_ase$individual=="HGDP00868" | newsig_ase$individual=="HGDP00877") yak_filt <- subset(newsig_ase, newsig_ase$individual=="HGDP00948"| newsig_ase$individual=="HGDP00950"| newsig_ase$individual=="HGDP00955"| newsig_ase$individual=="HGDP00959"| newsig_ase$individual=="HGDP00964"| newsig_ase$individual=="HGDP00967") cam_filt <- subset(newsig_ase, newsig_ase$individual=="HGDP00711"| newsig_ase$individual=="HGDP00712"| newsig_ase$individual=="HGDP00713"| newsig_ase$individual=="HGDP00715"| newsig_ase$individual=="HGDP00716"| newsig_ase$individual=="HGDP00720"| newsig_ase$individual=="HGDP00721") path_filt <- subset(newsig_ase, newsig_ase$individual=="HGDP00213"| newsig_ase$individual=="HGDP00222"| newsig_ase$individual=="HGDP00232"| newsig_ase$individual=="HGDP00237"| newsig_ase$individual=="HGDP00239"| newsig_ase$individual=="HGDP00247"| newsig_ase$individual=="HGDP00258") moz_filt <- subset(newsig_ase, newsig_ase$individual=="HGDP01258"| newsig_ase$individual=="HGDP01259"| newsig_ase$individual=="HGDP01262"| newsig_ase$individual=="HGDP01264"| newsig_ase$individual=="HGDP01274"| newsig_ase$individual=="HGDP01275"| newsig_ase$individual=="HGDP01277") mbu_filt <- subset(newsig_ase, newsig_ase$individual=="HGDP00449"| newsig_ase$individual=="HGDP00456"| newsig_ase$individual=="HGDP00462"| newsig_ase$individual=="HGDP00467"| newsig_ase$individual=="HGDP00471"| newsig_ase$individual=="HGDP00476"| newsig_ase$individual=="HGDP01081") ase_sites_seen = function(ind_filt, ind_bkgd) { filt=c() bkgd=c() for(i in 1:length(unique(ind_filt$individual))) { filt <- c(filt, (dim(subset(ind_filt, ind_filt$individual == unique(ind_filt$individual)[i]))[1])) bkgd <- c(bkgd, (dim(subset(ind_bkgd, ind_bkgd$individual == unique(ind_bkgd$individual)[i]))[1])) } num = data.frame(filt = filt, bkgd = bkgd) return(num) } ase_sites_seen2 = function(ind_filt, ind_bkgd) { filt=c() bkgd=c() for(i in 1:length(unique(ind_filt$individual))) { for(j in 1:length(unique(ind_filt$individual))) { if(i > j) { ind1 <- subset(ind_filt, ind_filt$individual == unique(ind_filt$individual)[i]) ind2 <- subset(ind_filt, ind_filt$individual == unique(ind_filt$individual)[j]) #find sites in both filt <- c(filt, sum(ind2$RSID %in% ind1$RSID)) ind1 <- subset(ind_bkgd, ind_bkgd$individual == unique(ind_bkgd$individual)[i]) ind2 <- subset(ind_bkgd, ind_bkgd$individual == unique(ind_bkgd$individual)[j]) #find sites in both bkgd <- c(bkgd, sum(ind2$RSID %in% ind1$RSID)) } } } num = data.frame(filt = filt, bkgd = bkgd) return(num) } ase_sites_seen3 = function(ind_filt, ind_bkgd) { filt=c() bkgd=c() for(i in 1:length(unique(ind_filt$individual))) { for(j in 1:length(unique(ind_filt$individual))) { for(k in 1:length(unique(ind_filt$individual))) { if(i > j & j > k) { ind1 <- subset(ind_filt, ind_filt$individual == unique(ind_filt$individual)[i]) ind2 <- subset(ind_filt, ind_filt$individual == unique(ind_filt$individual)[j]) ind3 <- subset(ind_filt, ind_filt$individual == unique(ind_filt$individual)[k]) #find sites in both filt <- c(filt, sum(ind2$RSID %in% ind1$RSID & ind3$RSID %in% ind2$RSID)) ind1 <- subset(ind_bkgd, ind_bkgd$individual == unique(ind_bkgd$individual)[i]) ind2 <- subset(ind_bkgd, ind_bkgd$individual == unique(ind_bkgd$individual)[j]) ind3 <- subset(ind_bkgd, ind_bkgd$individual == unique(ind_bkgd$individual)[k]) #find sites in both bkgd <- c(bkgd, sum(ind2$RSID %in% ind1$RSID & ind3$RSID %in% ind2$RSID)) } } } } num = data.frame(filt = filt, bkgd = bkgd) return(num) } ase_sites_seen4 = function(ind_filt, ind_bkgd) { filt=c() bkgd=c() for(i in 1:length(unique(ind_filt$individual))) { for(j in 1:length(unique(ind_filt$individual))) { for(k in 1:length(unique(ind_filt$individual))) { for(l in 1:length(unique(ind_filt$individual))) { if(i > j & j > k & k > l) { ind1 <- subset(ind_filt, ind_filt$individual == unique(ind_filt$individual)[i]) ind2 <- subset(ind_filt, ind_filt$individual == unique(ind_filt$individual)[j]) ind3 <- subset(ind_filt, ind_filt$individual == unique(ind_filt$individual)[k]) ind4 <- subset(ind_filt, ind_filt$individual == unique(ind_filt$individual)[l]) #find sites in both filt <- c(filt, sum(ind2$RSID %in% ind1$RSID & ind3$RSID %in% ind2$RSID & ind4$RSID %in% ind3$RSID)) ind1 <- subset(ind_bkgd, ind_bkgd$individual == unique(ind_bkgd$individual)[i]) ind2 <- subset(ind_bkgd, ind_bkgd$individual == unique(ind_bkgd$individual)[j]) ind3 <- subset(ind_bkgd, ind_bkgd$individual == unique(ind_bkgd$individual)[k]) ind4 <- subset(ind_bkgd, ind_bkgd$individual == unique(ind_bkgd$individual)[l]) #find sites in both bkgd <- c(bkgd, sum(ind2$RSID %in% ind1$RSID & ind3$RSID %in% ind2$RSID & ind4$RSID %in% ind3$RSID)) } } } } } num = data.frame(filt = filt, bkgd = bkgd) return(num) } ase_sites_seen5 = function(ind_filt, ind_bkgd) { filt=c() bkgd=c() for(i in 1:length(unique(ind_filt$individual))) { for(j in 1:length(unique(ind_filt$individual))) { for(k in 1:length(unique(ind_filt$individual))) { for(l in 1:length(unique(ind_filt$individual))) { for(m in 1:length(unique(ind_filt$individual))) { if(i > j & j > k & k > l & l > m) { ind1 <- subset(ind_filt, ind_filt$individual == unique(ind_filt$individual)[i]) ind2 <- subset(ind_filt, ind_filt$individual == unique(ind_filt$individual)[j]) ind3 <- subset(ind_filt, ind_filt$individual == unique(ind_filt$individual)[k]) ind4 <- subset(ind_filt, ind_filt$individual == unique(ind_filt$individual)[l]) ind5 <- subset(ind_filt, ind_filt$individual == unique(ind_filt$individual)[m]) #find sites in both filt <- c(filt, sum(ind2$RSID %in% ind1$RSID & ind3$RSID %in% ind2$RSID & ind4$RSID %in% ind3$RSID & ind5$RSID %in% ind4$RSID)) ind1 <- subset(ind_bkgd, ind_bkgd$individual == unique(ind_bkgd$individual)[i]) ind2 <- subset(ind_bkgd, ind_bkgd$individual == unique(ind_bkgd$individual)[j]) ind3 <- subset(ind_bkgd, ind_bkgd$individual == unique(ind_bkgd$individual)[k]) ind4 <- subset(ind_bkgd, ind_bkgd$individual == unique(ind_bkgd$individual)[l]) ind5 <- subset(ind_bkgd, ind_bkgd$individual == unique(ind_bkgd$individual)[m]) #find sites in both bkgd <- c(bkgd, sum(ind2$RSID %in% ind1$RSID & ind3$RSID %in% ind2$RSID & ind4$RSID %in% ind3$RSID & ind5$RSID %in% ind4$RSID)) } } } } } } num = data.frame(filt = filt, bkgd = bkgd) return(num) } ase_sites_seen6 = function(ind_filt, ind_bkgd) { filt=c() bkgd=c() for(i in 1:length(unique(ind_filt$individual))) { for(j in 1:length(unique(ind_filt$individual))) { for(k in 1:length(unique(ind_filt$individual))) { for(l in 1:length(unique(ind_filt$individual))) { for(m in 1:length(unique(ind_filt$individual))) { for(n in 1:length(unique(ind_filt$individual))) { if(i > j & j > k & k > l & l > m & m > n) { ind1 <- subset(ind_filt, ind_filt$individual == unique(ind_filt$individual)[i]) ind2 <- subset(ind_filt, ind_filt$individual == unique(ind_filt$individual)[j]) ind3 <- subset(ind_filt, ind_filt$individual == unique(ind_filt$individual)[k]) ind4 <- subset(ind_filt, ind_filt$individual == unique(ind_filt$individual)[l]) ind5 <- subset(ind_filt, ind_filt$individual == unique(ind_filt$individual)[m]) ind6 <- subset(ind_filt, ind_filt$individual == unique(ind_filt$individual)[n]) #find sites in both filt <- c(filt, sum(ind2$RSID %in% ind1$RSID & ind3$RSID %in% ind2$RSID & ind4$RSID %in% ind3$RSID & ind5$RSID %in% ind4$RSID & ind5$RSID %in% ind6$RSID)) ind1 <- subset(ind_bkgd, ind_bkgd$individual == unique(ind_bkgd$individual)[i]) ind2 <- subset(ind_bkgd, ind_bkgd$individual == unique(ind_bkgd$individual)[j]) ind3 <- subset(ind_bkgd, ind_bkgd$individual == unique(ind_bkgd$individual)[k]) ind4 <- subset(ind_bkgd, ind_bkgd$individual == unique(ind_bkgd$individual)[l]) ind5 <- subset(ind_bkgd, ind_bkgd$individual == unique(ind_bkgd$individual)[m]) ind6 <- subset(ind_bkgd, ind_bkgd$individual == unique(ind_bkgd$individual)[n]) #find sites in both bkgd <- c(bkgd, sum(ind2$RSID %in% ind1$RSID & ind3$RSID %in% ind2$RSID & ind4$RSID %in% ind3$RSID & ind5$RSID %in% ind4$RSID & ind5$RSID %in% ind6$RSID)) } } } } } } } num = data.frame(filt = filt, bkgd = bkgd) return(num) } ase_sites_seen7 = function(ind_filt, ind_bkgd) { filt=c() bkgd=c() for(i in 1:length(unique(ind_filt$individual))) { for(j in 1:length(unique(ind_filt$individual))) { for(k in 1:length(unique(ind_filt$individual))) { for(l in 1:length(unique(ind_filt$individual))) { for(m in 1:length(unique(ind_filt$individual))) { for(n in 1:length(unique(ind_filt$individual))) { for(o in 1:length(unique(ind_filt$individual))) { if(i > j & j > k & k > l & l > m & m > n & n > o) { ind1 <- subset(ind_filt, ind_filt$individual == unique(ind_filt$individual)[i]) ind2 <- subset(ind_filt, ind_filt$individual == unique(ind_filt$individual)[j]) ind3 <- subset(ind_filt, ind_filt$individual == unique(ind_filt$individual)[k]) ind4 <- subset(ind_filt, ind_filt$individual == unique(ind_filt$individual)[l]) ind5 <- subset(ind_filt, ind_filt$individual == unique(ind_filt$individual)[m]) ind6 <- subset(ind_filt, ind_filt$individual == unique(ind_filt$individual)[n]) ind7 <- subset(ind_filt, ind_filt$individual == unique(ind_filt$individual)[o]) #find sites in both filt <- c(filt, sum(ind2$RSID %in% ind1$RSID & ind3$RSID %in% ind2$RSID & ind4$RSID %in% ind3$RSID & ind5$RSID %in% ind4$RSID & ind5$RSID %in% ind6$RSID & ind7$RSID %in% ind6$RSID)) ind1 <- subset(ind_bkgd, ind_bkgd$individual == unique(ind_bkgd$individual)[i]) ind2 <- subset(ind_bkgd, ind_bkgd$individual == unique(ind_bkgd$individual)[j]) ind3 <- subset(ind_bkgd, ind_bkgd$individual == unique(ind_bkgd$individual)[k]) ind4 <- subset(ind_bkgd, ind_bkgd$individual == unique(ind_bkgd$individual)[l]) ind5 <- subset(ind_bkgd, ind_bkgd$individual == unique(ind_bkgd$individual)[m]) ind6 <- subset(ind_bkgd, ind_bkgd$individual == unique(ind_bkgd$individual)[n]) ind7 <- subset(ind_bkgd, ind_bkgd$individual == unique(ind_bkgd$individual)[o]) #find sites in both bkgd <- c(bkgd, sum(ind2$RSID %in% ind1$RSID & ind3$RSID %in% ind2$RSID & ind4$RSID %in% ind3$RSID & ind5$RSID %in% ind4$RSID & ind5$RSID %in% ind6$RSID & ind7$RSID %in% ind6$RSID)) } } } } } } } } num = data.frame(filt = filt, bkgd = bkgd) return(num) } maya <- ase_sites_seen(maya_filt, maya_bkgd)$filt/ase_sites_seen(maya_filt, maya_bkgd)$bkgd yakut <- ase_sites_seen(yak_filt, yak_bkgd)$filt/ase_sites_seen(yak_filt, yak_bkgd)$bkgd cam <- ase_sites_seen(cam_filt, cam_bkgd)$filt/ase_sites_seen(cam_filt, cam_bkgd)$bkgd path <- ase_sites_seen(path_filt, path_bkgd)$filt/ase_sites_seen(path_filt, path_bkgd)$bkgd moz <- ase_sites_seen(moz_filt, moz_bkgd)$filt/ase_sites_seen(moz_filt, moz_bkgd)$bkgd mbu <- ase_sites_seen(mbu_filt, mbu_bkgd)$filt/ase_sites_seen(mbu_filt, mbu_bkgd)$bkgd maya <- c(maya, ase_sites_seen2(maya_filt, maya_bkgd)$filt/ase_sites_seen2(maya_filt, maya_bkgd)$bkgd) yakut <- c(yakut, ase_sites_seen2(yak_filt, yak_bkgd)$filt/ase_sites_seen2(yak_filt, yak_bkgd)$bkgd) cam <- c(cam, ase_sites_seen2(cam_filt, cam_bkgd)$filt/ase_sites_seen2(cam_filt, cam_bkgd)$bkgd) path <- c(path, ase_sites_seen2(path_filt, path_bkgd)$filt/ase_sites_seen2(path_filt, path_bkgd)$bkgd) moz <- c(moz, ase_sites_seen2(moz_filt, moz_bkgd)$filt/ase_sites_seen2(moz_filt, moz_bkgd)$bkgd) mbu <- c(mbu, ase_sites_seen2(mbu_filt, mbu_bkgd)$filt/ase_sites_seen2(mbu_filt, mbu_bkgd)$bkgd) maya <- c(maya, ase_sites_seen3(maya_filt, maya_bkgd)$filt/ase_sites_seen3(maya_filt, maya_bkgd)$bkgd) yakut <- c(yakut, ase_sites_seen3(yak_filt, yak_bkgd)$filt/ase_sites_seen3(yak_filt, yak_bkgd)$bkgd) cam <- c(cam, ase_sites_seen3(cam_filt, cam_bkgd)$filt/ase_sites_seen3(cam_filt, cam_bkgd)$bkgd) path <- c(path, ase_sites_seen3(path_filt, path_bkgd)$filt/ase_sites_seen3(path_filt, path_bkgd)$bkgd) moz <- c(moz, ase_sites_seen3(moz_filt, moz_bkgd)$filt/ase_sites_seen3(moz_filt, moz_bkgd)$bkgd) mbu <- c(mbu, ase_sites_seen3(mbu_filt, mbu_bkgd)$filt/ase_sites_seen3(mbu_filt, mbu_bkgd)$bkgd) maya <- c(maya, ase_sites_seen4(maya_filt, maya_bkgd)$filt/ase_sites_seen4(maya_filt, maya_bkgd)$bkgd) yakut <- c(yakut, ase_sites_seen4(yak_filt, yak_bkgd)$filt/ase_sites_seen4(yak_filt, yak_bkgd)$bkgd) cam <- c(cam, ase_sites_seen4(cam_filt, cam_bkgd)$filt/ase_sites_seen4(cam_filt, cam_bkgd)$bkgd) path <- c(path, ase_sites_seen4(path_filt, path_bkgd)$filt/ase_sites_seen4(path_filt, path_bkgd)$bkgd) moz <- c(moz, ase_sites_seen4(moz_filt, moz_bkgd)$filt/ase_sites_seen4(moz_filt, moz_bkgd)$bkgd) mbu <- c(mbu, ase_sites_seen4(mbu_filt, mbu_bkgd)$filt/ase_sites_seen4(mbu_filt, mbu_bkgd)$bkgd) maya <- c(maya, ase_sites_seen5(maya_filt, maya_bkgd)$filt/ase_sites_seen5(maya_filt, maya_bkgd)$bkgd) yakut <- c(yakut, ase_sites_seen5(yak_filt, yak_bkgd)$filt/ase_sites_seen5(yak_filt, yak_bkgd)$bkgd) cam <- c(cam, ase_sites_seen5(cam_filt, cam_bkgd)$filt/ase_sites_seen5(cam_filt, cam_bkgd)$bkgd) path <- c(path, ase_sites_seen5(path_filt, path_bkgd)$filt/ase_sites_seen5(path_filt, path_bkgd)$bkgd) moz <- c(moz, ase_sites_seen5(moz_filt, moz_bkgd)$filt/ase_sites_seen5(moz_filt, moz_bkgd)$bkgd) mbu <- c(mbu, ase_sites_seen5(mbu_filt, mbu_bkgd)$filt/ase_sites_seen5(mbu_filt, mbu_bkgd)$bkgd) maya <- c(maya, ase_sites_seen6(maya_filt, maya_bkgd)$filt/ase_sites_seen6(maya_filt, maya_bkgd)$bkgd) yakut <- c(yakut, ase_sites_seen6(yak_filt, yak_bkgd)$filt/ase_sites_seen6(yak_filt, yak_bkgd)$bkgd) cam <- c(cam, ase_sites_seen6(cam_filt, cam_bkgd)$filt/ase_sites_seen6(cam_filt, cam_bkgd)$bkgd) path <- c(path, ase_sites_seen6(path_filt, path_bkgd)$filt/ase_sites_seen6(path_filt, path_bkgd)$bkgd) moz <- c(moz, ase_sites_seen6(moz_filt, moz_bkgd)$filt/ase_sites_seen6(moz_filt, moz_bkgd)$bkgd) mbu <- c(mbu, ase_sites_seen6(mbu_filt, mbu_bkgd)$filt/ase_sites_seen6(mbu_filt, mbu_bkgd)$bkgd) maya <- c(maya, ase_sites_seen7(maya_filt, maya_bkgd)$filt/ase_sites_seen7(maya_filt, maya_bkgd)$bkgd) cam <- c(cam, ase_sites_seen7(cam_filt, cam_bkgd)$filt/ase_sites_seen7(cam_filt, cam_bkgd)$bkgd) path <- c(path, ase_sites_seen7(path_filt, path_bkgd)$filt/ase_sites_seen7(path_filt, path_bkgd)$bkgd) moz <- c(moz, ase_sites_seen7(moz_filt, moz_bkgd)$filt/ase_sites_seen7(moz_filt, moz_bkgd)$bkgd) mbu <- c(mbu, ase_sites_seen7(mbu_filt, mbu_bkgd)$filt/ase_sites_seen7(mbu_filt, mbu_bkgd)$bkgd) sevens_num <- c(rep(1, choose(7,1)), rep(2, choose(7, 2)), rep(3, choose(7,3)), rep(4, choose(7, 4)), rep(5, choose(7, 5)), rep(6, choose(7,6)), rep(7, choose(7,7))) six_num <- c(rep(1, choose(6,1)), rep(2, choose(6, 2)), rep(3, choose(6,3)), rep(4, choose(6, 4)), rep(5, choose(6, 5)), rep(6, choose(6,6))) brewerVector=rev(c(colors()[c(466)],"blue","turquoise","darkgreen","yellow","orange","red")) maya.lo <- loess(maya~sevens_num) maya.pred <- predict(maya.lo, se=T) yakut.lo <- loess(yakut~six_num) yakut.pred <- predict(yakut.lo, se=T) cam.lo <- loess(cam~sevens_num) cam.pred <- predict(cam.lo, se=T) path.lo <- loess(path~sevens_num) path.pred <- predict(path.lo, se=T) moz.lo <- loess(moz~sevens_num) moz.pred <- predict(moz.lo, se=T) mbu.lo <- loess(mbu~sevens_num) mbu.pred <- predict(mbu.lo, se=T) maya_num <- sevens_num[!is.na(maya)] yakut_num <- six_num[!is.na(yakut)] cam_num <- sevens_num[!is.na(cam)] path_num <- sevens_num[!is.na(path)] moz_num <- sevens_num[!is.na(moz)] mbu_num <- sevens_num[!is.na(mbu)] pdf('ase_sharing_percent_within_sample30.pdf') par(mar=c(5,5,3,3)) plot(maya_num, maya[!is.na(maya)], ylab='Percent significant/measured ASE sites shared', xlab='Number of individuals', main='ASE sharing in a population', col=brewerVector[1], bty='n', ylim=c(0, max(maya[!is.na(maya)], yakut[!is.na(yakut)], cam[!is.na(cam)], path[!is.na(path)], moz[!is.na(moz)], mbu[!is.na(mbu)])), cex.main=2, cex.axis=1.5, cex.lab=1.5) lines(maya_num, maya.pred$fit, col=brewerVector[1], lwd=2) points(yakut_num, yakut[!is.na(yakut)], col=brewerVector[2]) lines(yakut_num, yakut.pred$fit, col=brewerVector[2], lwd=2) points(cam_num, cam[!is.na(cam)], col=brewerVector[3]) lines(cam_num, cam.pred$fit, col=brewerVector[3], lwd=2) points(path_num, path[!is.na(path)], col=brewerVector[4]) lines(path_num, path.pred$fit, col=brewerVector[4], lwd=2) points(moz_num, moz[!is.na(moz)], col=brewerVector[5]) lines(moz_num, moz.pred$fit, col=brewerVector[5], lwd=2) points(mbu_num, mbu[!is.na(mbu)], col=brewerVector[6]) lines(mbu_num, mbu.pred$fit, col=brewerVector[6], lwd=2) legend('topright', c('Maya', 'Yakut', 'Cambodian', 'Pathan', 'Mozabite', 'Mbuti'), lty='solid', lwd=2, col=brewerVector, bty='n') dev.off() path <- c('HGDP00213','HGDP00222','HGDP00232','HGDP00237','HGDP00239','HGDP00247','HGDP00258') mbu <- c('HGDP00449','HGDP00456','HGDP00462','HGDP00467','HGDP00471','HGDP00476', 'HGDP01081') cam <- c('HGDP00711','HGDP00712','HGDP00713','HGDP00715','HGDP00716','HGDP00720','HGDP00721') maya <- c('HGDP00854','HGDP00855','HGDP00856','HGDP00857','HGDP00858','HGDP00868','HGDP00877') yak <- c('HGDP00948','HGDP00950','HGDP00955','HGDP00959','HGDP00964','HGDP00967') moz <- c('HGDP01258','HGDP01259','HGDP01262','HGDP01264','HGDP01274','HGDP01275','HGDP01277') # # #ase_sites_seen2 = function(ind_filt) { # num=c() # for(i in 1:length(unique(ind_filt$individual))) { # for(j in 1:length(unique(ind_filt$individual))) { # if(i > j) { # ind1 <- subset(ind_filt, ind_filt$individual == unique(ind_filt$individual)[i]) # ind2 <- subset(ind_filt, ind_filt$individual == unique(ind_filt$individual)[j]) # #find sites in both # num <- c(num, sum(ind2$RSID %in% ind1$RSID)) # } # } # } # return(num) #} #get significant number of ASE sites per individual ase_ind <- unique(newsig_ase$individual) may_yak <- c() may_cam <- c() may_pat <- c() may_moz <- c() may_mbu <- c() yak_cam <- c() yak_pat <- c() yak_moz <- c() yak_mbu <- c() cam_pat <- c() cam_moz <- c() cam_mbu <- c() pat_moz <- c() pat_mbu <- c() moz_mbu <- c() may_may <- c() yak_yak <- c() cam_cam <- c() pat_pat <- c() moz_moz <- c() mbu_mbu <- c() ase_sites_seen2 = function(ind_filt, ind_bkgd) { filt=c() bkgd=c() for(i in 1:length(unique(ind_filt$individual))) { for(j in 1:length(unique(ind_filt$individual))) { if(i > j) { ind1 <- subset(ind_filt, ind_filt$individual == unique(ind_filt$individual)[i]) ind2 <- subset(ind_filt, ind_filt$individual == unique(ind_filt$individual)[j]) #find sites in both filt <- c(filt, sum(ind2$RSID %in% ind1$RSID)) ind1 <- subset(ind_bkgd, ind_bkgd$individual == unique(ind_bkgd$individual)[i]) ind2 <- subset(ind_bkgd, ind_bkgd$individual == unique(ind_bkgd$individual)[j]) #find sites in both bkgd <- c(bkgd, sum(ind2$RSID %in% ind1$RSID)) } } } num = filt/bkgd return(num) } for(i in 1:100) { may_sample <- sample(maya, 1) yak_sample <- sample(yak, 1) cam_sample <- sample(cam, 1) pat_sample <- sample(path, 1) moz_sample <- sample(moz, 1) mbu_sample <- sample(mbu, 1) may_yak <- c(may_yak, ase_sites_seen2(subset(newsig_ase, newsig_ase$individual==may_sample | newsig_ase$individual==yak_sample), subset(newinsig_ase, newinsig_ase$individual==may_sample | ase$individual==yak_sample))) may_cam <- c(may_cam, ase_sites_seen2(subset(newsig_ase, newsig_ase$individual==may_sample | newsig_ase$individual==cam_sample), subset(newinsig_ase, newinsig_ase$individual==may_sample | ase$individual==cam_sample))) may_pat <- c(may_pat, ase_sites_seen2(subset(newsig_ase, newsig_ase$individual==may_sample | newsig_ase$individual==pat_sample), subset(newinsig_ase, newinsig_ase$individual==may_sample | ase$individual==pat_sample))) may_moz <- c(may_moz, ase_sites_seen2(subset(newsig_ase, newsig_ase$individual==may_sample | newsig_ase$individual==moz_sample), subset(newinsig_ase, newinsig_ase$individual==may_sample | ase$individual==moz_sample))) may_mbu <- c(may_mbu, ase_sites_seen2(subset(newsig_ase, newsig_ase$individual==may_sample | newsig_ase$individual==mbu_sample), subset(newinsig_ase, newinsig_ase$individual==may_sample | ase$individual==mbu_sample))) yak_cam <- c(yak_cam, ase_sites_seen2(subset(newsig_ase, newsig_ase$individual==yak_sample | newsig_ase$individual==cam_sample), subset(newinsig_ase, newinsig_ase$individual==yak_sample | ase$individual==cam_sample))) yak_pat <- c(yak_pat, ase_sites_seen2(subset(newsig_ase, newsig_ase$individual==yak_sample | newsig_ase$individual==pat_sample), subset(newinsig_ase, newinsig_ase$individual==yak_sample | ase$individual==pat_sample))) yak_moz <- c(yak_moz, ase_sites_seen2(subset(newsig_ase, newsig_ase$individual==yak_sample | newsig_ase$individual==moz_sample), subset(newinsig_ase, newinsig_ase$individual==yak_sample | ase$individual==moz_sample))) yak_mbu <- c(yak_mbu, ase_sites_seen2(subset(newsig_ase, newsig_ase$individual==yak_sample | newsig_ase$individual==mbu_sample), subset(newinsig_ase, newinsig_ase$individual==yak_sample | ase$individual==mbu_sample))) cam_pat <- c(cam_pat, ase_sites_seen2(subset(newsig_ase, newsig_ase$individual==cam_sample | newsig_ase$individual==pat_sample), subset(newinsig_ase, newinsig_ase$individual==cam_sample | ase$individual==pat_sample))) cam_moz <- c(cam_moz, ase_sites_seen2(subset(newsig_ase, newsig_ase$individual==cam_sample | newsig_ase$individual==moz_sample), subset(newinsig_ase, newinsig_ase$individual==cam_sample | ase$individual==moz_sample))) cam_mbu <- c(cam_mbu, ase_sites_seen2(subset(newsig_ase, newsig_ase$individual==cam_sample | newsig_ase$individual==mbu_sample), subset(newinsig_ase, newinsig_ase$individual==cam_sample | ase$individual==mbu_sample))) pat_moz <- c(pat_moz, ase_sites_seen2(subset(newsig_ase, newsig_ase$individual==pat_sample | newsig_ase$individual==moz_sample), subset(newinsig_ase, newinsig_ase$individual==pat_sample | ase$individual==moz_sample))) pat_mbu <- c(pat_mbu, ase_sites_seen2(subset(newsig_ase, newsig_ase$individual==pat_sample | newsig_ase$individual==mbu_sample), subset(newinsig_ase, newinsig_ase$individual==pat_sample | ase$individual==mbu_sample))) moz_mbu <- c(moz_mbu, ase_sites_seen2(subset(newsig_ase, newsig_ase$individual==moz_sample | newsig_ase$individual==mbu_sample), subset(newinsig_ase, newinsig_ase$individual==moz_sample | ase$individual==mbu_sample))) may.samp <- sample(maya, 2) may_may <- c(may_may, ase_sites_seen2(subset(newsig_ase, newsig_ase$individual==may.samp[1] | newsig_ase$individual==may.samp[2]), subset(newinsig_ase, newinsig_ase$individual==may.samp[1] | ase$individual==may.samp[2]))) yak.samp <- sample(yak, 2) yak_yak <- c(yak_yak, ase_sites_seen2(subset(newsig_ase, newsig_ase$individual==yak.samp[1] | newsig_ase$individual==yak.samp[2]), subset(newinsig_ase, newinsig_ase$individual==yak.samp[1] | ase$individual==yak.samp[2]))) cam.samp <- sample(cam, 2) cam_cam <- c(cam_cam, ase_sites_seen2(subset(newsig_ase, newsig_ase$individual==cam.samp[1] | newsig_ase$individual==cam.samp[2]), subset(newinsig_ase, newinsig_ase$individual==cam.samp[1] | ase$individual==cam.samp[2]))) pat.samp <- sample(path, 2) pat_pat <- c(pat_pat, ase_sites_seen2(subset(newsig_ase, newsig_ase$individual==pat.samp[1] | newsig_ase$individual==pat.samp[2]), subset(newinsig_ase, newinsig_ase$individual==pat.samp[1] | ase$individual==pat.samp[2]))) moz.samp <- sample(moz, 2) moz_moz <- c(moz_moz, ase_sites_seen2(subset(newsig_ase, newsig_ase$individual==moz.samp[1] | newsig_ase$individual==moz.samp[2]), subset(newinsig_ase, newinsig_ase$individual==moz.samp[1] | ase$individual==moz.samp[2]))) mbu.samp <- sample(mbu, 2) mbu_mbu <- c(mbu_mbu, ase_sites_seen2(subset(newsig_ase, newsig_ase$individual==mbu.samp[1] | newsig_ase$individual==mbu.samp[2]), subset(newinsig_ase, newinsig_ase$individual==mbu.samp[1] | ase$individual==mbu.samp[2]))) } may_yak_summ <- summary(may_yak[may_yak < 10 & !is.na(may_yak)]) may_cam_summ <- summary(may_cam[may_cam < 10 & !is.na(may_cam)]) may_pat_summ <- summary(may_pat[may_pat < 10 & !is.na(may_pat)]) may_moz_summ <- summary(may_moz[may_moz < 10 & !is.na(may_moz)]) may_mbu_summ <- summary(may_mbu[may_mbu < 10 & !is.na(may_mbu)]) yak_cam_summ <- summary(yak_cam[yak_cam < 10 & !is.na(yak_cam)]) yak_pat_summ <- summary(yak_pat[yak_pat < 10 & !is.na(yak_pat)]) yak_moz_summ <- summary(yak_moz[yak_moz < 10 & !is.na(yak_moz)]) yak_mbu_summ <- summary(yak_mbu[yak_mbu < 10 & !is.na(yak_mbu)]) cam_pat_summ <- summary(cam_pat[cam_pat < 10 & !is.na(cam_pat)]) cam_moz_summ <- summary(cam_moz[cam_moz < 10 & !is.na(cam_moz)]) cam_mbu_summ <- summary(cam_mbu[cam_mbu < 10 & !is.na(cam_mbu)]) pat_moz_summ <- summary(pat_moz[pat_moz < 10 & !is.na(pat_moz)]) pat_mbu_summ <- summary(pat_mbu[pat_mbu < 10 & !is.na(pat_mbu)]) moz_mbu_summ <- summary(moz_mbu[moz_mbu < 10 & !is.na(moz_mbu)]) may_may_summ <- summary(may_may[may_may < 10 & !is.na(may_may)]) yak_yak_summ <- summary(yak_yak[yak_yak < 10 & !is.na(yak_yak)]) cam_cam_summ <- summary(cam_cam[cam_cam < 10 & !is.na(cam_cam)]) pat_pat_summ <- summary(pat_pat[pat_pat < 10 & !is.na(pat_pat)]) moz_moz_summ <- summary(moz_moz[moz_moz < 10 & !is.na(moz_moz)]) mbu_mbu_summ <- summary(mbu_mbu[mbu_mbu < 10 & !is.na(mbu_mbu)]) library(plotrix) pdf('arc_plot_percent.pdf') plot(1:10, type='n', xlim=c(-1,11), ylim=c(-1,10), bty='n', xlab='Population', main='Pairwise ASE sharing', xaxt='n', ylab='', yaxt='n') #first level draw.arc(1,0, 1, deg2=180, col='grey') draw.arc(3,0, 1, deg2=180, col='grey') draw.arc(5,0, 1, deg2=180, col='grey') draw.arc(7,0, 1, deg2=180, col='grey') draw.arc(9,0, 1, deg2=180, col='grey') #second level draw.arc(2,0, 2, deg2=180, col='grey') draw.arc(4,0, 2, deg2=180, col='grey') draw.arc(6,0, 2, deg2=180, col='grey') draw.arc(8,0, 2, deg2=180, col='grey') #third level draw.arc(3,0, 3, deg2=180, col='grey') draw.arc(5,0, 3, deg2=180, col='grey') draw.arc(7,0, 3, deg2=180, col='grey') #fourth level draw.arc(4,0, 4, deg2=180, col='grey') draw.arc(6,0, 4, deg2=180, col='grey') #fifth level draw.arc(5,0, 5, deg2=180, col='grey') text(1,1.25, as.character(round(median(may_yak_summ), 3))) #may_yak text(3,1.25, as.character(round(median(yak_cam_summ), 3))) text(5,1.25, as.character(round(median(cam_pat_summ), 3))) text(7,1.25, as.character(round(median(pat_moz_summ), 3))) text(9,1.25, as.character(round(median(moz_mbu_summ), 3))) text(2,2.25, as.character(round(median(may_cam_summ), 3))) text(4,2.25, as.character(round(median(yak_pat_summ), 3))) text(6,2.25, as.character(round(median(cam_moz_summ), 3))) text(8,2.25, as.character(round(median(moz_mbu_summ), 3))) text(3,3.25, as.character(round(median(may_pat_summ), 3))) text(5,3.25, as.character(round(median(yak_moz_summ), 3))) text(7,3.25, as.character(round(median(cam_mbu_summ), 3))) text(4,4.25, as.character(round(median(may_moz_summ), 3))) text(6,4.25, as.character(round(median(may_mbu_summ), 3))) text(5,5.25, as.character(round(median(may_mbu_summ), 3))) text(0, -0.25, as.character(round(median(may_may_summ), 3))) text(2, -0.25, as.character(round(median(yak_yak_summ), 3))) text(4, -0.25, as.character(round(median(cam_cam_summ), 3))) text(6, -0.25, as.character(round(median(pat_pat_summ), 3))) text(8, -0.25, as.character(round(median(moz_moz_summ), 3))) text(10, -0.25, as.character(round(median(mbu_mbu_summ), 3))) text(11, -0.25, as.character(round(median(c(may_may, yak_yak, cam_cam, pat_pat, moz_moz, mbu_mbu)), 3))) text(11, 1.25, as.character(round(median(c(may_yak, yak_cam, cam_pat, pat_moz, moz_mbu)), 3))) text(11, 2.25, as.character(round(median(c(may_cam, yak_pat, cam_moz, pat_mbu)), 3))) text(11, 3.25, as.character(round(median(c(may_pat, yak_moz, cam_mbu)), 3))) text(11, 4.25, as.character(round(median(c(may_moz, yak_mbu)), 3))) text(11, 5.25, as.character(round(median(c(may_mbu)), 3))) text(0,-1, 'Maya', cex=1.2) text(2,-1, 'Yakut', cex=1.2) text(4,-1, 'Cambodia', cex=1.2) text(6,-1, 'Pathan', cex=1.2) text(8,-1, 'Mozabite', cex=1.2) text(10,-1, 'Mbuti', cex=1.2) dev.off() #all by all pairwise ase sharing path <- c('HGDP00213','HGDP00222','HGDP00232','HGDP00237','HGDP00239','HGDP00247','HGDP00258') mbu <- c('HGDP00449','HGDP00456','HGDP00462','HGDP00467','HGDP00471','HGDP00476', 'HGDP01081') cam <- c('HGDP00711','HGDP00712','HGDP00713','HGDP00715','HGDP00716','HGDP00720','HGDP00721') maya <- c('HGDP00854','HGDP00855','HGDP00856','HGDP00857','HGDP00858','HGDP00868','HGDP00877') yak <- c('HGDP00948','HGDP00950','HGDP00955','HGDP00959','HGDP00964','HGDP00967') moz <- c('HGDP01258','HGDP01259','HGDP01262','HGDP01264','HGDP01274','HGDP01275','HGDP01277') ase_ind <- c(maya, yak, cam, path, moz, mbu) all_by_all <- matrix(NA, nrow=41, ncol=41) colnames(all_by_all) = ase_ind rownames(all_by_all) = ase_ind for(i in 1:41) { for(j in 1:41) { if(i != j) { all_by_all[i,j]<- ase_sites_seen2(subset(newsig_ase, newsig_ase$individual==ase_ind[i]| newsig_ase$individual==ase_ind[j]), subset(ase, ase$individual==ase_ind[i] | ase$individual==ase_ind[j])) } } } library(gplots) colorPop<-c(rep(brewerVector[1],7),rep(brewerVector[2],6),rep(brewerVector[3],7),rep(brewerVector[4],7),rep(brewerVector[5],7),rep(brewerVector[6],7)) pdf('ase_sharing_sample30.pdf') par(mar=c(7,3,7,7)) heatmap.2(all_by_all, trace="none",scale="row", labRow=ase_ind, labCol=ase_ind, RowSideColors=colorPop, ColSideColors=colorPop, dendrogram='column', key='T', density.info='none') legend('bottomleft', c('Maya', 'Yakut', 'Cambodian', 'Pathan', 'Mozabite', 'Mbuti'), col=brewerVector, lty='solid', bty='n', lwd=4, cex=0.75) dev.off()