setwd('./') library(ops) #################################### #assess technical reproducibility # #################################### run1_gene <- read.table('/HGDP_RNAseq/scripts/technical_replicates/supplemental_script_input_files/gene_fpkm_table_run1.txt', sep='\t', header=T) run2_gene <- read.table('/HGDP_RNAseq/scripts/technical_replicates/supplemental_script_input_files/gene_fpkm_table_run2.txt', header=T) merge12_gene <- merge(run1_gene, run2_gene, by='tracking_id')[,c(5:50, 55:100)] tiff('gene_replicates.tif', compression='lzw', res=300, height=2100, width=2100) par(mar=c(2,2,2,1), mfrow=c(7,7), cex.axis=1, cex.lab=1, mgp=c(3,0.5,0)) #take out rows where either was zero (this was done in the C't Hoen Nat Biotech paper) gene_concordance <- c() for(i in c(1:4,6:46)) { merged_no_zeros <- merge12_gene[apply(merge12_gene[,c(i,46+i)], 1, min)!=0,][,c(i,46+i)] p <- findP(merged_no_zeros)$maxIQR #looks for the optimal scaling power merged_ops <- merged_no_zeros^p #transforms the expression data to a normal distribution plot(merged_ops, xlab='', ylab='') mtext(strsplit(colnames(merged_no_zeros, 1)[1], '.', fixed=T)[[1]][1], 3, line=0.8, cex=0.8) #mtext(colnames(merged_no_zeros, 1)[2], 2, line=1.5, cex=0.8) abline(lm(merged_ops[,2]~merged_ops[,1]), col='red', lty='dashed') print(paste(p, colnames(merged_no_zeros, 1)[1], colnames(merged_no_zeros, 1)[2], cor.test(merged_ops[,1], merged_ops[,2])$estimate, sep=' ')) gene_concordance <- c(gene_concordance, cor.test(merged_ops[,1], merged_ops[,2])$estimate) } dev.off() pdf('gene_correlation.pdf') hist(gene_concordance, main='Correlation between run1 and run2', xlab='Correlation', ylab='Frequency', xlim=c(0,1)) dev.off() #################################### #look for outliers # #################################### gene <- read.table('/HGDP_RNAseq/expression_fpkm_tables/merged_sample_gene_fpkm_table.txt', header=T) transcript <- read.table('/HGDP_RNAseq/expression_fpkm_tables/merged_sample_isoform_fpkm_table.txt', header=T) transform_quantifications <- function(fpkm, columns) { #remove zeros fpkm_no_zeros <- fpkm[apply(fpkm[,columns],1,min)!=0,] #remove NA's fpkm_no_zeros <- fpkm_no_zeros[complete.cases(fpkm_no_zeros),] fpkm_p <- findP(fpkm_no_zeros[,columns])$maxIQR print(paste('OPS p: ', fpkm_p, sep='')) fpkm_ops <- fpkm_no_zeros[,columns]^fpkm_p #transforms the expression data to a normal distribution cor_ops <- cor(fpkm_ops) median_cor <- rep(NA, 45) names(median_cor) <- colnames(cor_ops) for(i in 1:45) { median_cor[i] <- median(cor_ops[,i][-i]) } return(median_cor) } gene_dstat <- transform_quantifications(gene, 5:49) transcript_dstat <- transform_quantifications(transcript, 5:49) pdf('dstat_gene_transcript.pdf') par(mar=c(5,5,3,3), mfrow=c(2,1), cex.axis=1.5, cex.lab=1.5) gene_hist <- hist(gene_dstat, xlab='D-statistic (genes)', ylab='Number of samples', xlim=c(0.55,1), breaks=10, main='') text(gene_hist$mids[1], 8, 'HGDP01029', srt=90, cex=1.2) text(gene_hist$mids[5], 8, 'HGDP00992', srt=90, cex=1.2) transcript_hist <- hist(transcript_dstat, xlab='D-statistic (transcripts)', ylab='Number of samples', xlim=c(0.55, 1), breaks=20, main='') text(transcript_hist$mids[1], 9, 'HGDP01029', srt=90, cex=1.2) text(transcript_hist$mids[4], 9, 'HGDP00992', srt=90, cex=1.2) dev.off() #gene outliers #HGDP00992 HGDP01029 #0.8637673 0.8273650 #transcript outliers #HGDP00992 HGDP01029 #0.6384332 0.5705611