image.png

In [104]:
options(stringsAsFactors=FALSE)
library(GenomicFeatures)
library(ChIPseeker)
library(RColorBrewer)
library(ggsci)
library(ggplot2)
library(ggpubr)
library(networkD3)
library(grid)
library(HiveR)
library(scatterpie)
library(ggExtra)
library(treemap)
library(pheatmap)
library(reshape)
library(ggalluvial)
library(ggmosaic)
library(ggtern)
library(data.table)
library(pals)

Fig.3a

In [ ]:
chipseq <- read.delim('ChIPseq.experiment.table.txt')
atacseq <- read.delim('DHS.experiment.table.txt')
chipexp <- read.delim('metrics/experiment.metrics.table.out', comment.char = '#')
chipsam <- read.delim('metrics/sample.metrics.table.out', comment.char = '#')
atacexp <- read.delim('metrics/DHS.experiment.metrics.table.out', comment.char = '#')
atacsam <- read.delim('metrics/DHS.sample.metrics.table.out', comment.char = '#')
chippik <- read.delim('metrics/peak.metrics.table.out', comment.char = '#')
atacpik <- read.delim('metrics/DHS.peak.metrics.table.out', comment.char = '#')
chippik <- chippik[which(chippik$IDR>50),]
atacpik <- atacpik[which(atacpik$IDR>1000),]
allpeak <- rbind(chippik, atacpik)
allpeak$ID <- sprintf("%s/%s/%s",allpeak$genome,allpeak$project,allpeak$experiment)
allpeak <- allpeak[!duplicated(allpeak$ID),]
rownames(allpeak) <- allpeak$ID 
qc1 <- unique(rbind(chipseq,atacseq)[, c("result_dir","project","main_group","analysis_name",'SPOT')]) %>%
  dplyr::filter(main_group %in% c("DNA-binding protein","open chromatin","histone related")) %>%
  dplyr::filter(!is.na(SPOT)) %>% dplyr::select(result_dir, main_group, SPOT)

chipsam2 <- chipsam[!duplicated(chipsam$sample) & !is.na(chipsam$sample),]
rownames(chipsam2) <- chipsam2$sample
qc2 <- chipseq[grep("_control",chipseq$replicate),c('result_dir','sample_id')] %>%
  dplyr::mutate(SPOT=chipsam2[sample_id,'SPOT'], main_group="control") %>%
  dplyr::filter(!is.na(SPOT)) %>% 
  dplyr::select(result_dir, main_group, SPOT)

pltd <- rbind(qc1, qc2) %>% dplyr::mutate(main_group=factor(main_group))
filtered <- expall[expall$main_group %in% c("DNA-binding protein", "histone related", "open chromatin") & 
                     (!expall$factor %in% c("FLAG", "CenH3", "dCas9", "Heterochromatin", "H1", "H2A", "H2AW", "H2AX", "H2AZ", "H2B", "H3", "H4", "H3.1", "H3.3", "IgG", "NRPE1", "NRPD1")) & 
                     !grepl("PolII",expall$group_list, ignore.case=T) & !grepl("RNA polymerase",expall$group_list, ignore.case=T) & 
                     !grepl("Unknown",expall$group_list, ignore.case=T) & !grepl("H2Bub1",expall$group_list, ignore.case=T) & 
                     !grepl("AGO",expall$group_list, ignore.case=T) & !grepl("DCL1",expall$group_list, ignore.case=T) & 
                     !grepl("lysinebutyrylation",expall$group_list, ignore.case=T) & !grepl("crotonylation",expall$group_list, ignore.case=T) & 
                     !grepl("phosphorylation",expall$group_list, ignore.case=T) & !grepl("ubiquitination",expall$group_list, ignore.case=T), ] ## , "H3K4me1", "H3K36me3"
filtered <- filtered[ifelse(filtered$result_dir=="arabidopsis_thaliana", ifelse(grepl("histone related",filtered$group_list), grepl("H3K4me1|H3K4me2|H3K4me3|H3K9me1|H3K9me2|H3K9me3|H3K9ac|H3K27ac|H3K27me1|H3K27me2|H3K27me3|H3K36me1|H3K36me2|H3K36me3|H3K79me2|H3K79me3|H3K56ac|H4K16ac|H3K23ac|H4K12ac|H3K4ac",filtered$group_list) , TRUE), TRUE), ]

filtered$idr <- ifelse(filtered$main_group == "open chromatin", sprintf("/public/workspace/chend/chiphub/ATACseq/analysis/%s/hammock/%s.peak.idr.hammock.gz",filtered$result_dir,filtered$analysis_name),
                       sprintf("/public/workspace/chend/chiphub/ChIPip/analysis/%s/%s/hammock/%s.peak.idr.hammock.gz",filtered$result_dir,filtered$project,filtered$analysis_name))

filtered$peak <- ifelse(filtered$main_group == "open chromatin", sprintf("/public/workspace/chend/chiphub/ATACseq/analysis/%s/peak/macs2/%s_pooled.narrowPeak.bed",filtered$result_dir,filtered$analysis_name),
                        ifelse(grepl("H3K36me3|H3K20me1|H3K4me1|H3K79me2|H3K79me3|H3K27me3|H3K9me3|H3K9me1",filtered$group_list), 
                               sprintf("/public/workspace/chend/chiphub/ChIPip/analysis/%s/%s/broad/macs2/naive/%s.broadPeak.bed",filtered$result_dir,filtered$project,filtered$analysis_name), 
                               sprintf("/public/workspace/chend/chiphub/ChIPip/analysis/%s/%s/peak/macs2/optimal/%s.narrowPeak.bed",filtered$result_dir,filtered$project,filtered$analysis_name)))

filtered <- filtered[file.exists(filtered$idr), ]

filtered$subgroup <- unlist(lapply(as.character(filtered$group_list), function(x){tail(unlist(strsplit(x," // ")),n=1)}))

athchip$peak <- peak[rownames(athchip),'IDR']
athchipOK <- athchip[athchip$OK & athchip$peak>50,]
athchipOK <- athchipOK[with(athchipOK, order(factor)), ]
pkfiles <- athchipOK$idr
athTFs <- unique(athchip$factor)
coTFs <- matrix(0, nrow = length(athTFs), ncol = length(athTFs))
colnames(coTFs) <- rownames(coTFs) <- athTFs
for(i in athTFs){
  for(j in athTFs){
    coTFs[i,j] <- max(jaccardmat[which(athchip$factor==i),which(athchip$factor==j)])
  }
}

PPI <- read.delim('ath.PPI.txt')
rownames(PPI) <- sprintf("%s-%s", PPI[,1], PPI[,2])
intct <- which(coTFs>median(coTFs), arr.ind=T)
cobTFs <- data.frame(from=athTFs[intct[,1]], to=athTFs[intct[,2]]) %>%
  mutate(fg=factors[from,'V1'], tg=factors[to,'V1'])
In [47]:
phtm <- pheatmap(log1p(coTFs), silent = T)
htmd <- log1p(coTFs)[phtm$tree_row$labels[phtm$tree_row$order], 
                     phtm$tree_row$labels[phtm$tree_row$order]]
diag(htmd) <- htmd[upper.tri(htmd)] <- -0.001
annotation_row2 <- data.frame(subgroup=factors[rownames(htmd),'V3']) %>% 
  mutate(subgroup=ifelse(subgroup %in% tx, subgroup, 'Others'))
rownames(annotation_row2) <- rownames(htmd)
labels_row <- rep("", nrow(htmd))
# fcols <- colorRampPalette(brewer.pal(n=8, name="Set1"))(length(unique(annotation_row$fam)))
# names(fcols) <- unique(annotation_row$fam)
# fcols['Others'] <- '#999999'
# ann_colors <- list(fam=fcols)
ann_colors2 <- ann_colors
ann_colors2$subgroup <- c(ann_colors2$subgroup, "Unknown"="#DEDEDE")
## ComplexHeatmap::
pheatmap(htmd, color=c(NA,brewer.ylorrd(255)), cluster_rows = F, cluster_cols = F, 
         cellwidth=3, cellheight=3, fontsize=3, border_color = NA, 
         annotation_colors=ann_colors2, 
         annotation_row=annotation_row2, labels_row = labels_row)

Fig.3b

Fig.3c

In [92]:
options(repr.plot.width = 16, repr.plot.height = 16, repr.plot.res = 100)
library(igraph)
set.seed(123456)
V(network)$size <- V(network)$degree <- log1p(degree(network)+1)*2
V(network)$size[V(network)$size > 5] <- 5
layout.fr <- layout.fruchterman.reingold(network)
plot(network, layout=layout.fr, vertex.label.font=3, 
     vertex.frame.color=NA, edge.curved=0)

Fig.3d

In [58]:
sankeyData <- list() 
fams <- unique(c(paste0("x|",regfam[miRTFSub$from]), miRfam, paste0("z|",regfam[miRTarSub$V2])))
sankeyData$nodes <- data.frame(name=fams)
ids <- as.integer(rownames(sankeyData$nodes)) - 1
names(ids) <- sankeyData$nodes$name
mir2tf <- data.frame(source=miRfam[miRTarSub$V1], target=regfam[miRTarSub$V2])
mir2tf <- aggregate(mir2tf$source, mir2tf, length) 
tf2mir <- data.frame(source=regfam[miRTFSub$from], target=miRfam[miRTFSub$to])
tf2mir <- aggregate(tf2mir$source, tf2mir, length) 

connects <- rbind(data.frame(source=ids[paste0("x|",tf2mir$source)], target=ids[tf2mir$target], value=tf2mir$x), data.frame(source=ids[mir2tf$source], target=ids[paste0("z|",mir2tf$target)], value=mir2tf$x))
sankeyData$links <- connects

rownames(miRTFSub) <- paste(miRTFSub[,1], miRTFSub[,2])
rownames(miRTarSub) <- paste(miRTarSub[,1], miRTarSub[,2])
rownames(TFTarSub) <- paste(TFTarSub[,1], TFTarSub[,2])
xData <- expand.grid(master=masters, miRNA=miRs, target=TFs, stringsAsFactors=FALSE) 
xData <- xData[which(paste(xData$master, xData$miRNA) %in% rownames(miRTFSub) & paste(xData$miRNA, xData$target) %in% rownames(miRTarSub) & paste(xData$master, xData$target) %in% rownames(TFTarSub)), ]
## check 
# apply(xData,1,function(x){
# x[1] %in% miRTFSub$from & x[2] %in% miRTFSub$to & x[2] %in% miRTarSub$V1 & x[3] %in% miRTarSub$V2 & x[1] %in% TFTarSub$from & x[3] %in% TFTarSub$to
# })

alluvialData <- data.frame(master=regfam[xData$master], miRNA=miRfam[xData$miRNA], target=paste0("z|",regfam[xData$target]))
alluvialData <- aggregate(alluvialData$master, alluvialData, length)
colnames(alluvialData)[4] <- "freq"

## pltd <- alluvialData[alluvialData$freq > 1,]
pltd <- alluvialData

length(unique(xData$miRNA))
length(unique(xData$master))
length(unique(xData$target))
134
117
462
In [61]:
library(VennDiagram)
library(gridExtra)
ggplot(pltd, aes(y=freq, axis1=master, axis2=miRNA, axis3=target)) +
  geom_alluvium(aes(fill=miRNA, color=miRNA), width=0, knot.pos=0, reverse=FALSE) +
  guides(fill=FALSE,color=FALSE) + geom_stratum(width=1/10, reverse=FALSE) +
  geom_text(stat="stratum", check_overlap=TRUE, label.strata=TRUE, reverse=FALSE) +
  scale_x_continuous(breaks=1:3, labels=c("Master TFs", "miRNAs", "Target TFs")) + theme_void() + ggtitle("miRNA-mediated GRN") ## + coord_flip() 

vennData <- list(Chen=sprintf("%s/%s/%s",floralFFLs[,1],floralFFLs[,2],floralFFLs[,3]), New=sprintf("%s/%s/%s",xData[,1],xData[,2],xData[,3]))
Warning message:
“`guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.”
Warning message:
“Computation failed in `stat_stratum()`:
The parameter `label.strata` is defunct.
use `aes(label = after_stat(stratum))`.”

Fig.3e

In [64]:
options(repr.plot.width = 8, repr.plot.height = 8, repr.plot.res = 100)
vplot <- venn.diagram(vennData, fill=c("#00AFBB", "#FC4E07"), alpha=0.5, cex=1.5, cat.fontface=2, cat.cex=1, lty=1, margin=0.1, filename=NULL, main="FFLs")
grid.arrange(gTree(children=vplot))

Fig.4b

Fig.4c

In [91]:
options(repr.plot.width = 6, repr.plot.height = 6, repr.plot.res = 100)
specific_score <- read.table('JS_specificity.bed')
colnames(specific_score) <- c("chr","start","end","type","JS")
ggdensity(specific_score, x="JS", color="type",
                palette=c("#FC4E07","#00AFBB"))+
  theme(legend.position="none") +xlab("Tissue specificity score") + ylab("Density")

Fig.4d

In [72]:
library(dplyr)
library(ggplot2)
library(ggpubr)
setwd("/public/workspace202011/encode/ChIPhub_related")
x <- read.delim('specificity.stat.bed', header=F) 
x <- x %>% mutate(V4=factor(V4, levels=c("promoter","enhancer")), V6=factor(V6, levels=c("Low","High")))
my_comparisons <- list( c("promoter", "enhancer") )
p <- ggviolin(x, x = "V4", y = "V7", fill = "V6",
         palette = c("#00AFBB", "#FC4E07"),
         add = "boxplot", add.params = list(fill = "white"))+
  stat_compare_means(comparisons = my_comparisons, label = "p.signif")
p <- facet(p, facet.by = "V6")
p

Fig.5a

In [73]:
hs <- x %>% filter(V6=="High") %>% 
    mutate(CRE=sprintf("%s_%s_%s", V1, V2, V3))
rownames(hs) <- hs$CRE
library(preprocessCore) 
matx <- read.table('DHS.quant.matrix', head=TRUE, sep="\t", stringsAsFactors=FALSE)
qdat <- normalize.quantiles(as.matrix(matx[,-c(1:3)]))
matx[,-c(1:3)] <- qdat
colnames(matx)[1:3] <- c('chr', 'bp1', 'bp2')
rownames(matx) <- sprintf("%s_%s_%s", matx$chr, matx$bp1, matx$bp2)

ecols <- read.delim("enhancer.tissue", quote = "", header = F, row.names = 1)
tord <- read.delim("tissue.order", quote = "", header = F, row.names = 1)
rownames(tord) <- gsub(" cell","",rownames(tord))
emeta <- read.delim("enhancer.meta", header = F, row.names = 2)
targets <- read.delim("DHS.target.bed", header = F)
colnames(targets) <- c('chr', 'bp1', 'bp2', 'gene', 'name', 'dist')
rownames(targets) <- sprintf("%s_%s_%s", targets$chr, targets$bp1, targets$bp2)
targets[targets$dist > 10000, 'name'] <- ''
TFs <- read.delim("ath.TFs.txt", header = F, row.names = 1)

matx <- matx[,intersect(rownames(emeta), colnames(matx))]

out <- cbind(x[,1:5], matx[sprintf("%s_%s_%s", x$V1, x$V2, x$V3),])
write.table(out, "table.S4.txt", sep="\t", quote=F, row.names=F)

tis <- rownames(emeta)[grep('control', emeta$V1, invert=T)]
maty <- matx[hs$CRE, tis]
mato <- t(apply(maty, 1, function(x){
        o <- aggregate(x=x, by=list(tissue=emeta[tis,]), FUN = mean)
        setNames(o[,2], o[,1])
}))

es <- hs$CRE[hs$V4=="enhancer"]
ps <- hs$CRE[hs$V4=="promoter"]

mato <- mato[,intersect(emeta[,1], colnames(mato))]
mato <- mato[, rownames(tord)]
pltd <- t(apply(mato, 1, scale))
colnames(pltd) <- colnames(mato)
library(ComplexHeatmap)
annotation_col <- data.frame(tissue=colnames(mato))
ann_colors <- list(tissue=setNames(ecols[,1],rownames(ecols)))
========================================
ComplexHeatmap version 2.9.4
Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
Github page: https://github.com/jokergoo/ComplexHeatmap
Documentation: http://jokergoo.github.io/ComplexHeatmap-reference

If you use it in published research, please cite:
Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
  genomic data. Bioinformatics 2016.

The new InteractiveComplexHeatmap package can directly export static 
complex heatmaps into an interactive Shiny app with zero effort. Have a try!

This message can be suppressed by:
  suppressPackageStartupMessages(library(ComplexHeatmap))
========================================
! pheatmap() has been masked by ComplexHeatmap::pheatmap(). Most of the arguments
   in the original pheatmap() are identically supported in the new function. You 
   can still use the original function by explicitly calling pheatmap::pheatmap().



Attaching package: ‘ComplexHeatmap’


The following object is masked from ‘package:pheatmap’:

    pheatmap


In [75]:
options(repr.plot.width = 6, repr.plot.height = 16, repr.plot.res = 100)
labels_gene <- targets[rownames(pltd), c('gene','name')]
labels_row <- ifelse(labels_gene$gene %in% rownames(TFs),labels_gene$name,'')
labels_row[rowSums(pltd > 0) > 3] <- ''
labels_row[duplicated(labels_row)] <- ''
names(labels_row) <- rownames(pltd)
mapal <- rev(pals::brewer.spectral(100))

pout <- pheatmap::pheatmap(pltd, color=mapal, annotation_col=annotation_col, annotation_colors=ann_colors, cluster_cols=F, border_color=NA, labels_row=labels_row, fontsize=3, cutree_rows=10)

Fig.5b

Fig.6b

In [101]:
rigg <- read.table('/public/workspace202011/encode/zhutao/analysis/chip-hub/all_species_enhancer_promoter_num.csv',head=T)
ggplot(rigg, aes(fill=type, y= factor(species,levels=rev(aa)), x=num))+geom_bar(position="stack", stat="identity")+
 scale_fill_manual(values=c("#FC4E07","#00AFBB"))+theme_pubr()

Fig.6c

In [100]:
library(ggridges)
all_species_tss <- read.table('/public/workspace202011/encode/zhutao/analysis/chip-hub/peaks/peak_version2/peak_summit_new/all_final_peak_dis_v2',head=T)
all_species_tss <- all_species_tss[all_species_tss$dis <1000000 & all_species_tss$dis >=0,]
all_species_tss$dis <- all_species_tss$dis + 1
all_species_tss$dis <- as.numeric(all_species_tss$dis)
options(repr.plot.width = 6, repr.plot.height = 12, repr.plot.res = 100)

ggplot(all_species_tss, aes(x = dis, y = factor(species,levels=rev(aa)),fill = factor(stat(quantile)))) +
  stat_density_ridges(geom = "density_ridges_gradient", calc_ecdf = TRUE,quantiles = c(0.55), quantile_lines = F) +
  scale_fill_manual(name = "Probability", values = c("#00AFBB","#FC4E07"),labels = c("Promoter", "Enhancer")) +
scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x),
                labels = trans_format("log10", math_format(10^.x))) +
theme_pubr() +labs(x = "Distance to closest gene (bp)", y = '')
Picking joint bandwidth of 0.103

Fig.6e

In [95]:
library(paletteer)
library(ggrepel)
new <-read.table('/public/workspace202011/encode/zhutao/analysis/chip-hub/result/promoter_enhancer_16_species_num_chr.csv',head=T)
new$loc <- 1:nrow(new) 
data <- new %>% mutate( is_annotate=ifelse((num>9) | (num<-9), "yes", "no")) 
options(repr.plot.width = 8, repr.plot.height = 6, repr.plot.res = 100)
ggplot(new, aes(x=loc, y=num)) +
    geom_point( aes(color=as.factor(chr_x)),size=0.1)+ scale_color_manual(values = paletteer_d("ggthemes::gdoc")[2:6])+
theme_pubr()+labs(x = "", y = 'Number of species')+geom_label_repel(data=subset(new, anno=="yes"), aes(label=ID), size=3,box.padding = 0.2,label.size=NA)
Warning message:
“ggrepel: 23 unlabeled data points (too many overlaps). Consider increasing max.overlaps”

Fig.6f

In [96]:
library(paletteer)
library(ggplot2)
library(scales)
library(ggpubr)
library(dplyr)
options(repr.plot.width = 10, repr.plot.height = 6, repr.plot.res = 100)
conserved_sp <- read.table("/public/workspace202011/encode/zhutao/analysis/chip-hub/final_species_promoter_enhancer_v2.csv",head=T)
options(repr.plot.width = 10, repr.plot.height = 8, repr.plot.res = 100)

aa = c("brachypodium_distachyon","oryza_sativa","setaria_italica","zea_mays","sorghum_bicolor","arabidopsis_thaliana",
       "solanum_tuberosum","solanum_lycopersicum","vitis_vinifera","populus_trichocarpa","gossypium_raimondii",
      "malus_domestica","prunus_persica","phaseolus_vulgaris","citrullus_lanatus","cucumis_melo","cucumis_sativus")

ggplot(conserved_sp, aes(fill=factor(label, levels=c('Promoter_7-16','Promoter_4-6','Promoter_1-3',
                                                          'Promoter_0','Enhancer_0', 'Enhancer_1-3','Enhancer_4-6','Enhancer_7-16')), y=percentage, x=factor(species,levels=aa)))+
scale_fill_manual(values=c("#313695","#4575B4","#74ADD1","#ABD9E9","#FDAE61","#F46D43","#D73027","#A50026"))+
geom_bar(position="stack", stat="identity")+
theme_pubr()+theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))+
 labs(x = "", y = 'Number of peaks')+guides(fill=guide_legend(title=""))

Fig.6h

In [103]:
library(foreach)
library(enrichplot)
library(clusterProfiler)
library(org.At.tair.db)
library(pheatmap)
library(ggplot2)
library(ggpubr)
library(pheatmap)
library(RColorBrewer)
library(dplyr)
library(ggthemes)
library(pals)
library(ComplexHeatmap)
e1_e4 <- read.table('/public/workspace202011/encode/zhutao/analysis/chip-hub/for_GO.csv',head=T)
e1_e4$new <- paste(e1_e4$type,'_',e1_e4$group)
terms <- NULL
for(i in levels(as.factor(e1_e4$new))){
go <- enrichGO(OrgDb=org.At.tair.db,
                 gene = e1_e4[e1_e4$new==i,'gene'],
                 pvalueCutoff = 0.01,
                 qvalueCutoff = 0.01,
                 keyType = 'TAIR',
                 pAdjustMethod = 'fdr',
                 ont = "BP")
  if(is.null(terms)){
    terms <- data.frame(organ=i, go@result)
  }else{
    terms <- rbind(terms, data.frame(organ=i, go@result))
  }
}
go_res <- terms[terms$pvalue <0.01,]
go_res$score <- -log10(go_res$pvalue)
aa <- go_res %>% group_by(organ) %>% slice_max(order_by = score, n = 4)
select <- go_res[go_res$Description %in% aa$Description,]
library(reshape)
res <- cast(select, Description ~ organ, value="score")
res[is.na(res)] <- 0
rownames(res) <- res$Description
res <- subset(res,select=-c(Description))
final <- as.data.frame(t(scale(t(as.matrix(res)),center=F)))
colnames(final) <- colnames(res)
rownames(final) <- rownames(res)
final <- final[,c(5,6,7,8,1,2,3,4)]
annotation_col=data.frame(level=colnames(final))
rownames(annotation_col) <- annotation_col$level
annotation_col$state <- c(rep("Promoter",4),rep("Enhancer",4))
ann_colors <- list(state=c("Promoter"="#00AFBB","Enhancer"="#FC4E07"),
                   level=setNames(c("#ABD9E9","#74ADD1","#4575B4","#313695","#FDAE61","#F46D43","#D73027","#A50026"),
                                  unique(annotation_col$level)))
cols <- colorRampPalette(c("white", "#F4511E"))(100)
options(repr.plot.width = 8, repr.plot.height = 10, repr.plot.res = 100)
ComplexHeatmap::pheatmap(final,color=cols,cluster_cols=F,annotation_colors=ann_colors, annotation_col=annotation_col)
Warning message:
“The input is a data frame, convert it to the matrix.”
In [ ]: