问题
I want to retrieve the genes that are present within a series of regions. Say, I have a bed file with query positions such like:
1 2665697 4665777 MIR201
1 10391435 12391516 MIR500
1 15106831 17106911 MIR122
1 23436535 25436616 MIR234
1 23436575 25436656 MIR488
I would like to get the genes that fall within those regions.
I have tried using biomaRt, and bedtools intersect, but the output I get, is a list of genes corresponding to all the regions, not one by one, as the desired output I would like to get would be the genes within each row, but in separate rows, a if I did one query region at a time. Basically I want to know what genes fall within each region, but still being able to identify which genes fall in which regions.
What I am doing is, from a region of detected miRNA, I am expanding the genome region upwards and downwards, so that I get the neighboring genes from this miRNA. I am using a 1 million bases windows up and down. This would work for just one query, but, how to do many queries with biomaRt or many intersections with bedtools, so that I get somewhat like:
1 2665697 4665777 MIR201 GENEX, GENEY, GENEZ...
1 10391435 12391516 MIR500 GENEA, GENEB, GENEC...
1 15106831 17106911 MIR122
1 23436535 25436616 MIR234
1 23436575 25436656 MIR488
Meaning that GENEX, GENEY and GENEZ fall within 1:2665697-4665777, with MIR201, placed in the middle, as this region is calculated subtracting 1 million bp to sart, and adding 1 million bp to end position.
I am somewhat determining the neighboring genes from each miRNA, to compare within species, but I do not get how to query multiple regions individually using biomaRt or bedtools.
Any help?
回答1:
Same approach as @Jimbou without tidyverse:
library(biomaRt)
# data
d <- read.table(text = "1 2665697 4665777 MIR201
1 10391435 12391516 MIR500
1 15106831 17106911 MIR122
1 23436535 25436616 MIR234
1 23436575 25436656 MIR488")
# specify the database
ensembl = useMart("ensembl", dataset = "hsapiens_gene_ensembl")
# loop through rows, get genes, then paste with collapse,
# and finally bind back with data d.
res <- cbind(
d,
genes = apply(d, 1, function(i){
x <- getBM(attributes=c("external_gene_name"),
filters = c("chromosome_name" , "start", "end"),
values = list(i[1], i[2], i[3]),
mart = ensembl)
# keeping only 3 genes, as output is too long.
# In your case remove below line
x <- head(x, 3)
# return genes, comma separated
paste(x$external_gene_name, collapse = ",")
})
)
res
# V1 V2 V3 V4 genes
# 1 1 2665697 4665777 MIR201 TTC34,AC242022.1,AL592464.2
# 2 1 10391435 12391516 MIR500 AL139424.2,PGD,AL139424.1
# 3 1 15106831 17106911 MIR122 KAZN,TMEM51-AS1,TMEM51
# 4 1 23436535 25436616 MIR234 ASAP3,E2F2,AL021154.1
# 5 1 23436575 25436656 MIR488 ASAP3,E2F2,AL021154.1
回答2:
You can try a biomart
& tidyverse
solution
library(biomaRt)
library(tidyverse)
# specify the database
ensembl = useMart("ensembl",dataset="hsapiens_gene_ensembl")
# queries per row
res <- d %>%
split(1:nrow(.)) %>%
map(~getBM(attributes=c("external_gene_name", "chromosome_name", "start_position", "end_position"),
filters = c("chromosome_name" , "start", "end"),
values = list(.$V1, .$V2, .$V3),
mart = ensembl))
# plot the results for the first element to check the overlapping genes
plot(data.frame(unlist(d[1, 2:3]), nrow(res$`1`)), type="l", col=2, lwd =3,
ylim = c(0, nrow(res$`1`)),
xlim=unlist(d[1, 2:3])+c(-100000,100000))
res$`1` %>%
gather(k,v,-external_gene_name,-chromosome_name) %>%
arrange(external_gene_name) %>%
mutate(n=rep(1:(n()/2),each=2)) %>%
split(.$n) %>%
map(~with(.,lines(cbind(v, n), type="l", lwd =3)))
# transform the data in your expected data.frame
res %>%
map(~transmute(.,new=paste(external_gene_name, collapse="," )) %>%
slice(1)) %>%
bind_rows() %>%
bind_cols(d,.) %>%
as.tibble()
# A tibble: 5 x 5
V1 V2 V3 V4 new
<int> <int> <int> <fct> <chr>
1 1 2665697 4665777 MIR201 TTC34,AC242022.1,AL592464.2,AL592464.1,AL589702.1,ACTRT2,LINC00982,PRDM16,MIR4251,AL008733.1,AL512383.1,AL590438.~
2 1 10391435 12391516 MIR500 AL139424.2,PGD,AL139424.1,CENPS-CORT,CENPS,CORT,DFFA,AL354956.1,PEX14,RN7SL614P,CASZ1,AL139423.1,HSPE1P24,C1orf12~
3 1 15106831 17106911 MIR122 KAZN,TMEM51-AS1,TMEM51,C1orf195,AL035405.1,AL391094.1,FHAD1,AL031283.2,AL031283.3,AL031283.1,EFHD2,CTRC,CELA2A,CE~
4 1 23436535 25436616 MIR234 ASAP3,E2F2,AL021154.1,ID3,MDS2,AL451000.1,RPL11,ELOA,ELOA-AS1,PITHD1,LYPLA2,GALE,HMGCL,FUCA1,CNR2,BTBD6P1,AL59060~
5 1 23436575 25436656 MIR488 ASAP3,E2F2,AL021154.1,ID3,MDS2,AL451000.1,RPL11,ELOA,ELOA-AS1,PITHD1,LYPLA2,GALE,HMGCL,FUCA1,CNR2,BTBD6P1,AL59060~
And if you need all data you can try a purrr
solution as well. Advantage: The biomart output is stored in a list and is not lost.
d %>%
nest(-V4) %>%
mutate(biomart=map(data, ~getBM(attributes=c("external_gene_name", "chromosome_name", "start_position", "end_position"),
filters = c("chromosome_name" , "start", "end"),
values = list(.$V1, .$V2, .$V3),
mart = ensembl)),
Genes = map(biomart, ~paste(.$external_gene_name, collapse = ","))) %>%
unnest(Genes, data)
# A tibble: 5 x 6
V4 biomart Genes V1 V2 V3
<fct> <list> <chr> <int> <int> <int>
1 MIR201 <data.frame [43 x 4]> TTC34,AC242022.1,AL592464.2,AL592464.1,AL589702.1,ACTRT2,~ 1 2.67e6 4.67e6
2 MIR500 <data.frame [72 x 4]> AL139424.2,PGD,AL139424.1,CENPS-CORT,CENPS,CORT,DFFA,AL35~ 1 1.04e7 1.24e7
3 MIR122 <data.frame [101 x 4]> KAZN,TMEM51-AS1,TMEM51,C1orf195,AL035405.1,AL391094.1,FHA~ 1 1.51e7 1.71e7
4 MIR234 <data.frame [62 x 4]> ASAP3,E2F2,AL021154.1,ID3,MDS2,AL451000.1,RPL11,ELOA,ELOA~ 1 2.34e7 2.54e7
5 MIR488 <data.frame [62 x 4]> ASAP3,E2F2,AL021154.1,ID3,MDS2,AL451000.1,RPL11,ELOA,ELOA~ 1 2.34e7 2.54e7
来源:https://stackoverflow.com/questions/50136262/query-genes-within-regions