How can I prune these column ranges given another file in R?

后端 未结 1 2009
时光说笑
时光说笑 2020-12-22 06:57

I have a data.frame1 like:

1    bin chrom chromStart  chromEnd    name score
2     12  chr1   29123222  29454711 -5.7648   599
3    116  chr1   45799118  459         


        
相关标签:
1条回答
  • 2020-12-22 07:20

    +1 for suggesting IRanges::findOverlaps.

    Here's a solution using findOverlaps and GenomicRanges:

    library(GenomicRanges);
    
    df1 <- cbind.data.frame(
        bin = c(12, 116, 117, 121, 133, 147),
        chrom = c("chr1", "chr1", "chr1", "chr1", "chr1", "chr1"),
        chromStart = c(29123222, 45799118, 46327104, 50780759, 63634657, 77825305),
        chromEnd = c(29454711, 45986770, 46490961, 51008404, 63864734, 78062178),
        name = c(-5.7648, -4.8403, -5.3036, -4.4165, -4.8096, -5.4671),
        score = c(599, 473, 536, 415, 469, 559));
    
    df2 <- cbind.data.frame(
        chrom = c("chr1", "chr1", "chr1", "chr1", "chr1"),
        chromStart = c(63600000, 45800000, 29100000, 50400000, 46500000),
        chromEnd = c(63700000, 45900000, 29400000, 50500000, 46600000),
        N = c(1566, 1566, 1566, 1566, 1566));
    
    # Make GRanges objects from dataframes
    gr1 <- with(df1, GRanges(
        chrom, 
        IRanges(start = chromStart, end = chromEnd), 
        bin = bin, 
        name = name, 
        score = score));
    
    gr2 <- with(df2, GRanges(
        chrom,
        IRanges(start = chromStart, end = chromEnd),
        N = N));
    
    # Get overlapping features
    hits <- findOverlaps(query = gr1, subject = gr2);
    
    # Get features from gr1 that overlap with features from gr2
    idx1 <- queryHits(hits);
    idx2 <- subjectHits(hits);
    gr <- gr1[idx1];
    
    # Make sure that we only keep the intersecting ranges
    start(gr) <- ifelse(start(gr) >= start(gr2[idx2]), start(gr), start(gr2[idx2]));
    end(gr) <- ifelse(end(gr) <= end(gr2[idx2]), end(gr), end(gr2[idx2]));
    
    print(gr);
    
    GRanges object with 3 ranges and 3 metadata columns:
          seqnames               ranges strand |       bin      name     score
             <Rle>            <IRanges>  <Rle> | <numeric> <numeric> <numeric>
      [1]     chr1 [29123222, 29400000]      * |        12   -5.7648       599
      [2]     chr1 [45800000, 45900000]      * |       116   -4.8403       473
      [3]     chr1 [63634657, 63700000]      * |       133   -4.8096       469
      -------
      seqinfo: 1 sequence from an unspecified genome; no seqlengths
    
    # Turn GRanges into a dataframe
    df <- data.frame(bin = mcols(gr)$bin, 
                     chrom = seqnames(gr), 
                     chromStart = start(gr), 
                     chromEnd = end(gr), 
                     name = mcols(gr)$name, 
                     score = mcols(gr)$score);
    print(df);  
    
    0 讨论(0)
提交回复
热议问题