awk to find overlaps

后端 未结 1 1397
没有蜡笔的小新
没有蜡笔的小新 2021-01-26 06:09

I have a file with columns as shown below.

Group   Start        End
chr1    117132092    118875009
chr1    117027758    119458215
chr1    103756473    10486458         


        
相关标签:
1条回答
  • 2021-01-26 06:39

    Using Gnu Awk version 4, you could try:

    gawk -f a.awk file file
    

    where a.awk is:

    NR==FNR {
        if (FNR>1) {
            a[$1][++i]=$2
            b[$1][i]=$3
        }
        next
    }
    FNR==1 {
        fmt="%-7s%-10s%-10s%-10s%-10s\n"
        printf fmt,"Group","Start","End","NewStart","NewEnd" 
    }
    FNR>1{
        $4=$2; $5=$3
        n=checkInside($1,$2,$3)
        if (n>0) {
            ff=0; x=$2; y=$3
            for (i=1; i<=n; i++) {
                ar=a[$1][R[i]]; br=b[$1][R[i]];
                getIntersect($2,$3,ar,br)
                getLargest($2,$3,ar,br)
                ovl=((i2-i1)/($3-$2))*100;
                ovr=((i2-i1)/(br-ar))*100;
                if (ovl>50 && ovr>50) {
                    if (r1<x) x=r1
                    if (r2>y) y=r2
                    ff=1
                }
            }
            if (ff) {
                $4=x; $5=y
            }
        }
        printf fmt,$1,$2,$3,$4,$5
    }
    
    function getLargest(x1,y1,x2,y2) {
        r1=(x1<=x2)?x1:x2
        r2=(y1>=y2)?y1:y2
    }
    
    function getIntersect(x1,y1,x2,y2) {
        if (x1>=x2 && x1<=y2) {
            i1=x1;
        } else {
            i1=x2;
        }
        i2=(y1<=y2)?y1:y2
    }
    
    function checkInside(g,x,y,i,j,x1,y1) {
        R["x"]=0
        for (i in a[g]) {
            x1=a[g][i]; y1=b[g][i];
            if ((x>=x1 && x<=y1) || (y>=x1 && y<=y1)) {
                if (!(x==x1 && y==y1))
                    R[++j]=i
            }
        }
        return j
    }
    

    Output:

    Group  Start     End       NewStart  NewEnd    
    chr1   117132092 118875009 117027758 119458215 
    chr1   117027758 119458215 117027758 119458215 
    chr1   103756473 104864582 103354114 104864582 
    chr1   105093795 106219211 105093795 106219211 
    chr1   103354114 104747251 102741437 105235140 
    chr1   102741437 105235140 102741437 105235140 
    chr1   100090254 101094139 100090254 101614730 
    chr1   100426977 101614730 100090254 101614730 
    chr2   86644663  87767193  86644663  87767193  
    chr2   82473711  83636545  82473711  83636545  
    chr2   83896702  85079032  83876122  85091910  
    chr2   83876122  85091910  83876122  85091910  
    chr2   82943211  84350917  82943211  84350917  
    chr3   89410051  90485635  89405753  90485635  
    chr3   89405753  90485635  89405753  90485635  
    chr3   86491492  87593215  86491492  87593215  
    chr3   82507157  83738004  82507157  83738004  
    chr3   85059618  86362254  85059618  86362254  
    
    0 讨论(0)
提交回复
热议问题