I have a file with columns as shown below.
Group Start End
chr1 117132092 118875009
chr1 117027758 119458215
chr1 103756473 10486458
Using Gnu Awk version 4, you could try:
gawk -f a.awk file file
where a.awk
is:
NR==FNR {
if (FNR>1) {
a[$1][++i]=$2
b[$1][i]=$3
}
next
}
FNR==1 {
fmt="%-7s%-10s%-10s%-10s%-10s\n"
printf fmt,"Group","Start","End","NewStart","NewEnd"
}
FNR>1{
$4=$2; $5=$3
n=checkInside($1,$2,$3)
if (n>0) {
ff=0; x=$2; y=$3
for (i=1; i<=n; i++) {
ar=a[$1][R[i]]; br=b[$1][R[i]];
getIntersect($2,$3,ar,br)
getLargest($2,$3,ar,br)
ovl=((i2-i1)/($3-$2))*100;
ovr=((i2-i1)/(br-ar))*100;
if (ovl>50 && ovr>50) {
if (r1<x) x=r1
if (r2>y) y=r2
ff=1
}
}
if (ff) {
$4=x; $5=y
}
}
printf fmt,$1,$2,$3,$4,$5
}
function getLargest(x1,y1,x2,y2) {
r1=(x1<=x2)?x1:x2
r2=(y1>=y2)?y1:y2
}
function getIntersect(x1,y1,x2,y2) {
if (x1>=x2 && x1<=y2) {
i1=x1;
} else {
i1=x2;
}
i2=(y1<=y2)?y1:y2
}
function checkInside(g,x,y,i,j,x1,y1) {
R["x"]=0
for (i in a[g]) {
x1=a[g][i]; y1=b[g][i];
if ((x>=x1 && x<=y1) || (y>=x1 && y<=y1)) {
if (!(x==x1 && y==y1))
R[++j]=i
}
}
return j
}
Output:
Group Start End NewStart NewEnd
chr1 117132092 118875009 117027758 119458215
chr1 117027758 119458215 117027758 119458215
chr1 103756473 104864582 103354114 104864582
chr1 105093795 106219211 105093795 106219211
chr1 103354114 104747251 102741437 105235140
chr1 102741437 105235140 102741437 105235140
chr1 100090254 101094139 100090254 101614730
chr1 100426977 101614730 100090254 101614730
chr2 86644663 87767193 86644663 87767193
chr2 82473711 83636545 82473711 83636545
chr2 83896702 85079032 83876122 85091910
chr2 83876122 85091910 83876122 85091910
chr2 82943211 84350917 82943211 84350917
chr3 89410051 90485635 89405753 90485635
chr3 89405753 90485635 89405753 90485635
chr3 86491492 87593215 86491492 87593215
chr3 82507157 83738004 82507157 83738004
chr3 85059618 86362254 85059618 86362254