I would like to ask for some hints in how to merge rows that share unique IDs into a comma separated table. Any hints in Perl, sed or awk are greatly appreciated.
This i
Using awk
Input
$ cat file
protein_id go_id
4102 GO:0003676
4125 GO:0003676
4125 GO:0008270
4139 GO:0008270
Output (if order doesn't matter)
$ awk 'FNR==1{print;next}{A[$1]=$1 in A ? A[$1]", "$2:$2}END{for(i in A)print i,A[i]}' file
protein_id go_id
4139 GO:0008270
4102 GO:0003676
4125 GO:0003676, GO:0008270
Better Readable version
awk '
FNR==1{
print
next
}
{
A[$1]=$1 in A ? A[$1]", "$2:$2
}
END{
for(i in A)
print i,A[i]
}
' file
Output (if order is important)
$ awk 'FNR==1{print;next}$1 in A{A[$1]=A[$1]", "$2;next}{A[O[++c]=$1]=$2}END{for(i=1; i in O; i++)print O[i],A[O[i]]}' file
protein_id go_id
4102 GO:0003676
4125 GO:0003676, GO:0008270
4139 GO:0008270
Better Readable version
awk '
FNR==1{
print
next
}
$1 in A{
A[$1]=A[$1]", "$2
next
}
{
A[O[++c]=$1]=$2
}
END{
for(i=1; i in O; i++)
print O[i],A[O[i]]
}
' file