Compare two text files and print the difference against key in bash shell script

前端未结

关注

 2  1387

Shell script, bash, have 2 large files around 1.2 GB data, with key and values, I need to compare both files based on the key and store difference in the value in the third file

相关标签:

2条回答

一个人的身影

2021-01-29 14:56

Could you please try following, written and tested with shown samples(also considering that your Input_file(s) are not starting from spaces).

awk '
BEGIN{
  OFS=";"
}
{
  match($0,/ .*/)
  line=substr($0,RSTART,RLENGTH)
  sub(/^ +/,"",line)
}
FNR==NR{
  num=split(line,array,";")
  for(i=1;i<=num;i++){
    arrayfromFile2[$1]=(arrayfromFile2[$1]?arrayfromFile2[$1] OFS:"")array[i]
  }
  delete array
  next
}
($1 in arrayfromFile2){
  num=split(arrayfromFile2[$1],temparrayChkFile2,";")
  for(i=1;i<=num;i++){
    arrayChkFile2[temparrayChkFile2[i]]
  }
  num=split(line,array,";")
  for(i=1;i<=num;i++){
    if(!(array[i] in arrayChkFile2)){
       val=(val?val OFS:"")array[i]
    }
  }
  print $1" "val
  val=""
  next
}
1
'  Input_file2  Input_file1

0 讨论(0)

佛祖请我去吃肉

2021-01-29 15:01

If your input files are too large to fit in memory then you could create a set of tag-value pairs from each tag-values line in each input file, e.g.:

$ awk 'BEGIN{FS=OFS=";"} {tag=$0; sub(/ [^; ]+(;.*|$)/,"",tag); sub(/[^;]+ /,""); for (i=1;i<=NF;i++) print tag, $i}' file2
test1;polo
test1;angus
test2;mike
test4;bob
test4;janet
1332240_44557576_CONTI Mazed & Micro kjd $353.50_30062020_lsdf3_some-rule;232324L
1332240_44557576_CONTI Mazed & Micro kjd $353.50_30062020_lsdf3_some-rule;343223432H

and then use standard UNIX tools like sort and comm to get the differences you want and then recombine with awk into the original tag-values. Here's how the whole thing could work:

$ cat tst.sh
#!/usr/bin/env bash

separate() {
    awk '
        BEGIN { FS=OFS=";" }
        {
            tag = $0
            sub(/ [^; ]+(;.*|$)/,"",tag)
            sub(/[^;]+ /,"")
            for (i=1; i<=NF; i++) {
                print tag, $i
            }
        }
    ' "${@:--}" | sort
}

combine() {
    awk '
        BEGIN { FS=OFS=";" }
        $1 != prev {
            printf "%s%s", ors, $1
            prev = $1
            ors = ORS
            ofs = " "
        }
        {
            printf "%s%s", ofs, $2
            ofs = OFS
        }
        END {
            printf "%s", ors
        }
    ' "${@:--}"
}

comm -23 <(separate "$1") <(separate "$2") | combine

$ ./tst.sh file1 file2
1332239_44557576_CONTI Lased & Micro kjd $353.50_30062020_lsdf3_no-rule 343323H;343343432H;343434311H;454656556H
1332240_44557576_CONTI Mazed & Micro kjd $353.50_30062020_lsdf3_some-rule 2226556H
test1 marco
test2 liza;zen
test3 alan;harry;tom
test4 june

and if you in future want to find the tag-value pairs in file2 but not file1 or the pairs in both then you'd just change comm -23 to comm -13 or comm -12.

0 讨论(0)