How do I robustly parse malformed CSV?

后端 未结 3 2072
盖世英雄少女心
盖世英雄少女心 2020-12-08 16:29

I\'m processing data from government sources (FEC, state voter databases, etc). It\'s inconsistently malformed, which breaks my CSV parser in all sorts of delightful ways.

相关标签:
3条回答
  • 2020-12-08 17:03

    I made an app to reformat CSV files, doubling the single quotes inside fields and replacing the new lines inside them with a string like '\n'.

    Once the data is inside the database we can replace back the '\n' to new lines.

    I needed to do this because the apps I had to process CSV does not deal correctly with new lines.

    Feel free to use and change.

    In python:

    import sys
    
    def ProcessCSV(filename):
        file1 = open(filename, 'r')
        filename2 = filename + '.out'
        file2 = open(filename2, 'w')
        print 'Reformatting {0} to {1}...', filename, filename2
        line1 = file1.readline()
        while (len(line1) > 0):
            line1 = line1.rstrip('\r\n')
            line2 = ''
            count = 0
            lastField = ( len(line1) == 0 )
            while not lastField:
                lastField = (line1.find('","') == -1)
                res = line1.partition('","')
                field = res[0]
                line1 = res[2]
                count = count + 1
                hasStart = False
                hasEnd = False
    
                if  ( count == 1 )  and  ( field[:1] == '"' ) :
                    field = field[1:]
                    hasStart = True
                elif count > 1:
                    hasStart = True
    
                while (True):
                    if  ( lastField == True )  and  ( field[-1:] == '"' ) :
                        field = field[:-1]
                        hasEnd = True
                    elif not lastField:
                        hasEnd = True
    
                    if lastField and not hasEnd:
                        line1 = file1.readline()
                        if (len(line1) == 0): break
                        line1 = line1.rstrip('\r\n')
                        lastField = (line1.find('","') == -1)
                        res = line1.partition('","')
                        field = field + '\\n' + res[0]
                        line1 = res[2]
                    else:
                        break
    
                field = field.replace('"', '""')
    
                line2 = line2 + iif(count > 1, ',', '') + iif(hasStart, '"', '') + field + iif(hasEnd, '"', '')
    
            if len(line2) > 0:
                file2.write(line2)
                file2.write('\n')
    
            line1 = file1.readline()
    
        file1.close()
        file2.close()
        print 'Done'
    
    def iif(st, v1, v2):
        if st:
            return v1
        else:
            return v2
    
    filename = sys.argv[1]
    if len(filename) == 0:
        print 'You must specify the input file'
    else:
        ProcessCSV(filename)
    

    In VB.net:

    Module Module1
    
    Sub Main()
        Dim FileName As String
        FileName = Command()
        If FileName.Length = 0 Then
            Console.WriteLine("You must specify the input file")
        Else
            ProcessCSV(FileName)
        End If
    End Sub
    
    Sub ProcessCSV(ByVal FileName As String)
        Dim File1 As Integer, File2 As Integer
        Dim Line1 As String, Line2 As String
        Dim Field As String, Count As Long
        Dim HasStart As Boolean, HasEnd As Boolean
        Dim FileName2 As String, LastField As Boolean
        On Error GoTo locError
    
        File1 = FreeFile()
        FileOpen(File1, FileName, OpenMode.Input, OpenAccess.Read)
    
        FileName2 = FileName & ".out"
        File2 = FreeFile()
        FileOpen(File2, FileName2, OpenMode.Output)
    
        Console.WriteLine("Reformatting {0} to {1}...", FileName, FileName2)
    
        Do Until EOF(File1)
            Line1 = LineInput(File1)
            '
            Line2 = ""
            Count = 0
            LastField = (Len(Line1) = 0)
            Do Until LastField
                LastField = (InStr(Line1, """,""") = 0)
                Field = Strip(Line1, """,""")
                Count = Count + 1
                HasStart = False
                HasEnd = False
                '
                If (Count = 1) And (Left$(Field, 1) = """") Then
                    Field = Mid$(Field, 2)
                    HasStart = True
                ElseIf Count > 1 Then
                    HasStart = True
                End If
                '
    locFinal:
                If (LastField) And (Right$(Field, 1) = """") Then
                    Field = Left$(Field, Len(Field) - 1)
                    HasEnd = True
                ElseIf Not LastField Then
                    HasEnd = True
                End If
                '
                If LastField And Not HasEnd And Not EOF(File1) Then
                    Line1 = LineInput(File1)
                    LastField = (InStr(Line1, """,""") = 0)
                    Field = Field & "\n" & Strip(Line1, """,""")
                    GoTo locFinal
                End If
                '
                Field = Replace(Field, """", """""")
                '
                Line2 = Line2 & IIf(Count > 1, ",", "") & IIf(HasStart, """", "") & Field & IIf(HasEnd, """", "")
            Loop
            '
            If Len(Line2) > 0 Then
                PrintLine(File2, Line2)
            End If
        Loop
    
        FileClose(File1, File2)
        Console.WriteLine("Done")
    
        Exit Sub
    locError:
        Console.WriteLine("Error: " & Err.Description)
    End Sub
    
    Function Strip(ByRef Text As String, ByRef Separator As String) As String
        Dim nPos As Long
        nPos = InStr(Text, Separator)
        If nPos > 0 Then
            Strip = Left$(Text, nPos - 1)
            Text = Mid$(Text, nPos + Len(Separator))
        Else
            Strip = Text
            Text = ""
        End If
    End Function
    
    End Module
    
    0 讨论(0)
  • 2020-12-08 17:08

    First, here is a rather naive attempt: http://rubular.com/r/gvh3BJaNTc

    /"(.*?)"(?=[\r\n,]|$)|([^,"\s].*?)(?=[\r\n,]|$)/m
    

    The assumptions here are:

    • A field may start with quotes. In which case, it should end with a quote that is either:
      • before a comma
      • before a new line (if it is last field on its line)
      • before the end of the file (if it is last field on the last line)
    • Or, its first character is not a quote, so it contains characters until the same condition as before is met.

    This almost does what you want, but fails on these fields:

    1 comma and
    linebreaks"
    

    As TC had pointed out in the comments, your text is ambiguous. I'm sure you already know it, but for completeness:

    • "a" - is that a or "a"? How do you represent a value that you want to be wrapped in quotes?
    • "1","2" - might be parsed as 1,2, or as 1","2 - both are legal.
    • ,1 \n 2, - End of line, or newline in the value? You cannot tell, specially if this is supposed to be the last value of its line.
    • 1 \n 2 \n 3 - One value with newlines? Two values (1\n2,3 or 1,2\n3)? Three values?

    You may be able to get some clues if you examine the first value on each row, which as you have said, should tell you the number of columns and their types - this can give you the additional information you are missing to parse the file (for example, if you know there should another field in this line, then all newlines belong in the current value). Even then though, it looks like there are serious problems here...

    0 讨论(0)
  • 2020-12-08 17:15

    It is possible to subclass Ruby's File to process each line of the the CSV file before it is passed to the Ruby's CSV parser. For example, here's how I used this trick to replace non-standard backslash-escaped quotes \" with standard double-quotes ""

    class MyFile < File
      def gets(*args)
        line = super
        if line != nil
          line.gsub!('\\"','""')  # fix the \" that would otherwise cause a parse error
        end
        line
      end
    end
    
    infile = MyFile.open(filename)
    incsv = CSV.new(infile)
    
    while row = incsv.shift
      # process each row here
    end
    

    You could in principle do all sorts of additional processing, e.g. UTF-8 cleanups. The nice thing about this approach is you handle the file on a line by line basis, so you don't need to load it all into memory or create an intermediate file.

    0 讨论(0)
提交回复
热议问题