Handle large delimited text files in VBA

后端 未结 4 2046
醉酒成梦
醉酒成梦 2021-01-16 15:33

Using VBA, I need to \"unpivot\" data that is currently in delimited text files (as hundreds of columns by tens of thousands of rows) into a normalized form (four columns by

相关标签:
4条回答
  • 2021-01-16 16:13

    I decided to build a tiny COM-aware wrapper around TextFieldParser in VB.NET. Not ideal, but the best I can come up with at present.

    0 讨论(0)
  • 2021-01-16 16:27

    Compiled but not tested

    Sub UnpivotFile(sPath As String)
    
        Const DELIM As String = ","
        Const QUOTE As String = """"
    
        Dim FSO As New FileSystemObject
        Dim arrHeader
        Dim arrContent
        Dim lb As Integer, ub As Integer
        Dim x As Integer
        Dim inData As Boolean
        Dim l As String, fName As String
        Dim fIn As Scripting.TextStream
        Dim fOut As Scripting.TextStream
        Dim tmp As String
        Dim lineNum As Long
    
        fName = FSO.GetFileName(sPath)
    
        Set fIn = FSO.OpenTextFile(sPath, ForReading)
        Set fOut = FSO.OpenTextFile(sPath & "_out", ForWriting)
        lineNum = 0
    
        Do While Not fIn.AtEndOfStream
    
            l = fIn.ReadLine
            lineNum = lineNum + 1
            arrContent = ParseLineToArray(l, DELIM, QUOTE)
    
            If Not inData Then
                arrHeader = arrContent
                lb = LBound(arrHeader)
                ub = UBound(arrHeader)
                inData = True
            Else
                For x = lb To ub
                    fOut.WriteLine Join(Array(fName, lineNum, _
                                   QID(arrHeader(x), DELIM, QUOTE), _
                                   QID(arrContent(x), DELIM, QUOTE)), DELIM)
    
                Next x
            End If
        Loop
        fIn.Close
        fOut.Close
    End Sub
    
    'quote if delimiter found
    Function QID(s, d As String, q As String)
        QID = IIf(InStr(s, d) > -1, q & s & q, s)
    End Function
    
    
    'Split a string into an array based on a Delimiter and a Text Identifier
    Private Function ParseLineToArray(sInput As String, m_Delim As String, _
                                      m_TextIdentifier As String) As Variant
       'Dim vArr As Variant
       Dim sArr() As String
       Dim bInText As Boolean
       Dim i As Long, n As Long
       Dim sTemp As String, tmp As String
    
       If sInput = "" Or InStr(1, sInput, m_Delim) = 0 Then
          'zero length string, or delimiter not present
          'dump all input into single-element array (minus Text Identifier)
          ReDim sArr(0)
          sArr(0) = Replace(sInput, m_TextIdentifier, "")
          ParseLineToArray = sArr()
       Else
          If InStr(1, sInput, m_TextIdentifier) = 0 Then
             'no text identifier so just split and return
             sArr() = Split(sInput, m_Delim)
             ParseLineToArray = sArr()
          Else
             'found the text identifier, so do it the long way
             bInText = False
             sTemp = ""
             n = 0
    
             For i = 1 To Len(sInput)
                tmp = Mid(sInput, i, 1)
                If tmp = m_TextIdentifier Then
                   'just toggle the flag - don't add to string
                   bInText = Not bInText
                Else
                   If tmp = m_Delim Then
                      If Not bInText Then
                         'delimiter not within quoted text, so add next array member
                         ReDim Preserve sArr(n)
                         sArr(n) = sTemp
                         sTemp = ""
                         n = n + 1
                      Else
                         sTemp = sTemp & tmp
                      End If
                   Else
                      sTemp = sTemp & tmp
                   End If           'character is a delimiter
                End If              'character is a quote marker
             Next i
    
             ReDim Preserve sArr(n)
             sArr(n) = sTemp
    
             ParseLineToArray = sArr()
          End If 'has any quoted text
       End If 'parseable
    
    End Function
    
    0 讨论(0)
  • 2021-01-16 16:27

    I have personally used CSV Reader in the past to parse huge CSV files (up to 1 GB). The performance and simplicity is incredible. I highly suggest that you use it.

    Since you say you used VB.NET, I suggest that you build a simple console application that references CSV Reader. This console application would take as a command line argument the path to a csv file to "unpivot". Then, from VBA, you could use VBA.Shell to run your console application and give it the path to the CSV file as an argument.

    0 讨论(0)
  • 2021-01-16 16:39

    This should be fast enough (it takes 8 secs on a 18MB file on my machine, but I only copy data, I don't restructure it - if you don't do calculations but only reorder stuff you should get the same kind of performance). It works even if the number of rows / columns would not fit in a spreadsheet.

    TODO: it is a bit long but you should be able to (a) copy paste it (b) change the file names and (c) amend the manipulateData function to suit your needs. The rest of the code is a bunch of reusable utilities functions that you should not need to change.

    I am not sure you can get much faster using VBA - if you need faster, you should consider an alternative language. Typically, the same code in Java or C# would much shorter because they already have standard libraries to read / write files etc. and would be faster too.

    Option Explicit
    
    Public Sub doIt()
        Dim sourceFile As String
        Dim destinationFile As String
        Dim data As Variant
        Dim result As Variant
    
        sourceFile = "xxxxxxx"
        destinationFile = "xxxxxxx"
    
        data = getDataFromFile(sourceFile, ",")
        If Not isArrayEmpty(data) Then
           result = manipulateData(data)
           writeToCsv result, destinationFile, ","
        Else
           MsgBox ("Empty file")
        End If
    End Sub
    
    Function manipulateData(sourceData As Variant) As Variant
        Dim result As Variant
        Dim i As Long
        Dim j As Long
        Dim k As Long
        Dim m As Long
    
        'redim the result array to the right size - here I only copy so same size as source
        ReDim result(1 To UBound(sourceData, 1), 1 To UBound(sourceData, 2)) As Variant
    
        For i = LBound(sourceData, 1) To UBound(sourceData, 1)
            For j = LBound(sourceData, 2) To UBound(sourceData, 2)
                k = i 'k to be defined - here I only copy data
                m = j 'm to be defined - here I only copy data
                result(k, m) = sourceData(i, j)
            Next j
        Next i
    
        manipulateData = result
    End Function
    
    Public Sub writeToCsv(parData As Variant, parFileName As String, parDelimiter As String)
    
        If getArrayNumberOfDimensions(parData) <> 2 Then Exit Sub
    
        Dim i As Long
        Dim j As Long
        Dim fileNum As Long
        Dim locLine As String
        Dim locCsvString As String
    
        fileNum = FreeFile
        If Dir(parFileName) <> "" Then Kill (parFileName)
        Open parFileName For Binary Lock Read Write As #fileNum
    
        For i = LBound(parData, 1) To UBound(parData, 1)
          locLine = ""
          For j = LBound(parData, 2) To UBound(parData, 2)
            If IsError(parData(i, j)) Then
              locLine = locLine & "#N/A" & parDelimiter
            Else
              locLine = locLine & parData(i, j) & parDelimiter
            End If
          Next j
          locLine = Left(locLine, Len(locLine) - 1)
          If i <> UBound(parData, 1) Then locLine = locLine & vbCrLf
          Put #fileNum, , locLine
        Next i
    
    error_handler:
        Close #fileNum
    
    End Sub
    
    Public Function isArrayEmpty(parArray As Variant) As Boolean
    'Returns false if not an array or dynamic array that has not been initialised (ReDim) or has been erased (Erase)
    
      If IsArray(parArray) = False Then isArrayEmpty = True
      On Error Resume Next
      If UBound(parArray) < LBound(parArray) Then isArrayEmpty = True: Exit Function Else: isArrayEmpty = False
    
    End Function
    
    Public Function getArrayNumberOfDimensions(parArray As Variant) As Long
    'Returns the number of dimension of an array - 0 for an empty array.
    
        Dim i As Long
        Dim errorCheck As Long
    
        If isArrayEmpty(parArray) Then Exit Function 'returns 0
    
        On Error GoTo FinalDimension
        'Visual Basic for Applications arrays can have up to 60000 dimensions
        For i = 1 To 60001
            errorCheck = LBound(parArray, i)
        Next i
    
        'Not supposed to happen
        getArrayNumberOfDimensions = 0
        Exit Function
    
    FinalDimension:
        getArrayNumberOfDimensions = i - 1
    
    End Function
    
    Private Function getDataFromFile(parFileName As String, parDelimiter As String, Optional parExcludeCharacter As String = "") As Variant
    'parFileName is supposed to be a delimited file (csv...)
    'parDelimiter is the delimiter, "," for example in a comma delimited file
    'Returns an empty array if file is empty or can't be opened
    'number of columns based on the line with the largest number of columns, not on the first line
    'parExcludeCharacter: sometimes csv files have quotes around strings: "XXX" - if parExcludeCharacter = """" then removes the quotes
    
    
      Dim locLinesList() As Variant
      Dim locData As Variant
      Dim i As Long
      Dim j As Long
      Dim locNumRows As Long
      Dim locNumCols As Long
      Dim fso As Variant
      Dim ts As Variant
      Const REDIM_STEP = 10000
    
      Set fso = CreateObject("Scripting.FileSystemObject")
    
      On Error GoTo error_open_file
      Set ts = fso.OpenTextFile(parFileName)
      On Error GoTo unhandled_error
    
      'Counts the number of lines and the largest number of columns
      ReDim locLinesList(1 To 1) As Variant
      i = 0
      Do While Not ts.AtEndOfStream
        If i Mod REDIM_STEP = 0 Then
          ReDim Preserve locLinesList(1 To UBound(locLinesList, 1) + REDIM_STEP) As Variant
        End If
        locLinesList(i + 1) = Split(ts.ReadLine, parDelimiter)
        j = UBound(locLinesList(i + 1), 1) 'number of columns
        If locNumCols < j Then locNumCols = j
        If j = 13 Then
          j = j
        End If
        i = i + 1
      Loop
    
      ts.Close
    
      locNumRows = i
    
      If locNumRows = 0 Then Exit Function 'Empty file
    
      ReDim locData(1 To locNumRows, 1 To locNumCols + 1) As Variant
    
      'Copies the file into an array
      If parExcludeCharacter <> "" Then
    
        For i = 1 To locNumRows
          For j = 0 To UBound(locLinesList(i), 1)
            If Left(locLinesList(i)(j), 1) = parExcludeCharacter Then
              If Right(locLinesList(i)(j), 1) = parExcludeCharacter Then
                locLinesList(i)(j) = Mid(locLinesList(i)(j), 2, Len(locLinesList(i)(j)) - 2)       'If locTempArray = "", Mid returns ""
              Else
                locLinesList(i)(j) = Right(locLinesList(i)(j), Len(locLinesList(i)(j)) - 1)
              End If
            ElseIf Right(locLinesList(i)(j), 1) = parExcludeCharacter Then
              locLinesList(i)(j) = Left(locLinesList(i)(j), Len(locLinesList(i)(j)) - 1)
            End If
            locData(i, j + 1) = locLinesList(i)(j)
          Next j
        Next i
    
      Else
    
        For i = 1 To locNumRows
          For j = 0 To UBound(locLinesList(i), 1)
            locData(i, j + 1) = locLinesList(i)(j)
          Next j
        Next i
    
      End If
    
      getDataFromFile = locData
    
      Exit Function
    
    error_open_file:                 'returns empty variant
    unhandled_error:                 'returns empty variant
    
    End Function
    
    0 讨论(0)
提交回复
热议问题