Using VBA, I need to \"unpivot\" data that is currently in delimited text files (as hundreds of columns by tens of thousands of rows) into a normalized form (four columns by
I decided to build a tiny COM-aware wrapper around TextFieldParser
in VB.NET. Not ideal, but the best I can come up with at present.
Compiled but not tested
Sub UnpivotFile(sPath As String)
Const DELIM As String = ","
Const QUOTE As String = """"
Dim FSO As New FileSystemObject
Dim arrHeader
Dim arrContent
Dim lb As Integer, ub As Integer
Dim x As Integer
Dim inData As Boolean
Dim l As String, fName As String
Dim fIn As Scripting.TextStream
Dim fOut As Scripting.TextStream
Dim tmp As String
Dim lineNum As Long
fName = FSO.GetFileName(sPath)
Set fIn = FSO.OpenTextFile(sPath, ForReading)
Set fOut = FSO.OpenTextFile(sPath & "_out", ForWriting)
lineNum = 0
Do While Not fIn.AtEndOfStream
l = fIn.ReadLine
lineNum = lineNum + 1
arrContent = ParseLineToArray(l, DELIM, QUOTE)
If Not inData Then
arrHeader = arrContent
lb = LBound(arrHeader)
ub = UBound(arrHeader)
inData = True
Else
For x = lb To ub
fOut.WriteLine Join(Array(fName, lineNum, _
QID(arrHeader(x), DELIM, QUOTE), _
QID(arrContent(x), DELIM, QUOTE)), DELIM)
Next x
End If
Loop
fIn.Close
fOut.Close
End Sub
'quote if delimiter found
Function QID(s, d As String, q As String)
QID = IIf(InStr(s, d) > -1, q & s & q, s)
End Function
'Split a string into an array based on a Delimiter and a Text Identifier
Private Function ParseLineToArray(sInput As String, m_Delim As String, _
m_TextIdentifier As String) As Variant
'Dim vArr As Variant
Dim sArr() As String
Dim bInText As Boolean
Dim i As Long, n As Long
Dim sTemp As String, tmp As String
If sInput = "" Or InStr(1, sInput, m_Delim) = 0 Then
'zero length string, or delimiter not present
'dump all input into single-element array (minus Text Identifier)
ReDim sArr(0)
sArr(0) = Replace(sInput, m_TextIdentifier, "")
ParseLineToArray = sArr()
Else
If InStr(1, sInput, m_TextIdentifier) = 0 Then
'no text identifier so just split and return
sArr() = Split(sInput, m_Delim)
ParseLineToArray = sArr()
Else
'found the text identifier, so do it the long way
bInText = False
sTemp = ""
n = 0
For i = 1 To Len(sInput)
tmp = Mid(sInput, i, 1)
If tmp = m_TextIdentifier Then
'just toggle the flag - don't add to string
bInText = Not bInText
Else
If tmp = m_Delim Then
If Not bInText Then
'delimiter not within quoted text, so add next array member
ReDim Preserve sArr(n)
sArr(n) = sTemp
sTemp = ""
n = n + 1
Else
sTemp = sTemp & tmp
End If
Else
sTemp = sTemp & tmp
End If 'character is a delimiter
End If 'character is a quote marker
Next i
ReDim Preserve sArr(n)
sArr(n) = sTemp
ParseLineToArray = sArr()
End If 'has any quoted text
End If 'parseable
End Function
I have personally used CSV Reader in the past to parse huge CSV files (up to 1 GB). The performance and simplicity is incredible. I highly suggest that you use it.
Since you say you used VB.NET, I suggest that you build a simple console application that references CSV Reader. This console application would take as a command line argument the path to a csv file to "unpivot". Then, from VBA, you could use VBA.Shell to run your console application and give it the path to the CSV file as an argument.
This should be fast enough (it takes 8 secs on a 18MB file on my machine, but I only copy data, I don't restructure it - if you don't do calculations but only reorder stuff you should get the same kind of performance). It works even if the number of rows / columns would not fit in a spreadsheet.
TODO: it is a bit long but you should be able to (a) copy paste it (b) change the file names and (c) amend the manipulateData function to suit your needs. The rest of the code is a bunch of reusable utilities functions that you should not need to change.
I am not sure you can get much faster using VBA - if you need faster, you should consider an alternative language. Typically, the same code in Java or C# would much shorter because they already have standard libraries to read / write files etc. and would be faster too.
Option Explicit
Public Sub doIt()
Dim sourceFile As String
Dim destinationFile As String
Dim data As Variant
Dim result As Variant
sourceFile = "xxxxxxx"
destinationFile = "xxxxxxx"
data = getDataFromFile(sourceFile, ",")
If Not isArrayEmpty(data) Then
result = manipulateData(data)
writeToCsv result, destinationFile, ","
Else
MsgBox ("Empty file")
End If
End Sub
Function manipulateData(sourceData As Variant) As Variant
Dim result As Variant
Dim i As Long
Dim j As Long
Dim k As Long
Dim m As Long
'redim the result array to the right size - here I only copy so same size as source
ReDim result(1 To UBound(sourceData, 1), 1 To UBound(sourceData, 2)) As Variant
For i = LBound(sourceData, 1) To UBound(sourceData, 1)
For j = LBound(sourceData, 2) To UBound(sourceData, 2)
k = i 'k to be defined - here I only copy data
m = j 'm to be defined - here I only copy data
result(k, m) = sourceData(i, j)
Next j
Next i
manipulateData = result
End Function
Public Sub writeToCsv(parData As Variant, parFileName As String, parDelimiter As String)
If getArrayNumberOfDimensions(parData) <> 2 Then Exit Sub
Dim i As Long
Dim j As Long
Dim fileNum As Long
Dim locLine As String
Dim locCsvString As String
fileNum = FreeFile
If Dir(parFileName) <> "" Then Kill (parFileName)
Open parFileName For Binary Lock Read Write As #fileNum
For i = LBound(parData, 1) To UBound(parData, 1)
locLine = ""
For j = LBound(parData, 2) To UBound(parData, 2)
If IsError(parData(i, j)) Then
locLine = locLine & "#N/A" & parDelimiter
Else
locLine = locLine & parData(i, j) & parDelimiter
End If
Next j
locLine = Left(locLine, Len(locLine) - 1)
If i <> UBound(parData, 1) Then locLine = locLine & vbCrLf
Put #fileNum, , locLine
Next i
error_handler:
Close #fileNum
End Sub
Public Function isArrayEmpty(parArray As Variant) As Boolean
'Returns false if not an array or dynamic array that has not been initialised (ReDim) or has been erased (Erase)
If IsArray(parArray) = False Then isArrayEmpty = True
On Error Resume Next
If UBound(parArray) < LBound(parArray) Then isArrayEmpty = True: Exit Function Else: isArrayEmpty = False
End Function
Public Function getArrayNumberOfDimensions(parArray As Variant) As Long
'Returns the number of dimension of an array - 0 for an empty array.
Dim i As Long
Dim errorCheck As Long
If isArrayEmpty(parArray) Then Exit Function 'returns 0
On Error GoTo FinalDimension
'Visual Basic for Applications arrays can have up to 60000 dimensions
For i = 1 To 60001
errorCheck = LBound(parArray, i)
Next i
'Not supposed to happen
getArrayNumberOfDimensions = 0
Exit Function
FinalDimension:
getArrayNumberOfDimensions = i - 1
End Function
Private Function getDataFromFile(parFileName As String, parDelimiter As String, Optional parExcludeCharacter As String = "") As Variant
'parFileName is supposed to be a delimited file (csv...)
'parDelimiter is the delimiter, "," for example in a comma delimited file
'Returns an empty array if file is empty or can't be opened
'number of columns based on the line with the largest number of columns, not on the first line
'parExcludeCharacter: sometimes csv files have quotes around strings: "XXX" - if parExcludeCharacter = """" then removes the quotes
Dim locLinesList() As Variant
Dim locData As Variant
Dim i As Long
Dim j As Long
Dim locNumRows As Long
Dim locNumCols As Long
Dim fso As Variant
Dim ts As Variant
Const REDIM_STEP = 10000
Set fso = CreateObject("Scripting.FileSystemObject")
On Error GoTo error_open_file
Set ts = fso.OpenTextFile(parFileName)
On Error GoTo unhandled_error
'Counts the number of lines and the largest number of columns
ReDim locLinesList(1 To 1) As Variant
i = 0
Do While Not ts.AtEndOfStream
If i Mod REDIM_STEP = 0 Then
ReDim Preserve locLinesList(1 To UBound(locLinesList, 1) + REDIM_STEP) As Variant
End If
locLinesList(i + 1) = Split(ts.ReadLine, parDelimiter)
j = UBound(locLinesList(i + 1), 1) 'number of columns
If locNumCols < j Then locNumCols = j
If j = 13 Then
j = j
End If
i = i + 1
Loop
ts.Close
locNumRows = i
If locNumRows = 0 Then Exit Function 'Empty file
ReDim locData(1 To locNumRows, 1 To locNumCols + 1) As Variant
'Copies the file into an array
If parExcludeCharacter <> "" Then
For i = 1 To locNumRows
For j = 0 To UBound(locLinesList(i), 1)
If Left(locLinesList(i)(j), 1) = parExcludeCharacter Then
If Right(locLinesList(i)(j), 1) = parExcludeCharacter Then
locLinesList(i)(j) = Mid(locLinesList(i)(j), 2, Len(locLinesList(i)(j)) - 2) 'If locTempArray = "", Mid returns ""
Else
locLinesList(i)(j) = Right(locLinesList(i)(j), Len(locLinesList(i)(j)) - 1)
End If
ElseIf Right(locLinesList(i)(j), 1) = parExcludeCharacter Then
locLinesList(i)(j) = Left(locLinesList(i)(j), Len(locLinesList(i)(j)) - 1)
End If
locData(i, j + 1) = locLinesList(i)(j)
Next j
Next i
Else
For i = 1 To locNumRows
For j = 0 To UBound(locLinesList(i), 1)
locData(i, j + 1) = locLinesList(i)(j)
Next j
Next i
End If
getDataFromFile = locData
Exit Function
error_open_file: 'returns empty variant
unhandled_error: 'returns empty variant
End Function