'In a code module:
Type tFolderAndFile(strFolder As String, _
strFile As String)
'Class code:
Sub Class_Globals
Private mtFF As tFolderAndFile
Private bFinalLineDone As Boolean 'so calling Sub can stop further processing of the file
Private mstrCharSet As String
Private mbtEndOfLineByte As Byte
Private mbtEncloserByte As Byte
Private mstrSeparatorChar As String
Private mstrEncloser As String
Private bFirstArrayDone As Boolean
Private lFilePos As Long
Private lArrayPos As Long
Private bNoEndOfLineFound As Boolean
Private bFinalArrayDone As Boolean
Private mlMaxBytes As Long
Private RAF As RandomAccessFile
Private lTotalBytes As Long
Private iBytes As Int
Private arrBytes() As Byte
Private strLine As String
Private miCols As Int
End Sub
'Initializes the object. You can add parameters to this method if needed.
Public Sub Initialize(tFF As tFolderAndFile, strCharset As String, btEndOfLineByte As Byte, btEncloserByte As Byte, lMaxBytes As Long, iColumns As Int)
mtFF = tFF
mstrCharSet = strCharset
mbtEndOfLineByte = btEndOfLineByte
mbtEncloserByte = btEncloserByte
If btEncloserByte <> -1 Then
mstrEncloser = Chr(btEncloserByte)
End If
mlMaxBytes = lMaxBytes
RAF.Initialize(tFF.strFolder, tFF.strFile, True)
lFilePos = 0
lArrayPos = 0
If iColumns = -1 Then
miCols = 0
Else
'this is useful for if the .csv has trailing comma's in every row (as often in Emis files)
miCols = iColumns
End If
lTotalBytes = RAF.Size 'using this in loops rather than RAF.Size speeds up substantially
'Log("lTotalBytes: " & lTotalBytes)
bNoEndOfLineFound = False
bFinalArrayDone = False
bFinalLineDone = False
mstrSeparatorChar = ""
iBytes = Min(lTotalBytes, mlMaxBytes)
End Sub
Public Sub InitRAF
RAF.Initialize(mtFF.strFolder, mtFF.strFile, True)
lTotalBytes = RAF.Size
iBytes = Min(lTotalBytes, mlMaxBytes)
End Sub
Public Sub Close
RAF.Close
Dim arrBytes() As Byte 'clear some memory?
lFilePos = 0
lArrayPos = 0
miCols = 0
bFirstArrayDone = False
bNoEndOfLineFound = False
bFinalArrayDone = False
bFinalLineDone = False
mstrSeparatorChar = ""
End Sub
Public Sub ReadLine As String 'ignore
Dim i As Long
Dim bInsideQuotes As Boolean = False
Do While bFinalLineDone = False
If bFinalArrayDone = False Then
If bFirstArrayDone = False Then
'first byte array to process
Dim arrBytes(iBytes) As Byte
iBytes = RAF.ReadBytes(arrBytes, 0, iBytes, 0)
bNoEndOfLineFound = False
bFirstArrayDone = True
Else
If bNoEndOfLineFound Then
Dim arrBytes(iBytes) As Byte
'Log("iBytes (before): " & iBytes & ", lFilePos: " & lFilePos)
iBytes = RAF.ReadBytes(arrBytes, 0, iBytes, lFilePos)
'Log("iBytes (after): " & iBytes)
lArrayPos = 0
bNoEndOfLineFound = False
End If
End If
End If 'If bFinalArrayDone = False
For i = lArrayPos To iBytes - 1
If arrBytes(i) = mbtEncloserByte Then
bInsideQuotes = bInsideQuotes = False
End If
If bInsideQuotes = False Then
If arrBytes(i) = mbtEndOfLineByte Then
If i > 0 Then
If arrBytes(i - 1) = 13 Then
strLine = BytesToString(arrBytes, lArrayPos, (i - lArrayPos) - 1, mstrCharSet)
Else
strLine = BytesToString(arrBytes, lArrayPos, i - lArrayPos, mstrCharSet)
End If
lArrayPos = i + 1 'to get ready for reading the next line, always starting just after a line-break
Return strLine
End If
End If
End If 'If bInsideQuotes = False T
Next
bNoEndOfLineFound = True 'means we need a new byte array
If bInsideQuotes Then
'-------------------------------------------------------------------------------------------
'this will happen if the array position is within quotes and it reaches the end of the array
'-------------------------------------------------------------------------------------------
lFilePos = lFilePos + lArrayPos
bInsideQuotes = False
Else
If iBytes < mlMaxBytes Then 'this is a bit simpler than the below line
'If RAF.Size - lFilePos < mlMaxBytes Then
If lFilePos < lTotalBytes - 1 Then
strLine = BytesToString(arrBytes, lArrayPos, iBytes - lArrayPos, mstrCharSet)
'this is a Public variable that can be picked up by the calling Sub to stop getting further lines
bFinalLineDone = True
'Log("final line: " & strLine)
Return strLine
Else
'this is a Public variable that can be picked up by the calling Sub to stop getting further lines
bFinalLineDone = True
End If
Else
lFilePos = lFilePos + lArrayPos
End If
End If
Loop
End Sub
'arrEmptyLines will pick up the count of empty lines in the file (consecutive linebreaks) and pass this to the calling Sub "ByRef"
Public Sub ReadList(bIgnoreEmptyLines As Boolean, iMaxLines As Int, arrEmptyLines() As Int) As ResumableSub 'ignore
Dim lstLines As List
If iMaxLines = 0 Then iMaxLines = -1
lstLines.Initialize
Do While bFinalLineDone = False
strLine = ReadLine
If strLine.Length > 0 Then
lstLines.Add(strLine)
Else
If bIgnoreEmptyLines = False Then
arrEmptyLines(0) = arrEmptyLines(0) + 1 'to pass to calling Sub ByRef
lstLines.Add(strLine)
End If
End If
If lstLines.Size = iMaxLines Then
Return lstLines
Exit
End If
Loop
Return lstLines
End Sub
'arrEmptyLines will pick up the count of empty lines in the file (consecutive linebreaks) and pass this to the calling Sub "ByRef"
Public Sub ReadCSVList(bIgnoreEmptyLines As Boolean, iMaxLines As Int, arrEmptyLines() As Int) As ResumableSub 'ignore
Dim c As Int
Dim lstLines As List
Dim bEncloser As Boolean = mstrEncloser.Length > 0
'Log("ReadCSVList, iMaxLines: " & iMaxLines)
If mstrSeparatorChar.Length = 0 Then
Dim rs3 As ResumableSub = GetSeparatorCharFromFile(mtFF, mstrCharSet, mstrEncloser, 4)
Wait For (rs3) Complete (oSeparatorChar As Object) 'can't do as Char as oChar can be an empty string: ""
mstrSeparatorChar = CStr(oSeparatorChar)
End If
lstLines.Initialize
If bEncloser Then
Do While bFinalLineDone = False
strLine = ReadLine
If strLine.Length > 0 Then
If miCols = 0 Then
miCols = CountFieldsInTextLine(strLine, mstrSeparatorChar, mstrEncloser)
End If
If strLine.Length > miCols Then
Dim arr() As Object = ParseCSVLine(strLine, miCols, mstrSeparatorChar, mstrEncloser)
lstLines.Add(arr)
End If
Else
arrEmptyLines(0) = arrEmptyLines(0) + 1 'to pass to calling Sub "ByRef", this is just for information
If bIgnoreEmptyLines = False Then
Dim arrNulls(miCols) As Object
For c = 0 To miCols - 1
arrNulls(c) = Null
Next
lstLines.Add(arrNulls)
End If
End If
If lstLines.Size = iMaxLines Then
Return lstLines
Exit
End If
Loop
Else 'If bEncloser
Do While bFinalLineDone = False
strLine = ReadLine
If strLine.Length > 0 Then
If miCols = 0 Then
miCols = CountFieldsInTextLine(strLine, mstrSeparatorChar, mstrEncloser)
End If
If strLine.Length > miCols Then
Dim arr() As Object = ParseCSVLineNoEnclosers(strLine, miCols, mstrSeparatorChar)
lstLines.Add(arr)
End If
Else
arrEmptyLines(0) = arrEmptyLines(0) + 1 'to pass to calling Sub "ByRef", this is just for information
If bIgnoreEmptyLines = False Then
Dim arrNulls(miCols) As Object
For c = 0 To miCols - 1
arrNulls(c) = Null
Next
lstLines.Add(arrNulls)
End If
End If
If lstLines.Size = iMaxLines Then
Return lstLines
Exit
End If
Loop 'Do While bFinalLineDone = False
End If 'If bEncloser
Return lstLines
End Sub
Sub CountFieldsInTextLine(strCSVLine As String, strSeparator As String, strEncloser As String) As Int
Dim c As Int
Dim iEndIndex As Int
Dim bInsideQuotes As Boolean = False
If strSeparator.Length = 0 Then
Return 1
End If
If strEncloser.Length = 0 Then
For iEndIndex = 0 To strCSVLine.Length - 1
If strCSVLine.CharAt(iEndIndex) = strSeparator Then
c = c + 1
End If
Next
Else
For iEndIndex = 0 To strCSVLine.Length - 1
If strCSVLine.CharAt(iEndIndex) = strEncloser Then
bInsideQuotes = bInsideQuotes = False
Else
If strCSVLine.CharAt(iEndIndex) = strSeparator Then
If bInsideQuotes = False Then
c = c + 1
End If
End If
End If
Next
End If
Return c + 1
End Sub
Sub GetCharCounts(strCSVLine As String, strEncloser As String) As Map
Dim i As Int
Dim bInsideQuotes As Boolean
Dim mapCountChars As Map
Dim iCount As Int
Dim oChar As Char
mapCountChars.Initialize
If strEncloser.Length = 0 Then
For i = 0 To strCSVLine.Length - 1
oChar = strCSVLine.CharAt(i)
If mapCountChars.ContainsKey(oChar) Then
iCount = mapCountChars.Get(oChar)
mapCountChars.Put(oChar, iCount + 1)
Else
mapCountChars.Put(oChar, 1)
End If
Next
Else
For i = 0 To strCSVLine.Length - 1
oChar = strCSVLine.CharAt(i)
If oChar = strEncloser Then
bInsideQuotes = bInsideQuotes = False
Else
If bInsideQuotes = False Then
If mapCountChars.ContainsKey(oChar) Then
iCount = mapCountChars.Get(oChar)
mapCountChars.Put(oChar, iCount + 1)
Else
mapCountChars.Put(oChar, 1)
End If
End If
End If
Next
End If
Return mapCountChars
End Sub
'iLines is the number of lines to read and analyze to determine the separator char
Public Sub GetSeparatorCharFromFile(tFF As tFolderAndFile, strCharSet As String, strEncloser As String, iLines As Int) As ResumableSub
Dim i As Int
Dim n As Int
Dim oTR2 As TextReader
Dim strLine As String
Dim iCount As Int
Dim lstMaps As List
Dim oMap As Map
Dim oMapCharAndCount As Map
Dim strKey As String
Dim strPossibleSeparator As Char
Dim strExclude As String = " ' "" 0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" 'includes quote and double-quote
oTR2.Initialize2(File.OpenInput(tFF.strFolder, tFF.strFile), strCharSet)
lstMaps.Initialize
Do While True
strLine = oTR2.ReadLine
If strLine = Null Then
iLines = n
Exit
End If
lstMaps.Add(GetCharCounts(strLine, strEncloser))
n = n + 1
If n = iLines Then
Exit
End If
Loop
oTR2.Close
oMapCharAndCount.Initialize
For i = 0 To lstMaps.Size - 1
oMap = lstMaps.Get(i)
For Each oKey As Object In oMap.Keys
strKey = oKey & oMap.Get(oKey)
If oMapCharAndCount.ContainsKey(strKey) Then
iCount = oMapCharAndCount.Get(strKey)
oMapCharAndCount.Put(strKey, iCount + 1)
Else
oMapCharAndCount.Put(strKey, 1)
End If
Next
Next
For Each oKey As Object In oMapCharAndCount.Keys
If iLines = oMapCharAndCount.Get(oKey) Then
strPossibleSeparator = CStr(oKey).CharAt(0)
If strExclude.Contains(strPossibleSeparator) = False Then
Return strPossibleSeparator
End If
End If
Next
'need to return something and can't return Null
Return ""
End Sub
Sub CStr(o As String) As String
Return o
End Sub
Sub ParseCSVLine(strCSVLine As String, iCols As Int, strSeparator As String, strEncloser As String) As Object()
Dim c As Int
Dim arrValues(iCols) As Object
Dim iStartIndex As Int
Dim iEndIndex As Int
Dim bInsideQuotes As Boolean = False
Dim bIgnoreFurtherFields As Boolean
'Dim strEmpty As String
'1 column only, so whole line in one column
If strSeparator.Length = 0 Then
arrValues(0) = strCSVLine
Return arrValues
End If
For iEndIndex = 0 To strCSVLine.Length - 1
If strCSVLine.CharAt(iEndIndex) = strEncloser Then
bInsideQuotes = bInsideQuotes = False
Else
If strCSVLine.CharAt(iEndIndex) = strSeparator Then
If bInsideQuotes = False Then
arrValues(c) = strCSVLine.SubString2(iStartIndex, iEndIndex)
iStartIndex = iEndIndex + 1
'this is useful (and needed) for if the .csv file has trailing separators
c = c + 1
If c = iCols Then
bIgnoreFurtherFields = True
Exit
End If
End If
End If
End If
Next
If bIgnoreFurtherFields = False Then
arrValues(c) = strCSVLine.SubString2(iStartIndex, iEndIndex)
End If
Return arrValues
End Sub
Sub ParseCSVLineNoEnclosers(strCSVLine As String, iCols As Int, strSeparator As String) As String()
Dim c As Int
Dim arrValues(iCols) As String
Dim iStartIndex As Int
Dim iEndIndex As Int
Dim bIgnoreFurtherFields As Boolean
'1 column only, so whole line in one column
If strSeparator.Length = 0 Then
arrValues(0) = strCSVLine
Return arrValues
End If
For iEndIndex = 0 To strCSVLine.Length - 1
If strCSVLine.CharAt(iEndIndex) = strSeparator Then
arrValues(c) = strCSVLine.SubString2(iStartIndex, iEndIndex)
iStartIndex = iEndIndex + 1
'this is useful (and needed) for if the .csv file has trailing separators
c = c + 1
If c = iCols Then
bIgnoreFurtherFields = True
Exit
End If
End If
Next
If bIgnoreFurtherFields = False Then
arrValues(c) = strCSVLine.SubString2(iStartIndex, iEndIndex)
End If
Return arrValues
End Sub
'If the same class (clsReadTextFile) is going to be used again then do bCloseRAFAfter = True and run class.InitRAF after to reset the RAF state
Public Sub GetLineCount(tFF As tFolderAndFile, btEndOfLineByte As Byte, _
iMaxRows As Int, iMaxBytesInArray As Int, bCloseRAFAfter As Boolean) As Int
Dim i As Int
Dim iBytes As Int
Dim lPosition As Long
Dim iLines As Int
RAF.Initialize(tFF.strFolder, tFF.strFile, True)
iBytes = iMaxBytesInArray 'will be 1.000.000, could make smaller or larger
Do While lPosition < RAF.Size
Dim arrBytes(iBytes) As Byte
iBytes = RAF.ReadBytes(arrBytes, 0, iBytes, lPosition)
For i = 0 To iBytes - 1
If arrBytes(i) = btEndOfLineByte Then
iLines = iLines + 1
If iMaxRows > 0 Then
If iLines = iMaxRows Then Exit
End If
End If
Next
lPosition = lPosition + iBytes
Loop
If bCloseRAFAfter Then
RAF.Close
End If
If arrBytes(iBytes - 1) = btEndOfLineByte Then
Return iLines
Else
Return iLines + 1
End If
End Sub
'This will be True if the very last line has been read and returned
Sub getFinalLineDone As Boolean
Return bFinalLineDone
End Sub
Sub getTotalFileBytes As Long
Return lTotalBytes
End Sub