Hello,
I'm trying to retrieve all text inside a PDF file and store on a variable to parse it, and after my unsuccessful search for a library, I'm trying to code it myself... with only partial results.
Following, the routine I'm doing to extract the text. At this moment, it only recovers plain text, the formatted remains unreadable.
I attach a couple of PDF test files, 1.PDF with RTF text, and 2.PDF with plain text. Any suggestions?
I'm trying to retrieve all text inside a PDF file and store on a variable to parse it, and after my unsuccessful search for a library, I'm trying to code it myself... with only partial results.
Following, the routine I'm doing to extract the text. At this moment, it only recovers plain text, the formatted remains unreadable.
B4X:
Sub Activity_Create(FirstTime As Boolean)
Dim textpdf As String
textpdf = ExtractTextFromPDF("1.PDF", "iso-8859-15")
Msgbox(textpdf,"")
End Sub
Sub ExtractTextFromPDF(FileName As String, Charset As String ) As String
Try
'Open pdf file
ProgressDialogShow2("Parsing PDF. Please wait...", False)
DoEvents
Dim In As InputStream
Dim compress As CompressedStreams
Dim pdf_length As Double
pdf_length = File.Size(File.DirRootExternal,FileName)
Dim bPDF(pdf_length) As Byte 'Bytes array that stores PDF file
File.OpenInput(File.DirRootExternal,FileName).ReadBytes(bPDF,0,bPDF.length)
Dim bc As ByteConverter
'We search all stream containing objects inside the PDF file
Dim lstBytStream As List, bytBuffer() As Byte
Dim lstStrDeco As List
lstBytStream.Initialize
lstStrDeco.Initialize
Dim seekEndstream As Int, seekStream As Int
Dim pos1, pos2 As Int
pos1=0
pos2=0
Dim whole_deco As String
whole_deco = ""
For a = pos1+8 To bPDF.length-1
'We search for "stream" plus an "x"
If bPDF(a) = 109 AND bPDF(a-1) = 97 AND bPDF(a-2) = 101 AND bPDF(a-3) = 114 AND _
bPDF(a-4) = 116 AND bPDF(a-5) = 115 AND bPDF(a-6) <> 100 AND (bPDF(a+2) = 120 OR bPDF(a+1) = 120) Then
If bPDF(a+2) = 120 Then
pos1 = a+2
Else
pos1 = a+1
End If
'We search for "endstream"
For b = pos1 + 6 To bPDF.length - 1
If bPDF(b) = 109 AND bPDF(b-1) = 97 AND bPDF(b-2) = 101 AND bPDF(b-3) = 114 AND _
bPDF(b-4) = 116 AND bPDF(b-5) = 115 AND bPDF(b-6) = 100 AND bPDF(b-7) = 110 AND _
bPDF(b-8) = 101 Then
pos2 = b-9
Dim select_length As Int
select_length = pos2 - pos1
Dim bPDF_stream(select_length) As Byte
For c = 0 To select_length - 1
bPDF_stream(c) = bPDF(pos1 + c)
Next
Log("bPDF_stream size: " & bPDF_stream.length)
Dim deco() As Byte
deco = compress.DecompressBytes(bPDF_stream, "zlib")
Dim strDeco As String, strDeco2 As String
strDeco = BytesToString(deco,0, deco.length, charset)
'We unescape and deprecate all text outside parenthesis
Dim delete As Boolean
delete = True
For c = 1 To strDeco.length - 2
Log("c:" & c & " length:" & strDeco.Length)
If strDeco.CharAt(c) = ")" AND strDeco.CharAt(c - 1) <> "\" Then
delete = True
Else If strDeco.CharAt(c) = "(" AND strDeco.CharAt(c - 1) <> "\" Then
delete = False
Else
If delete = False Then
Dim nextpar As Int
nextpar = strDeco.IndexOf2(")",c)
If nextpar <> -1 Then
Do While strDeco.CharAt(nextpar-1) = "\"
nextpar = strDeco.IndexOf2(")",nextpar+1)
Loop
strDeco2 = strDeco2 & strDeco.SubString2(c,nextpar)
c = nextpar - 1
Log("nextpar:" & nextpar)
Else
strDeco2 = strDeco2
Exit
End If
End If
End If
DoEvents
Next
If strDeco2.Length > 0 Then
strDeco2 = strDeco2.Replace("\(", "<<open_par>>")
strDeco2 = strDeco2.Replace("\)", "<<close_par>>")
strDeco2 = strDeco2.Replace("(", "")
strDeco2 = strDeco2.Replace(")", "")
strDeco2 = strDeco2.Replace("<<open_par>>", "(")
strDeco2 = strDeco2.Replace("<<close_par>>", ")")
End If
Log("Decoded and unescaped stream:" & CRLF & strDeco2)
whole_deco = whole_deco & CRLF & strDeco2
Exit
End If
Next
End If
Next
Log("Whole decoded and unescaped stream:" & CRLF & whole_deco)
ProgressDialogHide
Return whole_deco
Catch
ProgressDialogHide
Log(LastException)
End Try
End Sub
I attach a couple of PDF test files, 1.PDF with RTF text, and 2.PDF with plain text. Any suggestions?