Dim m As Matcher = Regex.Matcher("table class=""t2""><td class=\""", page)' originale
Do you have any examples? , I'm trying with parse or httpjob but I'm going crazy
html = html.Replace(Chr(10), "") ' HTML page with line feeds removed
Dim start As Int = html.IndexOf($"<table id="performances">"$)
Dim stop As Int = html.IndexOf2($"</table>"$, start)
If (stop > start) Then
html = html.SubString2(start, stop + 8)
extractList
End If
Private Sub extractPerformance(data As String, p As performance)
Dim pattern As String = $"class=.+?>[^\<]*"$
Dim m As Matcher = Regex.Matcher(pattern, data)
Do While m.Find
item = m.Match
If item.StartsWith($"class="association""$) Then p.association = extractData(item)
If item.StartsWith($"class="place""$) Then p.place = extractData(item)
If item.StartsWith($"class="address""$) Then p.address = extractData(item)
If item.StartsWith($"class="changes""$) Then changes = extractData(item)
If item.StartsWith($"class="title""$) Then p.title = changes & " " & extractData(item)
If item.StartsWith($"class="footnote""$) Then p.footnote = p.footnote & " " & extractData(item)
Loop
End Sub
' Extract the data content from a RegEx match item
Private Sub extractData(item As String) As String
Dim result As String
result = item.SubString(item.IndexOf(">") + 1) "
Return result
End Sub
it is difficult to help you without knowing the URL and the structure of the page.
You have to adapt the Regex to the expression you are interested in finding.
you can use the search engine.
I separated out the table section that I wanted like this :
B4X:html = html.Replace(Chr(10), "") ' HTML page with line feeds removed Dim start As Int = html.IndexOf($"<table id="performances">"$) Dim stop As Int = html.IndexOf2($"</table>"$, start) If (stop > start) Then html = html.SubString2(start, stop + 8) extractList End If
The section that I was interested in had a unique start tag (looks like your does too) so you grab everything up to the next </table> tag. Removing the line feeds first helps clean thing up and prepares for Regex searches. Even if you intend to use an XML parser I think that cutting the HTML down to size in this way makes things easier to manage.
I was able to pick out the items that I wanted by a Regex search for <class ...> tags ...
B4X:Private Sub extractPerformance(data As String, p As performance) Dim pattern As String = $"class=.+?>[^\<]*"$ Dim m As Matcher = Regex.Matcher(pattern, data) Do While m.Find item = m.Match If item.StartsWith($"class="association""$) Then p.association = extractData(item) If item.StartsWith($"class="place""$) Then p.place = extractData(item) If item.StartsWith($"class="address""$) Then p.address = extractData(item) If item.StartsWith($"class="changes""$) Then changes = extractData(item) If item.StartsWith($"class="title""$) Then p.title = changes & " " & extractData(item) If item.StartsWith($"class="footnote""$) Then p.footnote = p.footnote & " " & extractData(item) Loop End Sub ' Extract the data content from a RegEx match item Private Sub extractData(item As String) As String Dim result As String result = item.SubString(item.IndexOf(">") + 1) " Return result End Sub
This is very low tech but quite flexible; I am not very good at Regex. I don't expect that it will be a perfect fit for your situation, but I hope that it might give you some ideas
Sub Process_Globals
Private fx As JFX
Private MainForm As Form
' --- Variable for Jsoup
Private js As jSoup
Private Extract01 As List
Private Extract02 As List
Private Extract03 As List
Private Extract04 As List
End Sub
Sub AppStart (Form1 As Form, Args() As String)
MainForm = Form1
' MainForm.RootPane.LoadLayout("Layout1") 'Load the layout file.
' MainForm.Show
Extract01.Initialize
Extract02.Initialize
Extract03.Initialize
Extract04.Initialize
ScrapeTable
#IF B4A
Activity.Finish
#End If
#if B4J
ExitApplication ' ends the program
#End If
End Sub
Private Sub ScrapeTable
' --- Load the url
Dim url As String = "https://www.superenalotto.com/archivio"
' --- Get the page content
Dim HTML As String = js.connect(url)
' Log (HTML)
' --- Get the date for the numbers
Dim FirstDateRow As String = js.getElementsByClass(HTML, "t1").Get(0)
' --- Get the first (op top row) of the table in t2 class
Dim FirstTableRow As String = js.getElementsByClass(HTML, "t2").Get(0)
' --- We need the t1 class for the date
Extract01 = js.selectorElementText(FirstDateRow, "a")
' --- We need only the td rows of the wanted table
Extract02 = js.getElementsByClass(FirstTableRow, "ball-24px")
Extract03 = js.getElementsByClass(FirstTableRow, "jolly-24px")
Extract04 = js.getElementsByClass(FirstTableRow, "superstar-24px")
' --- Show results during test periode
' Log("Extract01: "& Extract01)
' Dim Count As Int = Extract01.Size
' Log("Count: "& Count)
'
' Log("Extract02: "& Extract02)
' Dim Count As Int = Extract02.Size
' Log("Count: "& Count)
'
' Log("Extract03: "& Extract03)
' Dim Count As Int = Extract03.Size
' Log("Count: "& Count)
'
' Log("Extract04: "& Extract04)
' Dim Count As Int = Extract04.Size
' Log("Count: "& Count)
' --- Start with the date which is the first line
Log($"The number from ${Extract01.Get(0)} are: ${CRLF}"$)
' --- We need to scrape the rows and columns
Dim columns As List
columns.Initialize
' --- To display a counter at the variables end
Dim x As Int = 0
' --- Scrape the first 6 numbers with the same class
For i = 0 To Extract02.Size -1
columns = js.selectorElementText($"<table>${Extract02.Get(i)}</table>"$, "td")
For j = 0 To columns.Size -1
x = x + 1
Log($"Number${x}: ${columns.Get(j)}"$)
Next
Next
' --- And scrape the Jolly number with different class
For i = 0 To Extract03.Size -1
columns = js.selectorElementText($"<table>${Extract03.Get(i)}</table>"$, "td")
For j = 0 To columns.Size -1
x = x + 1
Log($"Jolly Number${x}: ${columns.Get(j)}"$)
Next
Next
' --- And scrape the super number with different class
For i = 0 To Extract04.Size -1
columns = js.selectorElementText($"<table>${Extract04.Get(i)}</table>"$, "td")
For j = 0 To columns.Size -1
x = x + 1
Log($"Super Number${x}: ${columns.Get(j)}"$)
Next
Next
End Sub
In my example, I only used the original B4A and B4J jSoup HTML Parser library from 5 years ago.I can't load the libraries for B4A, Whi?
I'm trying to implement your example, but what is it? p As performance
while I solve the problem with B4A I saw in b4j that the library works very well, excellent job.
I attach the extract of the page, as the structure of the page repeats the same many times but so I extract it only once,
I need to do more laps.
eg the draws from January with the numbers and dates, at the moment it is only the last one.
' --- Get the date for the numbers
Dim FirstDateRow As String = js.getElementsByClass(HTML, "t1").Get(0)
Sub InterpretaTabella(WWW As String)
Dim TabellaHTML As String
TabellaHTML=WWW.SubString(WWW.ToUpperCase.IndexOf($"<TABLE CLASS="T2">"$)) 'funziona la data
TabellaHTML=TabellaHTML.SubString2(0,TabellaHTML.ToUpperCase.IndexOf("</TR"))'originale
Dim Righe() As String = Regex.Split("</td>",TabellaHTML)'originale
For Rg=0 To Righe.Length-1
Dim celle() As String = Regex.Split("</td>",Righe(Rg))
For Col=0 To celle.Length-1
Dim Cella As String
Cella=TogliTag(celle(Col))
'Log("Cella (" & Rg & "," & Col & ") :" & Cella)
Log (Cella.Trim)
Next
Next
End Sub
Sub TogliTag(cella As String) As String
Dim Ret As String =""
Dim Tag As Boolean = False
For i=0 To cella.Length-1
If cella.SubString2(i,i+1)="<" Then Tag=True
If Not(Tag) Then Ret=Ret & cella.SubString2(i,i+1)
If cella.SubString2(i,i+1)=">" Then Tag=False
Next
Return Ret
End Sub
' --- Load the raw text file
Dim RawHTML As String = ""
Dim read1 As String="miotesto.txt"
RawHTML = File.ReadString(File.DirAssets,read1)
' Log("Raw RawHTML: " & RawHTML) 'displays the content of file
' --- Remove empty lines and reformat the layout
Dim HTML As String = js.parse_HTML(RawHTML)
Log("Clean HTML: " & HTML)
Have you tried in B4A to resync the libraries and reload the program code to fix the problem? I take a look to your text file processing.
We use cookies and similar technologies for the following purposes:
Do you accept cookies and these technologies?
We use cookies and similar technologies for the following purposes:
Do you accept cookies and these technologies?