B4J Question [PyBridge] Web Scraping using BeautifulSoup

Erel

B4X founder
Staff member
Licensed User
Longtime User
To help you get started:

pip install beautifulsoup4


B4X:
Sub Class_Globals
    Private Root As B4XView
    Private xui As XUI
    Public Py As PyBridge
    Private bs4 As PyWrapper
End Sub

Public Sub Initialize
    
End Sub

'This event will be called once, before the page becomes visible.
Private Sub B4XPage_Created (Root1 As B4XView)
    Root = Root1
    Root.LoadLayout("MainPage")
    Py.Initialize(Me, "Py")
    Dim opt As PyOptions = Py.CreateOptions("Python/python/python.exe")
    Py.Start(opt)
    Wait For Py_Connected (Success As Boolean)
    If Success = False Then
        LogError("Failed to start Python process.")
        Return
    End If
    bs4 = Py.ImportModule("bs4")
    'add a converter that converts tags to maps:
    AddTypeConverter(bs4.GetField("element").GetField("Tag"), _
        Py.Lambda("tag: {'name': tag.name, 'attributes': dict(tag.attrs), 'text': tag.get_text(strip=True)}"))
        
    Dim HtmlDoc As String = $"<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>"$
    Dim soup As PyWrapper = CreateSoup(HtmlDoc, "html.parser")
    soup.Print
    soup.GetField("title").Print
    soup.GetField("title").GetField("name").Print
    soup.GetField("title").GetField("string").Print
    soup.GetField("title").GetField("parent").GetField("name").Print
    soup.GetField("p").Get("class").Print
    'example of fetching to B4J:
    Dim Tags As PyWrapper = soup.Run("find_all").Arg("a")
    Wait For (Tags.Fetch) Complete (Tags As PyWrapper)
    For Each tag As Map In Tags.Value.As(List)
        Log(tag)
    Next
End Sub

Private Sub CreateSoup(Html As String, Parser As String) As PyWrapper
    Return bs4.Run("BeautifulSoup").Arg(Html).Arg(Parser)
End Sub

Private Sub AddTypeConverter(PyType As PyWrapper, Method As PyWrapper)
    Dim converters As PyWrapper = Py.Bridge.GetField("comm").GetField("serializator").GetField("converters")
    converters.Set(PyType, Method)
End Sub
 
Upvote 0
Top