Source code for selene.core.soup.element
from bs4 import BeautifulSoup
from selene.core.config import *
from selene.core.element import Element
[docs]
class ElementSoup(Element):
"""
An element class to wrap beautiful soup functionality for finding and returning attributes from soup objects.
"""
def __init__(self, element, logger=None):
"""
Initialise an ElementSoup instance
Parameters
----------
element : html object
logger : logging.Logger
a logger instance (see core.logger.py)
"""
Element.__init__(self, element, logger)
self.attrs = element.attrs
self.text = element.get_text()
[docs]
@classmethod
def from_selene(cls, element_selene, logger=None):
"""
Initialise an ElementSoup instance from an ElementSelene object.
Allow interchangeability between selenium-based on soup-based elements
Parameters
----------
element_selene : selene.core.selenium.ElementSelene
logger : logging.Logger
a logger instance (see core.logger.py)
"""
html = element_selene.element.get_attribute("innerHTML")
soup = BeautifulSoup(html, "lxml")
return cls(element=soup.body, logger=logger)
[docs]
def find(self, *args, **kwargs):
"""
Find and return specific elements within the html
Parameters
----------
element : str
the type of html element searched for e.g. 'div'
attributes : dict
attributes of the searched element e.g. {"class": "text-1"}
Returns
----------
el : ElementSoup
"""
self.log(f'find: {"; ".join([str(arg) for arg in [*args]])}')
el = self.element.find(*args, **kwargs)
if el is None:
return ElementSoupBlank()
if not el.has_attr("href"):
el.attrs["href"] = None
return ElementSoup(el, self.logger)
[docs]
def find_all(self, *args, **kwargs):
"""
Find and return all elements within the html that meet the given criteria
Parameters
----------
element : str
the type of html element searched for e.g. 'div'
attributes : dict
attributes of the searched element e.g. {"class": "text-1"}
Returns
----------
els : list
all ElementSoup that meet criteria
"""
self.log(f'find_all: {"; ".join([str(arg) for arg in [*args]])}')
els = []
for el in self.element.find_all(*args, **kwargs):
if not el.has_attr("href"):
el.attrs["href"] = None
els.append(ElementSoup(el, self.logger))
return els
[docs]
def get_text(self):
"""return text of object"""
return self.text
[docs]
def has_attr(self, *args, **kwargs):
"""check whether element has a given attribute"""
return self.element.has_attr(*args, **kwargs)
[docs]
def get(self, *args, **kwargs):
"""return a given attribute of the element"""
return self.element.get(*args, **kwargs)
[docs]
class ElementSoupBlank(ElementSoup):
"""
A class for blank soup objects. Used in cases where another method has not returned anything
"""
def __init__(self):
"""Initialise a ElementSoupBlank object"""
self.element = None
self.attrs = {"href": None, "id": None, "aria-label": None}
self.find = lambda *x: ElementSoupBlank()
self.find_all = lambda *x: []
self.text = None
self.has_attr = lambda *x: False
self.get = lambda *x: None
self.get_text: lambda: None