Source code for selene.core.soup.element

from bs4 import BeautifulSoup

from selene.core.config import *
from selene.core.element import Element


[docs] class ElementSoup(Element): """ An element class to wrap beautiful soup functionality for finding and returning attributes from soup objects. """ def __init__(self, element, logger=None): """ Initialise an ElementSoup instance Parameters ---------- element : html object logger : logging.Logger a logger instance (see core.logger.py) """ Element.__init__(self, element, logger) self.attrs = element.attrs self.text = element.get_text()
[docs] @classmethod def from_selene(cls, element_selene, logger=None): """ Initialise an ElementSoup instance from an ElementSelene object. Allow interchangeability between selenium-based on soup-based elements Parameters ---------- element_selene : selene.core.selenium.ElementSelene logger : logging.Logger a logger instance (see core.logger.py) """ html = element_selene.element.get_attribute("innerHTML") soup = BeautifulSoup(html, "lxml") return cls(element=soup.body, logger=logger)
[docs] def find(self, *args, **kwargs): """ Find and return specific elements within the html Parameters ---------- element : str the type of html element searched for e.g. 'div' attributes : dict attributes of the searched element e.g. {"class": "text-1"} Returns ---------- el : ElementSoup """ self.log(f'find: {"; ".join([str(arg) for arg in [*args]])}') el = self.element.find(*args, **kwargs) if el is None: return ElementSoupBlank() if not el.has_attr("href"): el.attrs["href"] = None return ElementSoup(el, self.logger)
[docs] def find_all(self, *args, **kwargs): """ Find and return all elements within the html that meet the given criteria Parameters ---------- element : str the type of html element searched for e.g. 'div' attributes : dict attributes of the searched element e.g. {"class": "text-1"} Returns ---------- els : list all ElementSoup that meet criteria """ self.log(f'find_all: {"; ".join([str(arg) for arg in [*args]])}') els = [] for el in self.element.find_all(*args, **kwargs): if not el.has_attr("href"): el.attrs["href"] = None els.append(ElementSoup(el, self.logger)) return els
[docs] def get_text(self): """return text of object""" return self.text
[docs] def has_attr(self, *args, **kwargs): """check whether element has a given attribute""" return self.element.has_attr(*args, **kwargs)
[docs] def get(self, *args, **kwargs): """return a given attribute of the element""" return self.element.get(*args, **kwargs)
[docs] class ElementSoupBlank(ElementSoup): """ A class for blank soup objects. Used in cases where another method has not returned anything """ def __init__(self): """Initialise a ElementSoupBlank object""" self.element = None self.attrs = {"href": None, "id": None, "aria-label": None} self.find = lambda *x: ElementSoupBlank() self.find_all = lambda *x: [] self.text = None self.has_attr = lambda *x: False self.get = lambda *x: None self.get_text: lambda: None