Source code for selene.core.soup.page

import os
import time
import requests
from datetime import datetime
from bs4 import BeautifulSoup

from ..page import *
from ..config import *

from .element import ElementSoup, ElementSoupBlank



[docs]
class PageSoup(Page):
    """
    A page class to assist any workflow which requires BeautifulSoup.

    This is really a way to make Selenium WebDriver and BeautifulSoup more interchangeable, in as far as
    you can instantiate either a PageSoup or a PageSelene object, and the .find and .find_all function work in
    similar ways.

    Inherits selene.core.page.Page
    """

    def __init__(self, url, soup, logger=None):
        """
        Initialise a PageSoup instance from existing, parsed soup.

        Parameters
        ----------
            url : str
                the url of the page
            soup :
            logger : logging.Logger
                a logger instance (see core.logger.py)
        """
        Page.__init__(self, url, logger)
        self.soup = soup


[docs]
    @classmethod
    def from_soup(cls, url, soup, logger=None):
        """
        Initialise a PageSoup instance from existing, parsed soup.

        Parameters
        ----------
            url : str
                the url of the page
            soup :
            logger : logging.Logger
                a logger instance (see core.logger.py)
        """
        return cls(url, soup, logger)



[docs]
    @classmethod
    def from_html(cls, url, html, logger=None):
        """
        Initialise a PageSoup instance from existing html source code.

        Parameters
        ----------
            url : str
                the url of the page
            html : str
                the html code to parse
            logger : logging.Logger
                a logger instance (see core.logger.py)
        """
        soup = BeautifulSoup(html, "lxml")
        return cls(url, soup, logger)



[docs]
    @classmethod
    def from_request(cls, url, logger=None):
        """
        Initialise a PageSoup instance by parsing a request to a web url.

        Parameters
        ----------
            url : str
                the url of the page
            logger : logging.Logger
                a logger instance (see core.logger.py)
        """
        user_agent = str(np.random.choice(USER_AGENTS))
        headers = {
            "User-Agent": user_agent,
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        }
        with requests.Session() as session:
            page = session.get(url, headers=headers).content.decode()
            soup = BeautifulSoup(page, "lxml")
            return cls(url, soup, logger)



[docs]
    def find(self, *args, **kwargs):
        """
        Find and return specific a specific element within the page html

        Parameters
        ----------
            element : str
                the type of html element searched for e.g. 'div'
            attributes : dict
                attributes of the searched element e.g. {"class": "text-1"}
        Returns
        ----------
            el : ElementSoup
        """
        self.log(f'find: {"; ".join([str(arg) for arg in [*args]])}')
        el = self.soup.find(*args, **kwargs)
        if el is None:
            return ElementSoupBlank()
        if not el.has_attr("href"):
            el.attrs["href"] = None
        return ElementSoup(el, self.logger)



[docs]
    def find_all(self, *args, **kwargs):
        """
        Find and return all elements within the page html that meet the given criteria

        Parameters
        ----------
            element : str
                the type of html element searched for e.g. 'div'
            attributes : dict
                attributes of the searched element e.g. {"class": "text-1"}
        Returns
        ----------
            els : list
                all  ElementSoup that meet criteria
        """
        self.log(f'find_all: {"; ".join([str(arg) for arg in [*args]])}')
        els = []
        for el in self.soup.find_all(*args, **kwargs):
            if not el.has_attr("href"):
                el.attrs["href"] = None
            els.append(ElementSoup(el, self.logger))
        return els