Source code for selene.core.soup.page

import os
import time
import requests
from datetime import datetime
from bs4 import BeautifulSoup

from ..page import *
from ..config import *

from .element import ElementSoup, ElementSoupBlank


[docs] class PageSoup(Page): """ A page class to assist any workflow which requires BeautifulSoup. This is really a way to make Selenium WebDriver and BeautifulSoup more interchangeable, in as far as you can instantiate either a PageSoup or a PageSelene object, and the .find and .find_all function work in similar ways. Inherits selene.core.page.Page """ def __init__(self, url, soup, logger=None): """ Initialise a PageSoup instance from existing, parsed soup. Parameters ---------- url : str the url of the page soup : logger : logging.Logger a logger instance (see core.logger.py) """ Page.__init__(self, url, logger) self.soup = soup
[docs] @classmethod def from_soup(cls, url, soup, logger=None): """ Initialise a PageSoup instance from existing, parsed soup. Parameters ---------- url : str the url of the page soup : logger : logging.Logger a logger instance (see core.logger.py) """ return cls(url, soup, logger)
[docs] @classmethod def from_html(cls, url, html, logger=None): """ Initialise a PageSoup instance from existing html source code. Parameters ---------- url : str the url of the page html : str the html code to parse logger : logging.Logger a logger instance (see core.logger.py) """ soup = BeautifulSoup(html, "lxml") return cls(url, soup, logger)
[docs] @classmethod def from_request(cls, url, logger=None): """ Initialise a PageSoup instance by parsing a request to a web url. Parameters ---------- url : str the url of the page logger : logging.Logger a logger instance (see core.logger.py) """ user_agent = str(np.random.choice(USER_AGENTS)) headers = { "User-Agent": user_agent, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", } with requests.Session() as session: page = session.get(url, headers=headers).content.decode() soup = BeautifulSoup(page, "lxml") return cls(url, soup, logger)
[docs] def find(self, *args, **kwargs): """ Find and return specific a specific element within the page html Parameters ---------- element : str the type of html element searched for e.g. 'div' attributes : dict attributes of the searched element e.g. {"class": "text-1"} Returns ---------- el : ElementSoup """ self.log(f'find: {"; ".join([str(arg) for arg in [*args]])}') el = self.soup.find(*args, **kwargs) if el is None: return ElementSoupBlank() if not el.has_attr("href"): el.attrs["href"] = None return ElementSoup(el, self.logger)
[docs] def find_all(self, *args, **kwargs): """ Find and return all elements within the page html that meet the given criteria Parameters ---------- element : str the type of html element searched for e.g. 'div' attributes : dict attributes of the searched element e.g. {"class": "text-1"} Returns ---------- els : list all ElementSoup that meet criteria """ self.log(f'find_all: {"; ".join([str(arg) for arg in [*args]])}') els = [] for el in self.soup.find_all(*args, **kwargs): if not el.has_attr("href"): el.attrs["href"] = None els.append(ElementSoup(el, self.logger)) return els