Source code for elm.web.html_pw

# -*- coding: utf-8 -*-
"""ELM Web HTML loading with Playwright

We use Playwright so that javascript text is rendered before we scrape.
"""
import logging
from contextlib import AsyncExitStack

from rebrowser_playwright.async_api import async_playwright
from rebrowser_playwright.async_api import Error as PlaywrightError
from rebrowser_playwright.async_api import (
    TimeoutError as PlaywrightTimeoutError
)

from elm.web.utilities import pw_page


logger = logging.getLogger(__name__)


[docs] async def load_html_with_pw( # pragma: no cover url, browser_semaphore=None, timeout=90_000, **pw_launch_kwargs ): """Extract HTML from URL using Playwright. Parameters ---------- url : str URL to pull HTML for. browser_semaphore : asyncio.Semaphore, optional Semaphore instance that can be used to limit the number of playwright browsers open concurrently. If ``None``, no limits are applied. By default, ``None``. timeout : int, optional Maximum time to wait for page loading state time in milliseconds. Pass `0` to disable timeout. By default, ``90,000``. **pw_launch_kwargs Keyword-value argument pairs to pass to :meth:`async_playwright.chromium.launch`. Returns ------- str HTML from page. """ try: text = await _load_html(url, browser_semaphore=browser_semaphore, timeout=timeout, **pw_launch_kwargs) except (PlaywrightError, PlaywrightTimeoutError): text = "" return text
async def _load_html( # pragma: no cover url, browser_semaphore=None, timeout=90_000, **pw_launch_kwargs ): """Load html using playwright""" logger.trace("`_load_html` pw_launch_kwargs=%r", pw_launch_kwargs) logger.trace("browser_semaphore=%r", browser_semaphore) if browser_semaphore is None: browser_semaphore = AsyncExitStack() logger.trace("Loading HTML using playwright") async with async_playwright() as p, browser_semaphore: logger.trace("launching chromium; browser_semaphore=%r", browser_semaphore) browser = await p.chromium.launch(**pw_launch_kwargs) async with pw_page(browser, intercept_routes=True) as page: logger.trace("Navigating to: %r", url) await page.goto(url) logger.trace("Waiting for load with timeout: %r", timeout) await page.wait_for_load_state("networkidle", timeout=timeout) text = await page.content() return text