Source code for elm.web.html_pw

# -*- coding: utf-8 -*-
"""ELM Web HTML loading with Playwright

We use Playwright so that javascript text is rendered before we scrape.
"""
import logging
from contextlib import AsyncExitStack

from playwright.async_api import async_playwright
from playwright.async_api import Error as PlaywrightError
from playwright.async_api import TimeoutError as PlaywrightTimeoutError

logger = logging.getLogger(__name__)

# block pages by resource type. e.g. image, stylesheet
BLOCK_RESOURCE_TYPES = [
    "beacon",
    "csp_report",
    "font",
    "image",
    "imageset",
    "media",
    "object",
    "texttrack",
    #  can block stylsheets and scripts, though it's not recommended:
    # 'stylesheet',
    # 'script',
    # 'xhr',
]


# block popular 3rd party resources like tracking and advertisements.
BLOCK_RESOURCE_NAMES = [
    "adzerk",
    "analytics",
    "cdn.api.twitter",
    "doubleclick",
    "exelator",
    "facebook",
    "fontawesome",
    "google",
    "google-analytics",
    "googletagmanager",
    "lit.connatix",  # <- not sure about this one
]


async def _intercept_route(route):  # pragma: no cover
    """intercept all requests and abort blocked ones

    Source: https://scrapfly.io/blog/how-to-block-resources-in-playwright/
    """
    if route.request.resource_type in BLOCK_RESOURCE_TYPES:
        return await route.abort()

    if any(key in route.request.url for key in BLOCK_RESOURCE_NAMES):
        return await route.abort()

    return await route.continue_()


[docs] async def load_html_with_pw( # pragma: no cover url, browser_semaphore=None, **pw_launch_kwargs ): """Extract HTML from URL using Playwright. Parameters ---------- url : str URL to pull HTML for. browser_semaphore : asyncio.Semaphore, optional Semaphore instance that can be used to limit the number of playwright browsers open concurrently. If ``None``, no limits are applied. By default, ``None``. **pw_launch_kwargs Keyword-value argument pairs to pass to :meth:`async_playwright.chromium.launch`. Returns ------- str HTML from page. """ try: text = await _load_html(url, browser_semaphore, **pw_launch_kwargs) except (PlaywrightError, PlaywrightTimeoutError): text = "" return text
async def _load_html( # pragma: no cover url, browser_sem=None, **pw_launch_kwargs ): """Load html using playwright""" if browser_sem is None: browser_sem = AsyncExitStack() async with async_playwright() as p, browser_sem: browser = await p.chromium.launch(**pw_launch_kwargs) page = await browser.new_page() await page.route("**/*", _intercept_route) await page.goto(url) await page.wait_for_load_state("networkidle", timeout=90_000) text = await page.content() return text