sanitize_html

Sanitizes and cleans HTML content by removing unwanted elements, attributes, and whitespace. Provides fine-grained control over each cleaning operation through configurable options.

def sanitize_html(
    html: str,
    *,
    remove_scripts: bool,
    remove_styles: bool,
    remove_svgs: bool,
    remove_comments: bool,
    remove_long_attributes: bool,
    max_attribute_length: int,
    preserve_attributes: list[str] | None,
    remove_empty_tags: bool,
    preserve_empty_tags: list[str] | None,
    minify_whitespace: bool,
) -> str

Examples

from typing import TypedDict
from playwright.async_api import Page
from intuned_browser import sanitize_html
class Params(TypedDict):
    pass
async def automation(page: Page, params: Params, **_kwargs):
    await page.goto("https://books.toscrape.com")
    first_row = page.locator("ol.row").locator("li").first
    # Get the HTML of the first row.
    html = await first_row.inner_html()
    # Sanitize the HTML.
    sanitized_html = sanitize_html(html)
    # Log the sanitized HTML.
    print(sanitized_html)
    # Return the sanitized HTML.
    return sanitized_html

Arguments

html

str

required

The HTML content to sanitize

remove_scripts

bool

Remove all <script> elements. Defaults to True.

remove_styles

bool

Remove all <style> elements. Defaults to True.

remove_svgs

bool

Remove all <svg> elements. Defaults to True.

remove_comments

bool

Remove HTML comments. Defaults to True.

remove_long_attributes

bool

Remove attributes longer than max_attribute_length. Defaults to True.

max_attribute_length

int

Maximum length for attributes before removal. Defaults to 500.

preserve_attributes

list[str]

List of attribute names to always preserve. Defaults to [“class”, “src”].

remove_empty_tags

bool

Remove empty tags (except preserved ones). Defaults to True.

preserve_empty_tags

list[str]

List of tag names to preserve even when empty. Defaults to [“img”].

minify_whitespace

bool

Remove extra whitespace between tags and empty lines. Defaults to True.

Returns: `str`

The sanitized HTML string

Introduction

Typescript SDK

Python SDK

Examples

Arguments

Returns: `str`

Introduction

Typescript SDK

Python SDK

​Examples

​Arguments

​Returns: str

Examples

Arguments

Returns: `str`