Skip to main content
Sanitizes and cleans HTML content by removing unwanted elements, attributes, and whitespace. Provides fine-grained control over each cleaning operation through configurable options.
def sanitize_html(
    html: str,
    *,
    remove_scripts: bool,
    remove_styles: bool,
    remove_svgs: bool,
    remove_comments: bool,
    remove_long_attributes: bool,
    max_attribute_length: int,
    preserve_attributes: list[str] | None,
    remove_empty_tags: bool,
    preserve_empty_tags: list[str] | None,
    minify_whitespace: bool,
) -> str

Examples

from typing import TypedDict
from playwright.async_api import Page
from intuned_browser import sanitize_html
class Params(TypedDict):
    pass
async def automation(page: Page, params: Params, **_kwargs):
    await page.goto("https://books.toscrape.com")
    first_row = page.locator("ol.row").locator("li").first
    # Get the HTML of the first row.
    html = await first_row.inner_html()
    # Sanitize the HTML.
    sanitized_html = sanitize_html(html)
    # Log the sanitized HTML.
    print(sanitized_html)
    # Return the sanitized HTML.
    return sanitized_html

Arguments

html
str
required
The HTML content to sanitize
remove_scripts
bool
Remove all <script> elements. Defaults to True.
remove_styles
bool
Remove all <style> elements. Defaults to True.
remove_svgs
bool
Remove all <svg> elements. Defaults to True.
remove_comments
bool
Remove HTML comments. Defaults to True.
remove_long_attributes
bool
Remove attributes longer than max_attribute_length. Defaults to True.
max_attribute_length
int
Maximum length for attributes before removal. Defaults to 500.
preserve_attributes
list[str]
List of attribute names to always preserve. Defaults to [“class”, “src”].
remove_empty_tags
bool
Remove empty tags (except preserved ones). Defaults to True.
preserve_empty_tags
list[str]
List of tag names to preserve even when empty. Defaults to [“img”].
minify_whitespace
bool
Remove extra whitespace between tags and empty lines. Defaults to True.

Returns: str

The sanitized HTML string