Sanitizes and cleans HTML content by removing unwanted elements, attributes, and whitespace.
Provides fine-grained control over each cleaning operation through configurable options.
def sanitize_html(
html: str,
*,
remove_scripts: bool,
remove_styles: bool,
remove_svgs: bool,
remove_comments: bool,
remove_long_attributes: bool,
max_attribute_length: int,
preserve_attributes: list[str] | None,
remove_empty_tags: bool,
preserve_empty_tags: list[str] | None,
minify_whitespace: bool,
) -> str
Examples
from typing import TypedDict
from playwright.async_api import Page
from intuned_browser import sanitize_html
class Params(TypedDict):
pass
async def automation(page: Page, params: Params, **_kwargs):
await page.goto("https://books.toscrape.com")
first_row = page.locator("ol.row").locator("li").first
# Get the HTML of the first row.
html = await first_row.inner_html()
# Sanitize the HTML.
sanitized_html = sanitize_html(html)
# Log the sanitized HTML.
print(sanitized_html)
# Return the sanitized HTML.
return sanitized_html
Arguments
The HTML content to sanitize
Remove all <script> elements. Defaults to True.
Remove all <style> elements. Defaults to True.
Remove all <svg> elements. Defaults to True.
Remove HTML comments. Defaults to True.
Remove attributes longer than max_attribute_length. Defaults to True.
Maximum length for attributes before removal. Defaults to 500.
List of attribute names to always preserve. Defaults to [“class”, “src”].
Remove empty tags (except preserved ones). Defaults to True.
List of tag names to preserve even when empty. Defaults to [“img”].
Remove extra whitespace between tags and empty lines. Defaults to True.
Returns: str
The sanitized HTML string