async def extract_structured_data (
* ,
content : list[ContentItem] | ContentItem,
data_schema : type[BaseModel] | dict[ str , Any],
prompt : str | None = None ,
max_retires : int = 3 ,
enable_cache : bool = True ,
model : SUPPORTED_MODELS = 'claude-3-5-haiku-latest' ,
api_key : str | None = None ,
) -> Any
Extracts structured data from content items (text, images) using AI-powered analysis. This overload provides a simplified interface for data extraction from various content types
without requiring a page source or extraction strategy. It accepts text content, image buffers,
or image URLs and extracts structured data according to the provided schema. Examples BaseModel Schema
Dict Schema
List schema
Image Buffer
from pydantic import BaseModel, Field
from intuned_browser.ai import extract_structured_data, TextContentItem
# Define schema using Pydantic
class Person ( BaseModel ):
name: str = Field( description = "Person's full name" )
age: int = Field( description = "Person's age" )
occupation: str = Field( description = "Person's job title" )
company: str = Field( description = "Company name" )
async def automation ( page , params , ** _kwargs ):
text_content: TextContentItem = {
"type" : "text" ,
"data" : "John Doe, age 30, works as a Software Engineer at Tech Corp"
}
person = await extract_structured_data(
content = text_content,
model = "gpt-4o" ,
data_schema = Person, # Pass Pydantic model directly
prompt = "Extract person information from the text"
)
print ( f "Found person: { person[ 'name' ] } , { person[ 'age' ] } years old" )
Arguments content
list[ContentItem] | ContentItem
required
Content to extract data from - can be a single content item or array of content items. Check ContentItem for more details. data_schema
BaseModel | dict[str, Any]
required
Schema defining the expected structure of the extracted data. Can be either a Pydantic BaseModel class or a JSON Schema dictionary.
Optional prompt to guide the extraction process and provide more context. Defaults to None.
Maximum number of retry attempts on failures. Failures can be validation errors, API errors, output errors, etc. Defaults to 3.
Whether to enable caching of the extracted data. Defaults to True.
model
SUPPORTED_MODELS
default: "'claude-3-5-haiku-latest'"
AI model to use for extraction. See SUPPORTED_MODELS for all supported models. Defaults to “claude-3-5-haiku-latest”. Optional API key for AI extraction (if provided, will not be billed to your account). Defaults to None.
Returns: Any The extracted structured data conforming to the provided schema. Key Features & Limitations
No DOM Matching : This overload does not support DOM matching since it doesn’t operate on web pages.
Smart Caching : Caching is based on content hash to avoid redundant API calls.
Automatic Image Fetching : Image URLs are automatically fetched and converted to image buffers for processing.
Batch Processing : Multiple content items can be processed together for comprehensive extraction.
async def extract_structured_data (
* ,
source : Page | Locator,
data_schema : type[BaseModel] | dict[ str , Any],
prompt : str | None = None ,
strategy : Literal[ 'IMAGE' , 'MARKDOWN' , 'HTML' ] = 'HTML' ,
enable_dom_matching : bool = False ,
enable_cache : bool = True ,
max_retires : int = 3 ,
model : SUPPORTED_MODELS = 'claude-3-5-haiku-latest' ,
api_key : str | None = None ,
) -> Any
Extracts structured data from web pages using AI-powered content analysis. This function provides intelligent data extraction from web pages using various strategies
including HTML parsing, image analysis, and Markdown conversion. It supports extraction
from entire pages or specific elements, with built-in caching and retry mechanisms. Examples Pydantic BaseModel
Dict Schema
List schema
from pydantic import BaseModel, Field
from intuned_browser.ai import extract_structured_data
# Define schema using Pydantic
class Book ( BaseModel ):
name: str = Field( description = "Book title" )
price: str = Field( description = "Book price" )
description: str | None = Field( default = None , description = "Book description" )
in_stock: bool = Field( description = "Stock availability" )
rating: str | None = Field( default = None , description = "Book rating" )
async def automation ( page , params , ** _kwargs ):
await page.goto( 'https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html' )
product = await extract_structured_data(
source = page,
strategy = "HTML" ,
model = "gpt-4o" ,
data_schema = Book, # Pass Pydantic model directly
prompt = "Extract book details from this page" ,
enable_cache = True ,
max_retires = 3
)
print ( f "Found product: { product[ 'name' ] } - { product[ 'price' ] } " )
Arguments Playwright Page object to extract data from the entire page or Locator object to extract data from a specific element.
data_schema
BaseModel | dict[str, Any]
required
Schema defining the structure of the data to extract. Can be either a Pydantic BaseModel class or a JSON Schema dictionary.
Optional prompt to guide the extraction process and provide more context. Defaults to None.
strategy
Literal['IMAGE', 'MARKDOWN', 'HTML']
default: "'HTML'"
Type of extraction: “HTML”, “IMAGE”, or “MARKDOWN”. Defaults to “HTML”.
Whether to enable DOM element matching during extraction. Defaults to False. When set to True, all types in the schema must be strings to match with the DOM elements. The extracted results will be matched with the DOM elements and returned, then cached in a smart fashion so that the next time the same data is extracted, the result will be returned from the cache even if the DOM has minor changes.
Whether to enable caching of the extracted data. Defaults to True.
Maximum number of retry attempts on failures. Failures can be validation errors, API errors, output errors, etc. Defaults to 3.
model
SUPPORTED_MODELS
default: "'claude-3-5-haiku-latest'"
AI model to use for extraction. See SUPPORTED_MODELS for all supported models. Defaults to “claude-3-5-haiku-latest”. Optional API key for AI extraction (if provided, will not be billed to your account). Defaults to None.
Returns: Any The extracted structured data conforming to the provided schema. Extraction Strategies Extracts simplified HTML and processes it for structured data. Best for text-heavy content with semantic markup.
Captures page/element screenshots and uses vision AI for extraction. Ideal for visually complex layouts or content.
Converts HTML content to markdown format before extraction. Great for readable content with structured formatting.
Performance & Caching
All strategies implement intelligent caching based on content hash to avoid redundant API calls
Cache keys include schema, model, strategy, prompt, and page content to ensure proper invalidation
For DOM matching , all types in the schema must be strings to match with the DOM elements