Source code for web2vec.extractors.html_body_features

import re
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional
from urllib.parse import urlparse

import requests
import urllib3
from bs4 import BeautifulSoup

from web2vec.config import config
from web2vec.utils import get_domain_from_url


[docs] @dataclass class HtmlBodyFeatures: contains_forms: bool contains_obfuscated_scripts: bool contains_suspicious_keywords: bool body_length: int num_titles: int num_images: int num_links: int script_length: int special_characters: int script_to_special_chars_ratio: float script_to_body_ratio: float body_to_special_char_ratio: float iframe_redirection: int mouse_over_effect: int right_click_disabled: int num_scripts_http: int num_styles_http: int num_iframes_http: int num_external_scripts: int num_external_styles: int num_external_iframes: int num_meta_tags: int num_forms: int num_forms_post: int num_forms_get: int num_forms_external_action: int num_hidden_elements: int num_safe_anchors: int num_media_http: int num_media_external: int num_email_forms: int num_internal_links: int favicon_url: Optional[str] logo_url: Optional[str] found_forms: List[Dict[str, Any]] = field(default_factory=list) found_images: List[Dict[str, Any]] = field(default_factory=list) found_anchors: List[Dict[str, Any]] = field(default_factory=list) found_media: List[Dict[str, Any]] = field(default_factory=list) copyright: Optional[str] = None source_mode: str = "raw_http" was_js_rendered: bool = False likely_js_spa: bool = False html_snapshot_path: Optional[str] = None num_network_requests: int = 0 num_external_network_requests: int = 0 num_api_endpoints: int = 0 found_network_requests: List[str] = field(default_factory=list) found_api_endpoints: List[str] = field(default_factory=list)
[docs] def check_obfuscated_scripts(soup: BeautifulSoup) -> bool: """Check if the response contains any obfuscated scripts.""" scripts = soup.find_all("script") for script in scripts: if script.get("src") and ( "eval(" in script["src"] or "document.write(" in script["src"] ): return True return False
[docs] def check_suspicious_keywords( soup: BeautifulSoup, keywords: Optional[List[str]] = None ) -> bool: """Check if the response contains any suspicious keywords.""" suspicious_keywords = keywords or [ "login", "update", "verify", "password", "bank", "account", ] page_content = soup.get_text().lower() return any(keyword in page_content for keyword in suspicious_keywords)
[docs] def body_length(soup: BeautifulSoup) -> int: """Get the length of the body text in the given HTML content.""" return len(soup.get_text())
[docs] def num_titles(soup: BeautifulSoup) -> int: """Get the number of titles in the given HTML content.""" titles = ["h{}".format(i) for i in range(7)] titles = [soup.find_all(tag) for tag in titles] return len([item for sublist in titles for item in sublist])
[docs] def num_images(soup: BeautifulSoup) -> int: """Get the number of images in the given HTML content.""" return len(soup.find_all("img"))
[docs] def script_length(soup: BeautifulSoup) -> int: """Get the length of the scripts in the given HTML content.""" return len(soup.find_all("script"))
[docs] def special_characters(soup: BeautifulSoup) -> int: """Get the number of special characters in the given HTML content.""" body_text = soup.get_text() return len([c for c in body_text if not c.isalnum() and not c.isspace()])
[docs] def script_to_special_chars_ratio(soup: BeautifulSoup) -> float: """Get the ratio of script length to special characters in the given HTML content.""" schars = special_characters(soup) slength = script_length(soup) return slength / schars if schars > 0 else 0
[docs] def script_to_body_ratio(soup: BeautifulSoup) -> float: """Get the ratio of script length to body length in the given HTML content.""" blength = body_length(soup) slength = script_length(soup) return slength / blength if blength > 0 else 0
[docs] def body_to_special_char_ratio(soup: BeautifulSoup) -> float: """Get the ratio of body length to special characters in the given HTML content.""" blength = body_length(soup) schars = special_characters(soup) return blength / schars if schars > 0 else 0
[docs] def iframe_redirection(soup: BeautifulSoup) -> int: """Check if the response contains any iframe redirection.""" if not soup: return 1 return 0 if soup.find_all("iframe") or soup.find_all("frameborder") else 1
[docs] def mouse_over_effect(soup: BeautifulSoup) -> int: """Check if the response contains any mouse-over effect.""" if not soup: return 1 return 1 if soup.find_all(onmouseover=True) else 0
[docs] def right_click_disabled(soup: BeautifulSoup) -> int: """Check if the response contains any right-click disabled content.""" if not soup: return 1 return 0 if re.findall(r"event.button ?== ?2", str(soup)) else 1
[docs] def num_scripts_http(soup: BeautifulSoup) -> int: """Get the number of HTTP scripts in the given HTML content.""" scripts = soup.find_all("script", src=True) return len([script for script in scripts if script["src"].startswith("http://")])
[docs] def num_styles_http(soup: BeautifulSoup) -> int: """Get the number of HTTP stylesheets in the given HTML content.""" styles = soup.find_all("link", rel="stylesheet") return len([style for style in styles if style["href"].startswith("http://")])
[docs] def num_iframes_http(soup: BeautifulSoup) -> int: """Get the number of HTTP iframes in the given HTML content.""" iframes = soup.find_all("iframe", src=True) return len([iframe for iframe in iframes if iframe["src"].startswith("http://")])
[docs] def num_external_scripts(soup: BeautifulSoup, base_domain: str) -> int: """Get the number of external scripts in the given HTML content.""" scripts = soup.find_all("script", src=True) return len( [script for script in scripts if urlparse(script["src"]).netloc != base_domain] )
[docs] def num_external_styles(soup: BeautifulSoup, base_domain: str) -> int: """Get the number of external stylesheets in the given HTML content.""" styles = soup.find_all("link", rel="stylesheet") return len( [style for style in styles if urlparse(style["href"]).netloc != base_domain] )
[docs] def num_external_iframes(soup: BeautifulSoup, base_domain: str) -> int: """Get the number of external iframes in the given HTML content.""" iframes = soup.find_all("iframe", src=True) return len( [iframe for iframe in iframes if urlparse(iframe["src"]).netloc != base_domain] )
[docs] def num_meta_tags(soup: BeautifulSoup) -> int: """Get the number of meta tags in the given HTML content.""" return len(soup.find_all("meta"))
[docs] def num_forms(soup: BeautifulSoup) -> int: """Get the number of forms in the given HTML content.""" return len(soup.find_all("form"))
[docs] def num_forms_post(soup: BeautifulSoup) -> int: """Get the number of POST forms in the given HTML content.""" return len( [ form for form in soup.find_all("form") if form.get("method", "").lower() == "post" ] )
[docs] def num_forms_get(soup: BeautifulSoup) -> int: """Get the number of GET forms in the given HTML content.""" return len( [ form for form in soup.find_all("form") if form.get("method", "").lower() == "get" ] )
[docs] def num_forms_external_action(soup: BeautifulSoup, base_domain: str) -> int: """Get the number of forms with external action in the given HTML content.""" forms = soup.find_all("form", action=True) return len( [ form for form in forms if urlparse(form["action"]).netloc and urlparse(form["action"]).netloc != base_domain ] )
[docs] def hidden_elements(soup: BeautifulSoup) -> int: """Get the number of hidden elements in the given HTML content.""" hidden_elements = soup.find_all( style=lambda value: value and "display:none" in value ) return len(hidden_elements)
[docs] def num_safe_anchors(soup: BeautifulSoup, base_domain: str) -> int: """Get the number of safe anchors in the given HTML content.""" anchors = soup.find_all("a", href=True) return len( [ anchor for anchor in anchors if urlparse(anchor["href"]).netloc == base_domain or not urlparse(anchor["href"]).netloc ] )
[docs] def num_media_http(soup: BeautifulSoup) -> int: """Get the number of HTTP media in the given HTML content.""" media = soup.find_all(["img", "video", "audio"], src=True) return len([m for m in media if m["src"].startswith("http://")])
[docs] def num_media_external(soup: BeautifulSoup, base_domain: str) -> int: """Get the number of external media in the given HTML content.""" media = soup.find_all(["img", "video", "audio"], src=True) return len([m for m in media if urlparse(m["src"]).netloc != base_domain])
[docs] def num_email_forms(soup: BeautifulSoup) -> int: """Get the number of email forms in the given HTML content.""" forms = soup.find_all("form", action=True) return len([form for form in forms if form["action"].startswith("mailto:")])
[docs] def find_favicon(soup: BeautifulSoup) -> Optional[str]: """Find the favicon URL in the given HTML content.""" icon_link = soup.find("link", rel="icon") return icon_link["href"] if icon_link else None
[docs] def detect_likely_js_spa(soup: BeautifulSoup) -> bool: """Heuristic signal that a page likely depends on JS rendering.""" spa_roots = [ "#root", "#app", "#__next", "#__nuxt", "[data-reactroot]", "[ng-app]", ] if any(soup.select(selector) for selector in spa_roots): return True text_len = len(soup.get_text(strip=True)) scripts_count = len(soup.find_all("script")) has_noscript = bool(soup.find("noscript")) return scripts_count >= 3 and text_len < 200 and has_noscript
[docs] def is_external_url(url: str, base_domain: str) -> bool: """Return True when URL points outside current page domain.""" parsed = urlparse(url) if not parsed.netloc: return False return parsed.netloc != base_domain
[docs] def detect_api_endpoints(urls: List[str]) -> List[str]: """Return URLs that look like API/JSON endpoints.""" api_like = [] pattern = re.compile(r"(/api/|/graphql|/rest/|/v\d+/|[?&]format=json)", re.I) for candidate in urls: lowered = candidate.lower() if lowered.endswith(".json") or pattern.search(lowered): api_like.append(candidate) return list(dict.fromkeys(api_like))
[docs] def get_html_body_features( body: str, url: str, source_mode: str = "raw_http", was_js_rendered: bool = False, html_snapshot_path: Optional[str] = None, network_request_urls: Optional[List[str]] = None, ) -> HtmlBodyFeatures: """Extract HTML body features from the""" soup = BeautifulSoup(body, "html.parser") base_domain = get_domain_from_url(url) discovered_urls = list(dict.fromkeys(network_request_urls or [])) external_discovered = [ item for item in discovered_urls if is_external_url(item, base_domain) ] found_api_endpoints = detect_api_endpoints(discovered_urls) return HtmlBodyFeatures( contains_forms=bool(soup.find_all("form")), contains_obfuscated_scripts=check_obfuscated_scripts(soup), contains_suspicious_keywords=check_suspicious_keywords(soup), body_length=body_length(soup), num_titles=num_titles(soup), num_images=num_images(soup), num_links=num_links(soup), script_length=script_length(soup), special_characters=special_characters(soup), script_to_special_chars_ratio=script_to_special_chars_ratio(soup), script_to_body_ratio=script_to_body_ratio(soup), body_to_special_char_ratio=body_to_special_char_ratio(soup), iframe_redirection=iframe_redirection(soup), mouse_over_effect=mouse_over_effect(soup), right_click_disabled=right_click_disabled(soup), num_scripts_http=num_scripts_http(soup), num_styles_http=num_styles_http(soup), num_iframes_http=num_iframes_http(soup), num_external_scripts=num_external_scripts(soup, base_domain), num_external_styles=num_external_styles(soup, base_domain), num_external_iframes=num_external_iframes(soup, base_domain), num_meta_tags=num_meta_tags(soup), num_forms=num_forms(soup), num_forms_post=num_forms_post(soup), num_forms_get=num_forms_get(soup), num_forms_external_action=num_forms_external_action(soup, base_domain), num_hidden_elements=hidden_elements(soup), num_safe_anchors=num_safe_anchors(soup, base_domain), num_media_http=num_media_http(soup), num_media_external=num_media_external(soup, base_domain), num_email_forms=num_email_forms(soup), num_internal_links=num_internal_links(soup, base_domain), favicon_url=find_favicon(soup), logo_url=find_logo(soup), found_forms=[form.attrs for form in soup.find_all("form")], found_images=[img.attrs for img in soup.find_all("img")], found_anchors=[a.attrs for a in soup.find_all("a")], found_media=[m.attrs for m in soup.find_all(["img", "video", "audio"])], copyright=find_copyright(soup), source_mode=source_mode, was_js_rendered=was_js_rendered, likely_js_spa=detect_likely_js_spa(soup), html_snapshot_path=html_snapshot_path, num_network_requests=len(discovered_urls), num_external_network_requests=len(external_discovered), num_api_endpoints=len(found_api_endpoints), found_network_requests=discovered_urls, found_api_endpoints=found_api_endpoints, )
# Example usage: if __name__ == "__main__": from web2vec.crawlers.extractors import HtmlBodyExtractor url = "https://shop.volvocars.ca" if not config.ssl_verify: urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) response = requests.get( url, allow_redirects=True, timeout=60, verify=config.ssl_verify ) extractor = HtmlBodyExtractor( enable_js_render=True, save_html_snapshot=True, render_wait_seconds=2.0, ) html_body_features = extractor.extract_features(response) print(html_body_features) print(f"Snapshot path: {html_body_features.html_snapshot_path}")