import re
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional
from urllib.parse import urlparse
import requests
import urllib3
from bs4 import BeautifulSoup
from web2vec.config import config
from web2vec.utils import get_domain_from_url
[docs]
@dataclass
class HtmlBodyFeatures:
contains_forms: bool
contains_obfuscated_scripts: bool
contains_suspicious_keywords: bool
body_length: int
num_titles: int
num_images: int
num_links: int
script_length: int
special_characters: int
script_to_special_chars_ratio: float
script_to_body_ratio: float
body_to_special_char_ratio: float
iframe_redirection: int
mouse_over_effect: int
right_click_disabled: int
num_scripts_http: int
num_styles_http: int
num_iframes_http: int
num_external_scripts: int
num_external_styles: int
num_external_iframes: int
num_meta_tags: int
num_forms: int
num_forms_post: int
num_forms_get: int
num_forms_external_action: int
num_hidden_elements: int
num_safe_anchors: int
num_media_http: int
num_media_external: int
num_email_forms: int
num_internal_links: int
favicon_url: Optional[str]
logo_url: Optional[str]
found_forms: List[Dict[str, Any]] = field(default_factory=list)
found_images: List[Dict[str, Any]] = field(default_factory=list)
found_anchors: List[Dict[str, Any]] = field(default_factory=list)
found_media: List[Dict[str, Any]] = field(default_factory=list)
copyright: Optional[str] = None
source_mode: str = "raw_http"
was_js_rendered: bool = False
likely_js_spa: bool = False
html_snapshot_path: Optional[str] = None
num_network_requests: int = 0
num_external_network_requests: int = 0
num_api_endpoints: int = 0
found_network_requests: List[str] = field(default_factory=list)
found_api_endpoints: List[str] = field(default_factory=list)
[docs]
def check_obfuscated_scripts(soup: BeautifulSoup) -> bool:
"""Check if the response contains any obfuscated scripts."""
scripts = soup.find_all("script")
for script in scripts:
if script.get("src") and (
"eval(" in script["src"] or "document.write(" in script["src"]
):
return True
return False
[docs]
def check_suspicious_keywords(
soup: BeautifulSoup, keywords: Optional[List[str]] = None
) -> bool:
"""Check if the response contains any suspicious keywords."""
suspicious_keywords = keywords or [
"login",
"update",
"verify",
"password",
"bank",
"account",
]
page_content = soup.get_text().lower()
return any(keyword in page_content for keyword in suspicious_keywords)
[docs]
def body_length(soup: BeautifulSoup) -> int:
"""Get the length of the body text in the given HTML content."""
return len(soup.get_text())
[docs]
def num_titles(soup: BeautifulSoup) -> int:
"""Get the number of titles in the given HTML content."""
titles = ["h{}".format(i) for i in range(7)]
titles = [soup.find_all(tag) for tag in titles]
return len([item for sublist in titles for item in sublist])
[docs]
def num_images(soup: BeautifulSoup) -> int:
"""Get the number of images in the given HTML content."""
return len(soup.find_all("img"))
[docs]
def num_links(soup: BeautifulSoup) -> int:
"""Get the number of links in the given HTML content."""
return len(soup.find_all("a"))
[docs]
def script_length(soup: BeautifulSoup) -> int:
"""Get the length of the scripts in the given HTML content."""
return len(soup.find_all("script"))
[docs]
def special_characters(soup: BeautifulSoup) -> int:
"""Get the number of special characters in the given HTML content."""
body_text = soup.get_text()
return len([c for c in body_text if not c.isalnum() and not c.isspace()])
[docs]
def script_to_special_chars_ratio(soup: BeautifulSoup) -> float:
"""Get the ratio of script length to special characters in the given HTML content."""
schars = special_characters(soup)
slength = script_length(soup)
return slength / schars if schars > 0 else 0
[docs]
def script_to_body_ratio(soup: BeautifulSoup) -> float:
"""Get the ratio of script length to body length in the given HTML content."""
blength = body_length(soup)
slength = script_length(soup)
return slength / blength if blength > 0 else 0
[docs]
def body_to_special_char_ratio(soup: BeautifulSoup) -> float:
"""Get the ratio of body length to special characters in the given HTML content."""
blength = body_length(soup)
schars = special_characters(soup)
return blength / schars if schars > 0 else 0
[docs]
def iframe_redirection(soup: BeautifulSoup) -> int:
"""Check if the response contains any iframe redirection."""
if not soup:
return 1
return 0 if soup.find_all("iframe") or soup.find_all("frameborder") else 1
[docs]
def mouse_over_effect(soup: BeautifulSoup) -> int:
"""Check if the response contains any mouse-over effect."""
if not soup:
return 1
return 1 if soup.find_all(onmouseover=True) else 0
[docs]
def right_click_disabled(soup: BeautifulSoup) -> int:
"""Check if the response contains any right-click disabled content."""
if not soup:
return 1
return 0 if re.findall(r"event.button ?== ?2", str(soup)) else 1
[docs]
def num_scripts_http(soup: BeautifulSoup) -> int:
"""Get the number of HTTP scripts in the given HTML content."""
scripts = soup.find_all("script", src=True)
return len([script for script in scripts if script["src"].startswith("http://")])
[docs]
def num_styles_http(soup: BeautifulSoup) -> int:
"""Get the number of HTTP stylesheets in the given HTML content."""
styles = soup.find_all("link", rel="stylesheet")
return len([style for style in styles if style["href"].startswith("http://")])
[docs]
def num_iframes_http(soup: BeautifulSoup) -> int:
"""Get the number of HTTP iframes in the given HTML content."""
iframes = soup.find_all("iframe", src=True)
return len([iframe for iframe in iframes if iframe["src"].startswith("http://")])
[docs]
def num_external_scripts(soup: BeautifulSoup, base_domain: str) -> int:
"""Get the number of external scripts in the given HTML content."""
scripts = soup.find_all("script", src=True)
return len(
[script for script in scripts if urlparse(script["src"]).netloc != base_domain]
)
[docs]
def num_external_styles(soup: BeautifulSoup, base_domain: str) -> int:
"""Get the number of external stylesheets in the given HTML content."""
styles = soup.find_all("link", rel="stylesheet")
return len(
[style for style in styles if urlparse(style["href"]).netloc != base_domain]
)
[docs]
def num_external_iframes(soup: BeautifulSoup, base_domain: str) -> int:
"""Get the number of external iframes in the given HTML content."""
iframes = soup.find_all("iframe", src=True)
return len(
[iframe for iframe in iframes if urlparse(iframe["src"]).netloc != base_domain]
)
[docs]
def num_forms_post(soup: BeautifulSoup) -> int:
"""Get the number of POST forms in the given HTML content."""
return len(
[
form
for form in soup.find_all("form")
if form.get("method", "").lower() == "post"
]
)
[docs]
def hidden_elements(soup: BeautifulSoup) -> int:
"""Get the number of hidden elements in the given HTML content."""
hidden_elements = soup.find_all(
style=lambda value: value and "display:none" in value
)
return len(hidden_elements)
[docs]
def num_safe_anchors(soup: BeautifulSoup, base_domain: str) -> int:
"""Get the number of safe anchors in the given HTML content."""
anchors = soup.find_all("a", href=True)
return len(
[
anchor
for anchor in anchors
if urlparse(anchor["href"]).netloc == base_domain
or not urlparse(anchor["href"]).netloc
]
)
[docs]
def num_internal_links(soup: BeautifulSoup, base_domain: str) -> int:
"""Get the number of internal links in the given HTML content."""
links = soup.find_all("a", href=True)
return len([link for link in links if urlparse(link["href"]).netloc == base_domain])
[docs]
def find_favicon(soup: BeautifulSoup) -> Optional[str]:
"""Find the favicon URL in the given HTML content."""
icon_link = soup.find("link", rel="icon")
return icon_link["href"] if icon_link else None
[docs]
def find_logo(soup: BeautifulSoup) -> Optional[str]:
"""Find the logo URL in the given HTML content."""
logo_img = soup.find("img", alt=re.compile(r"logo", re.I))
return logo_img["src"] if logo_img else None
[docs]
def find_copyright(soup: BeautifulSoup) -> Optional[str]:
"""Find the copyright information in the given HTML content."""
# Possible patterns to find copyright information
patterns = [
re.compile(r"©"),
re.compile(r"©"),
re.compile(r"copyright", re.IGNORECASE),
re.compile(r"All rights reserved", re.IGNORECASE),
]
# Search in meta tags
for meta in soup.find_all("meta"):
if "content" in meta.attrs:
content = meta.attrs["content"]
for pattern in patterns:
if pattern.search(content):
return content
# Search in text content
text = soup.get_text(separator=" ")
for pattern in patterns:
match = pattern.search(text)
if match:
start = max(0, match.start() - 30)
end = match.end() + 30
return text[start:end]
return None
[docs]
def detect_likely_js_spa(soup: BeautifulSoup) -> bool:
"""Heuristic signal that a page likely depends on JS rendering."""
spa_roots = [
"#root",
"#app",
"#__next",
"#__nuxt",
"[data-reactroot]",
"[ng-app]",
]
if any(soup.select(selector) for selector in spa_roots):
return True
text_len = len(soup.get_text(strip=True))
scripts_count = len(soup.find_all("script"))
has_noscript = bool(soup.find("noscript"))
return scripts_count >= 3 and text_len < 200 and has_noscript
[docs]
def is_external_url(url: str, base_domain: str) -> bool:
"""Return True when URL points outside current page domain."""
parsed = urlparse(url)
if not parsed.netloc:
return False
return parsed.netloc != base_domain
[docs]
def detect_api_endpoints(urls: List[str]) -> List[str]:
"""Return URLs that look like API/JSON endpoints."""
api_like = []
pattern = re.compile(r"(/api/|/graphql|/rest/|/v\d+/|[?&]format=json)", re.I)
for candidate in urls:
lowered = candidate.lower()
if lowered.endswith(".json") or pattern.search(lowered):
api_like.append(candidate)
return list(dict.fromkeys(api_like))
[docs]
def get_html_body_features(
body: str,
url: str,
source_mode: str = "raw_http",
was_js_rendered: bool = False,
html_snapshot_path: Optional[str] = None,
network_request_urls: Optional[List[str]] = None,
) -> HtmlBodyFeatures:
"""Extract HTML body features from the"""
soup = BeautifulSoup(body, "html.parser")
base_domain = get_domain_from_url(url)
discovered_urls = list(dict.fromkeys(network_request_urls or []))
external_discovered = [
item for item in discovered_urls if is_external_url(item, base_domain)
]
found_api_endpoints = detect_api_endpoints(discovered_urls)
return HtmlBodyFeatures(
contains_forms=bool(soup.find_all("form")),
contains_obfuscated_scripts=check_obfuscated_scripts(soup),
contains_suspicious_keywords=check_suspicious_keywords(soup),
body_length=body_length(soup),
num_titles=num_titles(soup),
num_images=num_images(soup),
num_links=num_links(soup),
script_length=script_length(soup),
special_characters=special_characters(soup),
script_to_special_chars_ratio=script_to_special_chars_ratio(soup),
script_to_body_ratio=script_to_body_ratio(soup),
body_to_special_char_ratio=body_to_special_char_ratio(soup),
iframe_redirection=iframe_redirection(soup),
mouse_over_effect=mouse_over_effect(soup),
right_click_disabled=right_click_disabled(soup),
num_scripts_http=num_scripts_http(soup),
num_styles_http=num_styles_http(soup),
num_iframes_http=num_iframes_http(soup),
num_external_scripts=num_external_scripts(soup, base_domain),
num_external_styles=num_external_styles(soup, base_domain),
num_external_iframes=num_external_iframes(soup, base_domain),
num_meta_tags=num_meta_tags(soup),
num_forms=num_forms(soup),
num_forms_post=num_forms_post(soup),
num_forms_get=num_forms_get(soup),
num_forms_external_action=num_forms_external_action(soup, base_domain),
num_hidden_elements=hidden_elements(soup),
num_safe_anchors=num_safe_anchors(soup, base_domain),
num_media_http=num_media_http(soup),
num_media_external=num_media_external(soup, base_domain),
num_email_forms=num_email_forms(soup),
num_internal_links=num_internal_links(soup, base_domain),
favicon_url=find_favicon(soup),
logo_url=find_logo(soup),
found_forms=[form.attrs for form in soup.find_all("form")],
found_images=[img.attrs for img in soup.find_all("img")],
found_anchors=[a.attrs for a in soup.find_all("a")],
found_media=[m.attrs for m in soup.find_all(["img", "video", "audio"])],
copyright=find_copyright(soup),
source_mode=source_mode,
was_js_rendered=was_js_rendered,
likely_js_spa=detect_likely_js_spa(soup),
html_snapshot_path=html_snapshot_path,
num_network_requests=len(discovered_urls),
num_external_network_requests=len(external_discovered),
num_api_endpoints=len(found_api_endpoints),
found_network_requests=discovered_urls,
found_api_endpoints=found_api_endpoints,
)
# Example usage:
if __name__ == "__main__":
from web2vec.crawlers.extractors import HtmlBodyExtractor
url = "https://shop.volvocars.ca"
if not config.ssl_verify:
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
response = requests.get(
url, allow_redirects=True, timeout=60, verify=config.ssl_verify
)
extractor = HtmlBodyExtractor(
enable_js_render=True,
save_html_snapshot=True,
render_wait_seconds=2.0,
)
html_body_features = extractor.extract_features(response)
print(html_body_features)
print(f"Snapshot path: {html_body_features.html_snapshot_path}")