Source code for web2vec.extractors.html_body_features

import re
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional
from urllib.parse import urlparse

import requests
from bs4 import BeautifulSoup

from web2vec.utils import get_domain_from_url


[docs] @dataclass class HtmlBodyFeatures: contains_forms: bool contains_obfuscated_scripts: bool contains_suspicious_keywords: bool body_length: int num_titles: int num_images: int num_links: int script_length: int special_characters: int script_to_special_chars_ratio: float script_to_body_ratio: float body_to_special_char_ratio: float iframe_redirection: int mouse_over_effect: int right_click_disabled: int num_scripts_http: int num_styles_http: int num_iframes_http: int num_external_scripts: int num_external_styles: int num_external_iframes: int num_meta_tags: int num_forms: int num_forms_post: int num_forms_get: int num_forms_external_action: int num_hidden_elements: int num_safe_anchors: int num_media_http: int num_media_external: int num_email_forms: int num_internal_links: int favicon_url: Optional[str] logo_url: Optional[str] found_forms: List[Dict[str, Any]] = field(default_factory=list) found_images: List[Dict[str, Any]] = field(default_factory=list) found_anchors: List[Dict[str, Any]] = field(default_factory=list) found_media: List[Dict[str, Any]] = field(default_factory=list) copyright: Optional[str] = None
[docs] def check_obfuscated_scripts(soup: BeautifulSoup) -> bool: """Check if the response contains any obfuscated scripts.""" scripts = soup.find_all("script") for script in scripts: if script.get("src") and ( "eval(" in script["src"] or "document.write(" in script["src"] ): return True return False
[docs] def check_suspicious_keywords( soup: BeautifulSoup, keywords: Optional[List[str]] = None ) -> bool: """Check if the response contains any suspicious keywords.""" suspicious_keywords = keywords or [ "login", "update", "verify", "password", "bank", "account", ] page_content = soup.get_text().lower() return any(keyword in page_content for keyword in suspicious_keywords)
[docs] def body_length(soup: BeautifulSoup) -> int: """Get the length of the body text in the given HTML content.""" return len(soup.get_text())
[docs] def num_titles(soup: BeautifulSoup) -> int: """Get the number of titles in the given HTML content.""" titles = ["h{}".format(i) for i in range(7)] titles = [soup.find_all(tag) for tag in titles] return len([item for sublist in titles for item in sublist])
[docs] def num_images(soup: BeautifulSoup) -> int: """Get the number of images in the given HTML content.""" return len(soup.find_all("img"))
[docs] def script_length(soup: BeautifulSoup) -> int: """Get the length of the scripts in the given HTML content.""" return len(soup.find_all("script"))
[docs] def special_characters(soup: BeautifulSoup) -> int: """Get the number of special characters in the given HTML content.""" body_text = soup.get_text() return len([c for c in body_text if not c.isalnum() and not c.isspace()])
[docs] def script_to_special_chars_ratio(soup: BeautifulSoup) -> float: """Get the ratio of script length to special characters in the given HTML content.""" schars = special_characters(soup) slength = script_length(soup) return slength / schars if schars > 0 else 0
[docs] def script_to_body_ratio(soup: BeautifulSoup) -> float: """Get the ratio of script length to body length in the given HTML content.""" blength = body_length(soup) slength = script_length(soup) return slength / blength if blength > 0 else 0
[docs] def body_to_special_char_ratio(soup: BeautifulSoup) -> float: """Get the ratio of body length to special characters in the given HTML content.""" blength = body_length(soup) schars = special_characters(soup) return blength / schars if schars > 0 else 0
[docs] def iframe_redirection(soup: BeautifulSoup) -> int: """Check if the response contains any iframe redirection.""" if not soup: return 1 return 0 if soup.find_all("iframe") or soup.find_all("frameborder") else 1
[docs] def mouse_over_effect(soup: BeautifulSoup) -> int: """Check if the response contains any mouse-over effect.""" if not soup: return 1 return 1 if soup.find_all(onmouseover=True) else 0
[docs] def right_click_disabled(soup: BeautifulSoup) -> int: """Check if the response contains any right-click disabled content.""" if not soup: return 1 return 0 if re.findall(r"event.button ?== ?2", str(soup)) else 1
[docs] def num_scripts_http(soup: BeautifulSoup) -> int: """Get the number of HTTP scripts in the given HTML content.""" scripts = soup.find_all("script", src=True) return len([script for script in scripts if script["src"].startswith("http://")])
[docs] def num_styles_http(soup: BeautifulSoup) -> int: """Get the number of HTTP stylesheets in the given HTML content.""" styles = soup.find_all("link", rel="stylesheet") return len([style for style in styles if style["href"].startswith("http://")])
[docs] def num_iframes_http(soup: BeautifulSoup) -> int: """Get the number of HTTP iframes in the given HTML content.""" iframes = soup.find_all("iframe", src=True) return len([iframe for iframe in iframes if iframe["src"].startswith("http://")])
[docs] def num_external_scripts(soup: BeautifulSoup, base_domain: str) -> int: """Get the number of external scripts in the given HTML content.""" scripts = soup.find_all("script", src=True) return len( [script for script in scripts if urlparse(script["src"]).netloc != base_domain] )
[docs] def num_external_styles(soup: BeautifulSoup, base_domain: str) -> int: """Get the number of external stylesheets in the given HTML content.""" styles = soup.find_all("link", rel="stylesheet") return len( [style for style in styles if urlparse(style["href"]).netloc != base_domain] )
[docs] def num_external_iframes(soup: BeautifulSoup, base_domain: str) -> int: """Get the number of external iframes in the given HTML content.""" iframes = soup.find_all("iframe", src=True) return len( [iframe for iframe in iframes if urlparse(iframe["src"]).netloc != base_domain] )
[docs] def num_meta_tags(soup: BeautifulSoup) -> int: """Get the number of meta tags in the given HTML content.""" return len(soup.find_all("meta"))
[docs] def num_forms(soup: BeautifulSoup) -> int: """Get the number of forms in the given HTML content.""" return len(soup.find_all("form"))
[docs] def num_forms_post(soup: BeautifulSoup) -> int: """Get the number of POST forms in the given HTML content.""" return len( [ form for form in soup.find_all("form") if form.get("method", "").lower() == "post" ] )
[docs] def num_forms_get(soup: BeautifulSoup) -> int: """Get the number of GET forms in the given HTML content.""" return len( [ form for form in soup.find_all("form") if form.get("method", "").lower() == "get" ] )
[docs] def num_forms_external_action(soup: BeautifulSoup, base_domain: str) -> int: """Get the number of forms with external action in the given HTML content.""" forms = soup.find_all("form", action=True) return len( [ form for form in forms if urlparse(form["action"]).netloc and urlparse(form["action"]).netloc != base_domain ] )
[docs] def hidden_elements(soup: BeautifulSoup) -> int: """Get the number of hidden elements in the given HTML content.""" hidden_elements = soup.find_all( style=lambda value: value and "display:none" in value ) return len(hidden_elements)
[docs] def num_safe_anchors(soup: BeautifulSoup, base_domain: str) -> int: """Get the number of safe anchors in the given HTML content.""" anchors = soup.find_all("a", href=True) return len( [ anchor for anchor in anchors if urlparse(anchor["href"]).netloc == base_domain or not urlparse(anchor["href"]).netloc ] )
[docs] def num_media_http(soup: BeautifulSoup) -> int: """Get the number of HTTP media in the given HTML content.""" media = soup.find_all(["img", "video", "audio"], src=True) return len([m for m in media if m["src"].startswith("http://")])
[docs] def num_media_external(soup: BeautifulSoup, base_domain: str) -> int: """Get the number of external media in the given HTML content.""" media = soup.find_all(["img", "video", "audio"], src=True) return len([m for m in media if urlparse(m["src"]).netloc != base_domain])
[docs] def num_email_forms(soup: BeautifulSoup) -> int: """Get the number of email forms in the given HTML content.""" forms = soup.find_all("form", action=True) return len([form for form in forms if form["action"].startswith("mailto:")])
[docs] def find_favicon(soup: BeautifulSoup) -> Optional[str]: """Find the favicon URL in the given HTML content.""" icon_link = soup.find("link", rel="icon") return icon_link["href"] if icon_link else None
[docs] def get_html_body_features(body: str, url: str) -> HtmlBodyFeatures: """Extract HTML body features from the""" soup = BeautifulSoup(body, "html.parser") base_domain = get_domain_from_url(url) return HtmlBodyFeatures( contains_forms=bool(soup.find_all("form")), contains_obfuscated_scripts=check_obfuscated_scripts(soup), contains_suspicious_keywords=check_suspicious_keywords(soup), body_length=body_length(soup), num_titles=num_titles(soup), num_images=num_images(soup), num_links=num_links(soup), script_length=script_length(soup), special_characters=special_characters(soup), script_to_special_chars_ratio=script_to_special_chars_ratio(soup), script_to_body_ratio=script_to_body_ratio(soup), body_to_special_char_ratio=body_to_special_char_ratio(soup), iframe_redirection=iframe_redirection(soup), mouse_over_effect=mouse_over_effect(soup), right_click_disabled=right_click_disabled(soup), num_scripts_http=num_scripts_http(soup), num_styles_http=num_styles_http(soup), num_iframes_http=num_iframes_http(soup), num_external_scripts=num_external_scripts(soup, base_domain), num_external_styles=num_external_styles(soup, base_domain), num_external_iframes=num_external_iframes(soup, base_domain), num_meta_tags=num_meta_tags(soup), num_forms=num_forms(soup), num_forms_post=num_forms_post(soup), num_forms_get=num_forms_get(soup), num_forms_external_action=num_forms_external_action(soup, base_domain), num_hidden_elements=hidden_elements(soup), num_safe_anchors=num_safe_anchors(soup, base_domain), num_media_http=num_media_http(soup), num_media_external=num_media_external(soup, base_domain), num_email_forms=num_email_forms(soup), num_internal_links=num_internal_links(soup, base_domain), favicon_url=find_favicon(soup), logo_url=find_logo(soup), found_forms=[form.attrs for form in soup.find_all("form")], found_images=[img.attrs for img in soup.find_all("img")], found_anchors=[a.attrs for a in soup.find_all("a")], found_media=[m.attrs for m in soup.find_all(["img", "video", "audio"])], copyright=find_copyright(soup), )
# Example usage: if __name__ == "__main__": url = "https://www.example.com" response = requests.get(url, allow_redirects=True, timeout=60) html_body_features = get_html_body_features(response.text, response.url) print(html_body_features)