Source code for web2vec.extractors.http_response_features

import logging
from dataclasses import dataclass
from typing import List, Optional

import requests
from bs4 import BeautifulSoup

logger = logging.getLogger(__name__)



[docs]
@dataclass
class HttpResponseFeatures:
    redirects: bool
    redirect_count: int
    contains_forms: bool
    contains_obfuscated_scripts: bool
    contains_suspicious_keywords: bool
    uses_https: bool
    missing_x_frame_options: bool
    missing_x_xss_protection: bool
    missing_content_security_policy: bool
    missing_strict_transport_security: bool
    missing_x_content_type_options: bool
    is_live: bool
    server_version: Optional[str] = None
    body_length: int = 0
    num_titles: int = 0
    num_images: int = 0
    num_links: int = 0
    script_length: int = 0
    special_characters: int = 0
    script_to_special_chars_ratio: float = 0.0
    script_to_body_ratio: float = 0.0
    body_to_special_char_ratio: float = 0.0
    time_response: Optional[float] = None




[docs]
def check_redirects(response: requests.Response) -> bool:
    """Check if the response has been redirected."""
    return len(getattr(response, "history", [])) > 0




[docs]
def count_redirects(response: requests.Response) -> int:
    """Count the number of redirects in the response."""
    return len(getattr(response, "history", []))




[docs]
def check_forms(response: requests.Response) -> bool:
    """Check if the response contains any forms."""
    soup = BeautifulSoup(response.text, "html.parser")
    return bool(soup.find_all("form"))




[docs]
def check_obfuscated_scripts(response: requests.Response) -> bool:
    """Check if the response contains any obfuscated scripts."""
    soup = BeautifulSoup(response.text, "html.parser")
    scripts = soup.find_all("script")
    for script in scripts:
        if script.get("src") and (
            "eval(" in script["src"] or "document.write(" in script["src"]
        ):
            return True
    return False




[docs]
def check_suspicious_keywords(
    response: requests.Response, keywords: Optional[List[str]] = None
) -> bool:
    """Check if the response contains any suspicious keywords."""
    suspicious_keywords = keywords or [
        "login",
        "update",
        "verify",
        "password",
        "bank",
        "account",
    ]
    page_content = response.text.lower()
    return any(keyword in page_content for keyword in suspicious_keywords)




[docs]
def check_https(response: requests.Response) -> bool:
    """Check if the response uses HTTPS."""
    return response.url.startswith("https://")




[docs]
def check_header_x_frame_options(response: requests.Response) -> bool:
    """Check if the response is missing the X-Frame-Options header."""
    return "X-Frame-Options" not in response.headers




[docs]
def check_header_x_xss_protection(response: requests.Response) -> bool:
    """Check if the response is missing the X-XSS-Protection header"""
    return "X-XSS-Protection" not in response.headers




[docs]
def check_header_content_security_policy(response: requests.Response) -> bool:
    """Check if the response is missing the Content-Security-Policy header."""
    return "Content-Security-Policy" not in response.headers




[docs]
def check_header_strict_transport_security(response: requests.Response) -> bool:
    """Check if the response is missing the Strict-Transport-Security"""
    return "Strict-Transport-Security" not in response.headers




[docs]
def check_header_x_content_type_options(response: requests.Response) -> bool:
    """Check if the response is missing the X-Content-Type-Options"""
    return "X-Content-Type-Options" not in response.headers




[docs]
def is_live(response: requests.Response) -> bool:
    """Check if the response is live."""
    return response.status_code == 200




[docs]
def check_server_version(response: requests.Response) -> Optional[str]:
    """Check the server version of the response."""
    return response.headers.get("Server")




[docs]
def body_length(response: requests.Response) -> int:
    """Get the length of the body of the response."""
    soup = BeautifulSoup(response.text, "html.parser")
    return len(soup.get_text())




[docs]
def num_titles(response: requests.Response) -> int:
    """Get the number of titles in the response."""
    soup = BeautifulSoup(response.text, "html.parser")
    titles = ["h{}".format(i) for i in range(7)]
    titles = [soup.find_all(tag) for tag in titles]
    return len([item for sublist in titles for item in sublist])




[docs]
def num_images(response: requests.Response) -> int:
    """Get the number of images in the response"""
    soup = BeautifulSoup(response.text, "html.parser")
    return len(soup.find_all("img"))




[docs]
def num_links(response: requests.Response) -> int:
    """Get the number of links in the response."""
    soup = BeautifulSoup(response.text, "html.parser")
    return len(soup.find_all("a"))




[docs]
def script_length(response: requests.Response) -> int:
    """Get the length of the scripts in the"""
    soup = BeautifulSoup(response.text, "html.parser")
    return len(soup.find_all("script"))




[docs]
def special_characters(response: requests.Response) -> int:
    """Get the number of special characters in the response."""
    soup = BeautifulSoup(response.text, "html.parser")
    body_text = soup.get_text()
    return len([c for c in body_text if not c.isalnum() and not c.isspace()])




[docs]
def script_to_special_chars_ratio(response: requests.Response) -> float:
    """Get the ratio of scripts to special characters in the response"""
    schars = special_characters(response)
    slength = script_length(response)
    return slength / schars if schars > 0 else 0




[docs]
def script_to_body_ratio(response: requests.Response) -> float:
    """Get the ratio of scripts to body in"""
    blength = body_length(response)
    slength = script_length(response)
    return slength / blength if blength > 0 else 0




[docs]
def body_to_special_char_ratio(response: requests.Response) -> float:
    """Get the ratio of body to special characters in the response."""
    blength = body_length(response)
    schars = special_characters(response)
    return blength / schars if schars > 0 else 0




[docs]
def get_http_response_features(
    url: Optional[str] = None, response: Optional[requests.Response] = None
) -> HttpResponseFeatures:
    """Get the HTTP response features for a given URL or response object."""
    from web2vec import fetch_url

    if not url and not response:
        raise ValueError("Either URL or response object must be provided.")
    if not response:
        try:
            response = fetch_url(url)
        except requests.exceptions.RequestException as e:
            logger.error(f"Error fetching URL: {e}", e)
            return HttpResponseFeatures(
                redirects=False,
                redirect_count=0,
                contains_forms=False,
                contains_obfuscated_scripts=False,
                contains_suspicious_keywords=False,
                uses_https=False,
                missing_x_frame_options=True,
                missing_x_xss_protection=True,
                missing_content_security_policy=True,
                missing_strict_transport_security=True,
                missing_x_content_type_options=True,
                is_live=False,
            )

    return HttpResponseFeatures(
        redirects=check_redirects(response),
        redirect_count=count_redirects(response),
        contains_forms=check_forms(response),
        contains_obfuscated_scripts=check_obfuscated_scripts(response),
        contains_suspicious_keywords=check_suspicious_keywords(response),
        uses_https=check_https(response),
        missing_x_frame_options=check_header_x_frame_options(response),
        missing_x_xss_protection=check_header_x_xss_protection(response),
        missing_content_security_policy=check_header_content_security_policy(response),
        missing_strict_transport_security=check_header_strict_transport_security(
            response
        ),
        missing_x_content_type_options=check_header_x_content_type_options(response),
        is_live=is_live(response),
        server_version=check_server_version(response),
        body_length=body_length(response),
        num_titles=num_titles(response),
        num_images=num_images(response),
        num_links=num_links(response),
        script_length=script_length(response),
        special_characters=special_characters(response),
        script_to_special_chars_ratio=script_to_special_chars_ratio(response),
        script_to_body_ratio=script_to_body_ratio(response),
        body_to_special_char_ratio=body_to_special_char_ratio(response),
        time_response=(
            response.elapsed.total_seconds()
            if getattr(response, "elapsed", None)
            else None
        ),
    )



if __name__ == "__main__":
    url = "https://www.example.com"
    features = get_http_response_features(url)
    print(features)