import logging
from dataclasses import dataclass
from typing import List, Optional
import requests
from bs4 import BeautifulSoup
logger = logging.getLogger(__name__)
[docs]
@dataclass
class HttpResponseFeatures:
redirects: bool
redirect_count: int
contains_forms: bool
contains_obfuscated_scripts: bool
contains_suspicious_keywords: bool
uses_https: bool
missing_x_frame_options: bool
missing_x_xss_protection: bool
missing_content_security_policy: bool
missing_strict_transport_security: bool
missing_x_content_type_options: bool
is_live: bool
server_version: Optional[str] = None
body_length: int = 0
num_titles: int = 0
num_images: int = 0
num_links: int = 0
script_length: int = 0
special_characters: int = 0
script_to_special_chars_ratio: float = 0.0
script_to_body_ratio: float = 0.0
body_to_special_char_ratio: float = 0.0
[docs]
def check_redirects(response: requests.Response) -> bool:
"""Check if the response has been redirected."""
return len(getattr(response, "history", [])) > 0
[docs]
def count_redirects(response: requests.Response) -> int:
"""Count the number of redirects in the response."""
return len(getattr(response, "history", []))
[docs]
def check_obfuscated_scripts(response: requests.Response) -> bool:
"""Check if the response contains any obfuscated scripts."""
soup = BeautifulSoup(response.text, "html.parser")
scripts = soup.find_all("script")
for script in scripts:
if script.get("src") and (
"eval(" in script["src"] or "document.write(" in script["src"]
):
return True
return False
[docs]
def check_suspicious_keywords(
response: requests.Response, keywords: Optional[List[str]] = None
) -> bool:
"""Check if the response contains any suspicious keywords."""
suspicious_keywords = keywords or [
"login",
"update",
"verify",
"password",
"bank",
"account",
]
page_content = response.text.lower()
return any(keyword in page_content for keyword in suspicious_keywords)
[docs]
def check_https(response: requests.Response) -> bool:
"""Check if the response uses HTTPS."""
return response.url.startswith("https://")
[docs]
def is_live(response: requests.Response) -> bool:
"""Check if the response is live."""
return response.status_code == 200
[docs]
def check_server_version(response: requests.Response) -> Optional[str]:
"""Check the server version of the response."""
return response.headers.get("Server")
[docs]
def body_length(response: requests.Response) -> int:
"""Get the length of the body of the response."""
soup = BeautifulSoup(response.text, "html.parser")
return len(soup.get_text())
[docs]
def num_titles(response: requests.Response) -> int:
"""Get the number of titles in the response."""
soup = BeautifulSoup(response.text, "html.parser")
titles = ["h{}".format(i) for i in range(7)]
titles = [soup.find_all(tag) for tag in titles]
return len([item for sublist in titles for item in sublist])
[docs]
def num_images(response: requests.Response) -> int:
"""Get the number of images in the response"""
soup = BeautifulSoup(response.text, "html.parser")
return len(soup.find_all("img"))
[docs]
def num_links(response: requests.Response) -> int:
"""Get the number of links in the response."""
soup = BeautifulSoup(response.text, "html.parser")
return len(soup.find_all("a"))
[docs]
def script_length(response: requests.Response) -> int:
"""Get the length of the scripts in the"""
soup = BeautifulSoup(response.text, "html.parser")
return len(soup.find_all("script"))
[docs]
def special_characters(response: requests.Response) -> int:
"""Get the number of special characters in the response."""
soup = BeautifulSoup(response.text, "html.parser")
body_text = soup.get_text()
return len([c for c in body_text if not c.isalnum() and not c.isspace()])
[docs]
def script_to_special_chars_ratio(response: requests.Response) -> float:
"""Get the ratio of scripts to special characters in the response"""
schars = special_characters(response)
slength = script_length(response)
return slength / schars if schars > 0 else 0
[docs]
def script_to_body_ratio(response: requests.Response) -> float:
"""Get the ratio of scripts to body in"""
blength = body_length(response)
slength = script_length(response)
return slength / blength if blength > 0 else 0
[docs]
def body_to_special_char_ratio(response: requests.Response) -> float:
"""Get the ratio of body to special characters in the response."""
blength = body_length(response)
schars = special_characters(response)
return blength / schars if schars > 0 else 0
[docs]
def get_http_response_features(
url: Optional[str] = None, response: Optional[requests.Response] = None
) -> HttpResponseFeatures:
"""Get the HTTP response features for a given URL or response object."""
from web2vec import fetch_url
if not url and not response:
raise ValueError("Either URL or response object must be provided.")
if not response:
try:
response = fetch_url(url)
except requests.exceptions.RequestException as e:
logger.error(f"Error fetching URL: {e}", e)
return HttpResponseFeatures(
redirects=False,
redirect_count=0,
contains_forms=False,
contains_obfuscated_scripts=False,
contains_suspicious_keywords=False,
uses_https=False,
missing_x_frame_options=True,
missing_x_xss_protection=True,
missing_content_security_policy=True,
missing_strict_transport_security=True,
missing_x_content_type_options=True,
is_live=False,
)
return HttpResponseFeatures(
redirects=check_redirects(response),
redirect_count=count_redirects(response),
contains_forms=check_forms(response),
contains_obfuscated_scripts=check_obfuscated_scripts(response),
contains_suspicious_keywords=check_suspicious_keywords(response),
uses_https=check_https(response),
missing_x_frame_options=check_header_x_frame_options(response),
missing_x_xss_protection=check_header_x_xss_protection(response),
missing_content_security_policy=check_header_content_security_policy(response),
missing_strict_transport_security=check_header_strict_transport_security(
response
),
missing_x_content_type_options=check_header_x_content_type_options(response),
is_live=is_live(response),
server_version=check_server_version(response),
body_length=body_length(response),
num_titles=num_titles(response),
num_images=num_images(response),
num_links=num_links(response),
script_length=script_length(response),
special_characters=special_characters(response),
script_to_special_chars_ratio=script_to_special_chars_ratio(response),
script_to_body_ratio=script_to_body_ratio(response),
body_to_special_char_ratio=body_to_special_char_ratio(response),
)
if __name__ == "__main__":
url = "https://www.example.com"
features = get_http_response_features(url)
print(features)