Source code for web2vec.extractors.url_lexical_features

import re
from dataclasses import dataclass
from functools import cache
from typing import Optional
from urllib.parse import parse_qs, urlparse

import tldextract

from web2vec.utils import entropy, fetch_file_from_url_and_read, valid_ip

shortening_services = (
    r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|"
    r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|"
    r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|"
    r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|"
    r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|"
    r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|"
    r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|"
    r"tr\.im|link\.zip\.net"
)


# Helper functions
[docs] def count_char(character: str, string: str) -> int: """Count the number of occurrences of the character in the string.""" return string.count(character)
[docs] def count_vowels(string: str) -> int: """Count the number of vowels in the string.""" return len(re.findall(r"[aeiouAEIOU]", string))
[docs] def contains_keywords(string: str, keywords: list) -> bool: """Check if the string contains any of the keywords.""" return any(keyword in string.lower() for keyword in keywords)
[docs] def tld_count(string: str) -> int: """Count the number of times the TLD appears in the URL.""" extracted = tldextract.extract(string) tld = extracted.suffix return string.lower().count(f".{tld}") if tld else 0
[docs] def url_depth(url): """Calculate the depth of the URL.""" return len([segment for segment in urlparse(url).path.split("/") if segment])
[docs] def uses_shortening_service(url) -> Optional[str]: """Check if the URL uses a shortening service.""" shortening_services_text = fetch_file_from_url_and_read( "https://raw.githubusercontent.com/korlabsio/urlshortener/main/names.txt" ) shortening_services_list = shortening_services_text.split("\n") shortening_services_list = [ service.strip() for service in shortening_services_list if service.strip() ] services_lookup = "|".join(map(re.escape, shortening_services_list)) return re.search(services_lookup, url)
[docs] def has_repeated_digits(value: str) -> bool: """Return True when string contains 3+ repeated digits in sequence.""" return bool(re.search(r"(\d)\1{2,}", value))
[docs] def numeric_chars_ratio(value: str) -> float: """Return numeric chars percentage in range 0..100.""" if not value: return 0.0 digits = sum(1 for char in value if char.isdigit()) return (digits / len(value)) * 100.0
[docs] def token_count(value: str) -> int: """Count lexical tokens split by non-alphanumeric separators.""" tokens = [token for token in re.split(r"[^A-Za-z0-9]+", value) if token] return len(tokens)
[docs] @dataclass class URLLexicalFeatures: count_dot_url: int count_dash_url: int count_underscore_url: int count_slash_url: int count_question_url: int count_equals_url: int count_at_url: int count_ampersand_url: int count_exclamation_url: int count_space_url: int count_tilde_url: int count_comma_url: int count_plus_url: int count_asterisk_url: int count_hash_url: int count_dollar_url: int count_percent_url: int url_length: int tld_amount_url: int count_dot_domain: int count_dash_domain: int count_underscore_domain: int count_slash_domain: int count_question_domain: int count_equals_domain: int count_at_domain: int count_ampersand_domain: int count_exclamation_domain: int count_space_domain: int count_tilde_domain: int count_comma_domain: int count_plus_domain: int count_asterisk_domain: int count_hash_domain: int count_dollar_domain: int count_percent_domain: int domain_length: int vowel_count_domain: int domain_in_ip_format: bool domain_contains_keywords: bool count_dot_directory: int count_dash_directory: int count_underscore_directory: int count_slash_directory: int count_question_directory: int count_equals_directory: int count_at_directory: int count_ampersand_directory: int count_exclamation_directory: int count_space_directory: int count_tilde_directory: int count_comma_directory: int count_plus_directory: int count_asterisk_directory: int count_hash_directory: int count_dollar_directory: int count_percent_directory: int directory_length: int count_dot_parameters: int count_dash_parameters: int count_underscore_parameters: int count_slash_parameters: int count_question_parameters: int count_equals_parameters: int count_at_parameters: int count_ampersand_parameters: int count_exclamation_parameters: int count_space_parameters: int count_tilde_parameters: int count_comma_parameters: int count_plus_parameters: int count_asterisk_parameters: int count_hash_parameters: int count_dollar_parameters: int count_percent_parameters: int parameters_length: int tld_presence_in_arguments: int number_of_parameters: int email_present_in_url: bool domain_entropy: float url_depth: int uses_shortening_service: Optional[str] is_ip: bool = False number_of_subdomains: int = 0 average_subdomain_length: float = 0.0 having_hyphen_in_subdomain: bool = False having_underscore_in_subdomain: bool = False having_digit_in_subdomain: bool = False having_special_char_in_subdomain: bool = False having_fragment: bool = False having_anchor: bool = False entropy_of_url: float = 0.0 repeated_digits_url: bool = False repeated_digits_domain: bool = False repeated_digits_directory: bool = False repeated_digits_parameters: bool = False token_count: int = 0 subdomain_count: int = 0 tld_popularity: int = 0 suspicious_file_extension: bool = False percentage_numeric_chars: float = 0.0 url_shortened: bool = False server_client_domain: bool = False
[docs] def get_url_lexical_features(url: str) -> URLLexicalFeatures: """Get the lexical features for the given URL.""" parsed_url = urlparse(url) domain = parsed_url.netloc hostname = parsed_url.hostname or domain path = parsed_url.path query = parsed_url.query directory = "/".join(path.split("/")[:-1]) subdomain = tldextract.extract(hostname).subdomain subdomain_parts = [part for part in subdomain.split(".") if part] average_subdomain_length = ( sum(len(part) for part in subdomain_parts) / len(subdomain_parts) if subdomain_parts else 0.0 ) url_shortening_match = uses_shortening_service(url) tld_suffix = tldextract.extract(hostname).suffix.lower() popular_tlds = {"com", "org", "net", "edu", "gov", "io", "co", "pl"} suspicious_extensions = { ".exe", ".scr", ".bat", ".cmd", ".js", ".jar", ".vbs", ".ps1", ".zip", ".rar", } features = URLLexicalFeatures( count_dot_url=count_char(".", url), count_dash_url=count_char("-", url), count_underscore_url=count_char("_", url), count_slash_url=count_char("/", url), count_question_url=count_char("?", url), count_equals_url=count_char("=", url), count_at_url=count_char("@", url), count_ampersand_url=count_char("&", url), count_exclamation_url=count_char("!", url), count_space_url=count_char(" ", url), count_tilde_url=count_char("~", url), count_comma_url=count_char(",", url), count_plus_url=count_char("+", url), count_asterisk_url=count_char("*", url), count_hash_url=count_char("#", url), count_dollar_url=count_char("$", url), count_percent_url=count_char("%", url), url_length=len(url), tld_amount_url=tld_count(url), count_dot_domain=count_char(".", domain), count_dash_domain=count_char("-", domain), count_underscore_domain=count_char("_", domain), count_slash_domain=count_char("/", domain), count_question_domain=count_char("?", domain), count_equals_domain=count_char("=", domain), count_at_domain=count_char("@", domain), count_ampersand_domain=count_char("&", domain), count_exclamation_domain=count_char("!", domain), count_space_domain=count_char(" ", domain), count_tilde_domain=count_char("~", domain), count_comma_domain=count_char(",", domain), count_plus_domain=count_char("+", domain), count_asterisk_domain=count_char("*", domain), count_hash_domain=count_char("#", domain), count_dollar_domain=count_char("$", domain), count_percent_domain=count_char("%", domain), domain_length=len(domain), vowel_count_domain=count_vowels(domain), domain_in_ip_format=domain.replace(".", "").isdigit(), domain_contains_keywords=contains_keywords(domain, ["server", "client"]), count_dot_directory=count_char(".", directory), count_dash_directory=count_char("-", directory), count_underscore_directory=count_char("_", directory), count_slash_directory=count_char("/", directory), count_question_directory=count_char("?", directory), count_equals_directory=count_char("=", directory), count_at_directory=count_char("@", directory), count_ampersand_directory=count_char("&", directory), count_exclamation_directory=count_char("!", directory), count_space_directory=count_char(" ", directory), count_tilde_directory=count_char("~", directory), count_comma_directory=count_char(",", directory), count_plus_directory=count_char("+", directory), count_asterisk_directory=count_char("*", directory), count_hash_directory=count_char("#", directory), count_dollar_directory=count_char("$", directory), count_percent_directory=count_char("%", directory), directory_length=len(directory), count_dot_parameters=count_char(".", query), count_dash_parameters=count_char("-", query), count_underscore_parameters=count_char("_", query), count_slash_parameters=count_char("/", query), count_question_parameters=count_char("?", query), count_equals_parameters=count_char("=", query), count_at_parameters=count_char("@", query), count_ampersand_parameters=count_char("&", query), count_exclamation_parameters=count_char("!", query), count_space_parameters=count_char(" ", query), count_tilde_parameters=count_char("~", query), count_comma_parameters=count_char(",", query), count_plus_parameters=count_char("+", query), count_asterisk_parameters=count_char("*", query), count_hash_parameters=count_char("#", query), count_dollar_parameters=count_char("$", query), count_percent_parameters=count_char("%", query), parameters_length=len(query), tld_presence_in_arguments=tld_count(query), number_of_parameters=len(parse_qs(query)), email_present_in_url=bool( re.search(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", url) ), domain_entropy=entropy(domain), url_depth=url_depth(url), uses_shortening_service=url_shortening_match, is_ip=valid_ip(domain), number_of_subdomains=len(subdomain_parts), average_subdomain_length=average_subdomain_length, having_hyphen_in_subdomain="-" in subdomain, having_underscore_in_subdomain="_" in subdomain, having_digit_in_subdomain=any(char.isdigit() for char in subdomain), having_special_char_in_subdomain=bool(re.search(r"[^A-Za-z0-9.-]", subdomain)), having_fragment=bool(parsed_url.fragment), having_anchor=bool(parsed_url.fragment), entropy_of_url=entropy(url), repeated_digits_url=has_repeated_digits(url), repeated_digits_domain=has_repeated_digits(hostname), repeated_digits_directory=has_repeated_digits(directory), repeated_digits_parameters=has_repeated_digits(query), token_count=token_count(url), subdomain_count=len(subdomain_parts), tld_popularity=1 if tld_suffix in popular_tlds else 0, suspicious_file_extension=any( path.lower().endswith(ext) for ext in suspicious_extensions ), percentage_numeric_chars=numeric_chars_ratio(url), url_shortened=url_shortening_match is not None, server_client_domain=contains_keywords(domain, ["server", "client"]), ) return features
[docs] @cache def get_url_lexical_features_cached(url: str) -> URLLexicalFeatures: """Get the lexical features for the given URL.""" return get_url_lexical_features(url)
# Example usage if __name__ == "__main__": url = "https://192.1.10.1/path/to/file.html?arg1=val1&arg2=val2" features = get_url_lexical_features(url) print(features)