import re
from dataclasses import dataclass
from functools import cache
from typing import Optional
from urllib.parse import parse_qs, urlparse
import tldextract
from web2vec.utils import entropy, fetch_file_from_url_and_read, valid_ip
shortening_services = (
r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|"
r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|"
r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|"
r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|"
r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|"
r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|"
r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|"
r"tr\.im|link\.zip\.net"
)
# Helper functions
[docs]
def count_char(character: str, string: str) -> int:
"""Count the number of occurrences of the character in the string."""
return string.count(character)
[docs]
def count_vowels(string: str) -> int:
"""Count the number of vowels in the string."""
return len(re.findall(r"[aeiouAEIOU]", string))
[docs]
def contains_keywords(string: str, keywords: list) -> bool:
"""Check if the string contains any of the keywords."""
return any(keyword in string.lower() for keyword in keywords)
[docs]
def tld_count(string: str) -> int:
"""Count the number of times the TLD appears in the URL."""
extracted = tldextract.extract(string)
tld = extracted.suffix
return string.lower().count(f".{tld}") if tld else 0
[docs]
def url_depth(url):
"""Calculate the depth of the URL."""
return len([segment for segment in urlparse(url).path.split("/") if segment])
[docs]
def uses_shortening_service(url) -> Optional[str]:
"""Check if the URL uses a shortening service."""
shortening_services_text = fetch_file_from_url_and_read(
"https://raw.githubusercontent.com/korlabsio/urlshortener/main/names.txt"
)
shortening_services_list = shortening_services_text.split("\n")
shortening_services_list = [
service.strip() for service in shortening_services_list if service.strip()
]
services_lookup = "|".join(map(re.escape, shortening_services_list))
return re.search(services_lookup, url)
[docs]
@dataclass
class URLLexicalFeatures:
count_dot_url: int
count_dash_url: int
count_underscore_url: int
count_slash_url: int
count_question_url: int
count_equals_url: int
count_at_url: int
count_ampersand_url: int
count_exclamation_url: int
count_space_url: int
count_tilde_url: int
count_comma_url: int
count_plus_url: int
count_asterisk_url: int
count_hash_url: int
count_dollar_url: int
count_percent_url: int
url_length: int
tld_amount_url: int
count_dot_domain: int
count_dash_domain: int
count_underscore_domain: int
count_slash_domain: int
count_question_domain: int
count_equals_domain: int
count_at_domain: int
count_ampersand_domain: int
count_exclamation_domain: int
count_space_domain: int
count_tilde_domain: int
count_comma_domain: int
count_plus_domain: int
count_asterisk_domain: int
count_hash_domain: int
count_dollar_domain: int
count_percent_domain: int
domain_length: int
vowel_count_domain: int
domain_in_ip_format: bool
domain_contains_keywords: bool
count_dot_directory: int
count_dash_directory: int
count_underscore_directory: int
count_slash_directory: int
count_question_directory: int
count_equals_directory: int
count_at_directory: int
count_ampersand_directory: int
count_exclamation_directory: int
count_space_directory: int
count_tilde_directory: int
count_comma_directory: int
count_plus_directory: int
count_asterisk_directory: int
count_hash_directory: int
count_dollar_directory: int
count_percent_directory: int
directory_length: int
count_dot_parameters: int
count_dash_parameters: int
count_underscore_parameters: int
count_slash_parameters: int
count_question_parameters: int
count_equals_parameters: int
count_at_parameters: int
count_ampersand_parameters: int
count_exclamation_parameters: int
count_space_parameters: int
count_tilde_parameters: int
count_comma_parameters: int
count_plus_parameters: int
count_asterisk_parameters: int
count_hash_parameters: int
count_dollar_parameters: int
count_percent_parameters: int
parameters_length: int
tld_presence_in_arguments: int
number_of_parameters: int
email_present_in_url: bool
domain_entropy: float
url_depth: int
uses_shortening_service: Optional[str]
is_ip: bool = False
[docs]
def get_url_lexical_features(url: str) -> URLLexicalFeatures:
"""Get the lexical features for the given URL."""
parsed_url = urlparse(url)
domain = parsed_url.netloc
path = parsed_url.path
query = parsed_url.query
directory = "/".join(path.split("/")[:-1])
features = URLLexicalFeatures(
count_dot_url=count_char(".", url),
count_dash_url=count_char("-", url),
count_underscore_url=count_char("_", url),
count_slash_url=count_char("/", url),
count_question_url=count_char("?", url),
count_equals_url=count_char("=", url),
count_at_url=count_char("@", url),
count_ampersand_url=count_char("&", url),
count_exclamation_url=count_char("!", url),
count_space_url=count_char(" ", url),
count_tilde_url=count_char("~", url),
count_comma_url=count_char(",", url),
count_plus_url=count_char("+", url),
count_asterisk_url=count_char("*", url),
count_hash_url=count_char("#", url),
count_dollar_url=count_char("$", url),
count_percent_url=count_char("%", url),
url_length=len(url),
tld_amount_url=tld_count(url),
count_dot_domain=count_char(".", domain),
count_dash_domain=count_char("-", domain),
count_underscore_domain=count_char("_", domain),
count_slash_domain=count_char("/", domain),
count_question_domain=count_char("?", domain),
count_equals_domain=count_char("=", domain),
count_at_domain=count_char("@", domain),
count_ampersand_domain=count_char("&", domain),
count_exclamation_domain=count_char("!", domain),
count_space_domain=count_char(" ", domain),
count_tilde_domain=count_char("~", domain),
count_comma_domain=count_char(",", domain),
count_plus_domain=count_char("+", domain),
count_asterisk_domain=count_char("*", domain),
count_hash_domain=count_char("#", domain),
count_dollar_domain=count_char("$", domain),
count_percent_domain=count_char("%", domain),
domain_length=len(domain),
vowel_count_domain=count_vowels(domain),
domain_in_ip_format=domain.replace(".", "").isdigit(),
domain_contains_keywords=contains_keywords(domain, ["server", "client"]),
count_dot_directory=count_char(".", directory),
count_dash_directory=count_char("-", directory),
count_underscore_directory=count_char("_", directory),
count_slash_directory=count_char("/", directory),
count_question_directory=count_char("?", directory),
count_equals_directory=count_char("=", directory),
count_at_directory=count_char("@", directory),
count_ampersand_directory=count_char("&", directory),
count_exclamation_directory=count_char("!", directory),
count_space_directory=count_char(" ", directory),
count_tilde_directory=count_char("~", directory),
count_comma_directory=count_char(",", directory),
count_plus_directory=count_char("+", directory),
count_asterisk_directory=count_char("*", directory),
count_hash_directory=count_char("#", directory),
count_dollar_directory=count_char("$", directory),
count_percent_directory=count_char("%", directory),
directory_length=len(directory),
count_dot_parameters=count_char(".", query),
count_dash_parameters=count_char("-", query),
count_underscore_parameters=count_char("_", query),
count_slash_parameters=count_char("/", query),
count_question_parameters=count_char("?", query),
count_equals_parameters=count_char("=", query),
count_at_parameters=count_char("@", query),
count_ampersand_parameters=count_char("&", query),
count_exclamation_parameters=count_char("!", query),
count_space_parameters=count_char(" ", query),
count_tilde_parameters=count_char("~", query),
count_comma_parameters=count_char(",", query),
count_plus_parameters=count_char("+", query),
count_asterisk_parameters=count_char("*", query),
count_hash_parameters=count_char("#", query),
count_dollar_parameters=count_char("$", query),
count_percent_parameters=count_char("%", query),
parameters_length=len(query),
tld_presence_in_arguments=tld_count(query),
number_of_parameters=len(parse_qs(query)),
email_present_in_url=bool(
re.search(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", url)
),
domain_entropy=entropy(domain),
url_depth=url_depth(url),
uses_shortening_service=uses_shortening_service(url),
is_ip=valid_ip(domain),
)
return features
[docs]
@cache
def get_url_lexical_features_cached(url: str) -> URLLexicalFeatures:
"""Get the lexical features for the given URL."""
return get_url_lexical_features(url)
# Example usage
if __name__ == "__main__":
url = "https://192.1.10.1/path/to/file.html?arg1=val1&arg2=val2"
features = get_url_lexical_features(url)
print(features)