web2vec.extractors.url_lexical_features module

class web2vec.extractors.url_lexical_features.URLLexicalFeatures(count_dot_url: int, count_dash_url: int, count_underscore_url: int, count_slash_url: int, count_question_url: int, count_equals_url: int, count_at_url: int, count_ampersand_url: int, count_exclamation_url: int, count_space_url: int, count_tilde_url: int, count_comma_url: int, count_plus_url: int, count_asterisk_url: int, count_hash_url: int, count_dollar_url: int, count_percent_url: int, url_length: int, tld_amount_url: int, count_dot_domain: int, count_dash_domain: int, count_underscore_domain: int, count_slash_domain: int, count_question_domain: int, count_equals_domain: int, count_at_domain: int, count_ampersand_domain: int, count_exclamation_domain: int, count_space_domain: int, count_tilde_domain: int, count_comma_domain: int, count_plus_domain: int, count_asterisk_domain: int, count_hash_domain: int, count_dollar_domain: int, count_percent_domain: int, domain_length: int, vowel_count_domain: int, domain_in_ip_format: bool, domain_contains_keywords: bool, count_dot_directory: int, count_dash_directory: int, count_underscore_directory: int, count_slash_directory: int, count_question_directory: int, count_equals_directory: int, count_at_directory: int, count_ampersand_directory: int, count_exclamation_directory: int, count_space_directory: int, count_tilde_directory: int, count_comma_directory: int, count_plus_directory: int, count_asterisk_directory: int, count_hash_directory: int, count_dollar_directory: int, count_percent_directory: int, directory_length: int, count_dot_parameters: int, count_dash_parameters: int, count_underscore_parameters: int, count_slash_parameters: int, count_question_parameters: int, count_equals_parameters: int, count_at_parameters: int, count_ampersand_parameters: int, count_exclamation_parameters: int, count_space_parameters: int, count_tilde_parameters: int, count_comma_parameters: int, count_plus_parameters: int, count_asterisk_parameters: int, count_hash_parameters: int, count_dollar_parameters: int, count_percent_parameters: int, parameters_length: int, tld_presence_in_arguments: int, number_of_parameters: int, email_present_in_url: bool, domain_entropy: float, url_depth: int, uses_shortening_service: str | None, is_ip: bool = False, number_of_subdomains: int = 0, average_subdomain_length: float = 0.0, having_hyphen_in_subdomain: bool = False, having_underscore_in_subdomain: bool = False, having_digit_in_subdomain: bool = False, having_special_char_in_subdomain: bool = False, having_fragment: bool = False, having_anchor: bool = False, entropy_of_url: float = 0.0, repeated_digits_url: bool = False, repeated_digits_domain: bool = False, repeated_digits_directory: bool = False, repeated_digits_parameters: bool = False, token_count: int = 0, subdomain_count: int = 0, tld_popularity: int = 0, suspicious_file_extension: bool = False, percentage_numeric_chars: float = 0.0, url_shortened: bool = False, server_client_domain: bool = False)[source]

Bases: object

average_subdomain_length: float = 0.0
count_ampersand_directory: int
count_ampersand_domain: int
count_ampersand_parameters: int
count_ampersand_url: int
count_asterisk_directory: int
count_asterisk_domain: int
count_asterisk_parameters: int
count_asterisk_url: int
count_at_directory: int
count_at_domain: int
count_at_parameters: int
count_at_url: int
count_comma_directory: int
count_comma_domain: int
count_comma_parameters: int
count_comma_url: int
count_dash_directory: int
count_dash_domain: int
count_dash_parameters: int
count_dash_url: int
count_dollar_directory: int
count_dollar_domain: int
count_dollar_parameters: int
count_dollar_url: int
count_dot_directory: int
count_dot_domain: int
count_dot_parameters: int
count_dot_url: int
count_equals_directory: int
count_equals_domain: int
count_equals_parameters: int
count_equals_url: int
count_exclamation_directory: int
count_exclamation_domain: int
count_exclamation_parameters: int
count_exclamation_url: int
count_hash_directory: int
count_hash_domain: int
count_hash_parameters: int
count_hash_url: int
count_percent_directory: int
count_percent_domain: int
count_percent_parameters: int
count_percent_url: int
count_plus_directory: int
count_plus_domain: int
count_plus_parameters: int
count_plus_url: int
count_question_directory: int
count_question_domain: int
count_question_parameters: int
count_question_url: int
count_slash_directory: int
count_slash_domain: int
count_slash_parameters: int
count_slash_url: int
count_space_directory: int
count_space_domain: int
count_space_parameters: int
count_space_url: int
count_tilde_directory: int
count_tilde_domain: int
count_tilde_parameters: int
count_tilde_url: int
count_underscore_directory: int
count_underscore_domain: int
count_underscore_parameters: int
count_underscore_url: int
directory_length: int
domain_contains_keywords: bool
domain_entropy: float
domain_in_ip_format: bool
domain_length: int
email_present_in_url: bool
entropy_of_url: float = 0.0
having_anchor: bool = False
having_digit_in_subdomain: bool = False
having_fragment: bool = False
having_hyphen_in_subdomain: bool = False
having_special_char_in_subdomain: bool = False
having_underscore_in_subdomain: bool = False
is_ip: bool = False
number_of_parameters: int
number_of_subdomains: int = 0
parameters_length: int
percentage_numeric_chars: float = 0.0
repeated_digits_directory: bool = False
repeated_digits_domain: bool = False
repeated_digits_parameters: bool = False
repeated_digits_url: bool = False
server_client_domain: bool = False
subdomain_count: int = 0
suspicious_file_extension: bool = False
tld_amount_url: int
tld_popularity: int = 0
tld_presence_in_arguments: int
token_count: int = 0
url_depth: int
url_length: int
url_shortened: bool = False
uses_shortening_service: str | None
vowel_count_domain: int
web2vec.extractors.url_lexical_features.contains_keywords(string: str, keywords: list) bool[source]

Check if the string contains any of the keywords.

web2vec.extractors.url_lexical_features.count_char(character: str, string: str) int[source]

Count the number of occurrences of the character in the string.

web2vec.extractors.url_lexical_features.count_vowels(string: str) int[source]

Count the number of vowels in the string.

web2vec.extractors.url_lexical_features.get_url_lexical_features(url: str) URLLexicalFeatures[source]

Get the lexical features for the given URL.

web2vec.extractors.url_lexical_features.get_url_lexical_features_cached(url: str) URLLexicalFeatures[source]

Get the lexical features for the given URL.

web2vec.extractors.url_lexical_features.has_repeated_digits(value: str) bool[source]

Return True when string contains 3+ repeated digits in sequence.

web2vec.extractors.url_lexical_features.numeric_chars_ratio(value: str) float[source]

Return numeric chars percentage in range 0..100.

web2vec.extractors.url_lexical_features.tld_count(string: str) int[source]

Count the number of times the TLD appears in the URL.

web2vec.extractors.url_lexical_features.token_count(value: str) int[source]

Count lexical tokens split by non-alphanumeric separators.

web2vec.extractors.url_lexical_features.url_depth(url)[source]

Calculate the depth of the URL.

web2vec.extractors.url_lexical_features.uses_shortening_service(url) str | None[source]

Check if the URL uses a shortening service.