web2vec.extractors.url_lexical_features module

class web2vec.extractors.url_lexical_features.URLLexicalFeatures(count_dot_url: int, count_dash_url: int, count_underscore_url: int, count_slash_url: int, count_question_url: int, count_equals_url: int, count_at_url: int, count_ampersand_url: int, count_exclamation_url: int, count_space_url: int, count_tilde_url: int, count_comma_url: int, count_plus_url: int, count_asterisk_url: int, count_hash_url: int, count_dollar_url: int, count_percent_url: int, url_length: int, tld_amount_url: int, count_dot_domain: int, count_dash_domain: int, count_underscore_domain: int, count_slash_domain: int, count_question_domain: int, count_equals_domain: int, count_at_domain: int, count_ampersand_domain: int, count_exclamation_domain: int, count_space_domain: int, count_tilde_domain: int, count_comma_domain: int, count_plus_domain: int, count_asterisk_domain: int, count_hash_domain: int, count_dollar_domain: int, count_percent_domain: int, domain_length: int, vowel_count_domain: int, domain_in_ip_format: bool, domain_contains_keywords: bool, count_dot_directory: int, count_dash_directory: int, count_underscore_directory: int, count_slash_directory: int, count_question_directory: int, count_equals_directory: int, count_at_directory: int, count_ampersand_directory: int, count_exclamation_directory: int, count_space_directory: int, count_tilde_directory: int, count_comma_directory: int, count_plus_directory: int, count_asterisk_directory: int, count_hash_directory: int, count_dollar_directory: int, count_percent_directory: int, directory_length: int, count_dot_parameters: int, count_dash_parameters: int, count_underscore_parameters: int, count_slash_parameters: int, count_question_parameters: int, count_equals_parameters: int, count_at_parameters: int, count_ampersand_parameters: int, count_exclamation_parameters: int, count_space_parameters: int, count_tilde_parameters: int, count_comma_parameters: int, count_plus_parameters: int, count_asterisk_parameters: int, count_hash_parameters: int, count_dollar_parameters: int, count_percent_parameters: int, parameters_length: int, tld_presence_in_arguments: int, number_of_parameters: int, email_present_in_url: bool, domain_entropy: float, url_depth: int, uses_shortening_service: str | None, is_ip: bool = False)[source]

Bases: object

count_ampersand_directory: int
count_ampersand_domain: int
count_ampersand_parameters: int
count_ampersand_url: int
count_asterisk_directory: int
count_asterisk_domain: int
count_asterisk_parameters: int
count_asterisk_url: int
count_at_directory: int
count_at_domain: int
count_at_parameters: int
count_at_url: int
count_comma_directory: int
count_comma_domain: int
count_comma_parameters: int
count_comma_url: int
count_dash_directory: int
count_dash_domain: int
count_dash_parameters: int
count_dash_url: int
count_dollar_directory: int
count_dollar_domain: int
count_dollar_parameters: int
count_dollar_url: int
count_dot_directory: int
count_dot_domain: int
count_dot_parameters: int
count_dot_url: int
count_equals_directory: int
count_equals_domain: int
count_equals_parameters: int
count_equals_url: int
count_exclamation_directory: int
count_exclamation_domain: int
count_exclamation_parameters: int
count_exclamation_url: int
count_hash_directory: int
count_hash_domain: int
count_hash_parameters: int
count_hash_url: int
count_percent_directory: int
count_percent_domain: int
count_percent_parameters: int
count_percent_url: int
count_plus_directory: int
count_plus_domain: int
count_plus_parameters: int
count_plus_url: int
count_question_directory: int
count_question_domain: int
count_question_parameters: int
count_question_url: int
count_slash_directory: int
count_slash_domain: int
count_slash_parameters: int
count_slash_url: int
count_space_directory: int
count_space_domain: int
count_space_parameters: int
count_space_url: int
count_tilde_directory: int
count_tilde_domain: int
count_tilde_parameters: int
count_tilde_url: int
count_underscore_directory: int
count_underscore_domain: int
count_underscore_parameters: int
count_underscore_url: int
directory_length: int
domain_contains_keywords: bool
domain_entropy: float
domain_in_ip_format: bool
domain_length: int
email_present_in_url: bool
is_ip: bool = False
number_of_parameters: int
parameters_length: int
tld_amount_url: int
tld_presence_in_arguments: int
url_depth: int
url_length: int
uses_shortening_service: str | None
vowel_count_domain: int
web2vec.extractors.url_lexical_features.contains_keywords(string: str, keywords: list) bool[source]

Check if the string contains any of the keywords.

web2vec.extractors.url_lexical_features.count_char(character: str, string: str) int[source]

Count the number of occurrences of the character in the string.

web2vec.extractors.url_lexical_features.count_vowels(string: str) int[source]

Count the number of vowels in the string.

web2vec.extractors.url_lexical_features.get_url_lexical_features(url: str) URLLexicalFeatures[source]

Get the lexical features for the given URL.

web2vec.extractors.url_lexical_features.get_url_lexical_features_cached(url: str) URLLexicalFeatures[source]

Get the lexical features for the given URL.

web2vec.extractors.url_lexical_features.tld_count(string: str) int[source]

Count the number of times the TLD appears in the URL.

web2vec.extractors.url_lexical_features.url_depth(url)[source]

Calculate the depth of the URL.

web2vec.extractors.url_lexical_features.uses_shortening_service(url) str | None[source]

Check if the URL uses a shortening service.