web2vec.crawlers.extractors module

class web2vec.crawlers.extractors.CertificateExtractor[source]

Bases: Extractor

FEATURE_CLASS

alias of CertificateFeatures

FEATURE_TYPE = 'SSL'
extract_features(response: Response | Response) CertificateFeatures[source]
class web2vec.crawlers.extractors.DNSExtractor[source]

Bases: Extractor

FEATURE_CLASS

alias of DNSFeatures

FEATURE_TYPE = 'DNS'
extract_features(response: Response | Response) DNSFeatures[source]
class web2vec.crawlers.extractors.Extractor[source]

Bases: object

FEATURE_CLASS = None
FEATURE_TYPE = None
extract_features(response: Response | Response) None[source]
features_name() str[source]
class web2vec.crawlers.extractors.GoogleIndexExtractor[source]

Bases: Extractor

FEATURE_CLASS

alias of GoogleIndexFeatures

FEATURE_TYPE = 'GOOGLE_INDEX'
extract_features(response: Response | Response) GoogleIndexFeatures[source]
class web2vec.crawlers.extractors.HtmlBodyExtractor(enable_js_render: bool = False, save_html_snapshot: bool = False, snapshot_output_dir: str | None = None, render_wait_seconds: float = 2.0)[source]

Bases: Extractor

FEATURE_CLASS

alias of HtmlBodyFeatures

FEATURE_TYPE = 'HTML'
extract_features(response: Response | Response) HtmlBodyFeatures[source]
class web2vec.crawlers.extractors.HttpResponseExtractor[source]

Bases: Extractor

FEATURE_CLASS

alias of HttpResponseFeatures

FEATURE_TYPE = 'HTTP'
extract_features(response: Response | Response) HttpResponseFeatures[source]
class web2vec.crawlers.extractors.OpenPageRankExtractor[source]

Bases: Extractor

FEATURE_CLASS

alias of OpenPageRankFeatures

FEATURE_TYPE = 'OPEN_PAGE_RANK'
extract_features(response: Response | Response) OpenPageRankFeatures[source]
class web2vec.crawlers.extractors.OpenPhishExtractor[source]

Bases: Extractor

FEATURE_CLASS

alias of OpenPhishFeatures

FEATURE_TYPE = 'OPEN_PHISH'
extract_features(response: Response | Response) OpenPhishFeatures[source]
class web2vec.crawlers.extractors.PhishTankExtractor[source]

Bases: Extractor

FEATURE_CLASS

alias of PhishTankFeatures

FEATURE_TYPE = 'PHISH_TANK'
extract_features(response: Response | Response) PhishTankFeatures[source]
class web2vec.crawlers.extractors.SimilarWebExtractor[source]

Bases: Extractor

FEATURE_CLASS

alias of SimilarWebFeatures

FEATURE_TYPE = 'SIMILAR_WEB'
extract_features(response: Response | Response) SimilarWebFeatures[source]
class web2vec.crawlers.extractors.UrlGeoExtractor[source]

Bases: Extractor

FEATURE_CLASS

alias of URLGeoFeatures

FEATURE_TYPE = 'GEO'
extract_features(response: Response | Response) URLGeoFeatures[source]
class web2vec.crawlers.extractors.UrlHausExtractor[source]

Bases: Extractor

FEATURE_CLASS

alias of URLHausFeatures

FEATURE_TYPE = 'URL_HAUS'
extract_features(response: Response | Response) URLHausFeatures[source]
class web2vec.crawlers.extractors.UrlLexicalExtractor[source]

Bases: Extractor

FEATURE_CLASS

alias of URLLexicalFeatures

FEATURE_TYPE = 'LEXICAL'
extract_features(response: Response | Response) URLLexicalFeatures[source]
class web2vec.crawlers.extractors.WhoisExtractor[source]

Bases: Extractor

FEATURE_CLASS

alias of WhoisFeatures

FEATURE_TYPE = 'WHOIS'
extract_features(response: Response | Response) WhoisFeatures[source]
web2vec.crawlers.extractors.process_extractors(url: str, extractors: List[Extractor], use_only_numerical: bool = False) dict[source]

Process a list of extractors for a given URL.