web2vec.crawlers.extractors module
- class web2vec.crawlers.extractors.CertificateExtractor[source]
Bases:
Extractor- FEATURE_CLASS
alias of
CertificateFeatures
- FEATURE_TYPE = 'SSL'
- extract_features(response: Response | Response) CertificateFeatures[source]
- class web2vec.crawlers.extractors.DNSExtractor[source]
Bases:
Extractor- FEATURE_CLASS
alias of
DNSFeatures
- FEATURE_TYPE = 'DNS'
- extract_features(response: Response | Response) DNSFeatures[source]
- class web2vec.crawlers.extractors.Extractor[source]
Bases:
object- FEATURE_CLASS = None
- FEATURE_TYPE = None
- class web2vec.crawlers.extractors.GoogleIndexExtractor[source]
Bases:
Extractor- FEATURE_CLASS
alias of
GoogleIndexFeatures
- FEATURE_TYPE = 'GOOGLE_INDEX'
- extract_features(response: Response | Response) GoogleIndexFeatures[source]
- class web2vec.crawlers.extractors.HtmlBodyExtractor(enable_js_render: bool = False, save_html_snapshot: bool = False, snapshot_output_dir: str | None = None, render_wait_seconds: float = 2.0)[source]
Bases:
Extractor- FEATURE_CLASS
alias of
HtmlBodyFeatures
- FEATURE_TYPE = 'HTML'
- extract_features(response: Response | Response) HtmlBodyFeatures[source]
- class web2vec.crawlers.extractors.HttpResponseExtractor[source]
Bases:
Extractor- FEATURE_CLASS
alias of
HttpResponseFeatures
- FEATURE_TYPE = 'HTTP'
- extract_features(response: Response | Response) HttpResponseFeatures[source]
- class web2vec.crawlers.extractors.OpenPageRankExtractor[source]
Bases:
Extractor- FEATURE_CLASS
alias of
OpenPageRankFeatures
- FEATURE_TYPE = 'OPEN_PAGE_RANK'
- extract_features(response: Response | Response) OpenPageRankFeatures[source]
- class web2vec.crawlers.extractors.OpenPhishExtractor[source]
Bases:
Extractor- FEATURE_CLASS
alias of
OpenPhishFeatures
- FEATURE_TYPE = 'OPEN_PHISH'
- extract_features(response: Response | Response) OpenPhishFeatures[source]
- class web2vec.crawlers.extractors.PhishTankExtractor[source]
Bases:
Extractor- FEATURE_CLASS
alias of
PhishTankFeatures
- FEATURE_TYPE = 'PHISH_TANK'
- extract_features(response: Response | Response) PhishTankFeatures[source]
- class web2vec.crawlers.extractors.SimilarWebExtractor[source]
Bases:
Extractor- FEATURE_CLASS
alias of
SimilarWebFeatures
- FEATURE_TYPE = 'SIMILAR_WEB'
- extract_features(response: Response | Response) SimilarWebFeatures[source]
- class web2vec.crawlers.extractors.UrlGeoExtractor[source]
Bases:
Extractor- FEATURE_CLASS
alias of
URLGeoFeatures
- FEATURE_TYPE = 'GEO'
- extract_features(response: Response | Response) URLGeoFeatures[source]
- class web2vec.crawlers.extractors.UrlHausExtractor[source]
Bases:
Extractor- FEATURE_CLASS
alias of
URLHausFeatures
- FEATURE_TYPE = 'URL_HAUS'
- extract_features(response: Response | Response) URLHausFeatures[source]
- class web2vec.crawlers.extractors.UrlLexicalExtractor[source]
Bases:
Extractor- FEATURE_CLASS
alias of
URLLexicalFeatures
- FEATURE_TYPE = 'LEXICAL'
- extract_features(response: Response | Response) URLLexicalFeatures[source]
- class web2vec.crawlers.extractors.WhoisExtractor[source]
Bases:
Extractor- FEATURE_CLASS
alias of
WhoisFeatures
- FEATURE_TYPE = 'WHOIS'
- extract_features(response: Response | Response) WhoisFeatures[source]