Source code for web2vec.extractors.external_api.similar_web_features

import logging
from dataclasses import dataclass, field
from functools import cache
from typing import List, Optional

import requests

from web2vec.config import config

logger = logging.getLogger(__name__)


[docs] @dataclass class Engagements: BounceRate: float Month: int Year: int PagePerVisit: float Visits: int TimeOnSite: float
[docs] @dataclass class TopCountryShare: Country: int CountryCode: str Value: float
[docs] @dataclass class EstimatedMonthlyVisit: date: str visits: int
[docs] @dataclass class TrafficSource: Social: float PaidReferrals: float Mail: float Referrals: float Search: float Direct: float
[docs] @dataclass class TopKeyword: Name: str EstimatedValue: int Volume: int Cpc: Optional[float]
[docs] @dataclass class SimilarWebFeatures: """Dataclass for SimilarWeb features.""" Version: int SiteName: str Description: str TopCountryShares: List[TopCountryShare] Title: str Engagements: Engagements EstimatedMonthlyVisits: List[EstimatedMonthlyVisit] GlobalRank: int CountryRank: int CountryCode: str CategoryRank: str Category: str LargeScreenshot: str TrafficSources: TrafficSource TopKeywords: List[TopKeyword] RawData: dict = field(default_factory=dict)
[docs] def get_similar_web_features(domain: str) -> Optional[SimilarWebFeatures]: """Get SimilarWeb features for a given domain.""" url = f"https://data.similarweb.com/api/v1/data?domain={domain}" # noqa try: response = requests.get( url, headers={"User-Agent": "Mozilla/5.0"}, timeout=config.api_timeout ) response.raise_for_status() data = response.json() top_country_shares = [ TopCountryShare(**country) for country in data.get("TopCountryShares", []) ] engagements = Engagements( BounceRate=float(data["Engagments"]["BounceRate"]), Month=int(data["Engagments"]["Month"]), Year=int(data["Engagments"]["Year"]), PagePerVisit=float(data["Engagments"]["PagePerVisit"]), Visits=int(data["Engagments"]["Visits"]), TimeOnSite=float(data["Engagments"]["TimeOnSite"]), ) estimated_monthly_visits = [ EstimatedMonthlyVisit(date=k, visits=v) for k, v in data.get("EstimatedMonthlyVisits", {}).items() ] traffic_sources = TrafficSource( Social=data["TrafficSources"]["Social"], PaidReferrals=data["TrafficSources"]["Paid Referrals"], Mail=data["TrafficSources"]["Mail"], Referrals=data["TrafficSources"]["Referrals"], Search=data["TrafficSources"]["Search"], Direct=data["TrafficSources"]["Direct"], ) top_keywords = [ TopKeyword(**keyword) for keyword in data.get("TopKeywords", []) ] similarweb_data = SimilarWebFeatures( Version=data.get("Version", 0), SiteName=data.get("SiteName", ""), Description=data.get("Description", ""), TopCountryShares=top_country_shares, Title=data.get("Title", ""), Engagements=engagements, EstimatedMonthlyVisits=estimated_monthly_visits, GlobalRank=data["GlobalRank"]["Rank"], CountryRank=data["CountryRank"]["Rank"], CountryCode=data["CountryRank"]["CountryCode"], CategoryRank=data["CategoryRank"]["Rank"], Category=data.get("Category", ""), LargeScreenshot=data.get("LargeScreenshot", ""), TrafficSources=traffic_sources, TopKeywords=top_keywords, RawData=data, ) return similarweb_data except requests.exceptions.RequestException as e: logger.error(f"Error fetching data: {e}", e) return None
[docs] @cache def get_similar_web_features_cached(domain: str) -> Optional[SimilarWebFeatures]: """Get the SimilarWeb features for the given domain.""" return get_similar_web_features(domain)
if __name__ == "__main__": domain_to_check = "down.pcclear.com" entry = get_similar_web_features(domain_to_check) print(entry)