Source code for web2vec.extractors.network_features

import json
import os
from typing import List, Optional
from urllib.parse import urljoin

import networkx as nx
from bs4 import BeautifulSoup

from web2vec.utils import get_domain_from_url


[docs] def build_graph(main_directory: str, allowed_domains: Optional[List] = None): """Build a directed graph from the crawled web pages.""" G = nx.DiGraph() for filename in os.listdir(main_directory): if filename.endswith(".json"): filepath = os.path.join(main_directory, filename) with open(filepath, "r", encoding="utf-8") as f: data = json.load(f) url = data["url"] html_content = data["html"] G.add_node(url) soup = BeautifulSoup(html_content, "html.parser") for link in soup.find_all("a", href=True): target_url = link["href"] if target_url.startswith("/"): target_url = urljoin(url, target_url) link_domain = get_domain_from_url(target_url) if allowed_domains is None or link_domain in allowed_domains: G.add_edge(url, target_url) return G