diff --git a/easygraph/datasets/__init__.py b/easygraph/datasets/__init__.py index d303baa9..035ada50 100644 --- a/easygraph/datasets/__init__.py +++ b/easygraph/datasets/__init__.py @@ -1,3 +1,4 @@ +# risky imports try: from easygraph.datasets.get_sample_graph import * from easygraph.datasets.gnn_benchmark import * @@ -8,16 +9,27 @@ from easygraph.datasets.karate import KarateClubDataset from easygraph.datasets.mathoverflow_answers import mathoverflow_answers - from .citation_graph import CitationGraphDataset - from .citation_graph import CiteseerGraphDataset - from .citation_graph import CoraBinary - from .citation_graph import CoraGraphDataset - from .citation_graph import PubmedGraphDataset from .ppi import LegacyPPIDataset from .ppi import PPIDataset - -except: +except Exception as e: print( " Please install Pytorch before use graph-related datasets and" " hypergraph-related datasets." ) + +from .amazon_photo import AmazonPhotoDataset +from .arxiv import ArxivHEPTHDataset +from .citation_graph import CitationGraphDataset +from .citation_graph import CiteseerGraphDataset +from .citation_graph import CoraBinary +from .citation_graph import CoraGraphDataset +from .citation_graph import PubmedGraphDataset +from .coauthor import CoauthorCSDataset +from .facebook_ego import FacebookEgoNetDataset +from .flickr import FlickrDataset +from .github import GitHubUsersDataset +from .reddit import RedditDataset +from .roadnet import RoadNetCADataset +from .twitter_ego import TwitterEgoDataset +from .web_google import WebGoogleDataset +from .wiki_topcats import WikiTopCatsDataset diff --git a/easygraph/datasets/amazon_photo.py b/easygraph/datasets/amazon_photo.py new file mode 100644 index 00000000..a9295a20 --- /dev/null +++ b/easygraph/datasets/amazon_photo.py @@ -0,0 +1,110 @@ +import os + +import easygraph as eg +import numpy as np +import scipy.sparse as sp + +from easygraph.classes.graph import Graph + +from .graph_dataset_base import EasyGraphBuiltinDataset +from .utils import data_type_dict +from .utils import download +from .utils import extract_archive +from .utils import tensor + + +class AmazonPhotoDataset(EasyGraphBuiltinDataset): + r"""Amazon Electronics Photo co-purchase graph dataset. + + Nodes represent products, and edges link products frequently co-purchased. + Node features are bag-of-words of product reviews. The task is to classify + the product category. + + Statistics: + + - Nodes: 7,650 + - Edges: 119,081 + - Number of Classes: 8 + - Features: 745 + + Parameters + ---------- + raw_dir : str, optional + Raw file directory to download/contains the input data directory. Default: None + force_reload : bool, optional + Whether to reload the dataset. Default: False + verbose : bool, optional + Whether to print out progress information. Default: True + transform : callable, optional + A transform that takes in a :class:`~easygraph.Graph` object and returns + a transformed version. The :class:`~easygraph.Graph` object will be + transformed before every access. + + Examples + -------- + >>> from easygraph.datasets import AmazonPhotoDataset + >>> dataset = AmazonPhotoDataset() + >>> g = dataset[0] + >>> print(g.number_of_nodes()) + >>> print(g.number_of_edges()) + >>> print(g.nodes[0]['feat'].shape) + >>> print(g.nodes[0]['label']) + >>> print(dataset.num_classes) + """ + + def __init__(self, raw_dir=None, force_reload=False, verbose=True, transform=None): + name = "amazon_photo" + url = "https://data.dgl.ai/dataset/amazon_co_buy_photo.zip" + super(AmazonPhotoDataset, self).__init__( + name=name, + url=url, + raw_dir=raw_dir, + force_reload=force_reload, + verbose=verbose, + transform=transform, + ) + + def process(self): + path = os.path.join(self.raw_path, "amazon_co_buy_photo.npz") + data = np.load(path) + + adj = sp.csr_matrix( + (data["adj_data"], data["adj_indices"], data["adj_indptr"]), + shape=data["adj_shape"], + ) + + features = sp.csr_matrix( + (data["attr_data"], data["attr_indices"], data["attr_indptr"]), + shape=data["attr_shape"], + ).todense() + + labels = data["labels"] + + g = eg.Graph() + g.add_edges_from(list(zip(*adj.nonzero()))) + + for i in range(features.shape[0]): + g.add_node(i, feat=np.array(features[i]).squeeze(), label=int(labels[i])) + + self._g = g + self._num_classes = len(np.unique(labels)) + + if self.verbose: + print("Finished loading AmazonPhoto dataset.") + print(f" NumNodes: {g.number_of_nodes()}") + print(f" NumEdges: {g.number_of_edges()}") + print(f" NumFeats: {features.shape[1]}") + print(f" NumClasses: {self._num_classes}") + + def __getitem__(self, idx): + assert idx == 0, "AmazonPhotoDataset only contains one graph" + if self._g is None: + raise ValueError("Graph has not been loaded or processed correctly.") + return self._g if self._transform is None else self._transform(self._g) + + def __len__(self): + return 1 + + @property + def num_classes(self): + return self._num_classes diff --git a/easygraph/datasets/arxiv.py b/easygraph/datasets/arxiv.py new file mode 100644 index 00000000..cfce499b --- /dev/null +++ b/easygraph/datasets/arxiv.py @@ -0,0 +1,106 @@ +"""Arxiv HEP-TH Citation Network + +This dataset represents the citation network of preprints from the High Energy Physics - Theory (HEP-TH) category on arXiv, covering the period from January 1993 to April 2003. + +Each node corresponds to a paper, and a directed edge from paper A to paper B indicates that A cites B. + +No features or labels are included in this dataset. + +Statistics: +- Nodes: 27,770 +- Edges: 352,807 +- Features: None +- Labels: None + +Reference: +J. Leskovec, J. Kleinberg and C. Faloutsos, "Graphs over Time: Densification Laws, Shrinking Diameters and Possible Explanations," +in KDD 2005. Dataset: https://snap.stanford.edu/data/cit-HepTh.html +""" + +import gzip +import os +import shutil + +import easygraph as eg + +from easygraph.classes.graph import Graph + +from .graph_dataset_base import EasyGraphBuiltinDataset +from .utils import download + + +class ArxivHEPTHDataset(EasyGraphBuiltinDataset): + r"""Arxiv HEP-TH citation network dataset. + + Parameters + ---------- + raw_dir : str, optional + Directory to store the raw downloaded files. Default: None + force_reload : bool, optional + Whether to re-download and process the dataset. Default: False + verbose : bool, optional + Whether to print detailed processing logs. Default: True + transform : callable, optional + Optional transform to apply on the graph. + + Examples + -------- + >>> from easygraph.datasets import ArxivHEPTHDataset + >>> dataset = ArxivHEPTHDataset() + >>> g = dataset[0] + >>> print("Nodes:", g.number_of_nodes()) + >>> print("Edges:", g.number_of_edges()) + """ + + def __init__(self, raw_dir=None, force_reload=False, verbose=True, transform=None): + name = "cit-HepTh" + url = "https://snap.stanford.edu/data/cit-HepTh.txt.gz" + super(ArxivHEPTHDataset, self).__init__( + name=name, + url=url, + raw_dir=raw_dir, + force_reload=force_reload, + verbose=verbose, + transform=transform, + ) + + def download(self): + r"""Download and decompress the .txt.gz file.""" + compressed_path = os.path.join(self.raw_dir, self.name + ".txt.gz") + extracted_path = os.path.join(self.raw_path, self.name + ".txt") + + download(self.url, path=compressed_path) + + if not os.path.exists(self.raw_path): + os.makedirs(self.raw_path) + + with gzip.open(compressed_path, "rb") as f_in: + with open(extracted_path, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + + def process(self): + graph = eg.DiGraph() # Citation network is directed + edge_list_path = os.path.join(self.raw_path, self.name + ".txt") + + with open(edge_list_path, "r") as f: + for line in f: + if line.startswith("#") or line.strip() == "": + continue + u, v = map(int, line.strip().split()) + graph.add_edge(u, v) + + self._g = graph + self._num_nodes = graph.number_of_nodes() + self._num_edges = graph.number_of_edges() + + if self.verbose: + print("Finished loading Arxiv HEP-TH dataset.") + print(f" NumNodes: {self._num_nodes}") + print(f" NumEdges: {self._num_edges}") + + def __getitem__(self, idx): + assert idx == 0, "ArxivHEPTHDataset only contains one graph" + return self._g if self._transform is None else self._transform(self._g) + + def __len__(self): + return 1 diff --git a/easygraph/datasets/citation_graph.py b/easygraph/datasets/citation_graph.py index a7e268d9..3795d678 100644 --- a/easygraph/datasets/citation_graph.py +++ b/easygraph/datasets/citation_graph.py @@ -1,6 +1,5 @@ -"""Cora, citeseer, pubmed dataset. +"""Cora, citeseer, pubmed dataset.""" -""" from __future__ import absolute_import import os @@ -53,9 +52,10 @@ class CitationGraphDataset(EasyGraphBuiltinDataset): reorder : bool Whether to reorder the graph using :func:`~eg.reorder_graph`. Default: False. """ + _urls = { "cora_v2": "dataset/cora_v2.zip", - "citeseer": "dataset/citeSeer.zip", + "citeseer": "dataset/citeseer.zip", "pubmed": "dataset/pubmed.zip", } diff --git a/easygraph/datasets/coauthor.py b/easygraph/datasets/coauthor.py new file mode 100644 index 00000000..fe90f734 --- /dev/null +++ b/easygraph/datasets/coauthor.py @@ -0,0 +1,118 @@ +"""CoauthorCS Dataset + +This dataset contains a co-authorship network of authors who submitted papers to CS category. +Each node represents an author and edges represent co-authorships. +Node features are bag-of-words representations of keywords in the author's papers. +The task is node classification, with labels indicating the primary field of study. + +Statistics: +- Nodes: 18333 +- Edges: 81894 +- Feature Dim: 6805 +- Classes: 15 + +Source: https://github.com/dmlc/dgl/tree/master/examples/pytorch/cluster_gcn +""" + +import os + +import easygraph as eg +import numpy as np +import scipy.sparse as sp + +from easygraph.classes.graph import Graph + +from .graph_dataset_base import EasyGraphBuiltinDataset +from .utils import data_type_dict +from .utils import download +from .utils import extract_archive +from .utils import tensor + + +class CoauthorCSDataset(EasyGraphBuiltinDataset): + r"""CoauthorCS citation network dataset. + + Nodes are authors, and edges indicate co-authorship relationships. Each node + has a bag-of-words feature vector and a label denoting the primary research field. + + Parameters + ---------- + raw_dir : str, optional + Directory to store the raw downloaded files. Default: None + force_reload : bool, optional + Whether to re-download and process the dataset. Default: False + verbose : bool, optional + Whether to print detailed processing logs. Default: True + transform : callable, optional + Transform to apply to the graph on access. + + Examples + -------- + >>> from easygraph.datasets import CoauthorCSDataset + >>> dataset = CoauthorCSDataset() + >>> g = dataset[0] + >>> print("Nodes:", g.number_of_nodes()) + >>> print("Edges:", g.number_of_edges()) + >>> print("Feature shape:", g.nodes[0]['feat'].shape) + >>> print("Label:", g.nodes[0]['label']) + >>> print("Number of classes:", dataset.num_classes) + """ + + def __init__(self, raw_dir=None, force_reload=False, verbose=True, transform=None): + name = "coauthor_cs" + url = "https://data.dgl.ai/dataset/coauthor_cs.zip" + super(CoauthorCSDataset, self).__init__( + name=name, + url=url, + raw_dir=raw_dir, + force_reload=force_reload, + verbose=verbose, + transform=transform, + ) + + def process(self): + path = os.path.join(self.raw_path, "coauthor_cs.npz") + data = np.load(path) + + # Reconstruct adjacency matrix + adj = sp.csr_matrix( + (data["adj_data"], data["adj_indices"], data["adj_indptr"]), + shape=data["adj_shape"], + ) + + # Reconstruct feature matrix + features = sp.csr_matrix( + (data["attr_data"], data["attr_indices"], data["attr_indptr"]), + shape=data["attr_shape"], + ).todense() + + labels = data["labels"] + + g = eg.Graph() + g.add_edges_from(list(zip(*adj.nonzero()))) + + for i in range(features.shape[0]): + g.add_node(i, feat=np.array(features[i]).squeeze(), label=int(labels[i])) + + self._g = g + self._num_classes = len(np.unique(labels)) + + if self.verbose: + print("Finished loading CoauthorCS dataset.") + print(f" NumNodes: {g.number_of_nodes()}") + print(f" NumEdges: {g.number_of_edges()}") + print(f" NumFeats: {features.shape[1]}") + print(f" NumClasses: {self._num_classes}") + + def __getitem__(self, idx): + assert idx == 0, "CoauthorCSDataset only contains one graph" + if self._g is None: + raise ValueError("Graph has not been loaded or processed correctly.") + return self._g if self._transform is None else self._transform(self._g) + + def __len__(self): + return 1 + + @property + def num_classes(self): + return self._num_classes diff --git a/easygraph/datasets/dynamic/email_enron.py b/easygraph/datasets/dynamic/email_enron.py index aad3087e..0fb24f78 100644 --- a/easygraph/datasets/dynamic/email_enron.py +++ b/easygraph/datasets/dynamic/email_enron.py @@ -73,8 +73,7 @@ def download(self): self.load_data = data def process(self): - """Loads input data from data directory and transfer to target graph for better analysis - """ + """Loads input data from data directory and transfer to target graph for better analysis""" self._g, edge_feature_list = dict_to_hypergraph(self.load_data, is_dynamic=True) diff --git a/easygraph/datasets/dynamic/email_eu.py b/easygraph/datasets/dynamic/email_eu.py index 236e6ecd..51c150ed 100644 --- a/easygraph/datasets/dynamic/email_eu.py +++ b/easygraph/datasets/dynamic/email_eu.py @@ -70,8 +70,7 @@ def download(self): self.load_data = data def process(self): - """Loads input data from data directory and transfer to target graph for better analysis - """ + """Loads input data from data directory and transfer to target graph for better analysis""" self._g, edge_feature_list = dict_to_hypergraph(self.load_data, is_dynamic=True) self._g.ndata["hyperedge_feature"] = tensor( range(1, len(edge_feature_list) + 1) diff --git a/easygraph/datasets/dynamic/hospital_lyon.py b/easygraph/datasets/dynamic/hospital_lyon.py index 6784d8f9..e7f93566 100644 --- a/easygraph/datasets/dynamic/hospital_lyon.py +++ b/easygraph/datasets/dynamic/hospital_lyon.py @@ -10,7 +10,9 @@ class Hospital_Lyon(EasyGraphDataset): _urls = { - "hospital_lyon": "easygraph-data-hospital-lyon/-/raw/main/hospital-lyon.json?ref_type=heads&inline=false", + "hospital_lyon": ( + "easygraph-data-hospital-lyon/-/raw/main/hospital-lyon.json?ref_type=heads&inline=false" + ), } def __init__( @@ -119,8 +121,7 @@ def download(self): self.load_data = data def process(self): - """Loads input data from data directory and transfer to target graph for better analysis - """ + """Loads input data from data directory and transfer to target graph for better analysis""" self._g, edge_feature_list = self.preprocess(self.load_data, is_dynamic=True) self._g.ndata["hyperedge_feature"] = tensor( diff --git a/easygraph/datasets/facebook_ego.py b/easygraph/datasets/facebook_ego.py new file mode 100644 index 00000000..33eabf33 --- /dev/null +++ b/easygraph/datasets/facebook_ego.py @@ -0,0 +1,109 @@ +"""Facebook Ego-Net Dataset + +This dataset contains a subset of Facebook’s social network collected from +survey participants in the SNAP EgoNet project. Nodes represent users, and +edges indicate friendship links between them. + +Each ego network is centered on a user and includes their friend connections +and friend-to-friend connections. The `.circles` files contain labeled groups +(i.e., communities) of friends identified by the ego user. + +This version processes all ego-nets as a single undirected graph. Node features +are not provided. Labels (circles) are optional and not included by default. + +Statistics (based on merged graph): +- Nodes: ~4,000+ +- Edges: ~88,000+ +- Features: None +- Classes: None + +Reference: +J. McAuley and J. Leskovec, “Learning to Discover Social Circles in Ego Networks,” +in NIPS, 2012. [https://snap.stanford.edu/data/egonets-Facebook.html] +""" + +import os + +import easygraph as eg + +from easygraph.classes.graph import Graph + +from .graph_dataset_base import EasyGraphBuiltinDataset +from .utils import download +from .utils import extract_archive + + +class FacebookEgoNetDataset(EasyGraphBuiltinDataset): + r"""Facebook Ego-Net social network dataset. + + Each node is a user, and edges represent friendship. The dataset + includes 10 ego networks centered on different users. + + Parameters + ---------- + raw_dir : str, optional + Directory to store the raw downloaded files. Default: None + force_reload : bool, optional + Whether to re-download and process the dataset. Default: False + verbose : bool, optional + Whether to print detailed processing logs. Default: True + transform : callable, optional + Optional transform to apply on the graph. + + Examples + -------- + >>> from easygraph.datasets import FacebookEgoNetDataset + >>> dataset = FacebookEgoNetDataset() + >>> g = dataset[0] + >>> print("Nodes:", g.number_of_nodes()) + >>> print("Edges:", g.number_of_edges()) + """ + + def __init__(self, raw_dir=None, force_reload=False, verbose=True, transform=None): + name = "facebook" + url = "https://snap.stanford.edu/data/facebook.tar.gz" + super(FacebookEgoNetDataset, self).__init__( + name=name, + url=url, + raw_dir=raw_dir, + force_reload=force_reload, + verbose=verbose, + transform=transform, + ) + + def process(self): + parent_dir = os.path.join(self.raw_path, "facebook") + g = eg.Graph() + + # Iterate over all .edges files in the subdirectory + for filename in os.listdir(parent_dir): + if filename.endswith(".edges"): + edge_file = os.path.join(parent_dir, filename) + + with open(edge_file, "r") as f: + for line in f: + u, v = map(int, line.strip().split()) + g.add_edge(u, v) + + self._g = g + self._num_nodes = g.number_of_nodes() + self._num_edges = g.number_of_edges() + + if self.verbose: + print("Finished loading Facebook Ego-Net dataset.") + print(f" NumNodes: {self._num_nodes}") + print(f" NumEdges: {self._num_edges}") + + def __getitem__(self, idx): + assert idx == 0, "FacebookEgoNetDataset only contains one merged graph" + return self._g if self._transform is None else self._transform(self._g) + + def __len__(self): + return 1 + + def download(self): + r"""Automatically download data and extract it.""" + if self.url is not None: + archive_path = os.path.join(self.raw_dir, self.name + ".tar.gz") + download(self.url, path=archive_path) + extract_archive(archive_path, self.raw_path) diff --git a/easygraph/datasets/flickr.py b/easygraph/datasets/flickr.py new file mode 100644 index 00000000..022308a8 --- /dev/null +++ b/easygraph/datasets/flickr.py @@ -0,0 +1,129 @@ +import json +import os + +import easygraph as eg +import numpy as np +import scipy.sparse as sp + +from easygraph.classes.graph import Graph + +from .graph_dataset_base import EasyGraphBuiltinDataset +from .utils import data_type_dict +from .utils import tensor + + +class FlickrDataset(EasyGraphBuiltinDataset): + r"""Flickr dataset for node classification. + + Nodes are images and edges represent social tags co-occurrence. + Node features are precomputed image embeddings. Labels indicate image categories. + + Statistics: + - Nodes: 89,250 + - Edges: 899,756 + - Classes: 7 + - Feature dim: 500 + + Source: GraphSAINT (https://arxiv.org/abs/1907.04931) + + Parameters + ---------- + raw_dir : str, optional + Custom directory to download the dataset. Default: None (uses standard cache dir). + force_reload : bool, optional + Whether to re-download and reprocess. Default: False. + verbose : bool, optional + Whether to print loading progress. Default: False. + transform : callable, optional + A transform applied to the graph on access. + reorder : bool, optional + Whether to apply graph reordering for locality (requires torch). Default: False. + + Examples + -------- + >>> from easygraph.datasets import FlickrDataset + >>> ds = FlickrDataset(verbose=True) + >>> g = ds[0] + >>> print(g.number_of_nodes(), g.number_of_edges(), ds.num_classes) + >>> print(g.nodes[0]['feat'].shape, g.nodes[0]['label']) + """ + + def __init__( + self, + raw_dir=None, + force_reload=False, + verbose=False, + transform=None, + reorder=False, + ): + name = "flickr" + url = self._get_dgl_url("dataset/flickr.zip") + self._reorder = reorder + super(FlickrDataset, self).__init__( + name=name, + url=url, + raw_dir=raw_dir, + force_reload=force_reload, + verbose=verbose, + transform=transform, + ) + + def process(self): + # Load adjacency + coo = sp.load_npz(os.path.join(self.raw_path, "adj_full.npz")) + g = eg.Graph() + g.add_edges_from(list(zip(*coo.nonzero()))) + + # Load features + feats = np.load(os.path.join(self.raw_path, "feats.npy")) + # Load labels + with open(os.path.join(self.raw_path, "class_map.json")) as f: + class_map = json.load(f) + labels = np.array([class_map[str(i)] for i in range(feats.shape[0])]) + + # Load train/val/test splits + with open(os.path.join(self.raw_path, "role.json")) as f: + role = json.load(f) + train_mask = np.zeros(feats.shape[0], dtype=bool) + train_mask[role["tr"]] = True + val_mask = np.zeros(feats.shape[0], dtype=bool) + val_mask[role["va"]] = True + test_mask = np.zeros(feats.shape[0], dtype=bool) + test_mask[role["te"]] = True + + # Attach node data + for i in range(feats.shape[0]): + g.add_node(i, feat=feats[i].astype(np.float32), label=int(labels[i])) + g.graph["train_mask"] = train_mask + g.graph["val_mask"] = val_mask + g.graph["test_mask"] = test_mask + + self._g = g + self._num_classes = int(labels.max() + 1) + if self.verbose: + print("Loaded Flickr dataset") + print( + f" Nodes: {g.number_of_nodes()}, Edges: {g.number_of_edges()}, Features: {feats.shape[1]}, Classes: {self._num_classes}" + ) + + def __getitem__(self, idx): + assert idx == 0, "FlickrDataset contains only one graph" + g = self._g + # transfer mask info + g.graph["train_mask"] = g.graph.pop("train_mask") + g.graph["val_mask"] = g.graph.pop("val_mask") + g.graph["test_mask"] = g.graph.pop("test_mask") + return self._transform(g) if self._transform else g + + def __len__(self): + return 1 + + @property + def num_classes(self): + return self._num_classes + + @staticmethod + def _get_dgl_url(path): + from .utils import _get_dgl_url + + return _get_dgl_url(path) diff --git a/easygraph/datasets/github.py b/easygraph/datasets/github.py new file mode 100644 index 00000000..e0aebda1 --- /dev/null +++ b/easygraph/datasets/github.py @@ -0,0 +1,125 @@ +"""GitHub Users Social Network Dataset (musae_git) + +This dataset represents a directed social network of GitHub users collected in 2019. +Nodes represent GitHub developers, and a directed edge from user A to user B indicates that A follows B. + +Each node also includes: +- Features: User profile and activity-based features. +- Labels: Developer's project area (e.g., machine learning, web dev, etc.) + +Statistics: +- Nodes: 37,700 +- Edges: 289,003 +- Feature dim: 5,575 +- Classes: 2 + +Reference: +J. Leskovec et al. "SNAP Datasets: Stanford Large Network Dataset Collection", +https://snap.stanford.edu/data/github-social.html +""" + +import csv +import json +import os + +import easygraph as eg +import numpy as np + +from easygraph.classes.graph import Graph + +from .graph_dataset_base import EasyGraphBuiltinDataset +from .utils import download +from .utils import extract_archive + + +class GitHubUsersDataset(EasyGraphBuiltinDataset): + r"""GitHub developers social graph (musae_git). + + Parameters + ---------- + raw_dir : str, optional + Directory to store raw data. Default: None + force_reload : bool, optional + Force re-download and processing. Default: False + verbose : bool, optional + Print processing information. Default: True + transform : callable, optional + Transform to apply to the graph on load. + + Examples + -------- + >>> from easygraph.datasets import GitHubUsersDataset + >>> dataset = GitHubUsersDataset() + >>> g = dataset[0] + >>> print("Nodes:", g.number_of_nodes()) + >>> print("Edges:", g.number_of_edges()) + >>> print("Feature shape:", g.nodes[0]['feat'].shape) + >>> print("Label:", g.nodes[0]['label']) + """ + + def __init__(self, raw_dir=None, force_reload=False, verbose=True, transform=None): + name = "musae_git" + url = "https://snap.stanford.edu/data/git_web_ml.zip" + super(GitHubUsersDataset, self).__init__( + name=name, + url=url, + raw_dir=raw_dir, + force_reload=force_reload, + verbose=verbose, + transform=transform, + ) + + def download(self): + archive = os.path.join(self.raw_dir, self.name + ".zip") + download(self.url, path=archive) + extract_archive(archive, self.raw_path) + + def process(self): + g = eg.DiGraph() + base_path = os.path.join(self.raw_path, "git_web_ml") + + # Load node features + with open(os.path.join(base_path, "musae_git_features.json"), "r") as f: + features = json.load(f) + + # Load labels + labels = {} + with open(os.path.join(base_path, "musae_git_target.csv"), "r") as f: + reader = csv.DictReader(f) + for row in reader: + node_id = int(row["id"]) + labels[node_id] = int(row["ml_target"]) + + # Load edges + with open(os.path.join(base_path, "musae_git_edges.csv"), "r") as f: + reader = csv.DictReader(f) + for row in reader: + u, v = int(row["id_1"]), int(row["id_2"]) + g.add_edge(u, v) + + # Add node attributes + for node_id in g.nodes: + feat = np.array(features[str(node_id)], dtype=np.float32) + label = labels.get(node_id, -1) + g.add_node(node_id, feat=feat, label=label) + + self._g = g + self._num_classes = len(set(labels.values())) + + if self.verbose: + print("Finished loading GitHub Users dataset.") + print(f" NumNodes: {g.number_of_nodes()}") + print(f" NumEdges: {g.number_of_edges()}") + print(f" Feature dim: {feat.shape[0]}") + print(f" NumClasses: {self._num_classes}") + + def __getitem__(self, idx): + assert idx == 0, "GitHubUsersDataset only contains one graph" + return self._g if self._transform is None else self._transform(self._g) + + def __len__(self): + return 1 + + @property + def num_classes(self): + return self._num_classes diff --git a/easygraph/datasets/graph_dataset_base.py b/easygraph/datasets/graph_dataset_base.py index 4f433e81..b1d831be 100644 --- a/easygraph/datasets/graph_dataset_base.py +++ b/easygraph/datasets/graph_dataset_base.py @@ -1,5 +1,4 @@ -"""Basic EasyGraph Dataset -""" +"""Basic EasyGraph Dataset""" from __future__ import absolute_import diff --git a/easygraph/datasets/ppi.py b/easygraph/datasets/ppi.py index 06c350cb..950a434c 100644 --- a/easygraph/datasets/ppi.py +++ b/easygraph/datasets/ppi.py @@ -1,4 +1,5 @@ -""" PPIDataset for inductive learning. """ +"""PPIDataset for inductive learning.""" + import json import os diff --git a/easygraph/datasets/reddit.py b/easygraph/datasets/reddit.py new file mode 100644 index 00000000..a5e39493 --- /dev/null +++ b/easygraph/datasets/reddit.py @@ -0,0 +1,104 @@ +import os + +import easygraph as eg +import numpy as np +import scipy.sparse as sp + +from easygraph.classes.graph import Graph + +from .graph_dataset_base import EasyGraphBuiltinDataset +from .utils import data_type_dict +from .utils import download +from .utils import extract_archive +from .utils import tensor + + +class RedditDataset(EasyGraphBuiltinDataset): + r"""Reddit posts graph (Sept 2014) for community (subreddit) classification. + + Statistics: + - Nodes: ~232,965 + - Edges: ~114 million (approx.) + - Features per node: 602 + - Classes: number of subreddit communities + + Data are split by post-day: first 20 days train, then validation (30%), test (rest). + + Parameters + ---------- + self_loop : bool + Add self-loop edges if True. + raw_dir, force_reload, verbose, transform : same as EasyGraphBuiltinDataset + """ + + def __init__( + self, + self_loop=False, + raw_dir=None, + force_reload=False, + verbose=True, + transform=None, + ): + name = "reddit" + url = "https://data.dgl.ai/dataset/reddit.zip" + self.self_loop = self_loop + super().__init__( + name=name, + url=url, + raw_dir=raw_dir, + force_reload=force_reload, + verbose=verbose, + transform=transform, + ) + + def process(self): + # Expect two files extracted: reddit_data.npz & reddit_graph.npz + data = np.load(os.path.join(self.raw_path, "reddit_data.npz")) + feat = data["feature"] # shape [N, 602] + labels = data["label"] # shape [N] + split = data["node_types"] # 1=train,2=val,3=test + + # Load adjacency + adj = sp.load_npz(os.path.join(self.raw_path, "reddit_graph.npz")) + src, dst = adj.nonzero() + if self.self_loop: + self_loops = np.arange(adj.shape[0]) + src = np.concatenate([src, self_loops]) + dst = np.concatenate([dst, self_loops]) + edges = list(zip(src, dst)) + + # Build graph + g = eg.Graph() + g.add_edges_from(edges) + + # Assign node features, labels, and masks + for i in range(feat.shape[0]): + g.add_node( + i, + feat=feat[i], + label=int(labels[i]), + train_mask=(split[i] == 1), + val_mask=(split[i] == 2), + test_mask=(split[i] == 3), + ) + + self._g = g + self._num_classes = int(np.max(labels) + 1) + + if self.verbose: + print("Loaded Reddit dataset:") + print(f" NumNodes: {g.number_of_nodes()}") + print(f" NumEdges: {g.number_of_edges()}") + print(f" NumFeats: {feat.shape[1]}") + print(f" NumClasses: {self._num_classes}") + + def __getitem__(self, idx): + assert idx == 0, "RedditDataset only contains one graph" + return self._g if self.transform is None else self.transform(self._g) + + def __len__(self): + return 1 + + @property + def num_classes(self): + return self._num_classes diff --git a/easygraph/datasets/roadnet.py b/easygraph/datasets/roadnet.py new file mode 100644 index 00000000..1d7bfa8a --- /dev/null +++ b/easygraph/datasets/roadnet.py @@ -0,0 +1,107 @@ +"""RoadNet-CA Dataset + +This dataset represents the road network of California. +Nodes correspond to intersections, and edges represent roads connecting them. + +The data is undirected and unweighted. No features or labels are provided. + +Statistics: +- Nodes: 1,965,206 +- Edges: 2,766,607 +- Features: None +- Labels: None + +Reference: +J. Leskovec and A. Krevl, “SNAP Datasets: Stanford Large Network Dataset Collection,” +https://snap.stanford.edu/data/roadNet-CA.html +""" + +import gzip +import os +import shutil + +import easygraph as eg + +from easygraph.classes.graph import Graph + +from .graph_dataset_base import EasyGraphBuiltinDataset +from .utils import download + + +class RoadNetCADataset(EasyGraphBuiltinDataset): + r"""Road network of California (RoadNet-CA) + + Nodes are road intersections and edges are roads connecting them. + + Parameters + ---------- + raw_dir : str, optional + Directory to store the raw downloaded files. Default: None + force_reload : bool, optional + Whether to re-download and process the dataset. Default: False + verbose : bool, optional + Whether to print detailed processing logs. Default: True + transform : callable, optional + Optional transform to apply on the graph. + + Examples + -------- + >>> from easygraph.datasets import RoadNetCADataset + >>> dataset = RoadNetCADataset() + >>> g = dataset[0] + >>> print("Nodes:", g.number_of_nodes()) + >>> print("Edges:", g.number_of_edges()) + """ + + def __init__(self, raw_dir=None, force_reload=False, verbose=True, transform=None): + name = "roadNet-CA" + url = "https://snap.stanford.edu/data/roadNet-CA.txt.gz" + super(RoadNetCADataset, self).__init__( + name=name, + url=url, + raw_dir=raw_dir, + force_reload=force_reload, + verbose=verbose, + transform=transform, + ) + + def download(self): + r"""Download and decompress the .txt.gz file.""" + compressed_path = os.path.join(self.raw_dir, self.name + ".txt.gz") + extracted_path = os.path.join(self.raw_path, self.name + ".txt") + + download(self.url, path=compressed_path) + + if not os.path.exists(self.raw_path): + os.makedirs(self.raw_path) + + with gzip.open(compressed_path, "rb") as f_in: + with open(extracted_path, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + + def process(self): + graph = eg.Graph() # Undirected road network + edge_list_path = os.path.join(self.raw_path, self.name + ".txt") + + with open(edge_list_path, "r") as f: + for line in f: + if line.startswith("#") or line.strip() == "": + continue + u, v = map(int, line.strip().split()) + graph.add_edge(u, v) + + self._g = graph + self._num_nodes = graph.number_of_nodes() + self._num_edges = graph.number_of_edges() + + if self.verbose: + print("Finished loading RoadNet-CA dataset.") + print(f" NumNodes: {self._num_nodes}") + print(f" NumEdges: {self._num_edges}") + + def __getitem__(self, idx): + assert idx == 0, "RoadNetCADataset only contains one graph" + return self._g if self._transform is None else self._transform(self._g) + + def __len__(self): + return 1 diff --git a/easygraph/datasets/twitter_ego.py b/easygraph/datasets/twitter_ego.py new file mode 100644 index 00000000..7b631214 --- /dev/null +++ b/easygraph/datasets/twitter_ego.py @@ -0,0 +1,65 @@ +import gzip +import os + +import easygraph as eg + +from easygraph.datasets.graph_dataset_base import EasyGraphBuiltinDataset +from easygraph.datasets.utils import download +from easygraph.datasets.utils import extract_archive + + +class TwitterEgoDataset(EasyGraphBuiltinDataset): + r""" + Twitter Ego Network Dataset + + The Twitter dataset was collected from public sources and contains a large ego-network of Twitter users. + The combined network includes 81K edges among 81K users. + + Source: J. McAuley and J. Leskovec, Stanford SNAP, 2012 + URL: https://snap.stanford.edu/data/egonets-Twitter.html + File used: https://snap.stanford.edu/data/twitter_combined.txt.gz + """ + + def __init__(self): + super(TwitterEgoDataset, self).__init__( + name="twitter_ego", + url="https://snap.stanford.edu/data/twitter_combined.txt.gz", + force_reload=False, + ) + + def download(self): + gz_path = os.path.join(self.raw_path, "twitter_combined.txt.gz") + download(self.url, path=gz_path) + extract_archive(gz_path, self.raw_path) + + def process(self): + import gzip + + import easygraph as eg + + gz_path = os.path.join(self.raw_path, "twitter_combined.txt.gz") + txt_path = os.path.join(self.raw_path, "twitter_combined.txt") + + if not os.path.exists(txt_path): + with gzip.open(gz_path, "rt") as f_in, open(txt_path, "w") as f_out: + f_out.writelines(f_in) + + G = eg.Graph() + edge_count = 0 + with open(txt_path, "r") as f: + for line in f: + u, v = map(int, line.strip().split()) + G.add_edge(u, v) + edge_count += 1 + + self._graphs = [G] + self._graph = G + self._processed = True + + def __getitem__(self, idx): + if self._graph is not None: + return self._graph + elif self._graphs: + return self._graphs[idx] + else: + return None diff --git a/easygraph/datasets/web_google.py b/easygraph/datasets/web_google.py new file mode 100644 index 00000000..97597299 --- /dev/null +++ b/easygraph/datasets/web_google.py @@ -0,0 +1,118 @@ +"""Web-Google Dataset + +This dataset is a web graph based on Google's web pages and their hyperlink +structure, as crawled by the Stanford WebBase project in 2002. + +Each node represents a web page, and a directed edge from u to v indicates +a hyperlink from page u to page v. + +Statistics: +- Nodes: 875713 +- Edges: 5105039 +- Features: None +- Labels: None + +Reference: +J. Leskovec, A. Rajaraman, J. Ullman, “Mining of Massive Datasets.” +Dataset from SNAP: https://snap.stanford.edu/data/web-Google.html +""" + +import gzip +import os +import shutil + +import easygraph as eg + +from easygraph.classes.graph import Graph + +from .graph_dataset_base import EasyGraphBuiltinDataset +from .utils import download +from .utils import extract_archive + + +class WebGoogleDataset(EasyGraphBuiltinDataset): + r"""Web-Google hyperlink network dataset. + + Parameters + ---------- + raw_dir : str, optional + Directory to store the raw downloaded files. Default: None + force_reload : bool, optional + Whether to re-download and process the dataset. Default: False + verbose : bool, optional + Whether to print detailed processing logs. Default: True + transform : callable, optional + Optional transform to apply on the graph. + + Examples + -------- + >>> from easygraph.datasets import WebGoogleDataset + >>> dataset = WebGoogleDataset() + >>> g = dataset[0] + >>> print("Nodes:", g.number_of_nodes()) + >>> print("Edges:", g.number_of_edges()) + """ + + def __init__(self, raw_dir=None, force_reload=False, verbose=True, transform=None): + name = "web-Google" + url = "https://snap.stanford.edu/data/web-Google.txt.gz" + super(WebGoogleDataset, self).__init__( + name=name, + url=url, + raw_dir=raw_dir, + force_reload=force_reload, + verbose=verbose, + transform=transform, + ) + + def download(self): + r"""Download and extract .gz edge list.""" + if self.url is not None: + file_path = os.path.join(self.raw_dir, self.name + ".txt.gz") + download(self.url, path=file_path) + extract_archive(file_path, self.raw_path) + + def process(self): + graph = eg.DiGraph() # Web-Google is directed + edge_list_path = os.path.join(self.raw_path, self.name + ".txt") + + with open(edge_list_path, "r") as f: + for line in f: + if line.startswith("#") or line.strip() == "": + continue + u, v = map(int, line.strip().split()) + graph.add_edge(u, v) + + self._g = graph + self._num_nodes = graph.number_of_nodes() + self._num_edges = graph.number_of_edges() + + if self.verbose: + print("Finished loading Web-Google dataset.") + print(f" NumNodes: {self._num_nodes}") + print(f" NumEdges: {self._num_edges}") + + def __getitem__(self, idx): + assert idx == 0, "WebGoogleDataset only contains one graph" + return self._g if self._transform is None else self._transform(self._g) + + def __len__(self): + return 1 + + def download(self): + r"""Download and decompress the .txt.gz file.""" + if self.url is not None: + compressed_path = os.path.join(self.raw_dir, self.name + ".txt.gz") + extracted_path = os.path.join(self.raw_path, self.name + ".txt") + + # Download .gz file + download(self.url, path=compressed_path) + + # Ensure output directory exists + if not os.path.exists(self.raw_path): + os.makedirs(self.raw_path) + + # Decompress manually + with gzip.open(compressed_path, "rb") as f_in: + with open(extracted_path, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) diff --git a/easygraph/datasets/wiki_topcats.py b/easygraph/datasets/wiki_topcats.py new file mode 100644 index 00000000..9c337d5f --- /dev/null +++ b/easygraph/datasets/wiki_topcats.py @@ -0,0 +1,105 @@ +"""Wikipedia Top Categories Dataset (wiki-topcats) + +This dataset is a directed graph of Wikipedia articles restricted to +top-level categories (at least 100 articles), capturing the largest +strongly connected component. + +Statistics: +- Nodes: 1,791,489 +- Edges: 28,511,807 +- Categories: 17,364 +- Overlapping labels per node + +Source: +H. Yin, A. Benson, J. Leskovec, D. Gleich. +"Local Higher-order Graph Clustering", KDD 2017 +Data: https://snap.stanford.edu/data/wiki-topcats.html +""" + +import gzip +import os + +import easygraph as eg + +from easygraph.datasets.graph_dataset_base import EasyGraphBuiltinDataset +from easygraph.datasets.utils import download +from easygraph.datasets.utils import extract_archive + + +class WikiTopCatsDataset(EasyGraphBuiltinDataset): + """Wikipedia Top Categories Snapshot from 2011 (SNAP)""" + + def __init__(self, raw_dir=None, force_reload=False, verbose=True, transform=None): + super(WikiTopCatsDataset, self).__init__( + name="wiki_topcats", + url="https://snap.stanford.edu/data/wiki-topcats.txt.gz", + raw_dir=raw_dir, + force_reload=force_reload, + verbose=verbose, + transform=transform, + ) + + def download(self): + # Download the main graph file + gz_path = os.path.join(self.raw_dir, "wiki-topcats.txt.gz") + download(self.url, path=gz_path) + + # Also download category info and page names + cat_url = "https://snap.stanford.edu/data/wiki-topcats-categories.txt.gz" + names_url = "https://snap.stanford.edu/data/wiki-topcats-page-names.txt.gz" + download( + cat_url, path=os.path.join(self.raw_dir, "wiki-topcats-categories.txt.gz") + ) + download( + names_url, path=os.path.join(self.raw_dir, "wiki-topcats-page-names.txt.gz") + ) + + def process(self): + raw = self.raw_dir + + # Decompress and read edges + edge_gz = os.path.join(raw, "wiki-topcats.txt.gz") + edge_txt = os.path.join(raw, "wiki-topcats.txt") + if not os.path.exists(edge_txt): + with gzip.open(edge_gz, "rt") as fin, open(edge_txt, "w") as fout: + fout.writelines(fin) + G = eg.DiGraph() + edge_count = 0 + with open(edge_txt, "r") as f: + for line in f: + u, v = map(int, line.strip().split()) + G.add_edge(u, v) + edge_count += 1 + if self.verbose: + print(f"Loaded graph: {G.number_of_nodes()} nodes, {edge_count} edges") + + # Compress node names + names_gz = os.path.join(raw, "wiki-topcats-page-names.txt.gz") + names = {} + with gzip.open(names_gz, "rt") as f: + for idx, line in enumerate(f): + names[idx] = line.strip() + + # Load categories + cats_gz = os.path.join(raw, "wiki-topcats-categories.txt.gz") + labels = {} # mapping: node -> list of category strings + with gzip.open(cats_gz, "rt") as f: + for idx, line in enumerate(f): + categories = line.strip().split(";") + categories = [cat.strip() for cat in categories if cat.strip()] + labels[idx] = categories + + # Attach node features: empty, and node labels + for n in G.nodes: + G.add_node(n, name=names.get(n, ""), label=labels.get(n, [])) + + self._graph = G + self._graphs = [G] + self._processed = True + + def __getitem__(self, idx): + assert idx == 0 + return self._graph + + def __len__(self): + return 1 diff --git a/easygraph/model/hypergraphs/hwnn.py b/easygraph/model/hypergraphs/hwnn.py index 37684c39..980bd39e 100644 --- a/easygraph/model/hypergraphs/hwnn.py +++ b/easygraph/model/hypergraphs/hwnn.py @@ -39,7 +39,7 @@ def __init__( def forward(self, X: torch.Tensor, hgs: list) -> torch.Tensor: r"""The forward function. - + Parameters: ``X`` (``torch.Tensor``): Input vertex feature matrix. Size :math:`(N, C_{in})`. ``hg`` (``eg.Hypergraph``): The hypergraph structure that contains :math:`N` vertices. diff --git a/easygraph/nn/convs/hypergraphs/hwnn_conv.py b/easygraph/nn/convs/hypergraphs/hwnn_conv.py index ea7ea563..7c1fa7e8 100644 --- a/easygraph/nn/convs/hypergraphs/hwnn_conv.py +++ b/easygraph/nn/convs/hypergraphs/hwnn_conv.py @@ -44,7 +44,7 @@ def init_parameters(self): def forward(self, X: torch.Tensor, hg: Hypergraph) -> torch.Tensor: r"""The forward function. - + Parameters: X (``torch.Tensor``): Input vertex feature matrix. Size :math:`(N, C_{in})`. hg (``eg.Hypergraph``): The hypergraph structure that contains :math:`N` vertices.