Source code for fedgraph.data_process

# setting of data generation

import pickle as pkl
import sys

import networkx as nx
import numpy as np
import scipy.sparse as sp
import torch
import torch_geometric
import torch_sparse


[docs] def parse_index_file(filename: str) -> list: """ Reads and parses an index file Parameters ---------- filename : str Name or path of the file to parse. Returns ------- index : list List of integers, each integer in the list represents int of the lines of the input file. """ index = [] for line in open(filename): index.append(int(line.strip())) return index
[docs] def normalize(mx: sp.csc_matrix) -> sp.csr_matrix: """ This function is to row-normalize sparse matrix for efficient computation of the graph Parameters ---------- mx : sparse matrix Input sparse matrix to row-normalize. Returns ------- mx : sparse matrix Row-normalized sparse matrix. Note ---- Row-normalizing is usually done in graph algorithms to enable equal node contributions regardless of the node's degree and to stabilize, ease numerical computations. """ rowsum = np.array(mx.sum(1)) r_inv = np.power(rowsum, -1).flatten() r_inv[np.isinf(r_inv)] = 0.0 r_mat_inv = sp.diags(r_inv) mx = r_mat_inv.dot(mx) return mx
[docs] def load_data(dataset_str: str) -> tuple: """ Loads input data from 'gcn/data' directory and processes these datasets into a format suitable for training GCN and similar models. Parameters ---------- dataset_str : Name of the dataset to be loaded. Returns ------- features : torch.Tensor Node feature matrix as a float tensor. adj : torch.Tensor or torch_sparse.tensor.SparseTensor Adjacency matrix of the graph. labels : torch.Tensor Labels of the nodes. idx_train : torch.LongTensor Indices of training nodes. idx_val : torch.LongTensor Indices of validation nodes. idx_test : torch.LongTensor Indices of test nodes. Note ---- ind.dataset_str.x => the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object; ind.dataset_str.tx => the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object; ind.dataset_str.allx => the feature vectors of both labeled and unlabeled training instances (a superset of ind.dataset_str.x) as scipy.sparse.csr.csr_matrix object; ind.dataset_str.y => the one-hot labels of the labeled training instances as numpy.ndarray object; ind.dataset_str.ty => the one-hot labels of the test instances as numpy.ndarray object; ind.dataset_str.ally => the labels for instances in ind.dataset_str.allx as numpy.ndarray object; ind.dataset_str.graph => a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict object; ind.dataset_str.test.index => the indices of test instances in graph, for the inductive setting as list object. All objects above must be saved using python pickle module. """ if dataset_str in ["cora", "citeseer", "pubmed"]: # download dataset from torch_geometric dataset = torch_geometric.datasets.Planetoid("./data", dataset_str) names = ["x", "y", "tx", "ty", "allx", "ally", "graph"] objects = [] for i in range(len(names)): with open( "data/{}/raw/ind.{}.{}".format(dataset_str, dataset_str, names[i]), "rb" ) as f: if sys.version_info > (3, 0): objects.append(pkl.load(f, encoding="latin1")) else: objects.append(pkl.load(f)) x, y, tx, ty, allx, ally, graph = tuple(objects) test_idx_reorder = parse_index_file( "data/{}/raw/ind.{}.test.index".format(dataset_str, dataset_str) ) test_idx_range = np.sort(test_idx_reorder) if dataset_str == "citeseer": # Fix citeseer dataset (there are some isolated nodes in the graph) # Find isolated nodes, add them as zero-vecs into the right position test_idx_range_full = range( min(test_idx_reorder), max(test_idx_reorder) + 1 ) tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) tx_extended[test_idx_range - min(test_idx_range), :] = tx tx = tx_extended ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) ty_extended[test_idx_range - min(test_idx_range), :] = ty ty = ty_extended features = sp.vstack((allx, tx)).tolil() features[test_idx_reorder, :] = features[test_idx_range, :] adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) labels = np.vstack((ally, ty)) labels[test_idx_reorder, :] = labels[test_idx_range, :] idx_test = torch.LongTensor(test_idx_range.tolist()) idx_train = torch.LongTensor(range(len(y))) idx_val = torch.LongTensor(range(len(y), len(y) + 500)) # features = normalize(features) # adj = normalize(adj) # no normalize adj here, normalize it in the training process features = torch.tensor(features.toarray()).float() adj = torch.tensor(adj.toarray()).float() adj = torch_sparse.tensor.SparseTensor.from_dense(adj) labels = torch.tensor(labels) labels = torch.argmax(labels, dim=1) elif dataset_str in [ "ogbn-arxiv", "ogbn-products", "ogbn-mag", "ogbn-papers100M", ]: #'ogbn-mag' is heteregeneous from ogb.nodeproppred import PygNodePropPredDataset # Download and process data at './dataset/.' dataset = PygNodePropPredDataset( name=dataset_str, transform=torch_geometric.transforms.ToSparseTensor() ) split_idx = dataset.get_idx_split() idx_train, idx_val, idx_test = ( split_idx["train"], split_idx["valid"], split_idx["test"], ) idx_train = torch.LongTensor(idx_train) idx_val = torch.LongTensor(idx_val) idx_test = torch.LongTensor(idx_test) data = dataset[0] features = data.x labels = data.y.reshape(-1) if dataset_str == "ogbn-arxiv": adj = data.adj_t.to_symmetric() else: adj = data.adj_t elif dataset_str == "reddit": from dgl.data import RedditDataset data = RedditDataset() g = data[0] adj = torch_sparse.tensor.SparseTensor.from_edge_index(g.edges()) features = g.ndata["feat"] train_mask = g.ndata["train_mask"] val_mask = g.ndata["val_mask"] test_mask = g.ndata["test_mask"] idx_train = (train_mask == True).nonzero().view(-1) idx_val = (val_mask == True).nonzero().view(-1) idx_test = (test_mask == True).nonzero().view(-1) labels = g.ndata["label"] return features.float(), adj, labels, idx_train, idx_val, idx_test