# setting of data generation
import pickle as pkl
import sys
import networkx as nx
import numpy as np
import scipy.sparse as sp
import torch
import torch_geometric
import torch_sparse
[docs]
def parse_index_file(filename: str) -> list:
"""
Reads and parses an index file
Parameters
----------
filename : str
Name or path of the file to parse.
Returns
-------
index : list
List of integers, each integer in the list represents int of the lines of the input file.
"""
index = []
for line in open(filename):
index.append(int(line.strip()))
return index
[docs]
def normalize(mx: sp.csc_matrix) -> sp.csr_matrix:
"""
This function is to row-normalize sparse matrix for efficient computation of the graph
Parameters
----------
mx : sparse matrix
Input sparse matrix to row-normalize.
Returns
-------
mx : sparse matrix
Row-normalized sparse matrix.
Note
----
Row-normalizing is usually done in graph algorithms to enable equal node contributions
regardless of the node's degree and to stabilize, ease numerical computations.
"""
rowsum = np.array(mx.sum(1))
r_inv = np.power(rowsum, -1).flatten()
r_inv[np.isinf(r_inv)] = 0.0
r_mat_inv = sp.diags(r_inv)
mx = r_mat_inv.dot(mx)
return mx
[docs]
def load_data(dataset_str: str) -> tuple:
"""
Loads input data from 'gcn/data' directory and processes these datasets into a format
suitable for training GCN and similar models.
Parameters
----------
dataset_str : Name of the dataset to be loaded.
Returns
-------
features : torch.Tensor
Node feature matrix as a float tensor.
adj : torch.Tensor or torch_sparse.tensor.SparseTensor
Adjacency matrix of the graph.
labels : torch.Tensor
Labels of the nodes.
idx_train : torch.LongTensor
Indices of training nodes.
idx_val : torch.LongTensor
Indices of validation nodes.
idx_test : torch.LongTensor
Indices of test nodes.
Note
----
ind.dataset_str.x => the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object;
ind.dataset_str.tx => the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object;
ind.dataset_str.allx => the feature vectors of both labeled and unlabeled training instances (a superset of ind.dataset_str.x) as scipy.sparse.csr.csr_matrix object;
ind.dataset_str.y => the one-hot labels of the labeled training instances as numpy.ndarray object;
ind.dataset_str.ty => the one-hot labels of the test instances as numpy.ndarray object;
ind.dataset_str.ally => the labels for instances in ind.dataset_str.allx as numpy.ndarray object;
ind.dataset_str.graph => a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict object;
ind.dataset_str.test.index => the indices of test instances in graph, for the inductive setting as list object.
All objects above must be saved using python pickle module.
"""
if dataset_str in ["cora", "citeseer", "pubmed"]:
# download dataset from torch_geometric
dataset = torch_geometric.datasets.Planetoid("./data", dataset_str)
names = ["x", "y", "tx", "ty", "allx", "ally", "graph"]
objects = []
for i in range(len(names)):
with open(
"data/{}/raw/ind.{}.{}".format(dataset_str, dataset_str, names[i]), "rb"
) as f:
if sys.version_info > (3, 0):
objects.append(pkl.load(f, encoding="latin1"))
else:
objects.append(pkl.load(f))
x, y, tx, ty, allx, ally, graph = tuple(objects)
test_idx_reorder = parse_index_file(
"data/{}/raw/ind.{}.test.index".format(dataset_str, dataset_str)
)
test_idx_range = np.sort(test_idx_reorder)
if dataset_str == "citeseer":
# Fix citeseer dataset (there are some isolated nodes in the graph)
# Find isolated nodes, add them as zero-vecs into the right position
test_idx_range_full = range(
min(test_idx_reorder), max(test_idx_reorder) + 1
)
tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
tx_extended[test_idx_range - min(test_idx_range), :] = tx
tx = tx_extended
ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
ty_extended[test_idx_range - min(test_idx_range), :] = ty
ty = ty_extended
features = sp.vstack((allx, tx)).tolil()
features[test_idx_reorder, :] = features[test_idx_range, :]
adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))
labels = np.vstack((ally, ty))
labels[test_idx_reorder, :] = labels[test_idx_range, :]
idx_test = torch.LongTensor(test_idx_range.tolist())
idx_train = torch.LongTensor(range(len(y)))
idx_val = torch.LongTensor(range(len(y), len(y) + 500))
# features = normalize(features)
# adj = normalize(adj) # no normalize adj here, normalize it in the training process
features = torch.tensor(features.toarray()).float()
adj = torch.tensor(adj.toarray()).float()
adj = torch_sparse.tensor.SparseTensor.from_dense(adj)
labels = torch.tensor(labels)
labels = torch.argmax(labels, dim=1)
elif dataset_str in [
"ogbn-arxiv",
"ogbn-products",
"ogbn-mag",
"ogbn-papers100M",
]: #'ogbn-mag' is heteregeneous
from ogb.nodeproppred import PygNodePropPredDataset
# Download and process data at './dataset/.'
dataset = PygNodePropPredDataset(
name=dataset_str, transform=torch_geometric.transforms.ToSparseTensor()
)
split_idx = dataset.get_idx_split()
idx_train, idx_val, idx_test = (
split_idx["train"],
split_idx["valid"],
split_idx["test"],
)
idx_train = torch.LongTensor(idx_train)
idx_val = torch.LongTensor(idx_val)
idx_test = torch.LongTensor(idx_test)
data = dataset[0]
features = data.x
labels = data.y.reshape(-1)
if dataset_str == "ogbn-arxiv":
adj = data.adj_t.to_symmetric()
else:
adj = data.adj_t
elif dataset_str == "reddit":
from dgl.data import RedditDataset
data = RedditDataset()
g = data[0]
adj = torch_sparse.tensor.SparseTensor.from_edge_index(g.edges())
features = g.ndata["feat"]
train_mask = g.ndata["train_mask"]
val_mask = g.ndata["val_mask"]
test_mask = g.ndata["test_mask"]
idx_train = (train_mask == True).nonzero().view(-1)
idx_val = (val_mask == True).nonzero().view(-1)
idx_test = (test_mask == True).nonzero().view(-1)
labels = g.ndata["label"]
return features.float(), adj, labels, idx_train, idx_val, idx_test