Source code for pygip.models.attack.CEGA

import copy
import json
import math
import os
import pickle as pkl
import random
import sys
import time

import dgl
import dgl.function as fn
import networkx as nx
import numpy as np
import numpy.linalg as la
import pandas as pd
import scipy.sparse as sp
import torch
import torch as th
import torch.nn as nn
import torch.nn.functional as F
from dgl.data import AmazonCoBuyComputerDataset, AmazonCoBuyPhotoDataset, CoauthorCSDataset, CoauthorPhysicsDataset, \
    RedditDataset, WikiCSDataset, AmazonRatingsDataset, QuestionsDataset, RomanEmpireDataset, FlickrDataset, \
    CoraFullDataset
from dgl.data import citation_graph as citegrh
from dgl.nn.pytorch import GraphConv
from sklearn.cluster import KMeans
from sklearn.metrics import f1_score
from torch_geometric.datasets import CitationFull
from tqdm import tqdm

time_limit = 300


[docs]def get_receptive_fields_dense(cur_neighbors, selected_node, weighted_score, adj_matrix2): receptive_vector = ((cur_neighbors + adj_matrix2[selected_node]) != 0) + 0 count = weighted_score.dot(receptive_vector) return count
[docs]def get_current_neighbors_dense(cur_nodes, adj_matrix2): if np.array(cur_nodes).shape[0] == 0: return 0 neighbors = (adj_matrix2[list(cur_nodes)].sum(axis=0) != 0) + 0 return neighbors
[docs]def get_current_neighbors_1(cur_nodes, adj_matrix): if np.array(cur_nodes).shape[0] == 0: return 0 neighbors = (adj_matrix[list(cur_nodes)].sum(axis=0) != 0) + 0 return neighbors
[docs]def get_entropy_contribute(npy_m1, npy_m2): entro1 = 0 entro2 = 0 for i in range(npy_m1.shape[0]): entro1 -= np.sum(npy_m1[i] * np.log2(npy_m1[i])) entro2 -= np.sum(npy_m2[i] * np.log2(npy_m2[i])) return entro1 - entro2
[docs]def get_max_info_entropy_node_set(idx_used, high_score_nodes, labels, batch_size, adj_matrix2, num_class, model_prediction): max_info_node_set = [] high_score_nodes_ = copy.deepcopy(high_score_nodes) labels_ = copy.deepcopy(labels) for k in range(batch_size): score_list = np.zeros(len(high_score_nodes_)) for i in range(len(high_score_nodes_)): labels_tmp = copy.deepcopy(labels_) node = high_score_nodes_[i] node_neighbors = get_current_neighbors_dense([node], adj_matrix2) adj_neigh = adj_matrix2[list(node_neighbors)] aay = np.matmul(adj_neigh, labels_) total_score = 0 for j in range(num_class): if model_prediction[node][j] != 0: labels_tmp[node] = 0 labels_tmp[node][j] = 1 aay_ = np.matmul(adj_neigh, labels_tmp) total_score += model_prediction[node][j] * get_entropy_contribute(aay, aay_) score_list[i] = total_score idx = np.argmax(score_list) max_node = high_score_nodes_[idx] max_info_node_set.append(max_node) labels_[max_node] = model_prediction[max_node] high_score_nodes_.remove(max_node) return max_info_node_set
[docs]def get_max_nnd_node_dense(idx_used, high_score_nodes, min_distance, distance_aax, num_ones, num_node, adj_matrix2, gamma=1): dmax = np.ones(num_node) max_receptive_node = 0 max_total_score = 0 cur_neighbors = get_current_neighbors_dense(idx_used, adj_matrix2) for node in high_score_nodes: receptive_field = get_receptive_fields_dense(cur_neighbors, node, num_ones, adj_matrix2) node_distance = distance_aax[node, :] node_distance = np.where(node_distance < min_distance, node_distance, min_distance) node_distance = dmax - node_distance distance_score = node_distance.dot(num_ones) total_score = receptive_field / num_node + gamma * distance_score / num_node if total_score > max_total_score: max_total_score = total_score max_receptive_node = node return max_receptive_node
def aug_normalized_adjacency(adj): adj = adj + sp.eye(adj.shape[0]) adj = sp.coo_matrix(adj) row_sum = np.array(adj.sum(1)) d_inv_sqrt = np.power(row_sum, -0.5).flatten() d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0. d_mat_inv_sqrt = sp.diags(d_inv_sqrt) return d_mat_inv_sqrt.dot(adj).dot(d_mat_inv_sqrt).tocoo()
[docs]def compute_distance(_i, _j, features_aax): return la.norm(features_aax[_i, :] - features_aax[_j, :])
[docs]def parse_index_file(filename): index = [] for line in open(filename): index.append(int(line.strip())) return index
[docs]def normalize(mx): """Row-normalize sparse matrix""" rowsum = np.array(mx.sum(1)) r_inv = np.power(rowsum, -1).flatten() r_inv[np.isinf(r_inv)] = 0. r_mat_inv = sp.diags(r_inv) mx = r_mat_inv.dot(mx) return mx
[docs]def accuracy(output, labels): preds = output.max(1)[1].type_as(labels) correct = preds.eq(labels).double() correct = correct.sum() return correct / len(labels)
[docs]def load_data_from_grain(path="./data", dataset="cora"): """ ind.[:dataset].x => the feature vectors of the training instances (scipy.sparse.csr.csr_matrix) ind.[:dataset].y => the one-hot labels of the labeled training instances (numpy.ndarray) ind.[:dataset].allx => the feature vectors of both labeled and unlabeled training instances (csr_matrix) ind.[:dataset].ally => the labels for instances in ind.dataset_str.allx (numpy.ndarray) ind.[:dataset].graph => the dict in the format {index: [index of neighbor nodes]} (collections.defaultdict) ind.[:dataset].tx => the feature vectors of the test instances (scipy.sparse.csr.csr_matrix) ind.[:dataset].ty => the one-hot labels of the test instances (numpy.ndarray) ind.[:dataset].test.index => indices of test instances in graph, for the inductive setting """ print("\n[STEP 1]: Upload {} dataset.".format(dataset)) names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] objects = [] for i in range(len(names)): with open("{}/ind.{}.{}".format(path, dataset, names[i]), 'rb') as f: if sys.version_info > (3, 0): objects.append(pkl.load(f, encoding='latin1')) else: objects.append(pkl.load(f)) x, y, tx, ty, allx, ally, graph = tuple(objects) test_idx_reorder = parse_index_file("{}/ind.{}.test.index".format(path, dataset)) test_idx_range = np.sort(test_idx_reorder) if dataset == 'citeseer': # Citeseer dataset contains some isolated nodes in the graph test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder) + 1) tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) tx_extended[test_idx_range - min(test_idx_range), :] = tx tx = tx_extended ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) ty_extended[test_idx_range - min(test_idx_range), :] = ty ty = ty_extended features = sp.vstack((allx, tx)).tolil() features[test_idx_reorder, :] = features[test_idx_range, :] adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) print("| # of nodes : {}".format(adj.shape[0])) print("| # of edges : {}".format(adj.sum().sum() / 2)) features = normalize(features) print("| # of features : {}".format(features.shape[1])) print("| # of clases : {}".format(ally.shape[1])) features = torch.FloatTensor(np.array(features.todense())) sparse_mx = adj.tocoo().astype(np.float32) labels = np.vstack((ally, ty)) labels[test_idx_reorder, :] = labels[test_idx_range, :] if dataset == 'citeseer': save_label = np.where(labels)[1] labels = torch.LongTensor(np.where(labels)[1]) idx_train = range(len(y)) idx_val = range(len(y), len(y) + 500) idx_test = test_idx_range.tolist() print("| # of train set : {}".format(len(idx_train))) print("| # of val set : {}".format(len(idx_val))) print("| # of test set : {}".format(len(idx_test))) idx_train, idx_val, idx_test = list(map(lambda x: torch.LongTensor(x), [idx_train, idx_val, idx_test])) def missing_elements(L): start, end = L[0], L[-1] return sorted(set(range(start, end + 1)).difference(L)) if dataset == 'citeseer': L = np.sort(idx_test) missing = missing_elements(L) for element in missing: save_label = np.insert(save_label, element, 0) labels = torch.LongTensor(save_label) return adj, features, labels, idx_train, idx_val, idx_test
[docs]def set_seed(seed): random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed)
[docs]def sparse_mx_to_torch_sparse_tensor(sparse_mx): """Convert a scipy sparse matrix to a torch sparse tensor.""" sparse_mx = sparse_mx.tocoo().astype(np.float32) indices = torch.from_numpy( np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64)) values = torch.from_numpy(sparse_mx.data) shape = torch.Size(sparse_mx.shape) return torch.sparse.FloatTensor(indices, values, shape)
[docs]def aug_normalized_adjacency(adj): adj = adj # + sp.eye(adj.shape[0]) adj = sp.coo_matrix(adj) row_sum = np.array(adj.sum(1)) d_inv_sqrt = np.power(row_sum, -0.5).flatten() d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0. d_mat_inv_sqrt = sp.diags(d_inv_sqrt) return d_mat_inv_sqrt.dot(adj).dot(d_mat_inv_sqrt).tocoo()
[docs]def aug_random_walk(adj): adj = adj + sp.eye(adj.shape[0]) adj = sp.coo_matrix(adj) row_sum = np.array(adj.sum(1)) d_inv = np.power(row_sum, -1.0).flatten() d_mat = sp.diags(d_inv) return (d_mat.dot(adj)).tocoo()
[docs]class GCN_drop(nn.Module): def __init__(self, feature_number, label_number, dropout=0.85, nhid=128): super(GCN_drop, self).__init__() self.gc1 = GraphConv(feature_number, nhid, bias=True) self.gc2 = GraphConv(nhid, label_number, bias=True) self.dropout = dropout
[docs] def forward(self, g, features): x = F.dropout(features, self.dropout, training=self.training) x = F.relu(self.gc1(g, x)) x = F.dropout(x, self.dropout, training=self.training) x = self.gc2(g, x) return x
[docs]def convert_pyg_to_dgl(pyg_data): """ Converts a PyTorch Geometric Data object into a DGLGraph. Args: pyg_data (torch_geometric.data.Data): PyTorch Geometric Data object. Returns: dgl.DGLGraph: The converted DGL graph. """ edge_index = pyg_data.edge_index num_nodes = pyg_data.num_nodes g = dgl.graph((edge_index[0], edge_index[1]), num_nodes=num_nodes) if hasattr(pyg_data, 'x') and pyg_data.x is not None: g.ndata['feat'] = pyg_data.x if hasattr(pyg_data, 'y') and pyg_data.y is not None: g.ndata['label'] = pyg_data.y for mask_name in ['train_mask', 'val_mask', 'test_mask']: if hasattr(pyg_data, mask_name) and getattr(pyg_data, mask_name) is not None: g.ndata[mask_name] = getattr(pyg_data, mask_name) return g
[docs]def load_data(dataset_name): if dataset_name == 'cora': data = citegrh.load_cora() if dataset_name == 'citeseer': data = citegrh.load_citeseer() if dataset_name == 'pubmed': data = citegrh.load_pubmed() if dataset_name == 'amazoncomputer': data = AmazonCoBuyComputerDataset() if dataset_name == 'amazonphoto': data = AmazonCoBuyPhotoDataset() if dataset_name == 'coauthorCS': data = CoauthorCSDataset() if dataset_name == 'coauthorphysics': data = CoauthorPhysicsDataset() if dataset_name == 'reddit': data = RedditDataset() if dataset_name == 'wiki': data = WikiCSDataset() if dataset_name == 'amazonrating': data = AmazonRatingsDataset() if dataset_name == 'question': data = QuestionsDataset() if dataset_name == 'roman': data = RomanEmpireDataset() if dataset_name == 'flickr': data = FlickrDataset() if dataset_name == 'cora_full': data = CoraFullDataset() if dataset_name == 'dblp': data = CitationFull(root='./data/', name='DBLP') data = data[0] if dataset_name == 'dblp': g = convert_pyg_to_dgl(data) else: g = data[0] isolated_nodes = ((g.in_degrees() == 0) & (g.out_degrees() == 0)).nonzero().squeeze(1) g.remove_nodes(isolated_nodes) if dataset_name in ['cora', 'citeseer', 'pubmed', 'reddit', 'flickr']: features = g.ndata['feat'] labels = g.ndata['label'] train_mask = g.ndata['train_mask'] test_mask = g.ndata['test_mask'] num_nodes = g.num_nodes() elif dataset_name in ['wiki']: features = g.ndata['feat'] labels = g.ndata['label'] test_mask = g.ndata['test_mask'].bool() train_mask = (1 - g.ndata['test_mask']).bool() # num_nodes = g.num_nodes() elif dataset_name in ['amazoncomputer', 'amazonphoto', 'coauthorCS', 'coauthorphysics', 'cora_full', 'dblp']: features = g.ndata['feat'] labels = g.ndata['label'] num_nodes = g.num_nodes() train_mask = torch.zeros(num_nodes, dtype=torch.bool) test_mask = torch.zeros(num_nodes, dtype=torch.bool) torch.manual_seed(42) indices = torch.randperm(num_nodes) num_train = int(num_nodes * 0.6) train_mask[indices[:num_train]] = True test_mask[indices[num_train:]] = True assert train_mask.sum() + test_mask.sum() == num_nodes elif dataset_name in ['amazonrating', 'question', 'roman']: features = g.ndata['feat'] labels = g.ndata['label'] num_nodes = g.num_nodes() train_mask = g.ndata['train_mask'][:, 0] test_mask = g.ndata['test_mask'][:, 0] return g, features, labels, num_nodes, train_mask, test_mask
[docs]def evaluate(model, g, features, labels, mask): model.eval() with torch.no_grad(): logits = model(g, features) logits = logits[mask] labels = labels[mask] _, indices = torch.max(logits, dim=1) correct = torch.sum(indices == labels) f1score = f1_score(labels.cpu().numpy(), indices.cpu().numpy(), average='macro') return correct.item() * 1.0 / len(labels), f1score
class GcnNet(nn.Module): def __init__(self, feature_number, label_number): super(GcnNet, self).__init__() self.layers = nn.ModuleList() self.layers.append(GraphConv(feature_number, 16, activation=F.relu)) self.layers.append(GraphConv(16, label_number)) self.dropout = nn.Dropout(p=0.5) def forward(self, g, features): x = F.relu(self.layers[0](g, features)) x = self.layers[1](g, x) return x # Initialization
[docs]def init_mask(C, sub_train_mask, sub_labels): # print(f"=========Initialization with {2 * C} Nodes==========================") initial_set = [] for label in range(C): label_nodes = [] for i, l in enumerate(sub_labels): if sub_train_mask[i] == True and l == label: label_nodes.append(i) selected_nodes = random.sample(label_nodes, k=2) # initial pool for each class initial_set.extend(selected_nodes) # print(initial_set) return initial_set
# node pool ## center_rank = rank_centrality(sub_g, sub_train_mask, sub_train_init, num_center, return_rank=True) ## selected_indices_center = center_rank[:num_center] ## sub_train_init[selected_indices_center] = True # Randomly select the rest of the initial nodes ## full_true_indices = th.nonzero(sub_train_mask & ~sub_train_init).squeeze() ## selected_indices_random = random.sample(full_true_indices.tolist(), num_random) ## sub_train_init[selected_indices_random] = True # Transform the formality and return the outcome; note the output are indicators # sub_train_init = th.zeros(len(sub_train_mask), dtype=th.bool) # sub_train_init[initial_set] = True # print(sub_labels[initial_set]) # sub_train_init = th.tensor(initial_set) # return sub_train_init
[docs]def update_sub_train_mask(num_each, sub_train_mask, sub_train_mask_new): full_true_indices = th.nonzero(sub_train_mask).squeeze() current_true_indices = th.nonzero(sub_train_mask_new).squeeze() missing_indices = set(full_true_indices.tolist()) - set(current_true_indices.tolist()) if len(missing_indices) >= num_each: # print(f"=========Update Random Querying Label with {num_each} Nodes==========================") selected_indices = random.sample(list(missing_indices), num_each) ## sub_train_mask_new[selected_indices] = True return selected_indices
# Calculate the entropy
[docs]def calculate_entropy(probs): return -th.sum(probs * th.log(probs + 1e-9), dim=-1)
[docs]def rank_entropy(net, sub_g, sub_features, sub_train_mask, sub_train_mask_new, num_each, return_rank=True): logits = net(sub_g, sub_features) prob = F.softmax(logits, dim=-1) nodes_interest = th.nonzero(sub_train_mask & ~sub_train_mask_new).squeeze() probs_interest = prob[nodes_interest] entropy_interest = calculate_entropy(probs_interest) nodes_rank = nodes_interest[th.argsort(entropy_interest, descending=True)] if len(nodes_rank) >= num_each: if return_rank: return nodes_rank else: print(f"=========Update Entropy Querying Label with {num_each} Nodes==========================") # selected_indices = random.sample(list(missing_indices), num_each) selected_indices = nodes_rank[:num_each] sub_train_mask_new[selected_indices] = True return sub_train_mask_new
[docs]def rank_density(net, sub_g, sub_features, sub_train_mask, sub_train_mask_new, num_each, num_clusters, return_rank=True): full_true_indices = th.nonzero(sub_train_mask).squeeze() current_true_indices = th.nonzero(sub_train_mask_new).squeeze() missing_indices = set(full_true_indices.tolist()) - set(current_true_indices.tolist()) ## Get the embeddings that we need ## Under numpy formality embedding_all = net(sub_g, sub_features, return_hidden=True).detach().numpy() kmeans = KMeans(n_clusters=num_clusters) kmeans.fit(embedding_all) ## Set up cluster_centers cluster_centers = kmeans.cluster_centers_ # Calculate the Euclidean distance dist = np.linalg.norm(embedding_all - cluster_centers[kmeans.labels_], axis=1) density_scores = th.from_numpy(1 / (1 + dist)) # pull back to the node coefficients list_missing_indices = torch.tensor(list(missing_indices)) shuffle_order = th.argsort(density_scores, descending=True) positions = [th.where(shuffle_order == temp)[0].item() for temp in list_missing_indices] sorted_positions = th.argsort(th.tensor(positions)) list_output = list_missing_indices[sorted_positions] if len(list_output) >= num_each: if return_rank: return list_output else: print(f"=========Update Entropy Querying Label with {num_each} Nodes==========================") # selected_indices = random.sample(list(missing_indices), num_each) selected_indices = list_output[:num_each] sub_train_mask_new[selected_indices] = True return sub_train_mask_new
[docs]def rank_centrality(sub_g, sub_train_mask, sub_train_mask_new, num_each, return_rank=True): nodes_interest = th.nonzero(sub_train_mask & ~sub_train_mask_new).squeeze() page_rank_score = page_rank(sub_g)[nodes_interest] nodes_centrality = nodes_interest[th.argsort(page_rank_score, descending=True)] if len(nodes_centrality) >= num_each: if return_rank: return nodes_centrality else: print(f"=========Update Entropy Querying Label with {num_each} Nodes==========================") # selected_indices = random.sample(list(missing_indices), num_each) selected_indices = nodes_centrality[:num_each] sub_train_mask_new[selected_indices] = True return sub_train_mask_new
# Hand-written pagerank score
[docs]def page_rank(graph, damping_factor=0.85, max_iter=100, tol=1e-8): num_nodes = graph.number_of_nodes() # Initialize the PageRank score for all nodes to be uniform pagerank_scores = torch.ones(num_nodes) / num_nodes graph.ndata['pagerank'] = pagerank_scores # Degree normalization factor # with graph.local_scope(): graph.ndata['deg'] = graph.out_degrees().float().clamp(min=1) # Avoid dividing by 0 for _ in range(max_iter): # Perform message passing (send normalized pagerank score) # print("Iteration ", _) prev_scores = pagerank_scores.clone() graph.ndata['h'] = pagerank_scores / graph.ndata['deg'] graph.update_all(fn.copy_u('h', 'm'), fn.sum('m', 'h_new')) # Apply PageRank formula pagerank_scores = damping_factor * graph.ndata['h_new'] + (1 - damping_factor) / num_nodes # pagerank_scores_new = (1 - damping_factor) / num_nodes + damping_factor * graph.ndata['pagerank_sum'] / \ # graph.ndata['deg'] # Check for convergence delta = torch.abs(pagerank_scores - prev_scores).sum().item() if delta < tol: break # Update pagerank scores graph.ndata['pagerank'] = pagerank_scores return graph.ndata['pagerank']
# ECE and Perturbation
[docs]def perturb_features(sub_features, noise_level=0.05): noise = th.randn_like(sub_features) * noise_level perturbed_features = sub_features + noise return perturbed_features
# Take the perturbation and count the average
[docs]def perturb_avg(net, sub_g, sub_features, num_perturbations, noise_level): original_logits = net(sub_g, sub_features) # Number of classes num_classes = original_logits.size(-1) # Initialization cumulative_probs = th.zeros(sub_features.size(0), num_classes, device=original_logits.device) # Perturbation for _ in range(num_perturbations): features_p = perturb_features(sub_features, noise_level=noise_level) logits_p = net(sub_g, features_p) probs_p = F.softmax(logits_p, dim=-1) cumulative_probs += probs_p # get a fair estimation for the distribution on existing label avg_probs = cumulative_probs / num_perturbations return avg_probs
# Try the traditional way: count the number of perturbed labels for each node
[docs]def rank_perturb(net, sub_g, sub_features, num_perturbations, sub_train_mask, sub_train_mask_new, noise_level, num_each, return_rank=True): original_logits = net(sub_g, sub_features) nodes_interest = th.nonzero(sub_train_mask & ~sub_train_mask_new).squeeze() original_pred = th.argmax(original_logits[nodes_interest], dim=-1) ## Store the outcome # unchanged_counts = th.zeros_like(original_pred, dtype = th.float) unchanged_counts = th.zeros_like(original_pred) # Perturbation for _ in range(num_perturbations): features_p = perturb_features(sub_features, noise_level=noise_level) logits_p = net(sub_g, features_p) labels_p = th.argmax(logits_p[nodes_interest], dim=-1) unchanged = labels_p.eq(original_pred) unchanged_counts += unchanged.int() # unchanged_counts_float = unchanged_counts.float() # unchanged_counts_float.mean() _, change_indices = torch.sort(unchanged_counts) nodes_rank_label = nodes_interest[change_indices] if len(nodes_rank_label) >= num_each: if return_rank: return nodes_rank_label else: print(f"=========Update Perturbation Querying Label with {num_each} Nodes==========================") # selected_indices = random.sample(list(missing_indices), num_each) selected_indices = nodes_rank_label[:num_each] sub_train_mask_new[selected_indices] = True return sub_train_mask_new
# Consider items in the embedding space
[docs]def rank_cluster(net, sub_g, sub_features, labels, total_sub_nodes, sub_train_mask, sub_train_mask_new, num_clusters, num_each, return_rank=True): # Work on missing indices full_true_indices = th.nonzero(sub_train_mask).squeeze() current_true_indices = th.nonzero(sub_train_mask_new).squeeze() missing_indices = set(full_true_indices.tolist()) - set(current_true_indices.tolist()) # Work on prep of embedding labels_true = labels[total_sub_nodes] logits = net(sub_g, sub_features) prob = F.softmax(logits, dim=-1) labels_pred = th.argmax(prob, dim=-1) embedding_all = net(sub_g, sub_features, return_hidden=True) mismatches_queried = (labels_true != labels_pred) & sub_train_mask_new selected_embeddings = embedding_all[mismatches_queried].detach().numpy() # Try kmeans num_clusters_used = min(num_clusters, th.sum(mismatches_queried).item()) # print(selected_embeddings) print("mismatches_queried:" + str(th.sum(mismatches_queried).item())) print("num_clusters_used:" + str(num_clusters_used)) if num_clusters_used >= 1: kmeans = KMeans(n_clusters=num_clusters_used, random_state=0) kmeans.fit(selected_embeddings) cluster_centers = th.tensor(kmeans.cluster_centers_, dtype=torch.float32) # Get back to the original field: Try to use a separate function for remaining functions list_missing_indices = list(missing_indices) embedding_pool = embedding_all[list_missing_indices] min_distances = find_short_dist(embedding_pool, cluster_centers) shuffle_order = th.argsort(min_distances) output_order = [list_missing_indices[i] for i in shuffle_order] nodes_rank_distance = torch.tensor(output_order) else: print("All nodes give the same label.") nodes_rank_distance = torch.tensor(list(missing_indices)) if len(nodes_rank_distance) >= num_each: if return_rank: return nodes_rank_distance else: print(f"=========Update Cluster Querying Label with {num_each} Nodes==========================") # selected_indices = random.sample(list(missing_indices), num_each) selected_indices = nodes_rank_distance[:num_each] sub_train_mask_new[selected_indices] = True return sub_train_mask_new
# Use a separate function to write out the calculation of distance
[docs]def find_short_dist(embedding_pool, cluster_centers): distances = torch.cdist(embedding_pool, cluster_centers) min_distances, _ = torch.min(distances, dim=1) return min_distances
# Consider Diversity; see what we can do from here.
[docs]def rank_diversity(net, sub_g, sub_features, sub_train_mask, sub_train_mask_new, num_each, num_clusters, rho, return_rank=True): full_indices = th.nonzero(sub_train_mask).squeeze() queried_indices = th.nonzero(sub_train_mask_new).squeeze() candidate_indices = set(full_indices.tolist()) - set(queried_indices.tolist()) # Get the embeddings embedding_all = net(sub_g, sub_features, return_hidden=True).detach().numpy() embedding_queried = embedding_all[queried_indices] kmeans = KMeans(n_clusters=num_clusters, random_state=42) kmeans.fit(embedding_queried) cluster_centers = kmeans.cluster_centers_ node_embeddings = th.tensor(embedding_all, dtype=th.float32) centroids = th.tensor(cluster_centers, dtype=th.float32) kmeans_labels = th.tensor(kmeans.labels_, dtype=th.int32) minimal_distance = th.min(th.cdist(node_embeddings, centroids, p=2), dim=1).values proposed_labels = th.min(th.cdist(node_embeddings, centroids, p=2), dim=1).indices # Closeness Scores (Distance to assigned centroid) close_temp = 1 / (1 + minimal_distance) close_normalized = (close_temp - close_temp.min()) / (close_temp.max() - close_temp.min() + 1e-10) # Rarity Scores (How rare as shown in ) queried_bincount = th.bincount(kmeans_labels) rarity_temp = 1 / (1 + queried_bincount[proposed_labels]) rarity_normalized = (rarity_temp - rarity_temp.min()) / (rarity_temp.max() - rarity_temp.min() + 1e-10) # Assemble the scores; rho is subject to tuning composite_scores = rho * close_normalized + (1 - rho) * rarity_normalized composite_scores_candidate = composite_scores[list(candidate_indices)] candidate_tensor = th.tensor(list(candidate_indices)) nodes_rank_diversity = candidate_tensor[th.argsort(composite_scores_candidate, descending=True)] if len(nodes_rank_diversity) >= num_each: if return_rank: return nodes_rank_diversity else: print(f"=========Update Cluster Querying Label with {num_each} Nodes==========================") # selected_indices = random.sample(list(missing_indices), num_each) selected_indices = nodes_rank_diversity[:num_each] sub_train_mask_new[selected_indices] = True return sub_train_mask_new
[docs]def quantile_selection(A, B, C, index_1, index_2, index_3, sub_train_mask, sub_train_mask_new, num_each): elements = th.nonzero(sub_train_mask & ~sub_train_mask_new).squeeze() ranks_A = [compute_rank(A, el) for el in elements] ranks_B = [compute_rank(B, el) for el in elements] ranks_C = [compute_rank(C, el) for el in elements] weighted_ranks = [] for i in range(len(elements)): weighted_rank = index_1 * ranks_A[i] + index_2 * ranks_B[i] + index_3 * ranks_C[i] weighted_ranks.append(weighted_rank) # Sort elements based on weighted ranks sorted_indices = np.argsort(weighted_ranks) sorted_elements = th.stack([elements[i] for i in sorted_indices]) # sorted_weighted_ranks = [weighted_ranks[i] for i in sorted_indices] # print(f"=========Update Entropy Querying Label with {num_each} Nodes==========================") # selected_indices = random.sample(list(missing_indices), num_each) selected_indices = sorted_elements[:num_each] # sub_train_mask_new[selected_indices] = True return selected_indices
[docs]def compute_rank(tensor, element): return np.where(tensor == element)[0][0]
[docs]class GcnNet(nn.Module): def __init__(self, feature_number, label_number): super(GcnNet, self).__init__() self.layers = nn.ModuleList() self.layers.append(GraphConv(feature_number, 16, activation=F.relu)) self.layers.append(GraphConv(16, label_number)) self.dropout = nn.Dropout(p=0.5)
[docs] def forward(self, g, features, return_hidden=False): relu = nn.ReLU() x = F.relu(self.layers[0](g, features)) if return_hidden: return x x = self.layers[1](g, x) return x
## Main Function
[docs]def attack0(dataset_name, seed, cuda, attack_node_arg=0.25, file_path='', LR=1e-3, TGT_LR=1e-2, EVAL_EPOCH=1000, TGT_EPOCH=1000, WARMUP_EPOCH=400, dropout=False, model_performance=True, **kwargs): # Initialization device = th.device(cuda) set_seed(seed) metrics_df = pd.DataFrame(columns=['Num Attack Nodes', 'Method', 'Test Accuracy', 'Test Fidelity']) g, features, labels, node_number, train_mask, test_mask = load_data(dataset_name) attack_node_number = int(node_number * attack_node_arg) feature_number = features.shape[1] label_number = len(labels.unique()) C_var = label_number print('The attack node number is: ', attack_node_number) g = g.to(device) degs = g.in_degrees().float() norm = th.pow(degs, -0.5) norm[th.isinf(norm)] = 0 if cuda != None: norm = norm.cuda() g.ndata['norm'] = norm.unsqueeze(1) if dropout == True: gcn_Net = GCN_drop(feature_number, label_number) else: gcn_Net = GcnNet(feature_number, label_number) optimizer = th.optim.Adam(gcn_Net.parameters(), lr=TGT_LR, weight_decay=5e-4) dur = [] ## Send the training to cuda features = features.to(device) gcn_Net = gcn_Net.to(device) train_mask = train_mask.to(device) test_mask = test_mask.to(device) labels = labels.to(device) target_performance = { 'acc': 0, 'f1score': 0 } print("=========Target Model Generating==========================") for epoch in range(TGT_EPOCH): if epoch >= 3: t0 = time.time() gcn_Net.train() logits = gcn_Net(g, features) logp = F.log_softmax(logits, 1) loss = F.nll_loss(logp[train_mask], labels[train_mask]) optimizer.zero_grad() loss.backward() optimizer.step() if epoch >= 3: dur.append(time.time() - t0) acc, f1score = evaluate(gcn_Net, g, features, labels, test_mask) if acc > target_performance['acc']: target_performance['acc'] = acc if f1score > target_performance['f1score']: target_performance['f1score'] = f1score print("Epoch {:05d} | Loss {:.4f} | Test Acc {:.4f} | Test F1 macro {:.4f} | Time(s) {:.4f}".format( epoch, loss.item(), acc, f1score, np.mean(dur))) ## Get the cuda-trained data back g = g.cpu() features = features.cpu() gcn_Net = gcn_Net.cpu() train_mask = train_mask.cpu() test_mask = test_mask.cpu() labels = labels.cpu() # Generate sub-graph index alpha = 0.8 sub_graph_node_index = [] for i in range(attack_node_number): sub_graph_node_index.append(random.randint(0, node_number - 1)) sub_labels = labels[sub_graph_node_index] syn_nodes = [] g_matrix = np.asmatrix(g.adjacency_matrix().to_dense()) for node_index in sub_graph_node_index: # get nodes one_step_node_index = g_matrix[node_index, :].nonzero()[1].tolist() two_step_node_index = [] for first_order_node_index in one_step_node_index: syn_nodes.append(first_order_node_index) two_step_node_index = g_matrix[first_order_node_index, :].nonzero()[1].tolist() sub_graph_syn_node_index = list(set(syn_nodes) - set(sub_graph_node_index)) total_sub_nodes = list(set(sub_graph_syn_node_index + sub_graph_node_index)) # Generate features for SubGraph attack np_features_query = features.clone() for node_index in sub_graph_syn_node_index: # initialized as zero np_features_query[node_index] = np_features_query[node_index] * 0 # get one step and two steps nodes one_step_node_index = g_matrix[node_index, :].nonzero()[1].tolist() one_step_node_index = list(set(one_step_node_index).intersection(set(sub_graph_node_index))) total_two_step_node_index = [] num_one_step = len(one_step_node_index) for first_order_node_index in one_step_node_index: # caculate the feature: features = 0.8 * average_one + 0.8^2 * average_two # new_array = features[first_order_node_index]*0.8/num_one_step this_node_degree = len(g_matrix[first_order_node_index, :].nonzero()[1].tolist()) np_features_query[node_index] = torch.from_numpy(np.sum( [np_features_query[node_index], features[first_order_node_index] * alpha / math.sqrt(num_one_step * this_node_degree)], axis=0)) two_step_node_index = g_matrix[first_order_node_index, :].nonzero()[1].tolist() total_two_step_node_index = list( set(total_two_step_node_index + two_step_node_index) - set(one_step_node_index)) total_two_step_node_index = list(set(total_two_step_node_index).intersection(set(sub_graph_node_index))) num_two_step = len(total_two_step_node_index) for second_order_node_index in total_two_step_node_index: # caculate the feature: features = 0.8 * average_one + 0.8^2 * average_two this_node_second_step_nodes = [] this_node_first_step_nodes = g_matrix[second_order_node_index, :].nonzero()[1].tolist() for nodes_in_this_node in this_node_first_step_nodes: this_node_second_step_nodes = list( set(this_node_second_step_nodes + g_matrix[nodes_in_this_node, :].nonzero()[1].tolist())) this_node_second_step_nodes = list(set(this_node_second_step_nodes) - set(this_node_first_step_nodes)) this_node_second_degree = len(this_node_second_step_nodes) np_features_query[node_index] = torch.from_numpy(np.sum( [np_features_query[node_index], features[second_order_node_index] * (1 - alpha) / math.sqrt(num_two_step * this_node_second_degree)], axis=0)) features_query = th.FloatTensor(np_features_query) # generate sub-graph adj-matrix, features, labels total_sub_nodes = list(set(sub_graph_syn_node_index + sub_graph_node_index)) sub_g = np.zeros((len(total_sub_nodes), len(total_sub_nodes))) for sub_index in range(len(total_sub_nodes)): sub_g[sub_index] = g_matrix[total_sub_nodes[sub_index], total_sub_nodes] for i in range(node_number): if i in sub_graph_node_index: test_mask[i] = 0 train_mask[i] = 1 continue if i in sub_graph_syn_node_index: test_mask[i] = 1 train_mask[i] = 0 else: test_mask[i] = 1 train_mask[i] = 0 sub_train_mask = train_mask[total_sub_nodes] sub_features = features_query[total_sub_nodes] sub_labels = labels[total_sub_nodes] sub_features = th.FloatTensor(sub_features) sub_labels = th.LongTensor(sub_labels) sub_train_mask = sub_train_mask sub_test_mask = test_mask # sub_g = DGLGraph(nx.from_numpy_matrix(sub_g)) # features = th.FloatTensor(data.features) # labels = th.LongTensor(data.labels) # train_mask = th.ByteTensor(data.train_mask) # test_mask = th.ByteTensor(data.test_mask) # g = DGLGraph(data.graph) gcn_Net.eval() # =================Generate Label=================================================== logits_query = gcn_Net(g, features) _, labels_query = th.max(logits_query, dim=1) sub_labels_query = labels_query[total_sub_nodes] sub_g = nx.from_numpy_array(sub_g) sub_g.remove_edges_from(nx.selfloop_edges(sub_g)) sub_g.add_edges_from(zip(sub_g.nodes(), sub_g.nodes())) sub_g = dgl.from_networkx(sub_g) # sub_g = DGLGraph(sub_g) n_edges = sub_g.number_of_edges() # normalization degs = sub_g.in_degrees().float() norm = th.pow(degs, -0.5) norm[th.isinf(norm)] = 0 sub_g.ndata['norm'] = norm.unsqueeze(1) print("=========Model Extracting==========================") # hyperparameters get from kwargs # no need to change these default for now num_perturbations = kwargs.get('num_perturbations', 100) noise_level = kwargs.get('noise_level', 0.05) rho = kwargs.get('rho', 0.8) num_each = kwargs.get('num_each', 1) epochs_per_cycle = kwargs.get('epochs_per_cycle', 1) setup = kwargs.get('setup', "experiment") # This need to be relatively bigger to allow for more accurate classification if_warmup = kwargs.get('if_warmup', False) LR_CEGA = kwargs.get('LR_CEGA', 1e-2) # Tuning parameters for adaptive weight in each of the CEGA iteration # Default works for cora and amazonphoto and coauthorCS # Need specific modification for citeseer and pubmed curve = kwargs.get('curve', 0.3) init_1 = kwargs.get('init_1', 0.2) init_2 = kwargs.get('init_2', 0.2) init_3 = kwargs.get('init_3', 0.2) gap = kwargs.get('gap', 0.6) # Derivative parameters num_node = sub_features.shape[0] total_epochs = epochs_per_cycle * 18 * C_var total_num = 20 * C_var num_cycles = total_epochs // epochs_per_cycle # Set up adaptive weights: set the numbers then reweight them # For citeseer, try k = 0.5, init_1 = 0.3. The other parameters seem to be working fine cycles = np.linspace(0, 1, num_cycles) index_1 = init_1 + gap * np.exp(-1 * curve * cycles) index_2 = init_2 + gap * (1 - np.exp(-1 * curve * cycles)) index_3 = init_3 * (1 - np.exp(-1 * cycles)) total = index_1 + index_2 + index_3 index_1 /= total index_2 /= total index_3 /= total # Set up output data formality # data_output = pd.DataFrame(columns=['Num Attack Nodes', 'Method', 'Test Accuracy', 'Test Fidelity']) # create GCN model max_acc1 = 0 max_acc2 = 0 max_f1 = 0 dur = [] if dropout == True: net = GCN_drop(feature_number, label_number) else: net = GcnNet(feature_number, label_number) optimizer = th.optim.Adam(net.parameters(), lr=LR_CEGA, weight_decay=5e-4) ## Set up initial set which is iteratively progressive train_inits = init_mask(C_var, sub_train_mask, sub_labels) train_inits_tensor = th.tensor(train_inits) sub_train_mask_new = th.zeros(len(sub_train_mask), dtype=th.bool) sub_train_mask_new[train_inits] = True ## Record the initial nodes in torch object nodes_queried = th.tensor([], dtype=th.long) nodes_queried = th.cat((nodes_queried, train_inits_tensor)) ## Do warm up if that is ever an option if if_warmup == True: sub_train_mask_warmup = th.zeros(len(sub_train_mask), dtype=th.bool) sub_train_mask_warmup[train_inits] = True net.train() for epoch in range(WARMUP_EPOCH): logits = net(sub_g, sub_features) logp = F.log_softmax(logits, dim=1) loss = F.nll_loss(logp[sub_train_mask_warmup], sub_labels_query[sub_train_mask_warmup]) optimizer.zero_grad() loss.backward() optimizer.step() acc, f1score = evaluate(net, g, features, labels, test_mask) print("Epoch {:05d} | Loss {:.4f} | Test Acc {:.4f} | Test F1 score {:.4f}".format( epoch + 1, loss.item(), acc, f1score)) net.eval() ## Now start timing when the real cycles begin start_time = time.time() log_dir = f"{file_path}/timelogs/{dataset_name}/logtime_cega_{seed}" os.makedirs(os.path.dirname(log_dir), exist_ok=True) # Learn a node in each cycle for cycle in range(10): # print(f"=========Cycle {cycle + 1}==========================") # print(f"========={int(sub_train_mask_new.sum())} Selected Nodes==========================") # Train some epochs: net.train() for epoch in range(epochs_per_cycle): logits = net(sub_g, sub_features) ## Need to get new sub_train_mask logp = F.log_softmax(logits, dim=1) loss = F.nll_loss(logp[sub_train_mask_new], sub_labels_query[sub_train_mask_new]) optimizer.zero_grad() loss.backward() optimizer.step() # if epoch >= 3: # dur.append(time.time() - t0) # dur.append(time.time() - t0) acc1, _ = evaluate(net, g, features, labels_query, test_mask) acc2, f1score = evaluate(net, g, features, labels, test_mask) if acc1 > max_acc1: max_acc1 = acc1 if acc2 > max_acc2: max_acc2 = acc2 if f1score > max_f1: max_f1 = f1score # Add f1 in output print( "Cycle {:05d} | Epoch {:05d} | Loss {:.4f} | Test Acc {:.4f} | Test Fid {:.4f} | Test F1score {:.4f} ".format( cycle + 1, epoch + 1 + cycle * epochs_per_cycle, loss.item(), acc2, acc1, max_f1)) net.eval() ## Not realized here! # new_row = {"Epoch": epoch + 1 + cycle * epochs_per_cycle, "Loss": loss.item(), "Accuracy": acc2, "Fidelity": acc1} # data_output = data_output.append(new_row, ignore_index = True) # data_output.append(new_row) # Update the sub_train_mask using your specially-designed algorithm if sub_train_mask_new.sum() < total_num: # Random if setup == "random": print("Setup: Random") # Add the entry to the node pool nodes_queried on the supposed order node_queried = update_sub_train_mask(num_each, sub_train_mask, sub_train_mask_new) node_queried_tensor = th.tensor(node_queried) # node_queried_tensor = th.tensor(node_queried, dtype = th.long) nodes_queried = th.cat((nodes_queried, node_queried_tensor)) sub_train_mask_new[node_queried] = True elif setup == "experiment": print("Setup: Experiment") ## First: Representativeness ## Can be replaced by other centrality measurement Rank1 = rank_centrality(sub_g, sub_train_mask, sub_train_mask_new, num_each, return_rank=True) ## Second: Uncertainty Rank2 = rank_entropy(net, sub_g, sub_features, sub_train_mask, sub_train_mask_new, num_each, return_rank=True) ## Third: Diversity Rank3 = rank_diversity(net, sub_g, sub_features, sub_train_mask, sub_train_mask_new, num_each, C_var, rho, return_rank=True) if Rank1 is None: print("Completed!") selected_indices = quantile_selection(Rank1, Rank2, Rank3, index_1[cycle], index_2[cycle], index_3[cycle], sub_train_mask, sub_train_mask_new, num_each) selected_indices_tensor = selected_indices.clone().detach() # th.tensor(, dtype = th.long) nodes_queried = th.cat((nodes_queried, selected_indices_tensor)) sub_train_mask_new[selected_indices] = True elif setup == "perturbation": print("Setup: Experiment with Perturbation") Rank1 = rank_centrality(sub_g, sub_train_mask, sub_train_mask_new, num_each, return_rank=True) Rank2 = rank_perturb(net, sub_g, sub_features, num_perturbations, sub_train_mask, sub_train_mask_new, noise_level, num_each, return_rank=True) Rank3 = rank_diversity(net, sub_g, sub_features, sub_train_mask, sub_train_mask_new, num_each, C_var, rho, return_rank=True) if Rank1 is None: print("Completed!") selected_indices = quantile_selection(Rank1, Rank2, Rank3, index_1[cycle], index_2[cycle], index_3[cycle], sub_train_mask, sub_train_mask_new, num_each) selected_indices_tensor = selected_indices.clone().detach() nodes_queried = th.cat((nodes_queried, selected_indices_tensor)) sub_train_mask_new[selected_indices] = True else: print("Wrong Setup!") return 1 else: print("Move on with designated nodes!") sub_train_mask_new = sub_train_mask_new ## Record time for all these cycles when the loop is complete node_selection_time = time.time() - start_time with open(log_dir, 'a') as log_file: log_file.write(f"CEGA {dataset_name} {seed} ") log_file.write(f"{node_selection_time:.4f}s\n") idx_train = nodes_queried.tolist() output_data = { 'total_sub_nodes': total_sub_nodes, 'idx_train': idx_train } ## Assertation and printing assert len(idx_train) == 20 * C_var print('node selection finished') with open(f'./node_selection/CEGA_{setup}_{dataset_name}_selected_nodes_{(20 * label_number)}_{seed}.json', 'w') as f: json.dump(output_data, f) sub_g = sub_g.to(device) sub_features = sub_features.to(device) sub_labels_query = sub_labels_query.to(device) labels_query = labels_query.to(device) g = g.to(device) features = features.to(device) test_mask = test_mask.to(device) labels = labels.to(device) print('=========Model Evaluation==========================') if model_performance: for iter in range(2 * C_var, 21 * C_var, C_var): set_seed(seed) ## Create net from scratch if dropout == True: net_scratch = GCN_drop(feature_number, label_number) else: net_scratch = GcnNet(feature_number, label_number) optimizer = th.optim.Adam(net_scratch.parameters(), lr=LR, weight_decay=5e-4) ## set up training nodes and send them to device sub_train_scratch = th.zeros(sub_features.size()[0], dtype=th.bool) sub_train_scratch[idx_train[:iter]] = True sub_train_scratch = sub_train_scratch.to(device) net_scratch = net_scratch.to(device) ## Reset data max_acc1 = 0 max_acc2 = 0 max_f1 = 0 dur = [] for epoch in range(EVAL_EPOCH): if epoch >= 3: t0 = time.time() net_scratch.train() logits = net_scratch(sub_g, sub_features) logp = F.log_softmax(logits, dim=1) loss = F.nll_loss(logp[sub_train_scratch], sub_labels_query[sub_train_scratch]) optimizer.zero_grad() loss.backward() optimizer.step() if epoch >= 3: dur.append(time.time() - t0) acc1, _ = evaluate(net_scratch, g, features, labels_query, test_mask) acc2, f1score = evaluate(net_scratch, g, features, labels, test_mask) if acc1 > max_acc1: max_acc1 = acc1 if acc2 > max_acc2: max_acc2 = acc2 if f1score > max_f1: max_f1 = f1score # Output Epoch Scores epoch_metrics = pd.DataFrame({ 'Num Attack Nodes': [iter], 'Method': ['CEGA'], 'Test Accuracy': [max_acc2], 'Test Fidelity': [max_acc1], 'Test F1score': [max_f1], }) metrics_df = pd.concat([metrics_df, epoch_metrics], ignore_index=True) print("Test Acc {:.4f} | Test Fid {:.4f} | Test F1score {:.4f} | Time(s) {:.4f}".format( acc2, acc1, max_f1, np.mean(dur))) ## Should this be 'f1score'? epoch_metrics = pd.DataFrame({ 'Num Attack Nodes': [int(th.sum(train_mask))], 'Method': ['CEGA'], 'Test Accuracy': [target_performance['acc']], 'Test Fidelity': [1], 'Test F1score': [target_performance['f1score']], }) metrics_df = pd.concat([metrics_df, epoch_metrics], ignore_index=True) log_file_path = f"{file_path}/{dataset_name}/log_cega_{seed}.csv" metrics_df.to_csv(log_file_path, mode='w', header=False, index=False) # Set net_full for the next graph to be taken care of, which is expected to include all nodes if True: set_seed(seed) log_file_path = f"{file_path}/{dataset_name}/log_cega_{seed}.csv" if dropout == True: net_full = GCN_drop(feature_number, label_number) else: net_full = GcnNet(feature_number, label_number) optimizer_full = th.optim.Adam(net_full.parameters(), lr=LR, weight_decay=5e-4) net_full = net_full.to(device) net = net.to(device) perfm_attack = { 'acc': 0, 'fid': 0, 'f1score': 0 } print('========================== Model Evaluation ==========================') progress_bar = tqdm(range(EVAL_EPOCH), desc="Generating model with ALL attack nodes", ncols=100) for epoch in progress_bar: if epoch >= 3: t0 = time.time() net_full.train() logits = net_full(sub_g, sub_features) logp = F.log_softmax(logits, 1) loss = F.nll_loss(logp, sub_labels_query) # [sub_train_mask] optimizer_full.zero_grad() loss.backward() optimizer_full.step() if epoch >= 3: dur.append(time.time() - t0) acc, f1score = evaluate(net_full, g, features, labels, test_mask) fid, _ = evaluate(net_full, g, features, labels_query, test_mask) if acc > perfm_attack['acc']: perfm_attack['acc'] = acc if fid > perfm_attack['fid']: perfm_attack['fid'] = fid if f1score > perfm_attack['f1score']: perfm_attack['f1score'] = f1score progress_bar.set_postfix({ "Loss": f"{loss.item():.4f}", "Test Acc": f"{acc:.4f}", "Test F1": f"{f1score:.4f}", # "Processed %": f"{(epoch + 1) / TGT_EPOCH * 100:.2f}", # "Time(s)": f"{np.mean(dur) if dur else 0:.4f}" }) epoch_metrics = pd.DataFrame({ 'Num Attack Nodes': [sub_train_mask.sum().item()], 'Method': ['cega'], 'Test Accuracy': [perfm_attack['acc']], 'Test Fidelity': [perfm_attack['fid']], 'Test F1score': [perfm_attack['f1score']], }) metrics_df = pd.concat([metrics_df, epoch_metrics], ignore_index=True) log_file_path = f"{file_path}/{dataset_name}/log_cega_{seed}.csv" metrics_df.to_csv(log_file_path, mode='w', header=False, index=False)
from pygip.models.attack.base import BaseAttack from pygip.datasets import Dataset
[docs]class CEGA(BaseAttack): supported_api_types = {"dgl"} def __init__(self, dataset: Dataset, attack_node_fraction: float, model_path: str = None): super(CEGA, self).__init__(dataset, attack_node_fraction, model_path) # graph data self.dataset = dataset self.graph = dataset.graph_data.to(self.device) self.features = dataset.graph_data.ndata['feat'] self.labels = dataset.graph_data.ndata['label'] self.train_mask = dataset.graph_data.ndata['train_mask'] self.test_mask = dataset.graph_data.ndata['test_mask'] # meta data self.node_number = dataset.num_nodes self.feature_number = dataset.num_features self.label_number = dataset.num_classes self.attack_node_number = int(dataset.num_nodes * attack_node_fraction) self.attack_node_fraction = attack_node_fraction
[docs] def attack(self, seed=1, cuda=None, LR=1e-3, TGT_LR=1e-2, EVAL_EPOCH=10, TGT_EPOCH=10, WARMUP_EPOCH=4, dropout=False, model_performance=True, **kwargs): # Initialization # device = 'cpu' set_seed(seed) metrics_df = pd.DataFrame(columns=['Num Attack Nodes', 'Method', 'Test Accuracy', 'Test Fidelity']) # g, features, labels, node_number, train_mask, test_mask = load_data(dataset_name) g = self.graph features = self.features labels = self.labels node_number = self.node_number train_mask = self.train_mask test_mask = self.test_mask attack_node_arg = self.attack_node_fraction attack_node_number = int(node_number * attack_node_arg) feature_number = features.shape[1] label_number = len(labels.unique()) C_var = label_number print('The attack node number is: ', attack_node_number) g = g.to(self.device) degs = g.in_degrees().float() norm = th.pow(degs, -0.5) norm[th.isinf(norm)] = 0 if cuda != None: norm = norm.cuda() g.ndata['norm'] = norm.unsqueeze(1) if dropout == True: gcn_Net = GCN_drop(feature_number, label_number) else: gcn_Net = GcnNet(feature_number, label_number) optimizer = th.optim.Adam(gcn_Net.parameters(), lr=TGT_LR, weight_decay=5e-4) dur = [] ## Send the training to cuda features = features.to(self.device) gcn_Net = gcn_Net.to(self.device) train_mask = train_mask.to(self.device) test_mask = test_mask.to(self.device) labels = labels.to(self.device) target_performance = { 'acc': 0, 'f1score': 0 } print("=========Target Model Generating==========================") for epoch in range(TGT_EPOCH): if epoch >= 3: t0 = time.time() gcn_Net.train() logits = gcn_Net(g, features) logp = F.log_softmax(logits, 1) loss = F.nll_loss(logp[train_mask], labels[train_mask]) optimizer.zero_grad() loss.backward() optimizer.step() if epoch >= 3: dur.append(time.time() - t0) acc, f1score = evaluate(gcn_Net, g, features, labels, test_mask) if acc > target_performance['acc']: target_performance['acc'] = acc if f1score > target_performance['f1score']: target_performance['f1score'] = f1score print("Epoch {:05d} | Loss {:.4f} | Test Acc {:.4f} | Test F1 macro {:.4f} | Time(s) {:.4f}".format( epoch, loss.item(), acc, f1score, np.mean(dur))) ## Get the cuda-trained data back g = g.cpu() features = features.cpu() gcn_Net = gcn_Net.cpu() train_mask = train_mask.cpu() test_mask = test_mask.cpu() labels = labels.cpu() # Generate sub-graph index alpha = 0.8 sub_graph_node_index = [] for i in range(attack_node_number): sub_graph_node_index.append(random.randint(0, node_number - 1)) sub_labels = labels[sub_graph_node_index] syn_nodes = [] g_matrix = np.asmatrix(g.adjacency_matrix().to_dense()) for node_index in sub_graph_node_index: # get nodes one_step_node_index = g_matrix[node_index, :].nonzero()[1].tolist() two_step_node_index = [] for first_order_node_index in one_step_node_index: syn_nodes.append(first_order_node_index) two_step_node_index = g_matrix[first_order_node_index, :].nonzero()[1].tolist() sub_graph_syn_node_index = list(set(syn_nodes) - set(sub_graph_node_index)) total_sub_nodes = list(set(sub_graph_syn_node_index + sub_graph_node_index)) # Generate features for SubGraph attack np_features_query = features.clone() for node_index in sub_graph_syn_node_index: # initialized as zero np_features_query[node_index] = np_features_query[node_index] * 0 # get one step and two steps nodes one_step_node_index = g_matrix[node_index, :].nonzero()[1].tolist() one_step_node_index = list(set(one_step_node_index).intersection(set(sub_graph_node_index))) total_two_step_node_index = [] num_one_step = len(one_step_node_index) for first_order_node_index in one_step_node_index: # caculate the feature: features = 0.8 * average_one + 0.8^2 * average_two # new_array = features[first_order_node_index]*0.8/num_one_step this_node_degree = len(g_matrix[first_order_node_index, :].nonzero()[1].tolist()) x1 = np_features_query[node_index] x2 = features[first_order_node_index] * alpha / math.sqrt(num_one_step * this_node_degree) np_features_query[node_index] = x1 + x2 two_step_node_index = g_matrix[first_order_node_index, :].nonzero()[1].tolist() total_two_step_node_index = list( set(total_two_step_node_index + two_step_node_index) - set(one_step_node_index)) total_two_step_node_index = list(set(total_two_step_node_index).intersection(set(sub_graph_node_index))) num_two_step = len(total_two_step_node_index) for second_order_node_index in total_two_step_node_index: # caculate the feature: features = 0.8 * average_one + 0.8^2 * average_two this_node_second_step_nodes = [] this_node_first_step_nodes = g_matrix[second_order_node_index, :].nonzero()[1].tolist() for nodes_in_this_node in this_node_first_step_nodes: this_node_second_step_nodes = list( set(this_node_second_step_nodes + g_matrix[nodes_in_this_node, :].nonzero()[1].tolist())) this_node_second_step_nodes = list(set(this_node_second_step_nodes) - set(this_node_first_step_nodes)) this_node_second_degree = len(this_node_second_step_nodes) x1 = np_features_query[node_index] x2 = features[first_order_node_index] * alpha / math.sqrt(num_one_step * this_node_degree) np_features_query[node_index] = x1 + x2 features_query = th.FloatTensor(np_features_query) # generate sub-graph adj-matrix, features, labels total_sub_nodes = list(set(sub_graph_syn_node_index + sub_graph_node_index)) sub_g = np.zeros((len(total_sub_nodes), len(total_sub_nodes))) for sub_index in range(len(total_sub_nodes)): sub_g[sub_index] = g_matrix[total_sub_nodes[sub_index], total_sub_nodes] for i in range(node_number): if i in sub_graph_node_index: test_mask[i] = 0 train_mask[i] = 1 continue if i in sub_graph_syn_node_index: test_mask[i] = 1 train_mask[i] = 0 else: test_mask[i] = 1 train_mask[i] = 0 sub_train_mask = train_mask[total_sub_nodes] sub_features = features_query[total_sub_nodes] sub_labels = labels[total_sub_nodes] sub_features = th.FloatTensor(sub_features) sub_labels = th.LongTensor(sub_labels) sub_train_mask = sub_train_mask sub_test_mask = test_mask gcn_Net.eval() # =================Generate Label=================================================== logits_query = gcn_Net(g, features) _, labels_query = th.max(logits_query, dim=1) sub_labels_query = labels_query[total_sub_nodes] sub_g = nx.from_numpy_array(sub_g) sub_g.remove_edges_from(nx.selfloop_edges(sub_g)) sub_g.add_edges_from(zip(sub_g.nodes(), sub_g.nodes())) sub_g = dgl.from_networkx(sub_g) # sub_g = DGLGraph(sub_g) n_edges = sub_g.number_of_edges() # normalization degs = sub_g.in_degrees().float() norm = th.pow(degs, -0.5) norm[th.isinf(norm)] = 0 sub_g.ndata['norm'] = norm.unsqueeze(1) print("=========Model Extracting==========================") # hyperparameters get from kwargs # no need to change these default for now num_perturbations = kwargs.get('num_perturbations', 100) noise_level = kwargs.get('noise_level', 0.05) rho = kwargs.get('rho', 0.8) num_each = kwargs.get('num_each', 1) epochs_per_cycle = kwargs.get('epochs_per_cycle', 1) setup = kwargs.get('setup', "experiment") # This need to be relatively bigger to allow for more accurate classification if_warmup = kwargs.get('if_warmup', False) LR_CEGA = kwargs.get('LR_CEGA', 1e-2) # Tuning parameters for adaptive weight in each of the CEGA iteration # Default works for cora and amazonphoto and coauthorCS # Need specific modification for citeseer and pubmed curve = kwargs.get('curve', 0.3) init_1 = kwargs.get('init_1', 0.2) init_2 = kwargs.get('init_2', 0.2) init_3 = kwargs.get('init_3', 0.2) gap = kwargs.get('gap', 0.6) # Derivative parameters num_node = sub_features.shape[0] total_epochs = epochs_per_cycle * 18 * C_var total_num = 20 * C_var num_cycles = total_epochs // epochs_per_cycle # Set up adaptive weights: set the numbers then reweight them # For citeseer, try k = 0.5, init_1 = 0.3. The other parameters seem to be working fine cycles = np.linspace(0, 1, num_cycles) index_1 = init_1 + gap * np.exp(-1 * curve * cycles) index_2 = init_2 + gap * (1 - np.exp(-1 * curve * cycles)) index_3 = init_3 * (1 - np.exp(-1 * cycles)) total = index_1 + index_2 + index_3 index_1 /= total index_2 /= total index_3 /= total # create GCN model max_acc1 = 0 max_acc2 = 0 max_f1 = 0 dur = [] if dropout == True: net = GCN_drop(feature_number, label_number) else: net = GcnNet(feature_number, label_number) optimizer = th.optim.Adam(net.parameters(), lr=LR_CEGA, weight_decay=5e-4) ## Set up initial set which is iteratively progressive train_inits = init_mask(C_var, sub_train_mask, sub_labels) train_inits_tensor = th.tensor(train_inits) sub_train_mask_new = th.zeros(len(sub_train_mask), dtype=th.bool) sub_train_mask_new[train_inits] = True ## Record the initial nodes in torch object nodes_queried = th.tensor([], dtype=th.long) nodes_queried = th.cat((nodes_queried, train_inits_tensor)) ## Do warm up if that is ever an option if if_warmup == True: sub_train_mask_warmup = th.zeros(len(sub_train_mask), dtype=th.bool) sub_train_mask_warmup[train_inits] = True net.train() for epoch in range(WARMUP_EPOCH): logits = net(sub_g, sub_features) logp = F.log_softmax(logits, dim=1) loss = F.nll_loss(logp[sub_train_mask_warmup], sub_labels_query[sub_train_mask_warmup]) optimizer.zero_grad() loss.backward() optimizer.step() acc, f1score = evaluate(net, g, features, labels, test_mask) print("Epoch {:05d} | Loss {:.4f} | Test Acc {:.4f} | Test F1 score {:.4f}".format( epoch + 1, loss.item(), acc, f1score)) net.eval() ## Now start timing when the real cycles begin start_time = time.time() # log_dir = f"{file_path}/timelogs/{dataset_name}/logtime_cega_{seed}" # os.makedirs(os.path.dirname(log_dir), exist_ok=True) print("=========Learn a node in each cycle==========================") # Learn a node in each cycle for cycle in range(num_cycles): net.train() for epoch in range(epochs_per_cycle): logits = net(sub_g, sub_features) ## Need to get new sub_train_mask logp = F.log_softmax(logits, dim=1) loss = F.nll_loss(logp[sub_train_mask_new], sub_labels_query[sub_train_mask_new]) optimizer.zero_grad() loss.backward() optimizer.step() acc1, _ = evaluate(net, g, features, labels_query, test_mask) acc2, f1score = evaluate(net, g, features, labels, test_mask) if acc1 > max_acc1: max_acc1 = acc1 if acc2 > max_acc2: max_acc2 = acc2 if f1score > max_f1: max_f1 = f1score # Add f1 in output print( "Cycle {:05d} | Epoch {:05d} | Loss {:.4f} | Test Acc {:.4f} | Test Fid {:.4f} | Test F1score {:.4f} ".format( cycle + 1, epoch + 1 + cycle * epochs_per_cycle, loss.item(), acc2, acc1, max_f1)) net.eval() # Update the sub_train_mask using your specially-designed algorithm if sub_train_mask_new.sum() < total_num: # Random if setup == "random": print("Setup: Random") # Add the entry to the node pool nodes_queried on the supposed order node_queried = update_sub_train_mask(num_each, sub_train_mask, sub_train_mask_new) node_queried_tensor = th.tensor(node_queried) # node_queried_tensor = th.tensor(node_queried, dtype = th.long) nodes_queried = th.cat((nodes_queried, node_queried_tensor)) sub_train_mask_new[node_queried] = True elif setup == "experiment": print("Setup: Experiment") ## First: Representativeness ## Can be replaced by other centrality measurement Rank1 = rank_centrality(sub_g, sub_train_mask, sub_train_mask_new, num_each, return_rank=True) ## Second: Uncertainty Rank2 = rank_entropy(net, sub_g, sub_features, sub_train_mask, sub_train_mask_new, num_each, return_rank=True) ## Third: Diversity Rank3 = rank_diversity(net, sub_g, sub_features, sub_train_mask, sub_train_mask_new, num_each, C_var, rho, return_rank=True) if Rank1 is None: print("Completed!") selected_indices = quantile_selection(Rank1, Rank2, Rank3, index_1[cycle], index_2[cycle], index_3[cycle], sub_train_mask, sub_train_mask_new, num_each) selected_indices_tensor = selected_indices.clone().detach() # th.tensor(, dtype = th.long) nodes_queried = th.cat((nodes_queried, selected_indices_tensor)) sub_train_mask_new[selected_indices] = True elif setup == "perturbation": print("Setup: Experiment with Perturbation") Rank1 = rank_centrality(sub_g, sub_train_mask, sub_train_mask_new, num_each, return_rank=True) Rank2 = rank_perturb(net, sub_g, sub_features, num_perturbations, sub_train_mask, sub_train_mask_new, noise_level, num_each, return_rank=True) Rank3 = rank_diversity(net, sub_g, sub_features, sub_train_mask, sub_train_mask_new, num_each, C_var, rho, return_rank=True) if Rank1 is None: print("Completed!") selected_indices = quantile_selection(Rank1, Rank2, Rank3, index_1[cycle], index_2[cycle], index_3[cycle], sub_train_mask, sub_train_mask_new, num_each) selected_indices_tensor = selected_indices.clone().detach() nodes_queried = th.cat((nodes_queried, selected_indices_tensor)) sub_train_mask_new[selected_indices] = True else: print("Wrong Setup!") return 1 else: print("Move on with designated nodes!") sub_train_mask_new = sub_train_mask_new ## Record time for all these cycles when the loop is complete # node_selection_time = time.time() - start_time # with open(log_dir, 'a') as log_file: # log_file.write(f"CEGA {dataset_name} {seed} ") # log_file.write(f"{node_selection_time:.4f}s\n") idx_train = nodes_queried.tolist() output_data = { 'total_sub_nodes': total_sub_nodes, 'idx_train': idx_train } ## Assertation and printing # assert len(idx_train) == 20 * C_var print('node selection finished') # with open(f'./node_selection/CEGA_{setup}_{dataset_name}_selected_nodes_{(20 * label_number)}_{seed}.json', # 'w') as f: # json.dump(output_data, f) sub_g = sub_g.to(self.device) sub_features = sub_features.to(self.device) sub_labels_query = sub_labels_query.to(self.device) labels_query = labels_query.to(self.device) g = g.to(self.device) features = features.to(self.device) test_mask = test_mask.to(self.device) labels = labels.to(self.device) print('=========Model Evaluation==========================') if model_performance: for iter in range(2 * C_var, 21 * C_var, C_var): set_seed(seed) ## Create net from scratch if dropout == True: net_scratch = GCN_drop(feature_number, label_number) else: net_scratch = GcnNet(feature_number, label_number) optimizer = th.optim.Adam(net_scratch.parameters(), lr=LR, weight_decay=5e-4) ## set up training nodes and send them to device sub_train_scratch = th.zeros(sub_features.size()[0], dtype=th.bool) sub_train_scratch[idx_train[:iter]] = True sub_train_scratch = sub_train_scratch.to(self.device) net_scratch = net_scratch.to(self.device) ## Reset data max_acc1 = 0 max_acc2 = 0 max_f1 = 0 dur = [] for epoch in range(EVAL_EPOCH): if epoch >= 3: t0 = time.time() net_scratch.train() logits = net_scratch(sub_g, sub_features) logp = F.log_softmax(logits, dim=1) loss = F.nll_loss(logp[sub_train_scratch], sub_labels_query[sub_train_scratch]) optimizer.zero_grad() loss.backward() optimizer.step() if epoch >= 3: dur.append(time.time() - t0) acc1, _ = evaluate(net_scratch, g, features, labels_query, test_mask) acc2, f1score = evaluate(net_scratch, g, features, labels, test_mask) if acc1 > max_acc1: max_acc1 = acc1 if acc2 > max_acc2: max_acc2 = acc2 if f1score > max_f1: max_f1 = f1score # Output Epoch Scores epoch_metrics = pd.DataFrame({ 'Num Attack Nodes': [iter], 'Method': ['CEGA'], 'Test Accuracy': [max_acc2], 'Test Fidelity': [max_acc1], 'Test F1score': [max_f1], }) metrics_df = pd.concat([metrics_df, epoch_metrics], ignore_index=True) print("Test Acc {:.4f} | Test Fid {:.4f} | Test F1score {:.4f} | Time(s) {:.4f}".format( acc2, acc1, max_f1, np.mean(dur))) ## Should this be 'f1score'? epoch_metrics = pd.DataFrame({ 'Num Attack Nodes': [int(th.sum(train_mask))], 'Method': ['CEGA'], 'Test Accuracy': [target_performance['acc']], 'Test Fidelity': [1], 'Test F1score': [target_performance['f1score']], }) metrics_df = pd.concat([metrics_df, epoch_metrics], ignore_index=True) # log_file_path = f"{file_path}/{dataset_name}/log_cega_{seed}.csv" # metrics_df.to_csv(log_file_path, mode='w', header=False, index=False) # Set net_full for the next graph to be taken care of, which is expected to include all nodes if True: set_seed(seed) # log_file_path = f"{file_path}/{dataset_name}/log_cega_{seed}.csv" if dropout == True: net_full = GCN_drop(feature_number, label_number) else: net_full = GcnNet(feature_number, label_number) optimizer_full = th.optim.Adam(net_full.parameters(), lr=LR, weight_decay=5e-4) net_full = net_full.to(self.device) net = net.to(self.device) perfm_attack = { 'acc': 0, 'fid': 0, 'f1score': 0 } print('========================== Model Evaluation ==========================') progress_bar = tqdm(range(EVAL_EPOCH), desc="Generating model with ALL attack nodes", ncols=100) for epoch in progress_bar: if epoch >= 3: t0 = time.time() net_full.train() logits = net_full(sub_g, sub_features) logp = F.log_softmax(logits, 1) loss = F.nll_loss(logp, sub_labels_query) # [sub_train_mask] optimizer_full.zero_grad() loss.backward() optimizer_full.step() if epoch >= 3: dur.append(time.time() - t0) acc, f1score = evaluate(net_full, g, features, labels, test_mask) fid, _ = evaluate(net_full, g, features, labels_query, test_mask) if acc > perfm_attack['acc']: perfm_attack['acc'] = acc if fid > perfm_attack['fid']: perfm_attack['fid'] = fid if f1score > perfm_attack['f1score']: perfm_attack['f1score'] = f1score progress_bar.set_postfix({ "Loss": f"{loss.item():.4f}", "Test Acc": f"{acc:.4f}", "Test F1": f"{f1score:.4f}", # "Processed %": f"{(epoch + 1) / TGT_EPOCH * 100:.2f}", # "Time(s)": f"{np.mean(dur) if dur else 0:.4f}" }) print("Test Acc {:.4f} | Test Fid {:.4f} | Test F1score {:.4f} | Time(s) {:.4f}".format( perfm_attack['acc'], perfm_attack['fid'], perfm_attack['f1score'], np.mean(dur))) epoch_metrics = pd.DataFrame({ 'Num Attack Nodes': [sub_train_mask.sum().item()], 'Method': ['cega'], 'Test Accuracy': [perfm_attack['acc']], 'Test Fidelity': [perfm_attack['fid']], 'Test F1score': [perfm_attack['f1score']], }) metrics_df = pd.concat([metrics_df, epoch_metrics], ignore_index=True)
# log_file_path = f"{file_path}/{dataset_name}/log_cega_{seed}.csv" # metrics_df.to_csv(log_file_path, mode='w', header=False, index=False)