Psst.. new poll here.
Psst.. new forums here.
Microsoft is blocking us again (TY IP Reputation!) so just use oauth login instead. :)
Paste
Pasted as Python by Michaelazmy ( 7 years ago )
import torch
import torch.nn as nn
import json, random, math
from collections import defaultdict
import numpy as np
from collections import defaultdict
from tqdm import tqdm
from torch.utils import data
import itertools
from knowledge_graph.knowledge_graph import KnowledgeGraph
import logging
logger = logging.getLogger('GraphEmbeddingMapper')
logger.setLevel(logging.INFO)
random.seed(23)
class GraphEmbeddingMapper:
def __init__(self, dataset, source_kg: KnowledgeGraph, target_kg: KnowledgeGraph):
self.source_kg = source_kg
self.target_kg = target_kg
logger.info('Splitting dataset')
self.train, self.valid, self.test = GraphEmbeddingMapper._split(dataset)
logger.info('Sizes: train {}, valid {}, test {}'.format(len(self.train), len(self.valid), len(self.test)))
# Device configuration
self.device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
# Hyper-parameters
self.input_size = 1000
self.hidden_size = 750
self.num_classes = 2
self.num_epochs = 10
self.batch_size = 1000
self.learning_rate = 0.001
self.train_dataset = Dataset(self.train, self.source_kg, self.target_kg)
self.train_loader = data.DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)
self.valid_dataset = Dataset(self.valid, self.source_kg, self.target_kg)
self.valid_loader = data.DataLoader(self.valid_dataset, batch_size=self.batch_size, shuffle=False)
self.test_dataset = Dataset(self.test, self.source_kg, self.target_kg)
self.test_loader = data.DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False)
@staticmethod
def _split(dataset):
random.shuffle(dataset)
return dataset[:int(0.7*len(dataset))], dataset[int(0.7*len(dataset)):int(0.8*len(dataset))], \
dataset[int(0.8*len(dataset)):]
def train_model(self, model_type="nn"):
logger.info("Training Model {}".format(model_type))
model, criterion, optimizer = None, None, None
if model_type == "nn":
model = NeuralNet(self.input_size, self.hidden_size, self.num_classes).to(self.device)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=self.learning_rate)
elif model_type == "lr":
model = LogisticRegression(self.input_size, self.num_classes).to(self.device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=self.learning_rate)
# Train the model
total_step = len(self.train_loader)
logger.info("Number of minibatches {}".format(total_step))
for epoch in range(self.num_epochs):
logger.info("Training epoch {}".format(epoch))
for i, (X, y, source_uri, candidate_uri) in tqdm(enumerate(self.train_loader), total=total_step):
logger.debug("{} {} {} {}".format(X, y, source_uri, candidate_uri))
# Move tensors to the configured device
X = X .to(self.device)
y = y.to(self.device)
# Forward pass
outputs, prob = model(X)
loss = criterion(outputs, y)
# Backward and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (i + 1) % 100 == 0:
logger.info('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
.format(epoch + 1, self.num_epochs, i + 1, total_step, loss.item()))
with torch.no_grad():
mrr = self.evaluate(self.train_loader, model)
logger.info("Train MRR {}".format(mrr))
with torch.no_grad():
mrr = self.evaluate(self.valid_loader, model)
logger.info("Valid MRR {}".format(mrr))
torch.save(model.state_dict(), 'model_{}.ckpt'.format(model_type))
return model
def test_model(self, model):
# model = NeuralNet(self.input_size, self.hidden_size, self.num_classes).to(self.device)
# model.load_state_dict(torch.load("model.ckpt", map_location='cpu'))
with torch.no_grad():
mrr = self.evaluate(self.test_loader, model)
logger.info("Test MRR {}".format(mrr))
def evaluate(self, loader, model):
# source
source_uris = []
# candidates
candidate_uris = []
# Holds the probability
probs = []
y_target = []
# Get predictions for each pair
logger.info("Evaluating...")
for X, y, source_uri, candidate_uri in tqdm(loader):
logger.debug("Source {} candidate {} label {}".format(source_uri, candidate_uri, y))
# Send tensors to device
X = X.to(self.device)
# Save target labels
y_target.extend(y.cpu().data.numpy())
source_uris.extend(source_uri)
candidate_uris.extend(candidate_uri)
# Get prediction of each candidate
outputs, prob = model(X)
probs.extend(prob.cpu().data.numpy())
ranks = []
# Evaluate
# (source uri, candidate uri, probability, true label)
combined = list(zip(source_uris, candidate_uris, probs, y_target))
logger.info("Combined {}".format(combined))
# For each group (source uri)
for key, group in itertools.groupby(combined, key=lambda x: x[0]):
logger.debug("Key {}".format(key))
# Sort candidates by predicted probability descendingly
sorted_candidates = sorted(group, key=lambda x: x[2], reverse=True)
logger.info("Sorted {}".format(sorted_candidates))
# One-hot vector, 1 in position of true label and 0 otherwise
y_predicted = [x[3] for x in sorted_candidates]
logger.debug("y predicted {}".format(y_predicted))
# Add ranks to the list
ranks.append(y_predicted)
logger.debug("ranks {} ".format(ranks))
# break
# Calculate MRR
return GraphEmbeddingMapper.mean_reciprocal_rank(ranks)
@staticmethod
def mean_reciprocal_rank(rs):
"""Score is reciprocal of the rank of the first relevant item
First element is 'rank 1'. Relevance is binary (nonzero is relevant).
Example from http://en.wikipedia.org/wiki/Mean_reciprocal_rank
>>> rs = [[0, 0, 1], [0, 1, 0], [1, 0, 0]]
>>> mean_reciprocal_rank(rs)
0.61111111111111105
>>> rs = np.array([[0, 0, 0], [0, 1, 0], [1, 0, 0]])
>>> mean_reciprocal_rank(rs)
0.5
>>> rs = [[0, 0, 0, 1], [1, 0, 0], [1, 0, 0]]
>>> mean_reciprocal_rank(rs)
0.75
Args:
rs: Iterator of relevance scores (list or numpy) in rank order
(first element is the first item)
Returns:
Mean reciprocal rank
"""
rs = (np.asarray(r).nonzero()[0] for r in rs)
return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs])
class Dataset(data.Dataset):
"""Characterizes a dataset for PyTorch"""
def __init__(self, data, source_kg: KnowledgeGraph, target_kg: KnowledgeGraph, negative_samples=10):
"""Initialization"""
self.data = data # (source uri, target uri)
self.source_kg = source_kg
self.target_kg = target_kg
self.source_out_of_vocab = defaultdict(np.ndarray)
self.target_out_of_vocab = defaultdict(np.ndarray)
self.negative_samples = negative_samples
self.src_cnt = 0
self.dst_cnt = 0
self.X, self.y = self._get_X_y()
def __len__(self):
"""Denotes the total number of samples"""
return len(self.X)
def __getitem__(self, index):
"""Generates one sample of data"""
# Load data and get label
X = np.concatenate([self.get_source_embedding(self.X[index][0]),
self.get_target_embedding(self.X[index][1])])
y = self.y[index]
return torch.from_numpy(X).double(), y, self.X[index][0], self.X[index][1] # embedding, label, source uri, target uri
def get_source_embedding(self, uri):
embedding = self.source_kg.get_embedding_by_uri(uri)
if embedding is not None:
return embedding
else:
if uri not in self.source_out_of_vocab:
self.source_out_of_vocab[uri] = np.random.rand(1, 500).flatten()
return self.source_out_of_vocab[uri]
def get_target_embedding(self, uri):
embedding = self.target_kg.get_embedding_by_uri(uri)
if embedding is not None:
return embedding
else:
if not uri in self.target_out_of_vocab:
self.target_out_of_vocab[uri] = np.random.rand(1, 500).flatten()
return self.target_out_of_vocab[uri]
def _get_X_y(self):
logger.info('Building X, y')
# Build Train set X, y Matrix
X = [] # (source, target) uris
y = [] # label, 1 => +ve , 0 => -ve
for i, sample in enumerate(self.data):
source_uri = sample[0]
target_uri = sample[1]
logger.debug('Sample {}: {} {}'.format(i, source_uri, target_uri))
X.append((source_uri, target_uri))
y.append(1)
for negative_sample in range(self.negative_samples):
logger.debug('Generating -info sample {}'.format(negative_sample))
# get a random sample idx that is not equivalent to current sample
idx = random.randint(0, len(self.data)-1)
while idx == i:
idx = random.randint(0, len(self.data)-1)
logger.debug('-ve index {}'.format(idx))
## add negative sample
X.append((source_uri, self.data[idx][1]))
y.append(0)
logger.debug(X)
logger.debug(y)
return X, y
# Fully connected neural network with one hidden layer
class NeuralNet(nn.Module):
def __init__(self, input_size, hidden_size, num_classes):
super(NeuralNet, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.relu = nn.ReLU()
self.fc2 = nn.Linear(hidden_size, num_classes)
# self.fc3 = nn.Linear(hidden_size, num_classes)
self.softmax = nn.Softmax(dim=-1)
def forward(self, x):
out = self.fc1(x.float())
out = self.relu(out)
out = self.fc2(out)
# out = self.relu(out)
# out = self.fc3(out)
return out, self.softmax(out)[:, 1]
# LR Model
class LogisticRegression(nn.Module):
def __init__(self, input_size, num_classes):
super(LogisticRegression, self).__init__()
self.linear = nn.Linear(input_size, num_classes)
self.softmax = nn.Softmax(dim=-1)
def forward(self, x):
out = self.linear(x.float())
return out, self.softmax(out)[:,1]
Revise this Paste