from collections import defaultdict from gensim import models from nltk import word_tokenize import numpy as np from sklearn.metrics.pairwise import cosine_similarity import pandas as pd import json # Function to get the cosine similarity between a relation and query # Note: Update this string with the path to the file word2vec_model = models.Word2Vec.load('../data/Graphs/word2vec_train_dev.dat') def get_rel_score_word2vecbase(rel: str, query: str) -> float: """ Get score for query and relation. Used to inform exploration of knowledge graph. :param rel: relation, or edge in knowledge graph :param query: query, question to answer :return: float score similarity between question and relation """ # Relation not in embedding vocabulary if rel not in word2vec_model.wv: return 0.0 # Relation must start with ns: rel = 'ns:' + rel if not rel[:3] == 'ns:' else rel words = word_tokenize(query.lower()) w_embs = [] for w in words: if w in word2vec_model.wv: w_embs.append(word2vec_model.wv[w]) return np.mean(cosine_similarity(w_embs, [word2vec_model.wv[rel]])) def load_node_label_lookup(filepath: str) -> dict: """ Load the lookup dictionary for nodes from the provided json file. Args: filepath: Path to the json file containing the lookup dictionary. Returns: Dictionary of node ids to text description of node. """ with open(filepath, 'rb') as fp: return json.load(fp) def load_query_df(filepath: str) -> pd.DataFrame: """ Load a simplified dataframe of queries. Generated from the original queries nested dictionary, this simplified version contains all necessary information for performing the graph traversal testing without all the extra information and difficult formatting. Simply loop through this dataframe row by row, start at the start node with the query for that row, and the expected answers are given in that same row. Args: filepath: Path to the provided parquet file Returns: Dataframe of queries to perform on the graph. """ return pd.read_parquet(filepath) # Function to load the graph from file def load_graph() -> dict: """ Load the graph from the given file. Returns: Graph, in form of node_id key, and nested list value. Nested list is adjacency list, with each list containing the relation, and destination node_id. """ # Preparing the graph graph = defaultdict(list) for line in open('../data/Graphs/graph.txt'): line = eval(line[:-1]) graph[line[0]].append([line[1], line[2]]) return graph # Function to load the queries from file # Preparing the queries def load_queries() -> list: """ Load the original queries file. This format can be extremely confusing, for a simplified format use load_query_df. Returns: Nested list, with index, node_id, relation types for answers, text description of start node, and dict of answers. """ queries = [] for line in open('../data/Graphs/annotations.txt'): line = eval(line[:-1]) queries.append(line) return queries