from collections import defaultdict

from gensim import models
from nltk import word_tokenize
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import json

# Function to get the cosine similarity between a relation and query
# Note: Update this string with the path to the file
word2vec_model = models.Word2Vec.load('../data/Graphs/word2vec_train_dev.dat')


def get_rel_score_word2vecbase(rel: str, query: str) -> float:
    """
    Get score for query and relation. Used to inform exploration of knowledge graph.

    :param rel: relation, or edge in knowledge graph
    :param query: query, question to answer
    :return: float score similarity between question and relation
    """
    # Relation not in embedding vocabulary
    if rel not in word2vec_model.wv:
        return 0.0
    # Relation must start with ns:
    rel = 'ns:' + rel if not rel[:3] == 'ns:' else rel

    words = word_tokenize(query.lower())
    w_embs = []
    for w in words:
        if w in word2vec_model.wv:
            w_embs.append(word2vec_model.wv[w])
    return np.mean(cosine_similarity(w_embs, [word2vec_model.wv[rel]]))


def load_node_label_lookup(filepath: str) -> dict:
    """

    Load the lookup dictionary for nodes from the provided json file.

    Args:
        filepath: Path to the json file containing the lookup dictionary.

    Returns: Dictionary of node ids to text description of node.

    """
    with open(filepath, 'rb') as fp:
        return json.load(fp)


def load_query_df(filepath: str) -> pd.DataFrame:
    """

    Load a simplified dataframe of queries. Generated from the original queries nested dictionary, this simplified
    version contains all necessary information for performing the graph traversal testing without all the extra
    information and difficult formatting. Simply loop through this dataframe row by row, start at the start node with
    the query for that row, and the expected answers are given in that same row.

    Args:
        filepath: Path to the provided parquet file

    Returns: Dataframe of queries to perform on the graph.

    """
    return pd.read_parquet(filepath)


# Function to load the graph from file
def load_graph() -> dict:
    """

    Load the graph from the given file.

    Returns: Graph, in form of node_id key, and nested list value. Nested list is adjacency list, with each list
    containing the relation, and destination node_id.

    """
    # Preparing the graph
    graph = defaultdict(list)
    for line in open('../data/Graphs/graph.txt'):
        line = eval(line[:-1])
        graph[line[0]].append([line[1], line[2]])
    return graph


# Function to load the queries from file
# Preparing the queries
def load_queries() -> list:
    """

    Load the original queries file. This format can be extremely confusing, for a simplified format use load_query_df.

    Returns: Nested list, with index, node_id, relation types for answers, text description of start node, and dict of
    answers.

    """
    queries = []
    for line in open('../data/Graphs/annotations.txt'):
        line = eval(line[:-1])
        queries.append(line)
    return queries