# Imports
import functools, requests, json, scipy.sparse, re, string, gensim, pyLDAvis
import pandas as pd, geopandas as gpd, numpy as np, matplotlib.pyplot as plt
from gensim import matutils, models, corpora
from gensim.models import LdaModel
from collections import defaultdict
from geojson import FeatureCollection
from pyLDAvis import gensim_models

import spacy
from spacy import language
from spacy.lang.en import English
from spacy.lang.de import German
from spacy.tokenizer import Tokenizer

# NLTK
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer


# Class to perform the Data Collection tasks
class DataCollecter:

    def get_participatory_project_area(process_id, process_name, url):
        """
        A function to retrieve the project areas of a given participation process.

        Keyword arguments:
        process_id -- the ID of the participation process
        process_name -- the name of the participation process
        url -- the URL to the participation process

        Returns:
        A shapely polygon
        """
        # Catch possible errors
        try: 
            api_path="" # enter the API path here to download DIPAS project areas
            response = requests.get(url+api_path).json()

            # Convert geojson into a geopandas Dataframe
            project_area_json = FeatureCollection(response)
            project_area = gpd.GeoDataFrame.from_features(project_area_json['features'])
            
            print(process_name + " project area retrieved successfully")

            # Return 
            return project_area.geometry
        except:
            # In case an issue occured    
            print("An issue occured: " + process_name + " project area could not be retrieved")
            return ""
    
    def get_geolocated_contributions(process_id, process_name, process_url):
        """ 
        A function to retrieve the geolocations of a participatory process and save them in a .csv file.

        Keyword arguments:
        process_id -- the ID of the participation process
        process_name -- the name of the participation process
        process_url -- the URL to the participation process

        Returns:
        The process ID, the url and a pandas dataframe containing all the geo-located contributions.

        """
        api_path = "" # enter the API path here to download DIPAS participatory data

        response = requests.get(process_url+api_path).json()
        contributions = pd.DataFrame(columns=["nid","name", "description","link","theme", "rubric","lat", "lon"])

        # Iterate over all features and add them to the dataframe
        for feature in response["features"]:
            contributions = contributions.append(pd.DataFrame({"nid":feature["properties"]["nid"], "name": feature["properties"]["name"], "description": feature["properties"]["description"],"link": feature["properties"]["link"],"theme": feature["properties"]["Thema"], "rubric":feature["properties"]["Rubric"],"lat": feature["geometry"]["coordinates"][1], "lon": feature["geometry"]["coordinates"][0]}, index=[0]))

        # Reset index in comments
        contributions = contributions.set_index("nid")

        # Save comments in the folder
        contributions.to_csv("../data/00_raw/01_participation-processes/"+str(process_id)+"/georeference.csv", index_label="contribution_id")

        # Print success message
        print(process_name + " data downloaded successfully")

        # Returns the process ID, the url and the dataframe with contributions
        return process_id, process_url, contributions

    def get_comments_and_replies(process_id, process_url, process_contributions):
        """
        Function to scrape all comments and replies from a participation process, and save them in a .csv file in the respective process folder.  

        Keyword arguments:
        process_id -- The ID of the participation process
        process_url -- The url of the participation process
        process_contributions -- A list of process contributions, that was retrieved by the function get_geolocated_contributions()

        Output:
        Success message
        """
        api_path = "" # enter the API path here to download DIPAS comments and replies

        comments = pd.DataFrame(columns=["Node ID","Comment ID", "Subject","Comment_text","created"])
        replies = pd.DataFrame(columns=["Node ID","Comment ID", "Reply ID", "Subject", "Reply_text","created"])

        # Catch possible errors
        try:

            # Iterate over all Nodes in the process contributions
            for node_id in process_contributions.index.to_list():

                # Receive comments for each contribution
                response = requests.get(process_url+api_path+str(node_id)).json()

                # Iterate over all comments and add them to the DataFrame
                for comment in response["comments"]:
                    comments = comments.append(pd.DataFrame({"Node ID": node_id, "Comment ID": comment["cid"], "Subject": comment["subject"], "Comment_text":comment ["comment"], "created": comment["created"]}, index=["Node ID"]))

                    # Iterate over all replies to this comment
                    for reply in comment["replies"]:
                        replies = replies.append(pd.DataFrame({"Node ID": node_id, "Comment ID": comment["cid"], "Reply ID":reply["cid"], "Subject":reply["subject"], "Reply_text":reply["comment"],"created":reply["created"]}, index=[comment["cid"]]))

            # Set indices
            comments = comments.set_index(["Node ID", "Comment ID"])
            replies = replies.set_index(["Node ID", "Comment ID", "Reply ID"])

            # Save comments and replies in a .csv file
            comments.to_csv("../data/00_raw/01_participation-processes/"+str(process_id)+"/comments.csv", index_label="Node ID")
            replies.to_csv("../data/00_raw/01_participation-processes/"+str(process_id)+"/replies.csv", index_label="Node ID")

            # Print outcome message
            print("-- Comments and replies downloaded successfully")

        # Catch errors
        except:
            print("-- No Comments or replies found")


# Class to perform preprocessing
class Preprocesser:
    
    def remove_locations(text, location_list):
        """
        Function to remove locations from a given text. The locations that should be removed are passed to the function as a list.
        Returns: 
        The text stripped from locations
        """
        for location in location_list:
            text = text.replace(location, '')
        
        return text.strip()

    def remove_punctuation_and_numbers(text):
        """
        Returns a String that is removed of punctuations and numbers. 
        
        """
        if text is not None:
            # Lowercase
            #text = text.lower()
            # Remove multiple spaces and replace them with one
            text = " ".join(text.split())

            # Replace unwanted punctuations and numbers
            text = re.sub('\[.*?\]', '', text)
            text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
            text = re.sub('\w*\d\w*', '', text)
            text = re.sub('[‘’“”„…]', '', text)
            text = re.sub('\n', ' ', text)
            text = re.sub('\r', ' ', text)

            # Remove multiple spaces and replace them with one
            text = " ".join(text.split())

        return text 
    
    def remove_stopwords(language, text, custom_stopwords):
        """
        Function to remove the stopwords within a given text and language
        """
        
        stop_words = []
        stop_words_en = stopwords.words('english')
        stop_words_de = stopwords.words('german')

        # Handle custom Stopwords in the beginning to catch phrases
        if len(custom_stopwords) > 0:
            for stopword in custom_stopwords:
                if stopword.lower() in text.lower():
                    text = text.lower().replace(stopword.lower(), "")
        
        
        # Select correct stop words
        if language == "de":
            stop_words = stop_words_de
        if language == "en":
            stop_words = stop_words_en

        # Remove stopwords that are, but shall not be in the list
        stop_words = [e for e in stop_words if e not in ["nicht", "nichts"]]

        if text is not None:
            # Remove stop words
            textwords = [word for word in text.lower().split() if word not in stop_words]
            text = " ".join(textwords)

        return text

    def lemmatize(language, text):

        """
        Function to lemmatize a given text with the help of the spacy library and a specified input language. 
        """

        nlp_de = spacy.load('de_core_news_lg')
        nlp_en = spacy.load('en_core_web_sm')

        if text is not None:
            if language == "de":
                textwords = [x.lemma_ for x in nlp_de(text)]
            if language == "en":
                textwords = [x.lemma_ for x in nlp_en(text)]
            text = " ".join(textwords)
        
        return text

    def train_bigram_trigram_model(training_contributions, min_count, threshold, language):
        """
        Function to train a bigram and trigram model on the based of contributions.
        Returns a gensim bigram and trigram model. 
        """
        connector_words = frozenset()
        if language == "en":
            connector_words = gensim.models.phrases.ENGLISH_CONNECTOR_WORDS
        if language == "de":
            connector_words = frozenset("der die das von mit bei in ein eine eines für auf oder zu ohne".split())

        data_words = [contribution.split() for contribution in training_contributions.tolist()]
        bigram = gensim.models.Phrases(data_words, min_count=min_count, threshold=threshold, connector_words=connector_words) # higher threshold fewer phrases.
        trigram = gensim.models.Phrases(bigram[data_words], threshold=threshold)  

        bigram_mod = gensim.models.phrases.Phraser(bigram)
        trigram_mod = gensim.models.phrases.Phraser(trigram)

        return bigram_mod, trigram_mod

    def extract_nouns_and_adjectives(text, language, adverbs):
        """
        Function to extract only nouns and adjectives from the text.
        Returns a String with nouns and adjectives in the order they appear and separated by a whitespace; already lemmatized 

        """
        
        nlp_de = spacy.load('de_core_news_lg')
        nlp_en = spacy.load('en_core_web_sm')

        doc = ""
        extracted_text = []

        if language == "de":
            doc = nlp_de(text)
        if language == "en":
            doc = nlp_en(text)

        for token in doc:
            if (token.pos_ == "NOUN") or (token.pos_ == "ADJ"):
                extracted_text.append(token.lemma_)
            if adverbs & (token.pos_ == "ADV"):
                extracted_text.append(token.lemma_)

        return ' '.join(extracted_text)

    def make_trigrams(bigram_model, trigram_model, text):
        """
        Returns a text with the bigrams and trigrams identified by 
        """
        if text is not None:
            return " ".join(trigram_model[bigram_model[text.split()]])
        else:
            return ""

    def complete_preprocessing(language, text, **kwargs):
        """
        A function to call the various preprocessing functions in sequence on a given text and language.
        """
        
        custom_stopwords = []
        bi_mod = None
        tri_mod = None

        for key, value in kwargs.items():
            if key == "custom_stopwords":
                custom_stopwords.append(value)
            if key == "bigram_model":
                bi_mod = value
            if key == "trigram_model":
                tri_mod = value


        text = Preprocesser.remove_punctuation_and_numbers(text)
        text = Preprocesser.lemmatize(language, text)
        if (bi_mod is not None) and (tri_mod is not None):
            text = Preprocesser.make_trigrams(bi_mod, tri_mod, text)
        text = Preprocesser.remove_stopwords(language, text, custom_stopwords)

        return text

# Class to perform the data loading
class DataLoader:

    def load_data(sentiments = True, translated = True):
        """
        Function to load the citizen's contributions, comments and replies

        """
        # Read participation processes
        participation_processes = gpd.read_file("../data/01_intermediate/participation_processes.geojson").set_index("ID")

        # Read contributions, comments and replies depending on the method's inputs
        if sentiments:
            contributions_gdf = gpd.read_file("../data/01_intermediate/process_contributions_comments_replies_en_sentiments.geojson").set_index(["Process ID", "Node ID", "Comment ID"])
        elif translated:
            contributions_gdf = gpd.read_file("../data/01_intermediate/process_contributions_comments_replies_en.geojson").set_index(["Process ID", "Node ID", "Comment ID"])
        else: 
            contributions_gdf = gpd.read_file("../data/01_intermediate/process_contributions_comments_replies.geojson").set_index(["Process ID", "Node ID", "Comment ID"])
        
        # Return both data sets
        return participation_processes, contributions_gdf


# Class to perform the topic modelling tasks
class TopicModeller:

    def get_topics(comment_column, num_topics, iterations):
        """
        Performs topic modelling on a Pandas dataframe colums containing Strings.

        Arguments:
        comment_column -- The column containing Strings
        num_topics -- The number of topics that should be returned
        iterations -- The number of iterations that should be performed (The higher, the more the algorithm will converge)

        Returns:
        topic_probablities -- A dataframe that contains every contributions' probabilities for each topic as well as one colums containing the topic with the maximum probability. It has the same index as the input column.
        topic_list -- A list of the 
        topic_model -- The original gensim instance of the topic model for further visualization
        dictionary -- The dictionary used for the topic model
        corpus -- The corpus used for the topic model

        """
        # Drop empty rows
        comment_column = comment_column.dropna()

        # Separate Words
        list = [contribution.split(" ") for contribution in comment_column.to_list()]

        # Remove every word that only occurs once
        frequency = defaultdict(int)
        for text in list:
            for token in text:
                frequency[token] += 1
        processed_corpus = [[token for token in text if frequency[token] > 1] for text in list]

        # Create Dictionary and corpus for LDA
        dictionary = corpora.Dictionary(processed_corpus)
        corpus = [dictionary.doc2bow(text) for text in processed_corpus]

        # Make a index to word dictionary.
        temp = dictionary[0]  # This is only to "load" the dictionary.
        id2word = dictionary.id2token

        # Setup the model
        topic_model = LdaModel(
            corpus=corpus,
            id2word=id2word,
            chunksize=5000,
            alpha='auto',
            eta='auto',
            iterations=iterations,
            num_topics=num_topics,
            passes=20,
            eval_every=5000
        )
        
        # Assign topic probability to dataframe
        all_topics =  topic_model.get_document_topics(corpus, minimum_probability=0.0)
        all_topics_csr = gensim.matutils.corpus2csc(all_topics)
        all_topics_numpy = all_topics_csr.T.toarray()
        topic_probablities = pd.DataFrame(all_topics_numpy)
        topic_probablities = topic_probablities.set_index(comment_column.index)

        # Set column names to topics
        column_names = []
        for i in range(num_topics):
            column_names.append("Topic " + str(i))
        topic_probablities.columns = column_names

        # Get the maximum probability 
        topic_columns = [col for col in topic_probablities if col.startswith('Topic ')]
        topic_probablities["Assigned Topic"] = topic_probablities[topic_columns].idxmax(axis=1)
        topic_probablities["Assigned Topic"] = topic_probablities["Assigned Topic"].apply(lambda x: x.split(" ")[1])
        topic_probablities["Probability Assigned Topic"] = topic_probablities.max(axis=1)

        # Create the topic list
        topic_list = {'Topic ' + str(i): [token for token, score in topic_model.show_topic(i, topn=20)] for i in range(0, topic_model.num_topics)}
        #weight_list = {'Weight Topic ' + str(i): [score for token, score in topic_model.show_topic(i, topn=20)] for i in range(0, topic_model.num_topics)}
        #topic_list = pd.concat([pd.DataFrame(topic_list),pd.DataFrame(weight_list)])

        return topic_probablities, topic_list, topic_model, dictionary, corpus

    def visualize_topics(model,corpus,dictionary):
        """
        Function to visualize a topic model with the pyLDAvis library
        """
        pyLDAvis.enable_notebook()
        return pyLDAvis.gensim_models.prepare(model, corpus, dictionary=dictionary)

