# -*- coding: utf-8 -*-
"""
Calculate the agreement between the two coders for "willingness to continue."

@author: Nele Albers, Nadyne L. Aretz

Required files:
    - First_Coder_Willingness_to_Continue.xlsx
    - Second_Coder_Willingness_to_Continue.xlsx
    
Output files:
    - calculate_agreement_willingness_to_continue_output.txt
"""

import numpy as np
import pandas as pd
from sklearn.metrics import cohen_kappa_score


def get_one_hot_encodings(df, df_max, all_codes, num_codes):
    """
    Get one-hod encoding of codes.
    
    Required:
        df (dataframe): Codes in text-form assigned to responses.
        df_max (int): Max. number of codes a response has.
        all_codes (list): All possible codes from coding scheme.
        num_codes (int): Number of codes in coding scheme.
        
    Returns:
        np-array (num_responses x num_codes): One-hot encoded codes per response.
    """

    codes = np.zeros((len(df), num_codes))
    for p in range(len(df)):  # For each response
        for c in range(1, df_max + 1):  # For each code
            code = df.iloc[p]["Code_" + str(c)]

            if not pd.isna(code):

                # First coder used "motivational" instead of "motivating"
                if "Not motivational enough" in code:
                    code = "Not motivating enough "

                index = all_codes.index(code)
                codes[p][index] = 1

    return codes


if __name__ == "__main__":

    df_1 = pd.read_excel("First_Coder_Willingness_to_Continue.xlsx", 
                         sheet_name = "Coding")
    # Max. number of codes assigned by first coder any response has
    df_1_max = 10

    df_2 = pd.read_excel("Second_Coder_Willingness_to_Continue.xlsx",
                         sheet_name = "Coding")
    df_2_max = 7


    all_codes= pd.read_excel("Second_Coder_Willingness_to_Continue.xlsx", 
                             sheet_name = "Coding scheme drop down")["Coding scheme drop down"].to_list()
    num_codes = len(all_codes)

    # First 20 responses were used as training data, so remove them for 
    # Cohen's Kappa computation
    df_1 = df_1.iloc[20:]
    df_1 = df_1.reset_index()
    df_2 = df_2.iloc[20:]
    df_2 = df_2.reset_index()


    # Compute one-hot encoding for codes for first coder
    codes_1 = get_one_hot_encodings(df_1, df_1_max, all_codes, num_codes)


    # Compute one-hot encoding of codes for second coder
    codes_2 = get_one_hot_encodings(df_2, df_2_max, all_codes, num_codes)


    # Compute Cohen's Kappa for each code    
    # nan-values are replaced with 0
    results = []
    for code in range(num_codes):
        
        first_coder = codes_1[:, code]
        second_coder = codes_2[:, code]
        cohen = cohen_kappa_score(first_coder, second_coder)
        if not np.isnan(cohen):
            results.append(cohen)
        else:
            results.append(0)
        

    # Average across all codes
    avg_cohen = np.mean(results)
    print("Average Cohen's Kappa:", round(avg_cohen, 2))

    
    # Write average to file
    with open("calculate_agreement_willingness_to_continue_output.txt", "w") as f:
        f.write("Average Cohen's Kappa: " + str(round(avg_cohen, 2)))
