# -*- coding: utf-8 -*-
"""
This file is to preprocess the data by combining data from different
study components spread over multiple data files.

@author: Nele Albers

Required files: 
    - Dataset_Acceptance_Virtual_Coach/Post_Questionnaire_Acceptance_Anonym_Preprocessed.xlsx
    - Dataset_Acceptance_Virtual_Coach/pre_questionnaire_anonym.csv
    - Dataset_Acceptance_Virtual_Coach/prescreening_prolific_numeric_anonym.csv
    
Output files:
    - Participant_Characteristics/preprocessed_data.csv
    - Mean_Ratings/preprocessed_data.csv
    - Correlations/preprocessed_data.csv
"""

import numpy as np
import os
import pandas as pd


def cronbach_alpha(df):
    """
    Calculate cronbach's alpha.
    Code from https://towardsdatascience.com/cronbachs-alpha-theory-and-application-in-python-d2915dd63586.
    """

    # 1. Transform the df into a correlation matrix
    df_corr = df.corr()

    # 2.1 Calculate N
    # The number of variables equals the number of columns in the df
    N = df.shape[1]

    # 2.2 Calculate R
    # For this, we'll loop through the columns and append every
    # relevant correlation to an array called "r_s". Then, we'll
    # calculate the mean of "r_s"
    rs = np.array([])
    for i, col in enumerate(df_corr.columns):
        sum_ = df_corr[col][i+1:].values
        rs = np.append(sum_, rs)
    mean_r = np.mean(rs)

    # 3. Use the formula to calculate Cronbach's Alpha 
    cronbach_alpha = (N * mean_r) / (1 + (N - 1) * mean_r)

    return cronbach_alpha


def get_char_pre_questionnaire(df):
    """Get user characteristics from pre-questionnaire."""

    df_preq_q = pd.read_csv("Dataset_Acceptance_Virtual_Coach/pre_questionnaire_anonym.csv")
    
    # Compute index for quitter self-identity
    df_preq_q["Quitter_Self_Identity"] = 1/3 * (df_preq_q["Quitting_Self_Identity_1"] + df_preq_q["Quitting_Self_Identity_2"] + df_preq_q["Quitting_Self_Identity_3"]) 
    
    # Calculate Cronbach's alpha
    # Only use participants from post-questionnaire
    df_sub = df_preq_q[df_preq_q["rand_id"].isin(df["rand_id"].to_list())]
    df_sub = df_sub[["Quitting_Self_Identity_1", "Quitting_Self_Identity_2", 
                     "Quitting_Self_Identity_3"]]
    alpha = cronbach_alpha(df_sub)
    print("Cronbach's alpha quitter self-identity:", round(alpha, 2))
    
    # Keep only data that we need
    df_preq_q = df_preq_q[["rand_id", "Quit_Before_24h",
                           "Quitter_Self_Identity"]]

    df = pd.merge(df, df_preq_q, on = "rand_id")

    return df


def get_char_prolific(df):
    """Get user characteristics from Prolific profiles (e.g., age)."""
    
    df_presc_prol = pd.read_csv("Dataset_Acceptance_Virtual_Coach/prescreening_prolific_numeric_anonym.csv")[["age_bin", 
                                                                                                              "Gender identity", 
                                                                                                              "rand_id"]]

    df = pd.merge(df, df_presc_prol, on="rand_id")

    return df


if __name__ == "__main__":
    
    curr_directory = os.path.abspath(os.path.dirname(__file__))

    # Let's first load the post-questionnaire data
    df = pd.read_excel(curr_directory + "/Dataset_Acceptance_Virtual_Coach/Post_Questionnaire_Acceptance_Anonym_Preprocessed.xlsx")

    # Let's add user variables
    # First, variables from the pre-questionnaire (e.g., quitter self-identity)
    print("... Extract data on participants from pre-questionnaire")
    df = get_char_pre_questionnaire(df)

    # Second, let's add some variables from Prolific (e.g., age)
    print("... Extract data on participants from Prolific profiles")
    df = get_char_prolific(df)

    # Save dataframe to csv-files
    df.to_csv("Participant_Characteristics/preprocessed_data.csv", index=False)
    df.to_csv("Mean_Ratings/preprocessed_data.csv", index=False)
    df.to_csv("Correlations/preprocessed_data.csv", index=False)
