import numpy as np
import pandas as pd
import os
import sys
from util import UserPerformance
from util import load_answers, load_user_info, find_valid_users, read_decisions, get_condition_users
from util import calc_ATI_scale, calc_familiarity, calc_propensity_to_trust, read_TiA_scales
from util import find_valid_users, load_answers, read_decisions, get_assessment, calc_user_reliance_measures
from scipy.stats import wilcoxon, kruskal, spearmanr, mannwhitneyu
from pingouin import ancova

p1_tasks = ["p1-{}".format(i) for i in range(10)]
p2_tasks = ["p2-{}".format(i) for i in range(10)]


def test_wilcoxon(list_1, list_2):
	assert len(list_1) == len(list_2)
	res = wilcoxon(x=list_1, y=list_2)
	print("Considering all participants with tutorial, the results are:")
	print("Mean: %.3f\t%.3f"%(np.mean(list_1), np.mean(list_2)))
	static, pvalue = res
	print(res)
	return pvalue

def compare_trust(user_set, user_trust_first, user_trust_second, scale):
	trust_list_1 = []
	trust_list_2 = []
	trust_list_1_explanation = []
	trust_list_2_explanation = []
	trust_list_1_no_explanation = []
	trust_list_2_no_explanation = []
	obj_list = []
	for user in user_set:
		trust_1 = user_trust_first[user][scale]
		trust_2 = user_trust_second[user][scale]
		trust_list_1.append(trust_1)
		trust_list_2.append(trust_2)
	print(f"{len(trust_list_1)} participants considered")
	print("for scale TiA-{}".format(scale))
	pvalue = test_wilcoxon(trust_list_1, trust_list_2)
	# if pvalue < 0.05 / 3:
	statistic, pvalue = mannwhitneyu(trust_list_1, trust_list_2, alternative='two-sided')
	print("Alternative before debugging <> after debugging,", "pvalue %.4f"%pvalue, "statistic %.4f"%statistic)
	statistic, pvalue = mannwhitneyu(trust_list_1, trust_list_2, alternative='less')
	print("Alternative before debugging < after debugging,", "pvalue %.4f"%pvalue, "statistic %.4f"%statistic)
	print("-" * 17)

def compare_performance_across_conditions(users, user_condition_dict, user_batch_order, usertask_dict, user_flip_dict, answer_dict):
	user2performance = {}
	for user in users:
		tp_condition = user_condition_dict[user]
		if user_batch_order[user][0] == 1:
			first_batch = p1_tasks
			second_batch = p2_tasks
		else:
			assert user_batch_order[user][0] == 2
			first_batch = p2_tasks
			second_batch = p1_tasks
		tp_performance = UserPerformance(username=user)
		
		tp_correct, tp_agreement_fraction, tp_switching_fraction, initial_disagreement_1, relative_positive_ai_reliance,\
			relative_positive_self_reliance = calc_user_reliance_measures(user, usertask_dict, user_flip_dict, answer_dict, first_batch)
		tp_accuracy = tp_correct / 10.0
		tp_performance.add_performance(accuracy=tp_accuracy, agreement_fraction=tp_agreement_fraction, switching_fraction=tp_switching_fraction, 
			relative_positive_ai_reliance=relative_positive_ai_reliance, relative_positive_self_reliance=relative_positive_self_reliance, group="first_batch")
		
		tp_correct, tp_agreement_fraction, tp_switching_fraction, initial_disagreement_2, relative_positive_ai_reliance,\
			relative_positive_self_reliance = calc_user_reliance_measures(user, usertask_dict, user_flip_dict, answer_dict, second_batch)
		tp_accuracy = tp_correct / 10.0
		tp_performance.add_performance(accuracy=tp_accuracy, agreement_fraction=tp_agreement_fraction, switching_fraction=tp_switching_fraction, 
			relative_positive_ai_reliance=relative_positive_ai_reliance, relative_positive_self_reliance=relative_positive_self_reliance, group="second_batch")

		# user_performance_list.append(tp_performance)
		user2performance[user] = tp_performance
	return user2performance

	# def compare_performance(condition_performance_dict):
	# 	keys = ["accuracy", "agreement_fraction", "switching_fraction", "relative_positive_ai_reliance", "relative_positive_self_reliance"]
	# 	for var_name in keys:
	# 		performance_dict = {
	# 			0: [], # "Debugging-R"
	# 			1: [], # "Debugging-D"
	# 			2: [], # "Debugging-I"
	# 			3: []  # "Control"
	# 		}
	# 		for condition in range(4):
	# 			for tp_performance in condition_performance_dict[condition]:
	# 				performance_improvement = tp_performance.performance["second_batch"][var_name] - tp_performance.performance["first_batch"][var_name]
	# 				performance_dict[condition].append(performance_improvement)
			
	# 		# statistic, pvalue = kruskal(performance_list_1, performance_list_2)
	# 		print(var_name)
	# 		for condition, condition_name in enumerate(["Debugging-R", "Debugging-D", "Debugging-I", "Control"]):
	# 			performance_list = performance_dict[condition]
	# 			print("Mean: M(first):{:.2f}, SD(first):{:.2f}".format(np.mean(performance_list), np.std(performance_list)))

	# 		res = kruskal(performance_dict[0], performance_dict[1], performance_dict[2], performance_dict[3])
	# 		print("Kruskal results:", res)
	# 		print("-" * 17)

	# print("Compare participants's performance improvement with different intervention (second - first):")
	# compare_performance(condition_performance_dict)
	# print("-" * 34)

TiA_subscales = ["Reliability/Competence", "Understanding/Predictability", "Intention of Developers",  "Trust in Automation"]

if __name__ == "__main__":
	# load answer dict
	answer_dict_p1 = load_answers("reviews_p1.json")
	answer_dict_p2 = load_answers("reviews_p2.json")
	answer_dict = {}
	for task_id in answer_dict_p1:
		answer_dict["p1-{}".format(task_id)] = answer_dict_p1[task_id]
	for task_id in answer_dict_p2:
		answer_dict["p2-{}".format(task_id)] = answer_dict_p2[task_id]

	valid_users, approved_users = find_valid_users()
	user_condition_dict, user_batch_order = load_user_info(reserved_users=valid_users)

	user_TiA_scale_first, user_TiA_scale_second = read_TiA_scales(reserved_users=valid_users)
	user_ATI_scale = calc_ATI_scale(valid_users)
	user_familiarity_dict = calc_familiarity(valid_users)
	user_ptt_scale = calc_propensity_to_trust(valid_users)
	user_task_dict, user_confidence_dict, user_flip_dict = read_decisions(user_batch_order, reserved_users=valid_users)
	user2performance = compare_performance_across_conditions(valid_users, user_condition_dict, user_batch_order, user_task_dict, user_flip_dict, answer_dict)

	condition_users = get_condition_users(user_condition_dict)
	users_dict = {
		"Debugging-R": valid_users & condition_users[0],
		"Debugging-D": valid_users & condition_users[1],
		"Debugging-I": valid_users & condition_users[2],
		"Control"    : valid_users & condition_users[3]
	}
	# for subscale in TiA_subscales:
	# 	compare_trust(valid_users - users_dict["Control"], user_TiA_scale_first, user_TiA_scale_second, subscale)
	# sys.exit(-1)

	user_data = {
		"user_id": [],
		"condition": [],
		"ATI": [],
		"TiA-Propensity": [],
		"TiA-Familiarity": [],
		"accuracy": [],
		"agreement_fraction": [],
		"switching_fraction": [],
		"relative_positive_ai_reliance": [],
		"relative_positive_self_reliance": []
	}
	for subscale in TiA_subscales:
		user_data["TiA-{}".format(subscale)] = []
	# consider all users for analysis of covariates on trust and reliance
	for user in user_TiA_scale_first:
		user_data["user_id"].append(user)
		user_data["condition"].append(user_condition_dict[user])
		user_data["ATI"].append(user_ATI_scale[user])
		user_data["TiA-Propensity"].append(user_ptt_scale[user])
		user_data["TiA-Familiarity"].append(user_familiarity_dict[user])
		for subscale in TiA_subscales:
			# trust_change = user_TiA_scale_second[user][subscale] - user_TiA_scale_first[user][subscale]
			# user_data["TiA-{}".format(subscale)].append(trust_change)
			trust_avg = (user_TiA_scale_second[user][subscale] + user_TiA_scale_first[user][subscale]) / 2
			user_data["TiA-{}".format(subscale)].append(trust_avg)
		
		for metric in  ["accuracy", "agreement_fraction", "switching_fraction", "relative_positive_ai_reliance", "relative_positive_self_reliance"]:
			# performance_change = user2performance[user].performance["second_batch"][metric] - user2performance[user].performance["first_batch"][metric]
			# user_data[metric].append(performance_change)
			performance_avg = (user2performance[user].performance["second_batch"][metric] + user2performance[user].performance["first_batch"][metric]) / 2
			user_data[metric].append(performance_avg)
	df = pd.DataFrame(user_data)
	print(df.shape)
	# for subscale in TiA_subscales:
	# 	variable_name = "TiA-{}".format(subscale)
	# 	# compare the trust change across all conditions
	# 	print(variable_name)
	# 	print(ancova(data=df, dv=variable_name, covar=['ATI', 'TiA-Propensity', "TiA-Familiarity"], between='condition', effsize='n2'))
	# 	print("-" * 34)

	# covariates correlation with all reliance and trust DV
	for covariate_name in ["TiA-Propensity", "TiA-Familiarity", "ATI"]:
		for subscale in TiA_subscales:
			variable_name = "TiA-{}".format(subscale)
			correlation, pvalue = spearmanr(user_data[covariate_name], user_data[variable_name])
			if pvalue < 0.05 / 3:
				print("Variable {} and variable {} have spearman correlation {:.3f} and pvalue {:.3f}".format(covariate_name, variable_name, correlation, pvalue))
		print("-" * 17)
		for metric in  ["accuracy", "agreement_fraction", "switching_fraction", "relative_positive_ai_reliance", "relative_positive_self_reliance"]:
			correlation, pvalue = spearmanr(user_data[covariate_name], user_data[metric])
			print(covariate_name, metric)
			print(len(user_data[covariate_name]), len(user_data[metric]))
			# if pvalue < 0.05 / 3:
			print("Variable {} and variable {} have spearman correlation {:.3f} and pvalue {:.3f}".format(covariate_name, metric, correlation, pvalue))
		print("-" * 34)




