import numpy as np
import pandas as pd
import os
import sys
import json

data_folder = "../anonymous_data"

def load_debugging_performance(reserved_users=None):
	filename = os.path.join(data_folder, "user_debugging_performance.csv")
	usecols = ["user_id","performance"]
	df = pd.read_csv(filename, usecols=usecols)
	if reserved_users is not None:
		# filter some invalid users
		df = df.drop(df[~df['user_id'].isin(reserved_users)].index)
	user_task_list = df.values.tolist()
	user_debugging_performance = {}
	for user_id,performance in user_task_list:
		user_debugging_performance[user_id] = float(performance)
		# 0 - tutorial random, 1 - tutorialDec, 2 - tutorialInc, 3 - control
	return user_debugging_performance

def load_user_info(reserved_users=None):
	filename = os.path.join(data_folder, "user_input_user_data.csv")
	usecols = ["user_id","first_batch","second_batch","experiment_group"]
	df = pd.read_csv(filename, usecols=usecols)
	if reserved_users is not None:
		# filter some invalid users
		df = df.drop(df[~df['user_id'].isin(reserved_users)].index)
	user_task_list = df.values.tolist()
	user_condition_dict = {}
	user_batch_order = {}
	for user_id,first_batch,second_batch,experiment_group in user_task_list:
		user_condition_dict[user_id] = experiment_group
		user_batch_order[user_id] = (int(first_batch), int(second_batch))
		# 0 - tutorial random, 1 - tutorialDec, 2 - tutorialInc, 3 - control
	return user_condition_dict, user_batch_order

def calc_mean(value_list):
	return np.mean(value_list)

def reverse_code(value, max_scale):
	return max_scale + 1 - value

# scale in [1: max_scale]
class questionnaire(object):
	
	def __init__(self, values, reverse_code_index, max_scale=6, questionnaire_size=0, value_add_one=False):
		self.questionnaire_size = questionnaire_size
		assert len(values) == self.questionnaire_size
		tp_values = []
		for i, value in enumerate(values):
			if value_add_one:
				value_ = float(value) + 1
			else:
				value_ = float(value)
			if (i + 1) in reverse_code_index:
				tp_values.append(reverse_code(value_, max_scale))
			else:
				tp_values.append(value_)
		self.values = tp_values
		self.max_scale = max_scale

	def calc_value(self):
		return calc_mean(self.values)

class UserPerformance(object):
	
	def __init__(self, username):
		self.username = username
		self.performance = {
			"overall": {},
			"first_batch": {},
			"second_batch": {}
		}
		self.miscalibration = {
			"first_batch": {},
			"second_batch": {}
		}
		self.keys = ["accuracy", "agreement_fraction", "switching_fraction", "relative_positive_ai_reliance", "relative_positive_self_reliance"]

	def add_performance(self, accuracy, agreement_fraction, switching_fraction, relative_positive_ai_reliance, relative_positive_self_reliance, group="first_batch"):
		self.performance[group] = {
			"accuracy": accuracy,
			"agreement_fraction": agreement_fraction,
			"switching_fraction": switching_fraction,
			"relative_positive_ai_reliance": relative_positive_ai_reliance,
			"relative_positive_self_reliance": relative_positive_self_reliance
		}

	def add_miscalibration(self, self_assessment, actual_correct_number, group="first_batch"):
		self.miscalibration[group] = self_assessment - actual_correct_number
		# use the gap as  miscalibration, range [-6, 6]

	def print_information(self):
		print("-" * 17)
		print(f"User {self.username}")
		for group in ["first_batch", "second_batch"]:
			print(group)
			for key_ in self.keys:
				print(key_, f"{self.performance[group][key_]}")
			print("miscalibration", self.miscalibration[group])
		print("-" * 17)

def get_assessment(reserved_users=None):
	filename = os.path.join(data_folder, "user_input_post_task_questionnaire_1.csv")
	usecols = ["user_id", "question_1", "question_2"]
	ai_assessment_first = {}
	self_assessment_first = {}
	df = pd.read_csv(filename, usecols=usecols)
	if reserved_users is not None:
		# filter some invalid users
		df = df.drop(df[~df['user_id'].isin(reserved_users)].index)
	user_task_list = df.values.tolist()
	for user_id, question_1, question_2 in user_task_list:
		ai_assessment_first[user_id] = int(question_1)
		self_assessment_first[user_id] = int(question_2)

	ai_assessment_second = {}
	self_assessment_second = {}

	filename = os.path.join(data_folder, "user_input_post_task_questionnaire_2.csv")

	df = pd.read_csv(filename, usecols=usecols)
	if reserved_users is not None:
		# filter some invalid users
		df = df.drop(df[~df['user_id'].isin(reserved_users)].index)
	user_task_list = df.values.tolist()
	for user_id, question_1, question_2 in user_task_list:
		ai_assessment_second[user_id] = int(question_1)
		self_assessment_second[user_id] = int(question_2)
	return ai_assessment_first, ai_assessment_second, self_assessment_first, self_assessment_second


def calc_user_reliance_measures(user, usertask_dict, user_flip_dict, answer_dict, task_id_list):
	user_trust_list = []
	tp_correct = 0
	tp_trust = 0
	tp_agreement = 0
	initial_disagreement = 0
	tp_switch_reliance = 0
	tp_correct_initial_disagreement = 0
	positive_ai_reliance = 0
	negative_ai_reliance = 0
	positive_self_reliance = 0
	negative_self_reliance = 0
	for task_id in task_id_list:
		first_choice = usertask_dict[user][(task_id, "base")]
		second_choice = usertask_dict[user][(task_id, "advice")]
		correct_answer = answer_dict[task_id]
		if task_id in user_flip_dict[user]:
			system_advice = 0 if correct_answer == 1 else 1
		else:
			system_advice = correct_answer
		if second_choice == system_advice:
			# agreement fraction
			tp_agreement += 1
		if first_choice != system_advice:
			# initial disagreement
			initial_disagreement += 1
			if system_advice == correct_answer:
				if second_choice == system_advice:
					# user switch to ai advice, which is correct
					positive_ai_reliance += 1
				else:
					# user don't rely on AI systems when it's correct
					negative_self_reliance += 1
			else:
				if first_choice == correct_answer:
					if second_choice == correct_answer:
						# AI system provide wrong advice, but user insist their own correct decision
						positive_self_reliance += 1
					else:
						# After wrong AI advice, users changed the decision to wrong term
						negative_ai_reliance += 1
			if second_choice == system_advice:
				tp_switch_reliance += 1
			if second_choice == correct_answer:
				tp_correct_initial_disagreement += 1
		if second_choice == correct_answer:
			tp_correct += 1
	number_of_tasks = float(len(task_id_list))
	tp_accuracy = tp_correct / number_of_tasks
	tp_agreement_fraction = tp_agreement / number_of_tasks
	if positive_ai_reliance + negative_self_reliance > 0:
		relative_positive_ai_reliance = positive_ai_reliance / float(positive_ai_reliance + negative_self_reliance)
	else:
		relative_positive_ai_reliance = 0.0

	if positive_self_reliance + negative_ai_reliance > 0:
		relative_positive_self_reliance = positive_self_reliance / float(positive_self_reliance + negative_ai_reliance)
	else:
		relative_positive_self_reliance = 0.0

	if initial_disagreement > 0:
		tp_switching_fraction= float(tp_switch_reliance) / initial_disagreement
	else:
		tp_switching_fraction = 0.0
	return tp_correct, tp_agreement_fraction, tp_switching_fraction, initial_disagreement, relative_positive_ai_reliance, relative_positive_self_reliance


def load_answers(filename):
	answer_dict = {}
	answer_file = os.path.join("../review_data", filename)
	f = open(answer_file)
	data_list = json.load(f)
	for data in data_list["reviews"]:
		tp_id = data["id"]
		# to check whether we need to make it integer
		classification = data["classification"]
		if classification == "d":
			# tp_answer = "deceptive"
			tp_answer = 0
		elif classification == "t":
			# tp_answer = "genuine"
			tp_answer = 1
		else:
			raise NotImplementedError("Unknown answer {}".format(tp_answer))
		answer_dict[tp_id] = tp_answer
	f.close()
	return answer_dict

def calc_familiarity(reserved_users=None):
	filename = os.path.join(data_folder, "user_input_pre_task_questionnaire_tia.csv")
	usecols = ["user_id", "question_3", "question_17"]
	df = pd.read_csv(filename, usecols=usecols)
	if reserved_users is not None:
		# filter some invalid users
		df = df.drop(df[~df['user_id'].isin(reserved_users)].index)
	user_task_list = df.values.tolist()
	user_familiarity = {}
	reverse_code_index = []
	# we need reverse the code in 
	max_scale = 5
	questionnaire_size = 2
	# reverse code for question 5
	for tuple_ in user_task_list:
		# "username", "pt1", "pt2", "pt3" = tuple_
		familiarity = questionnaire(tuple_[1:], reverse_code_index, max_scale=max_scale, questionnaire_size=questionnaire_size, value_add_one=True).calc_value()
		user_id = tuple_[0]
		user_familiarity[user_id] = familiarity
	return user_familiarity

def calc_propensity_to_trust(reserved_users=None):
	filename = os.path.join(data_folder, "user_input_pre_task_questionnaire_tia.csv")
	usecols = ["user_id", "question_5", "question_12", "question_18"]
	df = pd.read_csv(filename, usecols=usecols)
	if reserved_users is not None:
		# filter some invalid users
		df = df.drop(df[~df['user_id'].isin(reserved_users)].index)
	user_task_list = df.values.tolist()
	user_ptt_scale = {}
	reverse_code_index = [1]
	# we need reverse the code in 
	max_scale = 5
	questionnaire_size = 3
	# reverse code for question 5
	for tuple_ in user_task_list:
		# "username", "pt1", "pt2", "pt3" = tuple_
		prospensity_to_trust = questionnaire(tuple_[1:], reverse_code_index, max_scale=max_scale, questionnaire_size=questionnaire_size, value_add_one=True).calc_value()
		user_id = tuple_[0]
		user_ptt_scale[user_id] = prospensity_to_trust
	return user_ptt_scale

def calc_ATI_scale(reserved_users=None):
	filename = os.path.join(data_folder, "user_input_ati.csv")
	ATI_keys = ["user_id"]
	for i in range(9):
		ATI_keys.append("question_{}".format(i+1))
	df = pd.read_csv(filename, usecols=ATI_keys)
	if reserved_users is not None:
		# filter some invalid users
		df = df.drop(df[~df['user_id'].isin(reserved_users)].index)
	user_task_list = df.values.tolist()
	user_ATI_scale = {}
	reverse_code_index = [3, 6, 8]
	max_scale = 6
	questionnaire_size = 9
	# reverse code for question 3, 6, 8
	for tuple_ in user_task_list:
		# user_id, answer_1, answer_2, answer_3, answer_4, answer_5, answer_6, answer_7, answer_8, answer_9, __ = tuple_
		# reverse code for question 3, 6, 8
		# answer_3 = 7 - answer_3
		# answer_6 = 7 - answer_6
		# answer_8 = 7 - answer_8
		# ATI_scale = (answer_1 + answer_2 + answer_3 + answer_4 + answer_5 + answer_6 + answer_7 + answer_8 + answer_9) / 9.0
		ATI_scale = questionnaire(tuple_[1:], reverse_code_index, max_scale=max_scale, questionnaire_size=questionnaire_size, value_add_one=True).calc_value()
		user_id = tuple_[0]
		user_ATI_scale[user_id] = ATI_scale
	return user_ATI_scale

def read_nasa_tlx(reserved_users=None):
	filename = os.path.join(data_folder, "user_input_nasa_tlx.csv")
	keys_ = ["user_id"]
	for i in range(6):
		keys_.append("question_{}".format(i+1))
	df = pd.read_csv(filename, usecols=keys_)
	if reserved_users is not None:
		# filter some invalid users
		df = df.drop(df[~df['user_id'].isin(reserved_users)].index)
	user_task_list = df.values.tolist()
	user_nasa_tlx_dict = {}
	for tuple_ in user_task_list:
		user_id, answer_1, answer_2, answer_3, answer_4, answer_5, answer_6 = tuple_
		variable_names = ["Mental Demand", "Physical Demand", "Temporal Demand", "Performance", "Effort", "Frustration"]
		if user_id not in user_nasa_tlx_dict:
			user_nasa_tlx_dict[user_id] = {}
		for index, var_name in enumerate(variable_names):
			user_nasa_tlx_dict[user_id][var_name] = int(tuple_[index + 1])
	return user_nasa_tlx_dict

class TiA_questionnaire(object):
	
	def __init__(self, values):
		self.questionnaire_size = 19
		self.max_scale = 5
		self.reverse_code_index = [5, 10, 15, 16]
		self.subscales = {
			"Reliability/Competence": [1, 6, 10, 13, 15, 19],
			"Understanding/Predictability": [2, 7, 11, 16],
			# "Familiarity": [3, 17],
			"Intention of Developers": [4, 8],
			# "Propensity to Trust": [5, 12, 18],
			"Trust in Automation": [9, 14]
		}
		assert len(values) == self.questionnaire_size
		tp_values = []
		for i, value in enumerate(values):
			if (i + 1) in self.reverse_code_index:
				tp_values.append(reverse_code(value, self.max_scale))
			else:
				tp_values.append(value)
		self.values = tp_values
		self.scale_dict = {}

	def calc_value(self):
		for subscale in self.subscales:
			self.scale_dict[subscale] = calc_mean([self.values[index - 1] for index in self.subscales[subscale]])
			# print(subscale, self.scale_dict[subscale])
		# print("-" * 20)
		return self.scale_dict

def read_TiA_scales(reserved_users=None):
	filename = os.path.join(data_folder, "user_input_post_task_questionnaire_tia_1.csv")
	TiA_keys = ["user_id"]
	valid_index = []
	actual_index = []
	counter = 1
	user_TiA_scale_first = {}
	for i in range(19):
		if (i + 1) in [3, 5, 12, 17, 18]:
			continue
		# the five questions assessed in pre-task questionnaire
		TiA_keys.append("question_{}".format(i+1))
		valid_index.append(i)
		actual_index.append(counter)
		counter += 1
	df = pd.read_csv(filename, usecols=TiA_keys)
	# print(len(TiA_keys))
	if reserved_users is not None:
		# filter some invalid users
		df = df.drop(df[~df['user_id'].isin(reserved_users)].index)
	user_task_list = df.values.tolist()
	for tuple_ in user_task_list:
		user_id = tuple_[0]
		TiA_sample = [1] * 19
		# print(valid_index, actual_index)
		for index_1, index_2 in zip(valid_index, actual_index):
			# print(index_1, index_2)
			TiA_sample[index_1] = tuple_[index_2]
		TiA_scale = TiA_questionnaire(TiA_sample).calc_value()
		user_TiA_scale_first[user_id] = TiA_scale

	filename = os.path.join(data_folder, "user_input_post_task_questionnaire_tia_2.csv")
	user_TiA_scale_second = {}
	df = pd.read_csv(filename, usecols=TiA_keys)
	if reserved_users is not None:
		# filter some invalid users
		df = df.drop(df[~df['user_id'].isin(reserved_users)].index)
	user_task_list = df.values.tolist()
	for tuple_ in user_task_list:
		user_id = tuple_[0]
		TiA_sample = [1] * 19
		for index_1, index_2 in zip(valid_index, actual_index):
			TiA_sample[index_1] = tuple_[index_2]
		TiA_scale = TiA_questionnaire(TiA_sample).calc_value()
		user_TiA_scale_second[user_id] = TiA_scale
	return user_TiA_scale_first, user_TiA_scale_second

def read_attention_checks(reserved_users=None):
	user_condition_dict, user_batch_order = load_user_info()
	filename = os.path.join(data_folder, "user_input_attention_checks.csv")
	usecols = ["user_id","passed_attention_check_qualification_test","passed_attention_check_pre_task_questionnaire","passed_attention_check_t1_ATI",
	"passed_attention_check_assignments_1","passed_attention_check_post_task_questionnaire_1","passed_attention_check_debugging","passed_attention_check_assignments_2","passed_attention_check_post_task_questionnaire_2","passed_attention_check_no_tutorial"]
	df = pd.read_csv(filename, usecols=usecols)
	if reserved_users is not None:
		# filter some invalid users
		df = df.drop(df[~df['user_id'].isin(reserved_users)].index)
	user_task_list = df.values.tolist()
	user_attention_check_correct = {}
	for tuple_ in user_task_list:
		user_id = tuple_[0]
		if user_id not in user_condition_dict:
			raise NotImplementedError("Can not find {} user in any conditions".format(user_id))
		user_condition = user_condition_dict[user_id]
		if user_id not in user_attention_check_correct:
			user_attention_check_correct[user_id] = 0
		for index, task_id in enumerate(usecols[1:]):
			if user_condition == 3:
				if task_id == "passed_attention_check_debugging":
					continue
			else:
				if task_id == "passed_attention_check_no_tutorial":
					continue
			if tuple_[index + 1] == 1:
				user_attention_check_correct[user_id] += 1
	return user_attention_check_correct

def read_user_question_order(user_batch_order, reserved_users=None):
	filename = os.path.join(data_folder, "user_input_user_answers_phase_1.csv")
	user_task_dict = {}
	user_confidence_dict = {}
	usecols = ["user_id", "question_order"]
	
	df = pd.read_csv(filename, usecols=usecols)
	if reserved_users is not None:
		# filter some invalid users
		df = df.drop(df[~df['user_id'].isin(reserved_users)].index)
	user_task_list = df.values.tolist()
	user_question_order = {}
	for tuple_ in user_task_list:
		user_id = tuple_[0]
		question_order = tuple_[1]
		assert user_batch_order[user_id][0] != user_batch_order[user_id][1]
		if int(user_batch_order[user_id][0]) == 1:
			task_prefix = "p1-"
		elif int(user_batch_order[user_id][0]) == 2:
			task_prefix = "p2-"
		else:
			raise NotImplementedError("Unknown task batch")
		if user_id not in user_question_order:
			user_question_order[user_id] = []
		task_order = question_order.split(",")
		for task in task_order:
			if int(task) == 10:
				continue
			user_question_order[user_id].append(task_prefix + task)

	filename = os.path.join(data_folder, "user_input_user_answers_phase_2.csv")
	df = pd.read_csv(filename, usecols=usecols)
	if reserved_users is not None:
		# filter some invalid users
		df = df.drop(df[~df['user_id'].isin(reserved_users)].index)
	user_task_list = df.values.tolist()
	for tuple_ in user_task_list:
		user_id = tuple_[0]
		question_order = tuple_[1]
		assert user_batch_order[user_id][0] != user_batch_order[user_id][1]
		if int(user_batch_order[user_id][0]) == 1:
			task_prefix = "p1-"
		elif int(user_batch_order[user_id][0]) == 2:
			task_prefix = "p2-"
		else:
			raise NotImplementedError("Unknown task batch")
		if user_id not in user_question_order:
			user_question_order[user_id] = []
		task_order = question_order.split(",")
		for task in task_order:
			if int(task) == 10:
				continue
			user_question_order[user_id].append(task_prefix + task)

	return user_question_order

def read_decisions(user_batch_order, reserved_users=None):
	filename = os.path.join(data_folder, "user_input_user_answers_phase_1.csv")
	user_task_dict = {}
	user_confidence_dict = {}
	usecols = ["user_id"]
	for i in range(10):
		usecols.append("initial_answer_review_{}".format(i))
		usecols.append("confidence_initial_answer_review_{}".format(i))
		usecols.append("answer_after_prediction_review_{}".format(i))
		usecols.append("confidence_answer_after_prediction_review_{}".format(i))
	usecols += ["flip_1", "flip_2"]
	
	df = pd.read_csv(filename, usecols=usecols)
	if reserved_users is not None:
		# filter some invalid users
		df = df.drop(df[~df['user_id'].isin(reserved_users)].index)
	user_task_list = df.values.tolist()
	user_flip_dict = {}
	for tuple_ in user_task_list:
		user_id = tuple_[0]
		flip_1 = int(tuple_[-2])
		flip_2 = int(tuple_[-1])
		assert user_batch_order[user_id][0] != user_batch_order[user_id][1]
		if int(user_batch_order[user_id][0]) == 1:
			task_prefix = "p1-"
		elif int(user_batch_order[user_id][0]) == 2:
			task_prefix = "p2-"
		else:
			raise NotImplementedError("Unknown task batch")
		if user_id not in user_flip_dict:
			user_flip_dict[user_id] = []
		if user_id not in user_confidence_dict:
			user_confidence_dict[user_id] = {}
		if user_id not in user_task_dict:
			user_task_dict[user_id] = {}
		user_flip_dict[user_id].append(task_prefix+str(flip_1))
		user_flip_dict[user_id].append(task_prefix+str(flip_2))
		for task_id in range(10):
			task_id_ = task_prefix + str(task_id)
			# "initial_answer_review_{}".format(i)
			# "confidence_initial_answer_review_{}".format(i)
			# "answer_after_prediction_review_{}".format(i)
			# "confidence_answer_after_prediction_review_{}".format(i)
			user_task_dict[user_id][(task_id_, "base")] = tuple_[4*task_id+1] # initial_answer
			user_confidence_dict[user_id][(task_id_, "base")] = tuple_[4*task_id+2] # initial confidence
			user_task_dict[user_id][(task_id_, "advice")] = tuple_[4*task_id+3] # final_answer
			user_confidence_dict[user_id][(task_id_, "advice")] = tuple_[4*task_id+4] # final confidence

	filename = os.path.join(data_folder, "user_input_user_answers_phase_2.csv")
	df = pd.read_csv(filename, usecols=usecols)
	if reserved_users is not None:
		# filter some invalid users
		df = df.drop(df[~df['user_id'].isin(reserved_users)].index)
	user_task_list = df.values.tolist()
	for tuple_ in user_task_list:
		user_id = tuple_[0]
		flip_1 = int(tuple_[-2])
		flip_2 = int(tuple_[-1])
		assert user_batch_order[user_id][0] != user_batch_order[user_id][1]
		if int(user_batch_order[user_id][1]) == 1:
			task_prefix = "p1-"
		elif int(user_batch_order[user_id][1]) == 2:
			task_prefix = "p2-"
		else:
			raise NotImplementedError("Unknown task batch")
		if user_id not in user_confidence_dict:
			user_confidence_dict[user_id] = {}
		if user_id not in user_task_dict:
			user_task_dict[user_id] = {}
		if user_id not in user_flip_dict:
			user_flip_dict[user_id] = []
		user_flip_dict[user_id].append(task_prefix+str(flip_1))
		user_flip_dict[user_id].append(task_prefix+str(flip_2))
		for task_id in range(10):
			task_id_ = task_prefix + str(task_id)
			# "initial_answer_review_{}".format(i)
			# "confidence_initial_answer_review_{}".format(i)
			# "answer_after_prediction_review_{}".format(i)
			# "confidence_answer_after_prediction_review_{}".format(i)
			user_task_dict[user_id][(task_id_, "base")] = tuple_[4*task_id+1] # initial_answer
			user_confidence_dict[user_id][(task_id_, "base")] = tuple_[4*task_id+2] # initial confidence
			user_task_dict[user_id][(task_id_, "advice")] = tuple_[4*task_id+3] # final_answer
			user_confidence_dict[user_id][(task_id_, "advice")] = tuple_[4*task_id+4] # final confidence

	return user_task_dict, user_confidence_dict, user_flip_dict

def read_task_batch_data(filename, number_task=10, live_file=False):
	filename = os.path.join(data_folder, filename)
	usecols = ["user_id"]
	for i in range(number_task):
		usecols.append("initial_answer_review_{}".format(i))
		usecols.append("confidence_initial_answer_review_{}".format(i))
		usecols.append("answer_after_prediction_review_{}".format(i))
		usecols.append("confidence_answer_after_prediction_review_{}".format(i))
	if live_file:
		usecols += ["flip1", "flip2", "batch_1", "batch_2", "question_order"]
	else:
		usecols += ["flip_1", "flip_2", "batch_1", "batch_2", "question_order"]
	user_batch_data = {}
	df = pd.read_csv(filename, usecols=usecols)
	df = df.dropna(axis=0)
	# drop all rows with any missing values
	user_task_list = df.values.tolist()
	for tuple_ in user_task_list:
		assert len(usecols) == len(tuple_)
		user_id = tuple_[0]
		user_batch_data[user_id] = {}
		for index, col_name in enumerate(usecols):
			if col_name == "flip1":
				col_name = "flip_1"
			elif col_name == "flip2":
				col_name = "flip_2"
			user_batch_data[user_id][col_name] = tuple_[index]
	return user_batch_data

def compare_live_all_data(suffix="phase_1", number_task=10):
	assert suffix in ["phase_1", "phase_2", "phase_nt"]
	file_live = "user_input_live_user_answers_{}.csv".format(suffix)
	file_complete = "user_input_user_answers_{}.csv".format(suffix)
	data_live = read_task_batch_data(file_live, number_task=number_task, live_file=True)
	if suffix == "phase_nt":
		data_complete = read_task_batch_data(file_complete, number_task=number_task, live_file=True)
	else:
		data_complete = read_task_batch_data(file_complete, number_task=number_task)
	valid_users = set()
	for user in data_live:
		if user not in data_complete:
			continue
		data_dict_1 = data_live[user]
		data_dict_2 = data_complete[user]
		flag = True
		for key_ in data_dict_1:
			if data_dict_1[key_] != data_dict_2[key_]:
				flag = False
				break
		if flag:
			valid_users.add(user)
			# data exist, and all values are same with checking two data saving file
	return valid_users

def get_valid_user_from_file(filename):
	filename = os.path.join(data_folder, filename)
	df = pd.read_csv(filename)
	df = df.dropna(axis=0)
	# drop all rows with any missing values
	return set(df["user_id"])

def find_complete_users():
	filenames_ = ["user_input_ati.csv", "user_input_pre_task_questionnaire_tia.csv", "user_input_post_task_questionnaire_1.csv",
	"user_input_post_task_questionnaire_2.csv", "user_input_post_task_questionnaire_tia_1.csv",
	"user_input_post_task_questionnaire_tia_2.csv", "user_input_user_answers_phase_1.csv", "user_input_user_answers_phase_2.csv"]
	# all questionnaires used + task phase 1 + task phase 2

	valid_user_list = []
	# valid_user_list.append(compare_live_all_data("phase_1"))
	# all users have the same data in phase_1
	# valid_user_list.append(compare_live_all_data("phase_2"))
	# all users have the same data in phase_2
	# nt_valid_users = compare_live_all_data("phase_nt", number_task=8)
	nt_valid_users = get_valid_user_from_file("user_input_user_answers_phase_nt.csv")
	# print(len(valid_user_list[0]), len(valid_user_list[1]), len(nt_valid_users))
	# all users have the same data in phase_no tutorial, only the one correspond to 8 tutorial tasks
	nasa_tlx_users = get_valid_user_from_file("user_input_nasa_tlx.csv")
	for filename in filenames_:
		valid_user_list.append(get_valid_user_from_file(filename))
	# all users have the same data in phase 2
	initial_user_set = get_valid_user_from_file("user_input_user_data.csv")
	user_condition_dict, user_batch_order = load_user_info()
	complete_users = set()
	for user in initial_user_set:
		user_condition = user_condition_dict[user]
		if user_condition == 3:
			if user not in nt_valid_users:
				continue
		else:
			if user not in nasa_tlx_users:
				# in tutorial conditions, users 
				continue
		flag = True
		for user_set in valid_user_list:
			if user not in user_set:
				flag = False
				break
		if flag:
			complete_users.add(user)
	return complete_users

def find_valid_users(threshold=8):
	complete_users = find_complete_users()
	# print(f"In total, we have {len(complete_users)} participants with complete records")
	user_attention_check_correct = read_attention_checks(reserved_users=complete_users)
	valid_users = set()
	approved_users = set()
	for user in complete_users:
		# if len(user) != 24:
		# 	# non-prolific ID
		# 	continue
		# print(user_attention_check_correct[user])
		if user_attention_check_correct[user] >= threshold:
			valid_users.add(user)
		# else:
		# 	print(f"user {user} only passed {user_attention_check_correct[user]} attention checks")
		approved_users.add(user)
	print("-" * 17)
	print("Valid users: {}, complete users: {}".format(len(valid_users), len(complete_users)))
	return valid_users, approved_users

def calculate_bonus(valid_users, filename="bonus_valid_users.txt"):
	answer_dict_p1 = load_answers("reviews_p1.json")
	answer_dict_p2 = load_answers("reviews_p2.json")
	answer_dict = {}
	for task_id in answer_dict_p1:
		answer_dict["p1-{}".format(task_id)] = answer_dict_p1[task_id]
	for task_id in answer_dict_p2:
		answer_dict["p2-{}".format(task_id)] = answer_dict_p2[task_id]
	user_condition_dict, user_batch_order = load_user_info()

	user_task_dict, user_confidence_dict, _ = read_decisions(user_batch_order, reserved_users=valid_users)
	f = open(filename, "w")
	for user_id in valid_users:
		assert len(user_task_dict[user_id]) == 40
		# make sure users' all decisions on batch 1 and batch 2 are saved
		correct_number = 0
		for task_id in range(10):
			task_id_1 = "p1-{}".format(task_id)
			if user_task_dict[user_id][(task_id_1, "advice")] == answer_dict[task_id_1]:
				correct_number += 1
			task_id_2 = "p2-{}".format(task_id)
			if user_task_dict[user_id][(task_id_2, "advice")] == answer_dict[task_id_2]:
				correct_number += 1
		f.write("%s,%.2f\n"%(user_id, correct_number * 0.05))
	f.close()

def get_condition_users(user_condition_dict):
	condition_users = {}
	for condition in range(4):
		condition_users[condition] = set()
	for user in user_condition_dict:
		tp_condition = user_condition_dict[user]
		condition_users[tp_condition].add(user)
	return condition_users


if __name__ == "__main__":
	# ground truth for tasks in task batch
	answer_dict_p1 = load_answers("reviews_p1.json")
	answer_dict_p2 = load_answers("reviews_p2.json")
	user_condition_dict, user_batch_order = load_user_info()

	valid_users, approved_users = find_valid_users()
	condition_users = get_condition_users(user_condition_dict)

	for condition in range(4):
		print("Condition {} have {} valid participants".format(condition, len(valid_users & condition_users[condition])))
	# calculate_bonus(valid_users, filename="bonus_valid_users.txt")

	# f = open("approved_users", "w")
	# for user in approved_users:
	# 	f.write("%s\n"%(user))
	# f.close()

	# covariates, assessed in pre-task questionnaire
	user_ATI_scale = calc_ATI_scale()
	user_familiarity = calc_familiarity()
	user_ptt_scale = calc_propensity_to_trust()

	# dependent variables, in post-task questionnaire
