Author: Martin Dierikx
Date: 25-09-2023
Required files: data/cleaned_database_data.csv
Output files: results/preprocessed_database_data_reproduced.csv
This file contains the code to reproduce the preprocessing of the database data. After running the this code the outputted file (in the results folder) can be compared to the existing preprocessed_database_data.csv to check the reproducability.
# The unprocessed database data had the following fields:
# - Anonimized id of the participant
# - Session number of the datapoint
# - Type of information of the datapoint (e.g. previous activity, mood, self-motivation, etc.)
# - Actual information of the datapoint
sm = se = re = at = mo = prev_activity = goal = action = ga = num_reject = sef = init_proposal = steps = rejection_reason = p_id_of_sample = s_num_of_sample = ""
samples = []
for d in open("../data/cleaned_database_data.csv"):
data = d.split(",")
# Get the participant id and session number of the data point
p_id = data[0].strip("\"")
s_num = data[1].strip("\"")
# Check whether this data point belongs to the previous sample
if p_id != p_id_of_sample or s_num != s_num_of_sample:
# If it is a new samples, add the previous sample to the list and reset the variables
if p_id_of_sample != '':
samples.append([p_id_of_sample, sm, se, re, at, mo, action, init_proposal, prev_activity, ga, sef, goal,
num_reject, s_num_of_sample, steps, rejection_reason])
sm = se = re = at = mo = prev_activity = goal = action = ga = num_reject = sef = init_proposal = p_id_of_sample = steps = rejection_reason = ""
# Create a new sample if the id is of a datapoint
if p_id != "ID":
p_id_of_sample = p_id
s_num_of_sample = s_num
# Read the information of the datapoint based on the type of information
if data[2] == "prev_activity":
# Store the steps of each of the previous 9 days
prev_activity = "" + data[3].strip("\"") + ';' + data[4] + ';' + data[5] + ';' + data[6] + ';' + data[
7] + ';' + data[8] + ';' + data[9] + ';' + data[10] + ';' + data[11].strip("\"\n")
# Store the steps taken yesterday in a separate variable
steps = data[3].strip("\"")
elif data[2] == "mood":
mo = data[3].strip("\n")
elif data[2] == "rest":
re = data[3].strip("\"\n")
elif data[2] == "available_time":
at = data[3].strip("\"\n")
elif data[2] == "self_motivation":
sm = data[3].strip("\"\n")
elif data[2] == "self_efficacy":
se = data[3].strip("\"\n")
elif data[2] == "rl_action":
action = data[3].strip("\n")
elif data[2] == "initial_proposal":
init_proposal = data[3].strip("\"\n")
elif data[2] == "number_of_rejected_proposals":
num_reject = data[3].strip("\"\n")
elif data[2] == "goal_achievability":
ga = data[3].strip("\"\n")
elif data[2] == "self_efficacy_feedback":
sef = data[3].strip("\"\n")
elif data[2] == "goal":
goal = data[3].strip("\"\n")
elif data[2] == "rejection_reason":
rejection_reason = data[3].strip("\"\n")
# Add the final sample to the list
if p_id_of_sample != '':
samples.append(
[p_id_of_sample, sm, se, re, at, mo, action, init_proposal, prev_activity, ga, sef, goal, num_reject,
s_num_of_sample, steps, rejection_reason])
# The processed database data has the following fields:
# - Anonymized id of the participant
# - Self-motivation of the participant of the current session
# - Self-efficacy of the participant of the current session
# - Rest of the participant of the current session
# - Available time of the participant of the current session
# - Mood of the participant of the current session
# - Reinforcement learning model action taken in the current session
# - Self-motivation of the participant of the next session
# - Self-efficacy of the participant of the next session
# - Rest of the participant of the next session
# - Available time of the participant of the next session
# - Mood of the participant of the next session
# - Initial step goal proposal done in the current session
# - Previous activity of the past 9 days of the current session
# - Steps taken on the day of the current session
# - Perceived goal achievability of the goal of the current session
# - Self-efficacy accuracy of the self-efficacy of the current session
# - Goal of the current session
# - Number of rejected proposals of the current session
# - Reason for rejection of the current session
# - Session number of the current session
# Write the data samples to a file
f = open("../results/preprocessed_database_data_reproduced.csv", "w")
f.write(
f"ID,Self-motivation,Self-efficacy,Rest,Available time,Mood,RL action,Next self-motivation,Next self-efficacy,Next rest,Next available time,Next mood,Initial proposal,Previous_activity,Steps taken,Goal achievability,Self-efficacy feedback,Goal,Rejected proposals,Rejection reason,Session number\n")
f.close()
for i in range(len(samples) - 1):
# Store every grouping of two consecutive sessions to make one data sample
sample = samples[i]
sample_2 = samples[i + 1]
if sample_2[0] == sample[0]:
f = open("../results/preprocessed_database_data_reproduced.csv", "a")
f.write(
f"{sample[0]},{sample[1]},{sample[2]},{sample[3]},{sample[4]},{sample[5]},{sample[6]},{sample_2[1]},{sample_2[2]},{sample_2[3]},{sample_2[4]},{sample_2[5]},{sample[7]},{sample[8]},{sample_2[14]},{sample_2[9]},{sample_2[10]},{sample[11]},{sample[12]},{sample[15]},{sample[13]}\n")
f.close()