Database data preprocessing

Author: Martin Dierikx
Date: 25-09-2023
Required files: data/cleaned_database_data.csv
Output files: results/preprocessed_database_data_reproduced.csv

This file contains the code to reproduce the preprocessing of the database data. After running the this code the outputted file (in the results folder) can be compared to the existing preprocessed_database_data.csv to check the reproducability.

In [ ]:
# The unprocessed database data had the following fields:
# - Anonimized id of the participant
# - Session number of the datapoint
# - Type of information of the datapoint (e.g. previous activity, mood, self-motivation, etc.)
# - Actual information of the datapoint

sm = se = re = at = mo = prev_activity = goal = action = ga = num_reject = sef = init_proposal = steps = rejection_reason = p_id_of_sample = s_num_of_sample = ""
samples = []
for d in open("../data/cleaned_database_data.csv"):
    data = d.split(",")
    # Get the participant id and session number of the data point
    p_id = data[0].strip("\"")
    s_num = data[1].strip("\"")

    # Check whether this data point belongs to the previous sample
    if p_id != p_id_of_sample or s_num != s_num_of_sample:
        # If it is a new samples, add the previous sample to the list and reset the variables
        if p_id_of_sample != '':
            samples.append([p_id_of_sample, sm, se, re, at, mo, action, init_proposal, prev_activity, ga, sef, goal,
                            num_reject, s_num_of_sample, steps, rejection_reason])
        sm = se = re = at = mo = prev_activity = goal = action = ga = num_reject = sef = init_proposal = p_id_of_sample = steps = rejection_reason = ""

        # Create a new sample if the id is of a datapoint
        if p_id != "ID":
            p_id_of_sample = p_id
            s_num_of_sample = s_num

    # Read the information of the datapoint based on the type of information
    if data[2] == "prev_activity":
        # Store the steps of each of the previous 9 days
        prev_activity = "" + data[3].strip("\"") + ';' + data[4] + ';' + data[5] + ';' + data[6] + ';' + data[
            7] + ';' + data[8] + ';' + data[9] + ';' + data[10] + ';' + data[11].strip("\"\n")
        # Store the steps taken yesterday in a separate variable
        steps = data[3].strip("\"")
    elif data[2] == "mood":
        mo = data[3].strip("\n")
    elif data[2] == "rest":
        re = data[3].strip("\"\n")
    elif data[2] == "available_time":
        at = data[3].strip("\"\n")
    elif data[2] == "self_motivation":
        sm = data[3].strip("\"\n")
    elif data[2] == "self_efficacy":
        se = data[3].strip("\"\n")
    elif data[2] == "rl_action":
        action = data[3].strip("\n")
    elif data[2] == "initial_proposal":
        init_proposal = data[3].strip("\"\n")
    elif data[2] == "number_of_rejected_proposals":
        num_reject = data[3].strip("\"\n")
    elif data[2] == "goal_achievability":
        ga = data[3].strip("\"\n")
    elif data[2] == "self_efficacy_feedback":
        sef = data[3].strip("\"\n")
    elif data[2] == "goal":
        goal = data[3].strip("\"\n")
    elif data[2] == "rejection_reason":
        rejection_reason = data[3].strip("\"\n")

# Add the final sample to the list
if p_id_of_sample != '':
    samples.append(
        [p_id_of_sample, sm, se, re, at, mo, action, init_proposal, prev_activity, ga, sef, goal, num_reject,
         s_num_of_sample, steps, rejection_reason])

# The processed database data has the following fields:
# - Anonymized id of the participant
# - Self-motivation of the participant of the current session
# - Self-efficacy of the participant of the current session
# - Rest of the participant of the current session
# - Available time of the participant of the current session
# - Mood of the participant of the current session
# - Reinforcement learning model action taken in the current session
# - Self-motivation of the participant of the next session
# - Self-efficacy of the participant of the next session
# - Rest of the participant of the next session
# - Available time of the participant of the next session
# - Mood of the participant of the next session
# - Initial step goal proposal done in the current session
# - Previous activity of the past 9 days of the current session
# - Steps taken on the day of the current session
# - Perceived goal achievability of the goal of the current session
# - Self-efficacy accuracy of the self-efficacy of the current session
# - Goal of the current session
# - Number of rejected proposals of the current session
# - Reason for rejection of the current session
# - Session number of the current session

# Write the data samples to a file
f = open("../results/preprocessed_database_data_reproduced.csv", "w")
f.write(
    f"ID,Self-motivation,Self-efficacy,Rest,Available time,Mood,RL action,Next self-motivation,Next self-efficacy,Next rest,Next available time,Next mood,Initial proposal,Previous_activity,Steps taken,Goal achievability,Self-efficacy feedback,Goal,Rejected proposals,Rejection reason,Session number\n")
f.close()

for i in range(len(samples) - 1):
    # Store every grouping of two consecutive sessions to make one data sample
    sample = samples[i]
    sample_2 = samples[i + 1]
    if sample_2[0] == sample[0]:
        f = open("../results/preprocessed_database_data_reproduced.csv", "a")
        f.write(
            f"{sample[0]},{sample[1]},{sample[2]},{sample[3]},{sample[4]},{sample[5]},{sample[6]},{sample_2[1]},{sample_2[2]},{sample_2[3]},{sample_2[4]},{sample_2[5]},{sample[7]},{sample[8]},{sample_2[14]},{sample_2[9]},{sample_2[10]},{sample[11]},{sample[12]},{sample[15]},{sample[13]}\n")
        f.close()
In [ ]: