Database data preprocessing¶

Author: Martin Dierikx

Date: January 2024

Required files: data/cleaned_database_data.csv
Output files: results/preprocessed_database_data_reproduced.csv

This file contains the code to reproduce the preprocessing of the database data. After running the this code the outputted file (in the results folder) can be compared to the existing preprocessed_database_data.csv to check the reproducability.

In [1]:
# The unprocessed database data had the following fields:
# - Anonimized id of the participant
# - Session number of the datapoint
# - Type of information of the datapoint (e.g. previous activity, mood, self-motivation, etc.)
# - Actual information of the datapoint

sm = se = re = at = mo = prev_activity = goal = action = ga = num_reject = sef = init_proposal = steps = rejection_reason = p_id_of_sample = s_num_of_sample = ""
samples = []
for d in open("../data/cleaned_database_data.csv"):
    data = d.split(",")
    # Get the participant id and session number of the data point
    p_id = data[0].strip("\"")
    s_num = data[1].strip("\"")

    # Check whether this data point belongs to the previous sample
    if p_id != p_id_of_sample or s_num != s_num_of_sample:
        # If it is a new samples, add the previous sample to the list and reset the variables
        if p_id_of_sample != '':
            samples.append([p_id_of_sample, sm, se, re, at, mo, action, init_proposal, prev_activity, ga, sef, goal,
                            num_reject, s_num_of_sample, steps, rejection_reason])
        sm = se = re = at = mo = prev_activity = goal = action = ga = num_reject = sef = init_proposal = p_id_of_sample = steps = rejection_reason = ""

        # Create a new sample if the id is of a datapoint
        if p_id != "ID":
            p_id_of_sample = p_id
            s_num_of_sample = s_num

    # Read the information of the datapoint based on the type of information
    if data[2] == "prev_activity":
        # Store the steps of each of the previous 9 days
        prev_activity = "" + data[3].strip("\"") + ';' + data[4] + ';' + data[5] + ';' + data[6] + ';' + data[
            7] + ';' + data[8] + ';' + data[9] + ';' + data[10] + ';' + data[11].strip("\"\n")
        # Store the steps taken yesterday in a separate variable
        steps = data[3].strip("\"")
    elif data[2] == "mood":
        mo = data[3].strip("\n")
    elif data[2] == "rest":
        re = data[3].strip("\"\n")
    elif data[2] == "available_time":
        at = data[3].strip("\"\n")
    elif data[2] == "self_motivation":
        sm = data[3].strip("\"\n")
    elif data[2] == "self_efficacy":
        se = data[3].strip("\"\n")
    elif data[2] == "rl_action":
        action = data[3].strip("\n")
    elif data[2] == "initial_proposal":
        init_proposal = data[3].strip("\"\n")
    elif data[2] == "number_of_rejected_proposals":
        num_reject = data[3].strip("\"\n")
    elif data[2] == "goal_achievability":
        ga = data[3].strip("\"\n")
    elif data[2] == "self_efficacy_feedback":
        sef = data[3].strip("\"\n")
    elif data[2] == "goal":
        goal = data[3].strip("\"\n")
    elif data[2] == "rejection_reason":
        rejection_reason = data[3].strip("\"\n")

# Add the final sample to the list
if p_id_of_sample != '':
    samples.append(
        [p_id_of_sample, sm, se, re, at, mo, action, init_proposal, prev_activity, ga, sef, goal, num_reject,
         s_num_of_sample, steps, rejection_reason])

# The processed database data has the following fields:
# - Anonymized id of the participant
# - Self-motivation of the participant of the current session
# - Self-efficacy of the participant of the current session
# - Rest of the participant of the current session
# - Available time of the participant of the current session
# - Mood of the participant of the current session
# - Reinforcement learning model action taken in the current session
# - Self-motivation of the participant of the next session
# - Self-efficacy of the participant of the next session
# - Rest of the participant of the next session
# - Available time of the participant of the next session
# - Mood of the participant of the next session
# - Initial step goal proposal done in the current session
# - Previous activity of the past 9 days of the current session
# - Steps taken on the day of the current session
# - Perceived goal achievability of the goal of the current session
# - Self-efficacy accuracy of the self-efficacy of the current session
# - Goal of the current session
# - Number of rejected proposals of the current session
# - Reason for rejection of the current session
# - Session number of the current session

# Write the data samples to a file
f = open("../results/preprocessed_database_data_reproduced.csv", "w")
f.write(
    f"ID,Self-motivation,Self-efficacy,Rest,Available time,Mood,RL action,Next self-motivation,Next self-efficacy,Next rest,Next available time,Next mood,Initial proposal,Previous_activity,Steps taken,Goal achievability,Self-efficacy feedback,Goal,Rejected proposals,Rejection reason,Session number\n")
f.close()

for i in range(len(samples) - 1):
    # Store every grouping of two consecutive sessions to make one data sample
    sample = samples[i]
    sample_2 = samples[i + 1]
    if sample_2[0] == sample[0]:
        f = open("../results/preprocessed_database_data_reproduced.csv", "a")
        f.write(
            f"{sample[0]},{sample[1]},{sample[2]},{sample[3]},{sample[4]},{sample[5]},{sample[6]},{sample_2[1]},{sample_2[2]},{sample_2[3]},{sample_2[4]},{sample_2[5]},{sample[7]},{sample[8]},{sample_2[14]},{sample_2[9]},{sample_2[10]},{sample[11]},{sample[12]},{sample[15]},{sample[13]}\n")
        f.close()

Process the steps before and after the intervention of people who completed the whole study¶

For the analysis in R about the increase in steps that people took between the 9 days before the intervention and the last day of the intervention, this code creates the data file containing 2 columns. The first one has the average steps before the intervention and the second column has the steps of the final day of the intervention. This allows for a paired t-test.

In [2]:
import math
from matplotlib import pyplot as plt
import numpy as np
import scipy.stats as sc
from sklearn import metrics

# Keep track of the steps on the last day from the people who completed the whole study
p_ids = []
average_steps_final_day = []

# Read the data from the file
for d in open("../data/preprocessed_post_questionnaire_data.csv"):
    data = d.split(",")
    # To ignore the header cells
    if data[0] != 'ID':
        
        # Extract data on the average number of steps taken by people on the final day of the intervention
        p_ids.append(data[0].strip("\""))
        average_steps_final_day.append(int(data[1]))

# Keep track of the steps on the days before the intervention from the people who completed the whole study
average_steps_before_intervention = [0 for i in range(len(p_ids))]

# Read the data from the file
for d in open("../data/preprocessed_database_data.csv"):
    data = d.split(",")
    # To ignore the header cells
    if data[0] != "ID":

        # Store the data if it is the first session and it is a person who completed the whole study
        p_id = data[0].strip("\"")
        if int(data[20].strip('\n')) == 1 and p_id in p_ids:
            # Get the steps before the intervention
            prev_activity = data[13].split(";")
            prev_steps = []
            for steps in prev_activity:
                prev_steps.append(int(steps))
            average_steps_before_intervention[p_ids.index(p_id)] = int(np.mean(prev_steps))

# Write the step data to a file
f = open("../results/preprocessed_step_data_reproduced.csv", "w")
f.write(f"ID,Steps_before,Steps_after\n")
for i in range(len(p_ids)):
    f.write(f"{p_ids[i]},{average_steps_before_intervention[i]},{average_steps_final_day[i]}\n")
f.close()      
In [ ]: