Author: Andrei Stefan
Date: 13-11-2023
Required files: data/prolific_ids.csv, data/anonymised_data/anonymised_data_demographic.csv, data/anonymised_data/anonymised_data_state_action_state.csv, data/anonymised_data/anonymised_data_conversation_items.csv, data/anonymised_data/anonymised_data_prescreening.csv, data/anonymised_data/anonymised_data_prescreening_no_changes.csv data/anonymised_data/anonymised_data_post_questionnaire.csv
Output files: data/anonymised_data/anonymised_data_demographic_2.csv, data/anonymised_data/anonymised_data_state_action_state_2.csv, data/anonymised_data/anonymised_data_conversation_items_2.csv, data/anonymised_data/anonymised_data_prescreening_2.csv, data/anonymised_data/anonymised_data_post_questionnaire_2.csv
This file contains the code to reproduce preprocessing the pre-screening questionnaire and database data. Since all the published files are already anonymised, this notebook can only be used to run the same preprocessing on the anonymised files and double check the results. In some cases, the processed and anonymised files used as input are already the same as the output file will be.
# import all required packages
import csv
import numpy as np
import os
import pandas as pd
Define a helper function for mapping Prolific ids to anonymised ids (labeled participant_1 through participant_114).
def get_id_mappings():
"""
Function to map each prolific id to an anonymised one.
Args: none.
Returns: a dict containing the mapping.
"""
# open the prolific_ids file
with open('../data/prolific_ids.csv') as f:
# read the lines of the file into a list
lines = csv.reader(f, delimiter='\n')
# initialise an empty dict to hold the mappings
id_mappings = {}
# loop over the lines with enumerate to also have the index
for index, line in enumerate(lines):
#extract the prolific id from the line - remove brackets and quotes from the id
prolific_id = str(line)[2:-2]
# anonymise the id
anonymised_id = f"participant_{index+1}"
# add the anonymised id to the dict as the value of the un-anonymised id key
id_mappings[prolific_id] = anonymised_id
# return the dict with the mappings
return id_mappings
Define functions for processing and anonymising the data gathered.
def process_and_anonymise_demographic_data():
"""
Function to process and anonymise the demographic data obtained from the participants' Prolific profiles.
Args: none.
Returns: none.
Fields in the original data which were removed because we could not publish them:
Submission id
Status
Started at
Completed at
Reviewed at
Archived at
Time taken
Completion code
Sex
Ethnicity simplified
Country of birth
Country of residence
Nationality
Language
Student status
Employment status
"""
# get the mappings for the ids
id_mappings = get_id_mappings()
# read the (un-)anonymised data from the participants' Prolific profiles into a dataframe
df = pd.read_csv("../data/anonymised_data/anonymised_data_demographic.csv")
# only keep ids which are in the mappings (the original data could be from people we did not keep in the end)
df2 = df[df['Participant id'].isin(id_mappings)]
# replace the original Prolific id with the anonymised one
df3 = df2.replace({"Participant id": id_mappings})
# only keep id, age, and gender
# number of approvals was also in the data, but it was removed because it was not needed
df4 = df3[['Participant id', "Age", "Gender"]]
# save the dataframe to a csv
df4.to_csv("../data/anonymised_data/anonymised_data_demographic_2.csv", index=False)
def process_and_anonymise_database_data():
"""
Function to process and anonymise the data gathered during the interaction with the virtual coach.
Args: none.
Returns: none.
All the data gathered during the interaction is published, except the time when the samples were recorded, since we did not get permission to publish those.
"""
# get the mappings for the ids
id_mappings = get_id_mappings()
# read the (un-)anonymised data from the state_action_state table of the database into a dataframe
df_original_1 = pd.read_csv("../data/anonymised_data/anonymised_data_state_action_state.csv")
# only keep ids which are in the mappings (the original data could be from people we did not keep in the end)
df_original_1 = df_original_1[df_original_1['prolific_id'].isin(id_mappings)]
# replace the original Prolific id with the anonymised one
df1 = df_original_1.replace({"prolific_id": id_mappings})
# save the dataframe to a csv
df1.to_csv("../data/anonymised_data/anonymised_data_state_action_state_2.csv", index=False)
# read the (un-)anonymised data from the users table of the database into a dataframe (the table contains data about each user's conversation)
df_original_2 = pd.read_csv("../data/anonymised_data/anonymised_data_conversation_items.csv")
# only keep ids which are in the mappings (the original data could be from people we did not keep in the end)
df_original_2 = df_original_2[df_original_2['prolific_id'].isin(id_mappings)]
# replace the original Prolific id with the anonymised one
df2 = df_original_2.replace({"prolific_id": id_mappings})
# save the dataframe to a csv
df2.to_csv("../data/anonymised_data/anonymised_data_conversation_items_2.csv", index=False)
# next, we need to add the corresponding reward for each sample
# this reward is zero (0), except for the last sample of each person
# create a new column, with the reward set to zero
df1["reward"] = 0
# loop over the rows of the state - action - next state dataframe
for index, row in df1.iterrows():
# get the participant
current_id = df1['prolific_id'][index]
# get the state before
state_before = row['state_before']
# split the state before
split = state_before.split(", ")
# get the values of all state items
plans = split[0]
c = split[1]
pu = split[2]
a = split[3]
a1 = split[4]
a2 = split[5]
a3 = split[6]
a4 = split[7]
# rewrite the state before in the format needed
state_before = f"[{plans}, '{c}', '{pu}', '{a}', {a1}, {a2}, {a3}, {a4}]"
# repeat the same steps for the state after
state_after = row['state_after']
split = state_after.split(", ")
plans = split[0]
c = split[1]
pu = split[2]
a = split[3]
a1 = split[4]
a2 = split[5]
a3 = split[6]
a4 = split[7]
state_after = f"[{plans}, '{c}', '{pu}', '{a}', {a1}, {a2}, {a3}, {a4}]"
# write the newly formatted states to the dataframe
df1.loc[index, ['state_before']] = state_before
df1.loc[index, ['state_after']] = state_after
# check if index is not the last one (since then index + 1 would be out of bounds)
if index + 1 < len(df1):
# get the id of the next sample
next_id = df1['prolific_id'][index + 1]
# check if the next id is different (that means the next sample is from a new person)
if next_id != current_id:
# if it is, then get the reward corresponding to this id in the other dataframe
reward = df2.loc[df2['prolific_id'] == current_id]['reward'].iloc[0]
# split the reward since it is in plain text
split = reward.split("= ")
s = split[1].split(",")[0]
c_1 = split[2].split(",")[0]
c_f = split[3].split(",")[0]
c = split[4]
# reformat the reward
final_reward = f"[{s}, {c_1}, {c_f}, {c}]"
# save the reward in the dataframe
df1.loc[index, ['reward']] = final_reward
else:
# when we reach the final sample, get the reward corresponding to this id in the other dataframe
reward = df2.loc[df2['prolific_id'] == current_id]['reward'].iloc[0]
# split the reward since it is in plain text
split = reward.split("= ")
s = split[1].split(",")[0]
c_1 = split[2].split(",")[0]
c_f = split[3].split(",")[0]
c = split[4]
# reformat the reward
final_reward = f"[{s}, {c_1}, {c_f}, {c}]"
# save the reward in the dataframe
df1.loc[index, ['reward']] = final_reward
# save the full dataframe to a csv
df1.to_csv("../data/anonymised_data/anonymised_data_final_2.csv", index=False)
# save only state, action, next state, rewards to a different csv for convenience
df1[['state_before','action','state_after','reward']].to_csv("../data/anonymised_data/anonymised_data_final_without_id_2.csv", index=False)
def process_and_anonymise_prescreening_data():
"""
Function to process and anonymise the demographic data gathered in the pre-screening questionnaire.
Args: none.
Returns: none.
Fields in the original data which were removed because we could not publish them:
StartDate
StartDate
EndDate
Status
Progress
Duration (in seconds)
Finished
RecordedDate
ResponseId
DistributionChannel
UserLanguage
Q_RecaptchaScore
STUDY_ID
SESSION_ID
agentName
"""
# get the mappings for the ids
id_mappings = get_id_mappings()
# read the (un-)anonymised data from the prescreening into a dataframe
df1 = pd.read_csv("../data/anonymised_data/anonymised_data_prescreening_no_changes.csv")
# create a second dataframe with the same columns
df2 = pd.DataFrame(columns=df1.columns)
# create a new column in the original dataframe
df1["Godin score"] = 0
# iterate over the first dataframe
for index, row in df1.iterrows():
# get the id of participant
current_id = row['Prolific id']
# check if it is one of the participants kept in the final analysis
if current_id in id_mappings:
# if it is, then replace the un-anonymised id with a new one in the original dataframe
df1.loc[index, ['Prolific id']] = id_mappings[current_id]
# and add it to the new dataframe
df2.loc[len(df2.index)] = df1.loc[index]
# iterate over the first dataframe again
for index, row in df1.iterrows():
# get the weekly physical activity
weekly_pa = row['Weekly physical activity']
# if the weekly physical activity is "Never (0 - 60 minutes per week)", add it to the second dataframe
# in other words, ignore participants who had other answers for the weekly physical activity
if "Never" in weekly_pa:
df2.loc[index, ['Weekly physical activity']] = "Never (0 - 60 minutes per week)"
# get the TTM stage
ttm_stage = row['TTM stage']
# the answer corresponding to the preparing stage is "No, but I intend to in the next 30 days."
if "30" in ttm_stage:
# if this is the case, add it to the second dataframe
df2.loc[index, ['TTM stage']] = "Preparing"
# the answer corresponding to the contemplating stage is "No, but I intend to in the next 6 months."
if "6" in ttm_stage:
# if this is the case, add it to the second dataframe
df2.loc[index, ['TTM stage']] = "Contemplating"
# get the Godin questionnaire answers
godin_1 = row['Godin strenuous']
godin_2 = row['Godin moderate']
godin_3 = row['Godin mild']
# calculate the Godin score
godin_score = 9 * godin_1 + 5 * godin_2 + 3 * godin_3
# add the Godin score to the second dataframe
df2.loc[index, ['Godin score']] = godin_score
# get the answer to the question about the enjoyment of creating plans in general
planning_general = row['Feeling about creating plans in general']
if "Neutral" in planning_general:
# if the answer is "Neutral", add the corresponding value (0) to the second dataframe
df2.loc[index, ['Feeling about creating plans in general']] = 0
if "Good (5)" in planning_general:
# if the answer is "Good (5)", add the corresponding value (5) to the second dataframe
df2.loc[index, ['Feeling about creating plans in general']] = 5
if "Bad (-5)" in planning_general:
# if the answer is "Bad (-5)", add the corresponding value (-5) to the second dataframe
df2.loc[index, ['Feeling about creating plans in general']] = -5
# get the answer to the question about the enjoyment of creating plans for walking in particular
planning_walks = row['Feeling about creating plans for walks']
if "Neutral" in planning_walks:
# if the answer is "Neutral", add the corresponding value (0) to the second dataframe
df2.loc[index, ['Feeling about creating plans for walks']] = 0
if "Good (5)" in planning_walks:
# if the answer is "Good (5)", add the corresponding value (5) to the second dataframe
df2.loc[index, ['Feeling about creating plans for walks']] = 5
if "Bad (-5)" in planning_walks:
# if the answer is "Bad (-5)", add the corresponding value (-5) to the second dataframe
df2.loc[index, ['Feeling about creating plans for walks']] = -5
# get the answer about confidence
confidence = row['Confidence']
if "Moderately confident" in confidence:
# if the answer is "Moderately confident", add the corresponding value (5) to the second dataframe
df2.loc[index, ['Confidence']] = 5
if "Very confident (10)" in confidence:
# if the answer is "Very confident (10)", add the corresponding value (10) to the second dataframe
df2.loc[index, ['Confidence']] = 10
if "Not confident at all (0)" in confidence:
# if the answer is "Not confident at all (0)", add the corresponding value (0) to the second dataframe
df2.loc[index, ['Confidence']] = 0
# get the answer about perceived usefulness
perceived_usefulness = row['Perceived usefulness']
if "Neutral" in perceived_usefulness:
# if the answer is "Neutral", add the corresponding value (0) to the second dataframe
df2.loc[index, ['Perceived usefulness']] = 0
if "I think it can help me very much (10)" in perceived_usefulness:
# if the answer is "I think it can help me very much (10)", add the corresponding value (10) to the second dataframe
df2.loc[index, ['Perceived usefulness']] = 10
if "I think it can hinder me very much (-10)" in perceived_usefulness:
# if the answer is "I think it can hinder me very much (-10)", add the corresponding value (-10) to the second dataframe
df2.loc[index, ['Perceived usefulness']] = -10
# get the answer about attitude
attitude = row['Attitude']
if "Neutral" in attitude:
# if the answer is "Neutral", add the corresponding value (0) to the second dataframe
df2.loc[index, ['Attitude']] = 0
if "Good (10)" in attitude:
# if the answer is "Good (10)", add the corresponding value (10) to the second dataframe
df2.loc[index, ['Attitude']] = 10
if "Bad (-10)" in attitude:
# if the answer is "Bad (-10)", add the corresponding value (-10) to the second dataframe
df2.loc[index, ['Attitude']] = -10
# save the second dataframe to a csv, keeping only the important columns
df2[["Prolific id", "Weekly physical activity", "TTM stage", "Feeling about creating plans in general",
"Feeling about creating plans for walks", "Number of plans general", "Number of plans for walking",
"Confidence", "Perceived usefulness", "Attitude", "Godin score"]].to_csv(
"../data/anonymised_data/anonymised_data_prescreening_2.csv", index=False)
def process_and_anonymise_post_questionnaire_data():
"""
Function to process and anonymise the demographic data gathered in the post questionnaire.
Args: none.
Returns: none.
Fields in the original data which were removed because we could not publish them:
StartDate
EndDate
Status
Progress
Duration (in seconds)
Finished
RecordedDate
ResponseId
DistributionChannel
UserLanguage
Q_RecaptchaScore
STUDY_ID
SESSION_ID
agentName
"""
# get the mappings for the ids
id_mappings = get_id_mappings()
# read the (un-)anonymised post questionnaire data into a dataframe
df1 = pd.read_csv("../data/anonymised_data/anonymised_data_post_questionnaire.csv")
# create a second dataframe with the same columns
df2 = pd.DataFrame(columns=df1.columns)
# replace original recorded values where necessary
df1 = df1.replace("disagree", -3)
df1 = df1.replace("neither agree nor disagree", 0)
df1 = df1.replace("agree", 3)
# iterate over the dataframe
for index, row in df1.iterrows():
# get the id of the participant
current_id = row['Prolific id']
# check if it is one of the participants kept in the final analysis
if current_id in id_mappings:
# if it is, then replace the un-anonymised id with a new one in the original dataframe
df1.loc[index, ['Prolific id']] = id_mappings[current_id]
# and add it to the new dataframe
df2.loc[len(df2.index)] = df1.loc[index]
# save the dataframe to a csv
df2.to_csv("../data/anonymised_data/anonymised_data_post_questionnaire_2.csv", index=False)
Running any of the functions defined above (except get_mappings()
) produces one or more new csv files with the anonymised data specified in the name of the function. The input data is already anonymised (as we could not publish the raw data), so it is possible to compare the newly created files against the input files. Examples are shown below.
# run the function for anonymising the demographic data
process_and_anonymise_demographic_data()
Then, the files anonymised_data_demographic.csv
and anonymised_data_demographic_2.csv
can be compared.
Note: if the file does not apprear right away in the file browser on the left, click the ⟳ (Refresh the file browser) button.
The code below removes all newly created files.
# list all the files in the data/anonymised_data directory
for filename in os.listdir("../data/anonymised_data"):
# if the file contains _2, then it was created now and it should be removed
if "_2" in filename:
os.remove(f"../data/anonymised_data/{filename}")