##### Calculating movable wealth segregation with movable wealth divided by the number of adults in a household #####
# 01_load_and_process_data script
# 30/07/2025
# Movable wealth segregation with local environments calculated with a radius of 500m and 4000m
#####

# load the required packages
library(haven)
library(ineq)
library(DescTools)
library(tidyverse)
library(OasisR)
library(seg)
library(sf)
library(sp)

# Do not use scientific notation
options(scipen = 9999999)

# Load the database about household characteristics
households <- read_sav("G:/Bevolking/GBAHUISHOUDENSBUS/GBAHUISHOUDENS2022BUSV1.sav")

# Use mutate() to create a new column "ID" by combining the RINPERSOON and RINPERSOONS columns of the databases of interest
households <- mutate(households, ID = paste(households$RINPERSOONS, households$RINPERSOON))

# make the households database more readable
households <- rename(households, start_date = DATUMAANVANGHH)
households <- rename(households, end_date = DATUMEINDEHH)

# Delete households that ceased to exist before the 1st January of 2011
households <- households %>%
  mutate(start_date = as.Date(as.character(start_date), format = "%Y%m%d"),
         end_date = as.Date(as.character(end_date), format = "%Y%m%d")) %>%
  filter(end_date > as.Date("2011-01-01"))

# Delete superflous information in households database due to its huge size and our limited computational power
households <- households %>%
  select(ID, start_date, end_date, AANTALPERSHH, AANTALKINDHH)

#### Radius = 500 m ####

#### Calculate Spatial Information Theory Index for movable wealth segregation per adult capita in 2022 ####

# extract observations from the "address" database corresponding to the year 2022, taking the 1st of January as reference
households_2022 <- households %>%
  #mutate(start_date = as.Date(as.character(start_date), format = "%Y%m%d"),
  #end_date = as.Date(as.character(end_date), format = "%Y%m%d")) %>%
  filter(start_date < as.Date("2022-01-02") & end_date > as.Date("2022-01-01"))

# Load the data from 2022
wealth2022 <- read.csv("raw_data/database_2022.csv") # only if not already loaded

# Merge both databases
wealth2022 <- merge(wealth2022, households_2022, by ="ID", all.x = TRUE, all.y = FALSE)

# Calculate the number of adults
wealth2022 <- wealth2022 %>%
  mutate(number_of_adults = AANTALPERSHH - AANTALKINDHH)

# Delete missing values of wealth
wealth2022 <- wealth2022[wealth2022$VEHW1110FINH != 99999999999, ]
wealth2022 <- wealth2022[wealth2022$VEHW1130ONDH != 99999999999, ]
wealth2022 <- wealth2022[wealth2022$VEHW1140ABEH != 99999999999, ]
wealth2022 <- wealth2022[wealth2022$VEHW1150OVEH != 99999999999, ]

# Calculate movable wealth
wealth2022$Movable_wealth <- wealth2022$VEHW1110FINH + wealth2022$VEHW1130ONDH + wealth2022$VEHW1140ABEH + wealth2022$VEHW1150OVEH

# Calculate movable wealth per adult capita
wealth2022$Movable_wealth_per_adult_capita <- (wealth2022$Movable_wealth/wealth2022$number_of_adults)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their real estate wealth within every FUA
wealth2022 <- wealth2022 %>%
  arrange(FUA, Movable_wealth_per_adult_capita)

# Calculate the wealth percentile rank every household belongs to in their FUA
wealth2022 <- wealth2022 %>%
  group_by(FUA) %>% 
  mutate(Movable_wealth_per_adult_capita_percentile = ntile(Movable_wealth_per_adult_capita, 100))

# Clean rows of missing data 
wealth2022 <- wealth2022 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
wealth2022 <- wealth2022 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 500m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- wealth2022
  
  # Create the dataframe out of the wealth database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = Movable_wealth_per_adult_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 500, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("Movable_wealth_per_adult_capita_segregation_500m_2022_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(wealth2022)
rm(households_2022)



#### Calculate Spatial Information Theory Index for movable wealth segregation per adult capita in 2021 ####

# extract observations from the "address" database corresponding to the year 2021, taking the 1st of January as reference
households_2021 <- households %>%
  #mutate(start_date = as.Date(as.character(start_date), format = "%Y%m%d"),
  #end_date = as.Date(as.character(end_date), format = "%Y%m%d")) %>%
  filter(start_date < as.Date("2021-01-02") & end_date > as.Date("2021-01-01"))

# Load the data from 2021
wealth2021 <- read.csv("raw_data/database_2021.csv") # only if not already loaded

# Merge both databases
wealth2021 <- merge(wealth2021, households_2021, by ="ID", all.x = TRUE, all.y = FALSE)

# Calculate the number of adults
wealth2021 <- wealth2021 %>%
  mutate(number_of_adults = AANTALPERSHH - AANTALKINDHH)

# Delete missing values of wealth
wealth2021 <- wealth2021[wealth2021$VEHW1110FINH != 99999999999, ]
wealth2021 <- wealth2021[wealth2021$VEHW1130ONDH != 99999999999, ]
wealth2021 <- wealth2021[wealth2021$VEHW1140ABEH != 99999999999, ]
wealth2021 <- wealth2021[wealth2021$VEHW1150OVEH != 99999999999, ]

# Calculate movable wealth
wealth2021$Movable_wealth <- wealth2021$VEHW1110FINH + wealth2021$VEHW1130ONDH + wealth2021$VEHW1140ABEH + wealth2021$VEHW1150OVEH

# Calculate movable wealth per adult capita
wealth2021$Movable_wealth_per_adult_capita <- (wealth2021$Movable_wealth/wealth2021$number_of_adults)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their real estate wealth within every FUA
wealth2021 <- wealth2021 %>%
  arrange(FUA, Movable_wealth_per_adult_capita)

# Calculate the wealth percentile rank every household belongs to in their FUA
wealth2021 <- wealth2021 %>%
  group_by(FUA) %>% 
  mutate(Movable_wealth_per_adult_capita_percentile = ntile(Movable_wealth_per_adult_capita, 100))

# Clean rows of missing data 
wealth2021 <- wealth2021 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
wealth2021 <- wealth2021 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 500m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- wealth2021
  
  # Create the dataframe out of the wealth database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = Movable_wealth_per_adult_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 500, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("Movable_wealth_per_adult_capita_segregation_500m_2021_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(wealth2021)
rm(households_2021)


#### Calculate Spatial Information Theory Index for movable wealth segregation per adult capita in 2020 ####

# extract observations from the "address" database corresponding to the year 2020, taking the 1st of January as reference
households_2020 <- households %>%
  #mutate(start_date = as.Date(as.character(start_date), format = "%Y%m%d"),
  #end_date = as.Date(as.character(end_date), format = "%Y%m%d")) %>%
  filter(start_date < as.Date("2020-01-02") & end_date > as.Date("2020-01-01"))

# Load the data from 2020
wealth2020 <- read.csv("raw_data/database_2020.csv") # only if not already loaded

# Merge both databases
wealth2020 <- merge(wealth2020, households_2020, by ="ID", all.x = TRUE, all.y = FALSE)

# Calculate the number of adults
wealth2020 <- wealth2020 %>%
  mutate(number_of_adults = AANTALPERSHH - AANTALKINDHH)

# Delete missing values of wealth
wealth2020 <- wealth2020[wealth2020$VEHW1110FINH != 99999999999, ]
wealth2020 <- wealth2020[wealth2020$VEHW1130ONDH != 99999999999, ]
wealth2020 <- wealth2020[wealth2020$VEHW1140ABEH != 99999999999, ]
wealth2020 <- wealth2020[wealth2020$VEHW1150OVEH != 99999999999, ]

# Calculate movable wealth
wealth2020$Movable_wealth <- wealth2020$VEHW1110FINH + wealth2020$VEHW1130ONDH + wealth2020$VEHW1140ABEH + wealth2020$VEHW1150OVEH

# Calculate movable wealth per adult capita
wealth2020$Movable_wealth_per_adult_capita <- (wealth2020$Movable_wealth/wealth2020$number_of_adults)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their real estate wealth within every FUA
wealth2020 <- wealth2020 %>%
  arrange(FUA, Movable_wealth_per_adult_capita)

# Calculate the wealth percentile rank every household belongs to in their FUA
wealth2020 <- wealth2020 %>%
  group_by(FUA) %>% 
  mutate(Movable_wealth_per_adult_capita_percentile = ntile(Movable_wealth_per_adult_capita, 100))

# Clean rows of missing data 
wealth2020 <- wealth2020 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
wealth2020 <- wealth2020 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 500m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- wealth2020
  
  # Create the dataframe out of the wealth database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = Movable_wealth_per_adult_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 500, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("Movable_wealth_per_adult_capita_segregation_500m_2020_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(wealth2020)
rm(households_2020)



#### Calculate Spatial Information Theory Index for movable wealth segregation per adult capita in 2019 ####

# extract observations from the "address" database corresponding to the year 2019, taking the 1st of January as reference
households_2019 <- households %>%
  #mutate(start_date = as.Date(as.character(start_date), format = "%Y%m%d"),
  #end_date = as.Date(as.character(end_date), format = "%Y%m%d")) %>%
  filter(start_date < as.Date("2019-01-02") & end_date > as.Date("2019-01-01"))

# Load the data from 2019
wealth2019 <- read.csv("raw_data/database_2019.csv") # only if not already loaded

# Merge both databases
wealth2019 <- merge(wealth2019, households_2019, by ="ID", all.x = TRUE, all.y = FALSE)

# Calculate the number of adults
wealth2019 <- wealth2019 %>%
  mutate(number_of_adults = AANTALPERSHH - AANTALKINDHH)

# Delete missing values of wealth
wealth2019 <- wealth2019[wealth2019$VEHW1110FINH != 99999999999, ]
wealth2019 <- wealth2019[wealth2019$VEHW1130ONDH != 99999999999, ]
wealth2019 <- wealth2019[wealth2019$VEHW1140ABEH != 99999999999, ]
wealth2019 <- wealth2019[wealth2019$VEHW1150OVEH != 99999999999, ]

# Calculate movable wealth
wealth2019$Movable_wealth <- wealth2019$VEHW1110FINH + wealth2019$VEHW1130ONDH + wealth2019$VEHW1140ABEH + wealth2019$VEHW1150OVEH

# Calculate movable wealth per adult capita
wealth2019$Movable_wealth_per_adult_capita <- (wealth2019$Movable_wealth/wealth2019$number_of_adults)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their real estate wealth within every FUA
wealth2019 <- wealth2019 %>%
  arrange(FUA, Movable_wealth_per_adult_capita)

# Calculate the wealth percentile rank every household belongs to in their FUA
wealth2019 <- wealth2019 %>%
  group_by(FUA) %>% 
  mutate(Movable_wealth_per_adult_capita_percentile = ntile(Movable_wealth_per_adult_capita, 100))

# Clean rows of missing data 
wealth2019 <- wealth2019 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
wealth2019 <- wealth2019 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 500m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- wealth2019
  
  # Create the dataframe out of the wealth database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = Movable_wealth_per_adult_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 500, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("Movable_wealth_per_adult_capita_segregation_500m_2019_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(wealth2019)
rm(households_2019)



#### Calculate Spatial Information Theory Index for movable wealth segregation per adult capita in 2018 ####

# extract observations from the "address" database corresponding to the year 2018, taking the 1st of January as reference
households_2018 <- households %>%
  #mutate(start_date = as.Date(as.character(start_date), format = "%Y%m%d"),
  #end_date = as.Date(as.character(end_date), format = "%Y%m%d")) %>%
  filter(start_date < as.Date("2018-01-02") & end_date > as.Date("2018-01-01"))

# Load the data from 2018
wealth2018 <- read.csv("raw_data/database_2018.csv") # only if not already loaded

# Merge both databases
wealth2018 <- merge(wealth2018, households_2018, by ="ID", all.x = TRUE, all.y = FALSE)

# Calculate the number of adults
wealth2018 <- wealth2018 %>%
  mutate(number_of_adults = AANTALPERSHH - AANTALKINDHH)

# Delete missing values of wealth
wealth2018 <- wealth2018[wealth2018$VEHW1110FINH != 99999999999, ]
wealth2018 <- wealth2018[wealth2018$VEHW1130ONDH != 99999999999, ]
wealth2018 <- wealth2018[wealth2018$VEHW1140ABEH != 99999999999, ]
wealth2018 <- wealth2018[wealth2018$VEHW1150OVEH != 99999999999, ]

# Calculate movable wealth
wealth2018$Movable_wealth <- wealth2018$VEHW1110FINH + wealth2018$VEHW1130ONDH + wealth2018$VEHW1140ABEH + wealth2018$VEHW1150OVEH

# Calculate movable wealth per adult capita
wealth2018$Movable_wealth_per_adult_capita <- (wealth2018$Movable_wealth/wealth2018$number_of_adults)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their real estate wealth within every FUA
wealth2018 <- wealth2018 %>%
  arrange(FUA, Movable_wealth_per_adult_capita)

# Calculate the wealth percentile rank every household belongs to in their FUA
wealth2018 <- wealth2018 %>%
  group_by(FUA) %>% 
  mutate(Movable_wealth_per_adult_capita_percentile = ntile(Movable_wealth_per_adult_capita, 100))

# Clean rows of missing data 
wealth2018 <- wealth2018 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
wealth2018 <- wealth2018 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 500m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- wealth2018
  
  # Create the dataframe out of the wealth database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = Movable_wealth_per_adult_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 500, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("Movable_wealth_per_adult_capita_segregation_500m_2018_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(wealth2018)
rm(households_2018)



#### Calculate Spatial Information Theory Index for movable wealth segregation per adult capita in 2017 ####

# extract observations from the "address" database corresponding to the year 2017, taking the 1st of January as reference
households_2017 <- households %>%
  #mutate(start_date = as.Date(as.character(start_date), format = "%Y%m%d"),
  #end_date = as.Date(as.character(end_date), format = "%Y%m%d")) %>%
  filter(start_date < as.Date("2017-01-02") & end_date > as.Date("2017-01-01"))

# Load the data from 2017
wealth2017 <- read.csv("raw_data/database_2017.csv") # only if not already loaded

# Merge both databases
wealth2017 <- merge(wealth2017, households_2017, by ="ID", all.x = TRUE, all.y = FALSE)

# Calculate the number of adults
wealth2017 <- wealth2017 %>%
  mutate(number_of_adults = AANTALPERSHH - AANTALKINDHH)

# Delete missing values of wealth
wealth2017 <- wealth2017[wealth2017$VEHW1110FINH != 99999999999, ]
wealth2017 <- wealth2017[wealth2017$VEHW1130ONDH != 99999999999, ]
wealth2017 <- wealth2017[wealth2017$VEHW1140ABEH != 99999999999, ]
wealth2017 <- wealth2017[wealth2017$VEHW1150OVEH != 99999999999, ]

# Calculate movable wealth
wealth2017$Movable_wealth <- wealth2017$VEHW1110FINH + wealth2017$VEHW1130ONDH + wealth2017$VEHW1140ABEH + wealth2017$VEHW1150OVEH

# Calculate movable wealth per adult capita
wealth2017$Movable_wealth_per_adult_capita <- (wealth2017$Movable_wealth/wealth2017$number_of_adults)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their real estate wealth within every FUA
wealth2017 <- wealth2017 %>%
  arrange(FUA, Movable_wealth_per_adult_capita)

# Calculate the wealth percentile rank every household belongs to in their FUA
wealth2017 <- wealth2017 %>%
  group_by(FUA) %>% 
  mutate(Movable_wealth_per_adult_capita_percentile = ntile(Movable_wealth_per_adult_capita, 100))

# Clean rows of missing data 
wealth2017 <- wealth2017 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
wealth2017 <- wealth2017 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 500m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- wealth2017
  
  # Create the dataframe out of the wealth database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = Movable_wealth_per_adult_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 500, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("Movable_wealth_per_adult_capita_segregation_500m_2017_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(wealth2017)
rm(households_2017)



#### Calculate Spatial Information Theory Index for movable wealth segregation per adult capita in 2016 ####

# extract observations from the "address" database corresponding to the year 2016, taking the 1st of January as reference
households_2016 <- households %>%
  #mutate(start_date = as.Date(as.character(start_date), format = "%Y%m%d"),
  #end_date = as.Date(as.character(end_date), format = "%Y%m%d")) %>%
  filter(start_date < as.Date("2016-01-02") & end_date > as.Date("2016-01-01"))

# Load the data from 2016
wealth2016 <- read.csv("raw_data/database_2016.csv") # only if not already loaded

# Merge both databases
wealth2016 <- merge(wealth2016, households_2016, by ="ID", all.x = TRUE, all.y = FALSE)

# Calculate the number of adults
wealth2016 <- wealth2016 %>%
  mutate(number_of_adults = AANTALPERSHH - AANTALKINDHH)

# Delete missing values of wealth
wealth2016 <- wealth2016[wealth2016$VEHW1110FINH != 99999999999, ]
wealth2016 <- wealth2016[wealth2016$VEHW1130ONDH != 99999999999, ]
wealth2016 <- wealth2016[wealth2016$VEHW1140ABEH != 99999999999, ]
wealth2016 <- wealth2016[wealth2016$VEHW1150OVEH != 99999999999, ]

# Calculate movable wealth
wealth2016$Movable_wealth <- wealth2016$VEHW1110FINH + wealth2016$VEHW1130ONDH + wealth2016$VEHW1140ABEH + wealth2016$VEHW1150OVEH

# Calculate movable wealth per adult capita
wealth2016$Movable_wealth_per_adult_capita <- (wealth2016$Movable_wealth/wealth2016$number_of_adults)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their real estate wealth within every FUA
wealth2016 <- wealth2016 %>%
  arrange(FUA, Movable_wealth_per_adult_capita)

# Calculate the wealth percentile rank every household belongs to in their FUA
wealth2016 <- wealth2016 %>%
  group_by(FUA) %>% 
  mutate(Movable_wealth_per_adult_capita_percentile = ntile(Movable_wealth_per_adult_capita, 100))

# Clean rows of missing data 
wealth2016 <- wealth2016 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
wealth2016 <- wealth2016 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 500m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- wealth2016
  
  # Create the dataframe out of the wealth database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = Movable_wealth_per_adult_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 500, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("Movable_wealth_per_adult_capita_segregation_500m_2016_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(wealth2016)
rm(households_2016)



#### Calculate Spatial Information Theory Index for movable wealth segregation per adult capita in 2015 ####

# extract observations from the "address" database corresponding to the year 2015, taking the 1st of January as reference
households_2015 <- households %>%
  #mutate(start_date = as.Date(as.character(start_date), format = "%Y%m%d"),
  #end_date = as.Date(as.character(end_date), format = "%Y%m%d")) %>%
  filter(start_date < as.Date("2015-01-02") & end_date > as.Date("2015-01-01"))

# Load the data from 2015
wealth2015 <- read.csv("raw_data/database_2015.csv") # only if not already loaded

# Merge both databases
wealth2015 <- merge(wealth2015, households_2015, by ="ID", all.x = TRUE, all.y = FALSE)

# Calculate the number of adults
wealth2015 <- wealth2015 %>%
  mutate(number_of_adults = AANTALPERSHH - AANTALKINDHH)

# Delete missing values of wealth
wealth2015 <- wealth2015[wealth2015$VEHW1110FINH != 99999999999, ]
wealth2015 <- wealth2015[wealth2015$VEHW1130ONDH != 99999999999, ]
wealth2015 <- wealth2015[wealth2015$VEHW1140ABEH != 99999999999, ]
wealth2015 <- wealth2015[wealth2015$VEHW1150OVEH != 99999999999, ]

# Calculate movable wealth
wealth2015$Movable_wealth <- wealth2015$VEHW1110FINH + wealth2015$VEHW1130ONDH + wealth2015$VEHW1140ABEH + wealth2015$VEHW1150OVEH

# Calculate movable wealth per adult capita
wealth2015$Movable_wealth_per_adult_capita <- (wealth2015$Movable_wealth/wealth2015$number_of_adults)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their real estate wealth within every FUA
wealth2015 <- wealth2015 %>%
  arrange(FUA, Movable_wealth_per_adult_capita)

# Calculate the wealth percentile rank every household belongs to in their FUA
wealth2015 <- wealth2015 %>%
  group_by(FUA) %>% 
  mutate(Movable_wealth_per_adult_capita_percentile = ntile(Movable_wealth_per_adult_capita, 100))

# Clean rows of missing data 
wealth2015 <- wealth2015 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
wealth2015 <- wealth2015 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 500m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- wealth2015
  
  # Create the dataframe out of the wealth database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = Movable_wealth_per_adult_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 500, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("Movable_wealth_per_adult_capita_segregation_500m_2015_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(wealth2015)
rm(households_2015)



#### Calculate Spatial Information Theory Index for movable wealth segregation per adult capita in 2014 ####

# extract observations from the "address" database corresponding to the year 2014, taking the 1st of January as reference
households_2014 <- households %>%
  #mutate(start_date = as.Date(as.character(start_date), format = "%Y%m%d"),
  #end_date = as.Date(as.character(end_date), format = "%Y%m%d")) %>%
  filter(start_date < as.Date("2014-01-02") & end_date > as.Date("2014-01-01"))

# Load the data from 2014
wealth2014 <- read.csv("raw_data/database_2014.csv") # only if not already loaded

# Merge both databases
wealth2014 <- merge(wealth2014, households_2014, by ="ID", all.x = TRUE, all.y = FALSE)

# Calculate the number of adults
wealth2014 <- wealth2014 %>%
  mutate(number_of_adults = AANTALPERSHH - AANTALKINDHH)

# Delete missing values of wealth
wealth2014 <- wealth2014[wealth2014$VEHW1110FINH != 99999999999, ]
wealth2014 <- wealth2014[wealth2014$VEHW1130ONDH != 99999999999, ]
wealth2014 <- wealth2014[wealth2014$VEHW1140ABEH != 99999999999, ]
wealth2014 <- wealth2014[wealth2014$VEHW1150OVEH != 99999999999, ]

# Calculate movable wealth
wealth2014$Movable_wealth <- wealth2014$VEHW1110FINH + wealth2014$VEHW1130ONDH + wealth2014$VEHW1140ABEH + wealth2014$VEHW1150OVEH

# Calculate movable wealth per adult capita
wealth2014$Movable_wealth_per_adult_capita <- (wealth2014$Movable_wealth/wealth2014$number_of_adults)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their real estate wealth within every FUA
wealth2014 <- wealth2014 %>%
  arrange(FUA, Movable_wealth_per_adult_capita)

# Calculate the wealth percentile rank every household belongs to in their FUA
wealth2014 <- wealth2014 %>%
  group_by(FUA) %>% 
  mutate(Movable_wealth_per_adult_capita_percentile = ntile(Movable_wealth_per_adult_capita, 100))

# Clean rows of missing data 
wealth2014 <- wealth2014 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
wealth2014 <- wealth2014 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 500m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- wealth2014
  
  # Create the dataframe out of the wealth database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = Movable_wealth_per_adult_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 500, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("Movable_wealth_per_adult_capita_segregation_500m_2014_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(wealth2014)
rm(households_2014)




#### Calculate Spatial Information Theory Index for movable wealth segregation per adult capita in 2013 ####

# extract observations from the "address" database corresponding to the year 2013, taking the 1st of January as reference
households_2013 <- households %>%
  #mutate(start_date = as.Date(as.character(start_date), format = "%Y%m%d"),
  #end_date = as.Date(as.character(end_date), format = "%Y%m%d")) %>%
  filter(start_date < as.Date("2013-01-02") & end_date > as.Date("2013-01-01"))

# Load the data from 2013
wealth2013 <- read.csv("raw_data/database_2013.csv") # only if not already loaded

# Merge both databases
wealth2013 <- merge(wealth2013, households_2013, by ="ID", all.x = TRUE, all.y = FALSE)

# Calculate the number of adults
wealth2013 <- wealth2013 %>%
  mutate(number_of_adults = AANTALPERSHH - AANTALKINDHH)

# Delete missing values of wealth
wealth2013 <- wealth2013[wealth2013$VEHW1110FINH != 99999999999, ]
wealth2013 <- wealth2013[wealth2013$VEHW1130ONDH != 99999999999, ]
wealth2013 <- wealth2013[wealth2013$VEHW1140ABEH != 99999999999, ]
wealth2013 <- wealth2013[wealth2013$VEHW1150OVEH != 99999999999, ]

# Calculate movable wealth
wealth2013$Movable_wealth <- wealth2013$VEHW1110FINH + wealth2013$VEHW1130ONDH + wealth2013$VEHW1140ABEH + wealth2013$VEHW1150OVEH

# Calculate movable wealth per adult capita
wealth2013$Movable_wealth_per_adult_capita <- (wealth2013$Movable_wealth/wealth2013$number_of_adults)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their real estate wealth within every FUA
wealth2013 <- wealth2013 %>%
  arrange(FUA, Movable_wealth_per_adult_capita)

# Calculate the wealth percentile rank every household belongs to in their FUA
wealth2013 <- wealth2013 %>%
  group_by(FUA) %>% 
  mutate(Movable_wealth_per_adult_capita_percentile = ntile(Movable_wealth_per_adult_capita, 100))

# Clean rows of missing data 
wealth2013 <- wealth2013 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
wealth2013 <- wealth2013 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 500m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- wealth2013
  
  # Create the dataframe out of the wealth database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = Movable_wealth_per_adult_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 500, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("Movable_wealth_per_adult_capita_segregation_500m_2013_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(wealth2013)
rm(households_2013)



#### Calculate Spatial Information Theory Index for movable wealth segregation per adult capita in 2012 ####

# extract observations from the "address" database corresponding to the year 2012, taking the 1st of January as reference
households_2012 <- households %>%
  #mutate(start_date = as.Date(as.character(start_date), format = "%Y%m%d"),
  #end_date = as.Date(as.character(end_date), format = "%Y%m%d")) %>%
  filter(start_date < as.Date("2012-01-02") & end_date > as.Date("2012-01-01"))

# Load the data from 2012
wealth2012 <- read.csv("raw_data/database_2012.csv") # only if not already loaded

# Merge both databases
wealth2012 <- merge(wealth2012, households_2012, by ="ID", all.x = TRUE, all.y = FALSE)

# Calculate the number of adults
wealth2012 <- wealth2012 %>%
  mutate(number_of_adults = AANTALPERSHH - AANTALKINDHH)

# Delete missing values of wealth
wealth2012 <- wealth2012[wealth2012$VEHW1110FINH != 99999999999, ]
wealth2012 <- wealth2012[wealth2012$VEHW1130ONDH != 99999999999, ]
wealth2012 <- wealth2012[wealth2012$VEHW1140ABEH != 99999999999, ]
wealth2012 <- wealth2012[wealth2012$VEHW1150OVEH != 99999999999, ]

# Calculate movable wealth
wealth2012$Movable_wealth <- wealth2012$VEHW1110FINH + wealth2012$VEHW1130ONDH + wealth2012$VEHW1140ABEH + wealth2012$VEHW1150OVEH

# Calculate movable wealth per adult capita
wealth2012$Movable_wealth_per_adult_capita <- (wealth2012$Movable_wealth/wealth2012$number_of_adults)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their real estate wealth within every FUA
wealth2012 <- wealth2012 %>%
  arrange(FUA, Movable_wealth_per_adult_capita)

# Calculate the wealth percentile rank every household belongs to in their FUA
wealth2012 <- wealth2012 %>%
  group_by(FUA) %>% 
  mutate(Movable_wealth_per_adult_capita_percentile = ntile(Movable_wealth_per_adult_capita, 100))

# Clean rows of missing data 
wealth2012 <- wealth2012 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
wealth2012 <- wealth2012 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 500m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- wealth2012
  
  # Create the dataframe out of the wealth database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = Movable_wealth_per_adult_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 500, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("Movable_wealth_per_adult_capita_segregation_500m_2012_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(wealth2012)
rm(households_2012)



#### Calculate Spatial Information Theory Index for movable wealth segregation per adult capita in 2011 ####

# extract observations from the "address" database corresponding to the year 2011, taking the 1st of January as reference
households_2011 <- households %>%
  #mutate(start_date = as.Date(as.character(start_date), format = "%Y%m%d"),
  #end_date = as.Date(as.character(end_date), format = "%Y%m%d")) %>%
  filter(start_date < as.Date("2011-01-02") & end_date > as.Date("2011-01-01"))

# Load the data from 2011
wealth2011 <- read.csv("raw_data/database_2011.csv") # only if not already loaded

# Merge both databases
wealth2011 <- merge(wealth2011, households_2011, by ="ID", all.x = TRUE, all.y = FALSE)

# Calculate the number of adults
wealth2011 <- wealth2011 %>%
  mutate(number_of_adults = AANTALPERSHH - AANTALKINDHH)

# Delete missing values of wealth
wealth2011 <- wealth2011[wealth2011$VEHW1110FINH != 99999999999, ]
wealth2011 <- wealth2011[wealth2011$VEHW1130ONDH != 99999999999, ]
wealth2011 <- wealth2011[wealth2011$VEHW1140ABEH != 99999999999, ]
wealth2011 <- wealth2011[wealth2011$VEHW1150OVEH != 99999999999, ]

# Calculate movable wealth
wealth2011$Movable_wealth <- wealth2011$VEHW1110FINH + wealth2011$VEHW1130ONDH + wealth2011$VEHW1140ABEH + wealth2011$VEHW1150OVEH

# Calculate movable wealth per adult capita
wealth2011$Movable_wealth_per_adult_capita <- (wealth2011$Movable_wealth/wealth2011$number_of_adults)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their real estate wealth within every FUA
wealth2011 <- wealth2011 %>%
  arrange(FUA, Movable_wealth_per_adult_capita)

# Calculate the wealth percentile rank every household belongs to in their FUA
wealth2011 <- wealth2011 %>%
  group_by(FUA) %>% 
  mutate(Movable_wealth_per_adult_capita_percentile = ntile(Movable_wealth_per_adult_capita, 100))

# Clean rows of missing data 
wealth2011 <- wealth2011 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
wealth2011 <- wealth2011 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 500m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- wealth2011
  
  # Create the dataframe out of the wealth database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = Movable_wealth_per_adult_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 500, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("Movable_wealth_per_adult_capita_segregation_500m_2011_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(wealth2011)
rm(households_2011)

#### Radius = 4000 m ####


#### Calculate Spatial Information Theory Index for movable wealth segregation per adult capita in 2022 ####

# extract observations from the "address" database corresponding to the year 2022, taking the 1st of January as reference
households_2022 <- households %>%
  #mutate(start_date = as.Date(as.character(start_date), format = "%Y%m%d"),
  #end_date = as.Date(as.character(end_date), format = "%Y%m%d")) %>%
  filter(start_date < as.Date("2022-01-02") & end_date > as.Date("2022-01-01"))

# Load the data from 2022
wealth2022 <- read.csv("raw_data/database_2022.csv") # only if not already loaded

# Merge both databases
wealth2022 <- merge(wealth2022, households_2022, by ="ID", all.x = TRUE, all.y = FALSE)

# Calculate the number of adults
wealth2022 <- wealth2022 %>%
  mutate(number_of_adults = AANTALPERSHH - AANTALKINDHH)

# Delete missing values of wealth
wealth2022 <- wealth2022[wealth2022$VEHW1110FINH != 99999999999, ]
wealth2022 <- wealth2022[wealth2022$VEHW1130ONDH != 99999999999, ]
wealth2022 <- wealth2022[wealth2022$VEHW1140ABEH != 99999999999, ]
wealth2022 <- wealth2022[wealth2022$VEHW1150OVEH != 99999999999, ]

# Calculate movable wealth
wealth2022$Movable_wealth <- wealth2022$VEHW1110FINH + wealth2022$VEHW1130ONDH + wealth2022$VEHW1140ABEH + wealth2022$VEHW1150OVEH

# Calculate movable wealth per adult capita
wealth2022$Movable_wealth_per_adult_capita <- (wealth2022$Movable_wealth/wealth2022$number_of_adults)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their real estate wealth within every FUA
wealth2022 <- wealth2022 %>%
  arrange(FUA, Movable_wealth_per_adult_capita)

# Calculate the wealth percentile rank every household belongs to in their FUA
wealth2022 <- wealth2022 %>%
  group_by(FUA) %>% 
  mutate(Movable_wealth_per_adult_capita_percentile = ntile(Movable_wealth_per_adult_capita, 100))

# Clean rows of missing data 
wealth2022 <- wealth2022 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
wealth2022 <- wealth2022 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 4000m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- wealth2022
  
  # Create the dataframe out of the wealth database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = Movable_wealth_per_adult_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 4000, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("Movable_wealth_per_adult_capita_segregation_4000m_2022_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(wealth2022)
rm(households_2022)



#### Calculate Spatial Information Theory Index for movable wealth segregation per adult capita in 2021 ####

# extract observations from the "address" database corresponding to the year 2021, taking the 1st of January as reference
households_2021 <- households %>%
  #mutate(start_date = as.Date(as.character(start_date), format = "%Y%m%d"),
  #end_date = as.Date(as.character(end_date), format = "%Y%m%d")) %>%
  filter(start_date < as.Date("2021-01-02") & end_date > as.Date("2021-01-01"))

# Load the data from 2021
wealth2021 <- read.csv("raw_data/database_2021.csv") # only if not already loaded

# Merge both databases
wealth2021 <- merge(wealth2021, households_2021, by ="ID", all.x = TRUE, all.y = FALSE)

# Calculate the number of adults
wealth2021 <- wealth2021 %>%
  mutate(number_of_adults = AANTALPERSHH - AANTALKINDHH)

# Delete missing values of wealth
wealth2021 <- wealth2021[wealth2021$VEHW1110FINH != 99999999999, ]
wealth2021 <- wealth2021[wealth2021$VEHW1130ONDH != 99999999999, ]
wealth2021 <- wealth2021[wealth2021$VEHW1140ABEH != 99999999999, ]
wealth2021 <- wealth2021[wealth2021$VEHW1150OVEH != 99999999999, ]

# Calculate movable wealth
wealth2021$Movable_wealth <- wealth2021$VEHW1110FINH + wealth2021$VEHW1130ONDH + wealth2021$VEHW1140ABEH + wealth2021$VEHW1150OVEH

# Calculate movable wealth per adult capita
wealth2021$Movable_wealth_per_adult_capita <- (wealth2021$Movable_wealth/wealth2021$number_of_adults)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their real estate wealth within every FUA
wealth2021 <- wealth2021 %>%
  arrange(FUA, Movable_wealth_per_adult_capita)

# Calculate the wealth percentile rank every household belongs to in their FUA
wealth2021 <- wealth2021 %>%
  group_by(FUA) %>% 
  mutate(Movable_wealth_per_adult_capita_percentile = ntile(Movable_wealth_per_adult_capita, 100))

# Clean rows of missing data 
wealth2021 <- wealth2021 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
wealth2021 <- wealth2021 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 4000m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- wealth2021
  
  # Create the dataframe out of the wealth database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = Movable_wealth_per_adult_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 4000, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("Movable_wealth_per_adult_capita_segregation_4000m_2021_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(wealth2021)
rm(households_2021)


#### Calculate Spatial Information Theory Index for movable wealth segregation per adult capita in 2020 ####

# extract observations from the "address" database corresponding to the year 2020, taking the 1st of January as reference
households_2020 <- households %>%
  #mutate(start_date = as.Date(as.character(start_date), format = "%Y%m%d"),
  #end_date = as.Date(as.character(end_date), format = "%Y%m%d")) %>%
  filter(start_date < as.Date("2020-01-02") & end_date > as.Date("2020-01-01"))

# Load the data from 2020
wealth2020 <- read.csv("raw_data/database_2020.csv") # only if not already loaded

# Merge both databases
wealth2020 <- merge(wealth2020, households_2020, by ="ID", all.x = TRUE, all.y = FALSE)

# Calculate the number of adults
wealth2020 <- wealth2020 %>%
  mutate(number_of_adults = AANTALPERSHH - AANTALKINDHH)

# Delete missing values of wealth
wealth2020 <- wealth2020[wealth2020$VEHW1110FINH != 99999999999, ]
wealth2020 <- wealth2020[wealth2020$VEHW1130ONDH != 99999999999, ]
wealth2020 <- wealth2020[wealth2020$VEHW1140ABEH != 99999999999, ]
wealth2020 <- wealth2020[wealth2020$VEHW1150OVEH != 99999999999, ]

# Calculate movable wealth
wealth2020$Movable_wealth <- wealth2020$VEHW1110FINH + wealth2020$VEHW1130ONDH + wealth2020$VEHW1140ABEH + wealth2020$VEHW1150OVEH

# Calculate movable wealth per adult capita
wealth2020$Movable_wealth_per_adult_capita <- (wealth2020$Movable_wealth/wealth2020$number_of_adults)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their real estate wealth within every FUA
wealth2020 <- wealth2020 %>%
  arrange(FUA, Movable_wealth_per_adult_capita)

# Calculate the wealth percentile rank every household belongs to in their FUA
wealth2020 <- wealth2020 %>%
  group_by(FUA) %>% 
  mutate(Movable_wealth_per_adult_capita_percentile = ntile(Movable_wealth_per_adult_capita, 100))

# Clean rows of missing data 
wealth2020 <- wealth2020 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
wealth2020 <- wealth2020 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 4000m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- wealth2020
  
  # Create the dataframe out of the wealth database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = Movable_wealth_per_adult_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 4000, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("Movable_wealth_per_adult_capita_segregation_4000m_2020_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(wealth2020)
rm(households_2020)



#### Calculate Spatial Information Theory Index for movable wealth segregation per adult capita in 2019 ####

# extract observations from the "address" database corresponding to the year 2019, taking the 1st of January as reference
households_2019 <- households %>%
  #mutate(start_date = as.Date(as.character(start_date), format = "%Y%m%d"),
  #end_date = as.Date(as.character(end_date), format = "%Y%m%d")) %>%
  filter(start_date < as.Date("2019-01-02") & end_date > as.Date("2019-01-01"))

# Load the data from 2019
wealth2019 <- read.csv("raw_data/database_2019.csv") # only if not already loaded

# Merge both databases
wealth2019 <- merge(wealth2019, households_2019, by ="ID", all.x = TRUE, all.y = FALSE)

# Calculate the number of adults
wealth2019 <- wealth2019 %>%
  mutate(number_of_adults = AANTALPERSHH - AANTALKINDHH)

# Delete missing values of wealth
wealth2019 <- wealth2019[wealth2019$VEHW1110FINH != 99999999999, ]
wealth2019 <- wealth2019[wealth2019$VEHW1130ONDH != 99999999999, ]
wealth2019 <- wealth2019[wealth2019$VEHW1140ABEH != 99999999999, ]
wealth2019 <- wealth2019[wealth2019$VEHW1150OVEH != 99999999999, ]

# Calculate movable wealth
wealth2019$Movable_wealth <- wealth2019$VEHW1110FINH + wealth2019$VEHW1130ONDH + wealth2019$VEHW1140ABEH + wealth2019$VEHW1150OVEH

# Calculate movable wealth per adult capita
wealth2019$Movable_wealth_per_adult_capita <- (wealth2019$Movable_wealth/wealth2019$number_of_adults)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their real estate wealth within every FUA
wealth2019 <- wealth2019 %>%
  arrange(FUA, Movable_wealth_per_adult_capita)

# Calculate the wealth percentile rank every household belongs to in their FUA
wealth2019 <- wealth2019 %>%
  group_by(FUA) %>% 
  mutate(Movable_wealth_per_adult_capita_percentile = ntile(Movable_wealth_per_adult_capita, 100))

# Clean rows of missing data 
wealth2019 <- wealth2019 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
wealth2019 <- wealth2019 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 4000m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- wealth2019
  
  # Create the dataframe out of the wealth database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = Movable_wealth_per_adult_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 4000, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("Movable_wealth_per_adult_capita_segregation_4000m_2019_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(wealth2019)
rm(households_2019)



#### Calculate Spatial Information Theory Index for movable wealth segregation per adult capita in 2018 ####

# extract observations from the "address" database corresponding to the year 2018, taking the 1st of January as reference
households_2018 <- households %>%
  #mutate(start_date = as.Date(as.character(start_date), format = "%Y%m%d"),
  #end_date = as.Date(as.character(end_date), format = "%Y%m%d")) %>%
  filter(start_date < as.Date("2018-01-02") & end_date > as.Date("2018-01-01"))

# Load the data from 2018
wealth2018 <- read.csv("raw_data/database_2018.csv") # only if not already loaded

# Merge both databases
wealth2018 <- merge(wealth2018, households_2018, by ="ID", all.x = TRUE, all.y = FALSE)

# Calculate the number of adults
wealth2018 <- wealth2018 %>%
  mutate(number_of_adults = AANTALPERSHH - AANTALKINDHH)

# Delete missing values of wealth
wealth2018 <- wealth2018[wealth2018$VEHW1110FINH != 99999999999, ]
wealth2018 <- wealth2018[wealth2018$VEHW1130ONDH != 99999999999, ]
wealth2018 <- wealth2018[wealth2018$VEHW1140ABEH != 99999999999, ]
wealth2018 <- wealth2018[wealth2018$VEHW1150OVEH != 99999999999, ]

# Calculate movable wealth
wealth2018$Movable_wealth <- wealth2018$VEHW1110FINH + wealth2018$VEHW1130ONDH + wealth2018$VEHW1140ABEH + wealth2018$VEHW1150OVEH

# Calculate movable wealth per adult capita
wealth2018$Movable_wealth_per_adult_capita <- (wealth2018$Movable_wealth/wealth2018$number_of_adults)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their real estate wealth within every FUA
wealth2018 <- wealth2018 %>%
  arrange(FUA, Movable_wealth_per_adult_capita)

# Calculate the wealth percentile rank every household belongs to in their FUA
wealth2018 <- wealth2018 %>%
  group_by(FUA) %>% 
  mutate(Movable_wealth_per_adult_capita_percentile = ntile(Movable_wealth_per_adult_capita, 100))

# Clean rows of missing data 
wealth2018 <- wealth2018 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
wealth2018 <- wealth2018 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 4000m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- wealth2018
  
  # Create the dataframe out of the wealth database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = Movable_wealth_per_adult_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 4000, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("Movable_wealth_per_adult_capita_segregation_4000m_2018_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(wealth2018)
rm(households_2018)



#### Calculate Spatial Information Theory Index for movable wealth segregation per adult capita in 2017 ####

# extract observations from the "address" database corresponding to the year 2017, taking the 1st of January as reference
households_2017 <- households %>%
  #mutate(start_date = as.Date(as.character(start_date), format = "%Y%m%d"),
  #end_date = as.Date(as.character(end_date), format = "%Y%m%d")) %>%
  filter(start_date < as.Date("2017-01-02") & end_date > as.Date("2017-01-01"))

# Load the data from 2017
wealth2017 <- read.csv("raw_data/database_2017.csv") # only if not already loaded

# Merge both databases
wealth2017 <- merge(wealth2017, households_2017, by ="ID", all.x = TRUE, all.y = FALSE)

# Calculate the number of adults
wealth2017 <- wealth2017 %>%
  mutate(number_of_adults = AANTALPERSHH - AANTALKINDHH)

# Delete missing values of wealth
wealth2017 <- wealth2017[wealth2017$VEHW1110FINH != 99999999999, ]
wealth2017 <- wealth2017[wealth2017$VEHW1130ONDH != 99999999999, ]
wealth2017 <- wealth2017[wealth2017$VEHW1140ABEH != 99999999999, ]
wealth2017 <- wealth2017[wealth2017$VEHW1150OVEH != 99999999999, ]

# Calculate movable wealth
wealth2017$Movable_wealth <- wealth2017$VEHW1110FINH + wealth2017$VEHW1130ONDH + wealth2017$VEHW1140ABEH + wealth2017$VEHW1150OVEH

# Calculate movable wealth per adult capita
wealth2017$Movable_wealth_per_adult_capita <- (wealth2017$Movable_wealth/wealth2017$number_of_adults)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their real estate wealth within every FUA
wealth2017 <- wealth2017 %>%
  arrange(FUA, Movable_wealth_per_adult_capita)

# Calculate the wealth percentile rank every household belongs to in their FUA
wealth2017 <- wealth2017 %>%
  group_by(FUA) %>% 
  mutate(Movable_wealth_per_adult_capita_percentile = ntile(Movable_wealth_per_adult_capita, 100))

# Clean rows of missing data 
wealth2017 <- wealth2017 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
wealth2017 <- wealth2017 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 4000m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- wealth2017
  
  # Create the dataframe out of the wealth database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = Movable_wealth_per_adult_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 4000, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("Movable_wealth_per_adult_capita_segregation_4000m_2017_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(wealth2017)
rm(households_2017)



#### Calculate Spatial Information Theory Index for movable wealth segregation per adult capita in 2016 ####

# extract observations from the "address" database corresponding to the year 2016, taking the 1st of January as reference
households_2016 <- households %>%
  #mutate(start_date = as.Date(as.character(start_date), format = "%Y%m%d"),
  #end_date = as.Date(as.character(end_date), format = "%Y%m%d")) %>%
  filter(start_date < as.Date("2016-01-02") & end_date > as.Date("2016-01-01"))

# Load the data from 2016
wealth2016 <- read.csv("raw_data/database_2016.csv") # only if not already loaded

# Merge both databases
wealth2016 <- merge(wealth2016, households_2016, by ="ID", all.x = TRUE, all.y = FALSE)

# Calculate the number of adults
wealth2016 <- wealth2016 %>%
  mutate(number_of_adults = AANTALPERSHH - AANTALKINDHH)

# Delete missing values of wealth
wealth2016 <- wealth2016[wealth2016$VEHW1110FINH != 99999999999, ]
wealth2016 <- wealth2016[wealth2016$VEHW1130ONDH != 99999999999, ]
wealth2016 <- wealth2016[wealth2016$VEHW1140ABEH != 99999999999, ]
wealth2016 <- wealth2016[wealth2016$VEHW1150OVEH != 99999999999, ]

# Calculate movable wealth
wealth2016$Movable_wealth <- wealth2016$VEHW1110FINH + wealth2016$VEHW1130ONDH + wealth2016$VEHW1140ABEH + wealth2016$VEHW1150OVEH

# Calculate movable wealth per adult capita
wealth2016$Movable_wealth_per_adult_capita <- (wealth2016$Movable_wealth/wealth2016$number_of_adults)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their real estate wealth within every FUA
wealth2016 <- wealth2016 %>%
  arrange(FUA, Movable_wealth_per_adult_capita)

# Calculate the wealth percentile rank every household belongs to in their FUA
wealth2016 <- wealth2016 %>%
  group_by(FUA) %>% 
  mutate(Movable_wealth_per_adult_capita_percentile = ntile(Movable_wealth_per_adult_capita, 100))

# Clean rows of missing data 
wealth2016 <- wealth2016 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
wealth2016 <- wealth2016 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 4000m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- wealth2016
  
  # Create the dataframe out of the wealth database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = Movable_wealth_per_adult_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 4000, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("Movable_wealth_per_adult_capita_segregation_4000m_2016_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(wealth2016)
rm(households_2016)



#### Calculate Spatial Information Theory Index for movable wealth segregation per adult capita in 2015 ####

# extract observations from the "address" database corresponding to the year 2015, taking the 1st of January as reference
households_2015 <- households %>%
  #mutate(start_date = as.Date(as.character(start_date), format = "%Y%m%d"),
  #end_date = as.Date(as.character(end_date), format = "%Y%m%d")) %>%
  filter(start_date < as.Date("2015-01-02") & end_date > as.Date("2015-01-01"))

# Load the data from 2015
wealth2015 <- read.csv("raw_data/database_2015.csv") # only if not already loaded

# Merge both databases
wealth2015 <- merge(wealth2015, households_2015, by ="ID", all.x = TRUE, all.y = FALSE)

# Calculate the number of adults
wealth2015 <- wealth2015 %>%
  mutate(number_of_adults = AANTALPERSHH - AANTALKINDHH)

# Delete missing values of wealth
wealth2015 <- wealth2015[wealth2015$VEHW1110FINH != 99999999999, ]
wealth2015 <- wealth2015[wealth2015$VEHW1130ONDH != 99999999999, ]
wealth2015 <- wealth2015[wealth2015$VEHW1140ABEH != 99999999999, ]
wealth2015 <- wealth2015[wealth2015$VEHW1150OVEH != 99999999999, ]

# Calculate movable wealth
wealth2015$Movable_wealth <- wealth2015$VEHW1110FINH + wealth2015$VEHW1130ONDH + wealth2015$VEHW1140ABEH + wealth2015$VEHW1150OVEH

# Calculate movable wealth per adult capita
wealth2015$Movable_wealth_per_adult_capita <- (wealth2015$Movable_wealth/wealth2015$number_of_adults)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their real estate wealth within every FUA
wealth2015 <- wealth2015 %>%
  arrange(FUA, Movable_wealth_per_adult_capita)

# Calculate the wealth percentile rank every household belongs to in their FUA
wealth2015 <- wealth2015 %>%
  group_by(FUA) %>% 
  mutate(Movable_wealth_per_adult_capita_percentile = ntile(Movable_wealth_per_adult_capita, 100))

# Clean rows of missing data 
wealth2015 <- wealth2015 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
wealth2015 <- wealth2015 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 4000m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- wealth2015
  
  # Create the dataframe out of the wealth database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = Movable_wealth_per_adult_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 4000, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("Movable_wealth_per_adult_capita_segregation_4000m_2015_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(wealth2015)
rm(households_2015)



#### Calculate Spatial Information Theory Index for movable wealth segregation per adult capita in 2014 ####

# extract observations from the "address" database corresponding to the year 2014, taking the 1st of January as reference
households_2014 <- households %>%
  #mutate(start_date = as.Date(as.character(start_date), format = "%Y%m%d"),
  #end_date = as.Date(as.character(end_date), format = "%Y%m%d")) %>%
  filter(start_date < as.Date("2014-01-02") & end_date > as.Date("2014-01-01"))

# Load the data from 2014
wealth2014 <- read.csv("raw_data/database_2014.csv") # only if not already loaded

# Merge both databases
wealth2014 <- merge(wealth2014, households_2014, by ="ID", all.x = TRUE, all.y = FALSE)

# Calculate the number of adults
wealth2014 <- wealth2014 %>%
  mutate(number_of_adults = AANTALPERSHH - AANTALKINDHH)

# Delete missing values of wealth
wealth2014 <- wealth2014[wealth2014$VEHW1110FINH != 99999999999, ]
wealth2014 <- wealth2014[wealth2014$VEHW1130ONDH != 99999999999, ]
wealth2014 <- wealth2014[wealth2014$VEHW1140ABEH != 99999999999, ]
wealth2014 <- wealth2014[wealth2014$VEHW1150OVEH != 99999999999, ]

# Calculate movable wealth
wealth2014$Movable_wealth <- wealth2014$VEHW1110FINH + wealth2014$VEHW1130ONDH + wealth2014$VEHW1140ABEH + wealth2014$VEHW1150OVEH

# Calculate movable wealth per adult capita
wealth2014$Movable_wealth_per_adult_capita <- (wealth2014$Movable_wealth/wealth2014$number_of_adults)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their real estate wealth within every FUA
wealth2014 <- wealth2014 %>%
  arrange(FUA, Movable_wealth_per_adult_capita)

# Calculate the wealth percentile rank every household belongs to in their FUA
wealth2014 <- wealth2014 %>%
  group_by(FUA) %>% 
  mutate(Movable_wealth_per_adult_capita_percentile = ntile(Movable_wealth_per_adult_capita, 100))

# Clean rows of missing data 
wealth2014 <- wealth2014 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
wealth2014 <- wealth2014 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 4000m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- wealth2014
  
  # Create the dataframe out of the wealth database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = Movable_wealth_per_adult_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 4000, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("Movable_wealth_per_adult_capita_segregation_4000m_2014_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(wealth2014)
rm(households_2014)




#### Calculate Spatial Information Theory Index for movable wealth segregation per adult capita in 2013 ####

# extract observations from the "address" database corresponding to the year 2013, taking the 1st of January as reference
households_2013 <- households %>%
  #mutate(start_date = as.Date(as.character(start_date), format = "%Y%m%d"),
  #end_date = as.Date(as.character(end_date), format = "%Y%m%d")) %>%
  filter(start_date < as.Date("2013-01-02") & end_date > as.Date("2013-01-01"))

# Load the data from 2013
wealth2013 <- read.csv("raw_data/database_2013.csv") # only if not already loaded

# Merge both databases
wealth2013 <- merge(wealth2013, households_2013, by ="ID", all.x = TRUE, all.y = FALSE)

# Calculate the number of adults
wealth2013 <- wealth2013 %>%
  mutate(number_of_adults = AANTALPERSHH - AANTALKINDHH)

# Delete missing values of wealth
wealth2013 <- wealth2013[wealth2013$VEHW1110FINH != 99999999999, ]
wealth2013 <- wealth2013[wealth2013$VEHW1130ONDH != 99999999999, ]
wealth2013 <- wealth2013[wealth2013$VEHW1140ABEH != 99999999999, ]
wealth2013 <- wealth2013[wealth2013$VEHW1150OVEH != 99999999999, ]

# Calculate movable wealth
wealth2013$Movable_wealth <- wealth2013$VEHW1110FINH + wealth2013$VEHW1130ONDH + wealth2013$VEHW1140ABEH + wealth2013$VEHW1150OVEH

# Calculate movable wealth per adult capita
wealth2013$Movable_wealth_per_adult_capita <- (wealth2013$Movable_wealth/wealth2013$number_of_adults)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their real estate wealth within every FUA
wealth2013 <- wealth2013 %>%
  arrange(FUA, Movable_wealth_per_adult_capita)

# Calculate the wealth percentile rank every household belongs to in their FUA
wealth2013 <- wealth2013 %>%
  group_by(FUA) %>% 
  mutate(Movable_wealth_per_adult_capita_percentile = ntile(Movable_wealth_per_adult_capita, 100))

# Clean rows of missing data 
wealth2013 <- wealth2013 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
wealth2013 <- wealth2013 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 4000m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- wealth2013
  
  # Create the dataframe out of the wealth database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = Movable_wealth_per_adult_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 4000, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("Movable_wealth_per_adult_capita_segregation_4000m_2013_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(wealth2013)
rm(households_2013)



#### Calculate Spatial Information Theory Index for movable wealth segregation per adult capita in 2012 ####

# extract observations from the "address" database corresponding to the year 2012, taking the 1st of January as reference
households_2012 <- households %>%
  #mutate(start_date = as.Date(as.character(start_date), format = "%Y%m%d"),
  #end_date = as.Date(as.character(end_date), format = "%Y%m%d")) %>%
  filter(start_date < as.Date("2012-01-02") & end_date > as.Date("2012-01-01"))

# Load the data from 2012
wealth2012 <- read.csv("raw_data/database_2012.csv") # only if not already loaded

# Merge both databases
wealth2012 <- merge(wealth2012, households_2012, by ="ID", all.x = TRUE, all.y = FALSE)

# Calculate the number of adults
wealth2012 <- wealth2012 %>%
  mutate(number_of_adults = AANTALPERSHH - AANTALKINDHH)

# Delete missing values of wealth
wealth2012 <- wealth2012[wealth2012$VEHW1110FINH != 99999999999, ]
wealth2012 <- wealth2012[wealth2012$VEHW1130ONDH != 99999999999, ]
wealth2012 <- wealth2012[wealth2012$VEHW1140ABEH != 99999999999, ]
wealth2012 <- wealth2012[wealth2012$VEHW1150OVEH != 99999999999, ]

# Calculate movable wealth
wealth2012$Movable_wealth <- wealth2012$VEHW1110FINH + wealth2012$VEHW1130ONDH + wealth2012$VEHW1140ABEH + wealth2012$VEHW1150OVEH

# Calculate movable wealth per adult capita
wealth2012$Movable_wealth_per_adult_capita <- (wealth2012$Movable_wealth/wealth2012$number_of_adults)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their real estate wealth within every FUA
wealth2012 <- wealth2012 %>%
  arrange(FUA, Movable_wealth_per_adult_capita)

# Calculate the wealth percentile rank every household belongs to in their FUA
wealth2012 <- wealth2012 %>%
  group_by(FUA) %>% 
  mutate(Movable_wealth_per_adult_capita_percentile = ntile(Movable_wealth_per_adult_capita, 100))

# Clean rows of missing data 
wealth2012 <- wealth2012 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
wealth2012 <- wealth2012 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 4000m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- wealth2012
  
  # Create the dataframe out of the wealth database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = Movable_wealth_per_adult_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 4000, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("Movable_wealth_per_adult_capita_segregation_4000m_2012_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(wealth2012)
rm(households_2012)



#### Calculate Spatial Information Theory Index for movable wealth segregation per adult capita in 2011 ####

# extract observations from the "address" database corresponding to the year 2011, taking the 1st of January as reference
households_2011 <- households %>%
  #mutate(start_date = as.Date(as.character(start_date), format = "%Y%m%d"),
  #end_date = as.Date(as.character(end_date), format = "%Y%m%d")) %>%
  filter(start_date < as.Date("2011-01-02") & end_date > as.Date("2011-01-01"))

# Load the data from 2011
wealth2011 <- read.csv("raw_data/database_2011.csv") # only if not already loaded

# Merge both databases
wealth2011 <- merge(wealth2011, households_2011, by ="ID", all.x = TRUE, all.y = FALSE)

# Calculate the number of adults
wealth2011 <- wealth2011 %>%
  mutate(number_of_adults = AANTALPERSHH - AANTALKINDHH)

# Delete missing values of wealth
wealth2011 <- wealth2011[wealth2011$VEHW1110FINH != 99999999999, ]
wealth2011 <- wealth2011[wealth2011$VEHW1130ONDH != 99999999999, ]
wealth2011 <- wealth2011[wealth2011$VEHW1140ABEH != 99999999999, ]
wealth2011 <- wealth2011[wealth2011$VEHW1150OVEH != 99999999999, ]

# Calculate movable wealth
wealth2011$Movable_wealth <- wealth2011$VEHW1110FINH + wealth2011$VEHW1130ONDH + wealth2011$VEHW1140ABEH + wealth2011$VEHW1150OVEH

# Calculate movable wealth per adult capita
wealth2011$Movable_wealth_per_adult_capita <- (wealth2011$Movable_wealth/wealth2011$number_of_adults)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their real estate wealth within every FUA
wealth2011 <- wealth2011 %>%
  arrange(FUA, Movable_wealth_per_adult_capita)

# Calculate the wealth percentile rank every household belongs to in their FUA
wealth2011 <- wealth2011 %>%
  group_by(FUA) %>% 
  mutate(Movable_wealth_per_adult_capita_percentile = ntile(Movable_wealth_per_adult_capita, 100))

# Clean rows of missing data 
wealth2011 <- wealth2011 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
wealth2011 <- wealth2011 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 4000m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- wealth2011
  
  # Create the dataframe out of the wealth database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, Movable_wealth_per_adult_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = Movable_wealth_per_adult_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 4000, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("Movable_wealth_per_adult_capita_segregation_4000m_2011_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(wealth2011)
rm(households_2011)
