##### Calculating income segregation taking into account income per capita #####
# 01_load_and_process_data script
# 25/06/2024
#####

# load the required packages
library(haven)
library(ineq)
library(DescTools)
library(tidyverse)
library(OasisR)
library(seg)
library(sf)
library(sp)

# Do not use scientific notation
options(scipen = 9999999)

#### Local environment set at 500m ####

### Calculate Spatial Information Theory Index for income segregation in 2022 ###

# Load the data
income2022 <- read.csv("raw_data/database_2022.csv") # only if not already loaded

# Delete missing values of income
income2022 <- income2022[income2022$INHBESTINKH != 9999999999, ]

# Calculate income per capita in each household
income2022$Income_per_capita <- (income2022$INHBESTINKH/income2022$INHAHL)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their income within every FUA
income2022 <- income2022 %>%
  arrange(FUA, Income_per_capita)

# Calculate the income percentile rank every household belongs to in their FUA
income2022 <- income2022 %>%
  group_by(FUA) %>% 
  mutate(income_per_capita_percentile = ntile(Income_per_capita, 100))

# Clean rows of missing data 
income2022 <- income2022 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
income2022 <- income2022 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 500m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- income2022
  
  # Create the dataframe out of the income database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, income_per_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, income_per_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = income_per_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 500, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("income_per_capita_segregation_500m_2022_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(income2022)




### Calculate Spatial Information Theory Index for income segregation in 2021 ###

# Load the data
income2021 <- read.csv("raw_data/database_2021.csv") # only if not already loaded

# Delete missing values of income
income2021 <- income2021[income2021$INHBESTINKH != 9999999999, ]

# Calculate income per capita in each household
income2021$Income_per_capita <- (income2021$INHBESTINKH/income2021$INHAHL)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their income within every FUA
income2021 <- income2021 %>%
  arrange(FUA, Income_per_capita)

# Calculate the income percentile rank every household belongs to in their FUA
income2021 <- income2021 %>%
  group_by(FUA) %>% 
  mutate(income_per_capita_percentile = ntile(Income_per_capita, 100))

# Clean rows of missing data 
income2021 <- income2021 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
income2021 <- income2021 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 500m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- income2021
  
  # Create the dataframe out of the income database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, income_per_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, income_per_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = income_per_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 500, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("income_per_capita_segregation_500m_2021_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(income2021)




### Calculate Spatial Information Theory Index for income segregation in 2020 ###

# Load the data
income2020 <- read.csv("raw_data/database_2020.csv") # only if not already loaded

# Delete missing values of income
income2020 <- income2020[income2020$INHBESTINKH != 9999999999, ]

# Calculate income per capita in each household
income2020$Income_per_capita <- (income2020$INHBESTINKH/income2020$INHAHL)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their income within every FUA
income2020 <- income2020 %>%
  arrange(FUA, Income_per_capita)

# Calculate the income percentile rank every household belongs to in their FUA
income2020 <- income2020 %>%
  group_by(FUA) %>% 
  mutate(income_per_capita_percentile = ntile(Income_per_capita, 100))

# Clean rows of missing data 
income2020 <- income2020 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
income2020 <- income2020 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 500m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- income2020
  
  # Create the dataframe out of the income database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, income_per_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, income_per_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = income_per_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 500, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("income_per_capita_segregation_500m_2020_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(income2020)


### Calculate Spatial Information Theory Index for income segregation in 2019 ###

# Load the data
income2019 <- read.csv("raw_data/database_2019.csv") # only if not already loaded

# Delete missing values of income
income2019 <- income2019[income2019$INHBESTINKH != 9999999999, ]

# Calculate income per capita in each household
income2019$Income_per_capita <- (income2019$INHBESTINKH/income2019$INHAHL)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their income within every FUA
income2019 <- income2019 %>%
  arrange(FUA, Income_per_capita)

# Calculate the income percentile rank every household belongs to in their FUA
income2019 <- income2019 %>%
  group_by(FUA) %>% 
  mutate(income_per_capita_percentile = ntile(Income_per_capita, 100))

# Clean rows of missing data 
income2019 <- income2019 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
income2019 <- income2019 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 500m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- income2019
  
  # Create the dataframe out of the income database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, income_per_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, income_per_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = income_per_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 500, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("income_per_capita_segregation_500m_2019_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(income2019)





### Calculate Spatial Information Theory Index for income segregation in 2018 ###

# Load the data
income2018 <- read.csv("raw_data/database_2018.csv") # only if not already loaded

# Delete missing values of income
income2018 <- income2018[income2018$INHBESTINKH != 9999999999, ]

# Calculate income per capita in each household
income2018$Income_per_capita <- (income2018$INHBESTINKH/income2018$INHAHL)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their income within every FUA
income2018 <- income2018 %>%
  arrange(FUA, Income_per_capita)

# Calculate the income percentile rank every household belongs to in their FUA
income2018 <- income2018 %>%
  group_by(FUA) %>% 
  mutate(income_per_capita_percentile = ntile(Income_per_capita, 100))

# Clean rows of missing data 
income2018 <- income2018 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
income2018 <- income2018 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 500m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- income2018
  
  # Create the dataframe out of the income database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, income_per_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, income_per_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = income_per_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 500, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("income_per_capita_segregation_500m_2018_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(income2018)




### Calculate Spatial Information Theory Index for income segregation in 2017 ###

# Load the data
income2017 <- read.csv("raw_data/database_2017.csv") # only if not already loaded

# Delete missing values of income
income2017 <- income2017[income2017$INHBESTINKH != 9999999999, ]

# Calculate income per capita in each household
income2017$Income_per_capita <- (income2017$INHBESTINKH/income2017$INHAHL)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their income within every FUA
income2017 <- income2017 %>%
  arrange(FUA, Income_per_capita)

# Calculate the income percentile rank every household belongs to in their FUA
income2017 <- income2017 %>%
  group_by(FUA) %>% 
  mutate(income_per_capita_percentile = ntile(Income_per_capita, 100))

# Clean rows of missing data 
income2017 <- income2017 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
income2017 <- income2017 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 500m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- income2017
  
  # Create the dataframe out of the income database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, income_per_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, income_per_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = income_per_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 500, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("income_per_capita_segregation_500m_2017_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(income2017)




### Calculate Spatial Information Theory Index for income segregation in 2016 ###

# Load the data
income2016 <- read.csv("raw_data/database_2016.csv") # only if not already loaded

# Delete missing values of income
income2016 <- income2016[income2016$INHBESTINKH != 9999999999, ]


# Calculate income per capita in each household
income2016$Income_per_capita <- (income2016$INHBESTINKH/income2016$INHAHL)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their income within every FUA
income2016 <- income2016 %>%
  arrange(FUA, Income_per_capita)

# Calculate the income percentile rank every household belongs to in their FUA
income2016 <- income2016 %>%
  group_by(FUA) %>% 
  mutate(income_per_capita_percentile = ntile(Income_per_capita, 100))

# Clean rows of missing data 
income2016 <- income2016 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
income2016 <- income2016 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 500m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- income2016
  
  # Create the dataframe out of the income database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, income_per_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, income_per_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = income_per_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 500, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("income_per_capita_segregation_500m_2016_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(income2016)


### Calculate Spatial Information Theory Index for income segregation in 2015 ###

# Load the data
income2015 <- read.csv("raw_data/database_2015.csv") # only if not already loaded

# Delete missing values of income
income2015 <- income2015[income2015$INHBESTINKH != 9999999999, ]

# Calculate income per capita in each household
income2015$Income_per_capita <- (income2015$INHBESTINKH/income2015$INHAHL)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their income within every FUA
income2015 <- income2015 %>%
  arrange(FUA, Income_per_capita)

# Calculate the income percentile rank every household belongs to in their FUA
income2015 <- income2015 %>%
  group_by(FUA) %>% 
  mutate(income_per_capita_percentile = ntile(Income_per_capita, 100))

# Clean rows of missing data 
income2015 <- income2015 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
income2015 <- income2015 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 500m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- income2015
  
  # Create the dataframe out of the income database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, income_per_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, income_per_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = income_per_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 500, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("income_per_capita_segregation_500m_2015_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(income2015)



### Calculate Spatial Information Theory Index for income segregation in 2014 ###

# Load the data
income2014 <- read.csv("raw_data/database_2014.csv") # only if not already loaded

# Delete missing values of income
income2014 <- income2014[income2014$INHBESTINKH != 9999999999, ]

# Calculate income per capita in each household
income2014$Income_per_capita <- (income2014$INHBESTINKH/income2014$INHAHL)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their income within every FUA
income2014 <- income2014 %>%
  arrange(FUA, Income_per_capita)

# Calculate the income percentile rank every household belongs to in their FUA
income2014 <- income2014 %>%
  group_by(FUA) %>% 
  mutate(income_per_capita_percentile = ntile(Income_per_capita, 100))

# Clean rows of missing data 
income2014 <- income2014 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
income2014 <- income2014 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 500m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- income2014
  
  # Create the dataframe out of the income database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, income_per_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, income_per_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = income_per_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 500, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("income_per_capita_segregation_500m_2014_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(income2014)




### Calculate Spatial Information Theory Index for income segregation in 2013 ###

# Load the data
income2013 <- read.csv("raw_data/database_2013.csv") # only if not already loaded

# Delete missing values of income
income2013 <- income2013[income2013$INHBESTINKH != 9999999999, ]

# Calculate income per capita in each household
income2013$Income_per_capita <- (income2013$INHBESTINKH/income2013$INHAHL)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their income within every FUA
income2013 <- income2013 %>%
  arrange(FUA, Income_per_capita)

# Calculate the income percentile rank every household belongs to in their FUA
income2013 <- income2013 %>%
  group_by(FUA) %>% 
  mutate(income_per_capita_percentile = ntile(Income_per_capita, 100))

# Clean rows of missing data 
income2013 <- income2013 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
income2013 <- income2013 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 500m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- income2013
  
  # Create the dataframe out of the income database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, income_per_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, income_per_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = income_per_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 500, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("income_per_capita_segregation_500m_2013_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(income2013)








### Calculate Spatial Information Theory Index for income segregation in 2012 ###

# Load the data
income2012 <- read.csv("raw_data/database_2012.csv") # only if not already loaded

# Delete missing values of income
income2012 <- income2012[income2012$INHBESTINKH != 9999999999, ]

# Calculate income per capita in each household
income2012$Income_per_capita <- (income2012$INHBESTINKH/income2012$INHAHL)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their income within every FUA
income2012 <- income2012 %>%
  arrange(FUA, Income_per_capita)

# Calculate the income percentile rank every household belongs to in their FUA
income2012 <- income2012 %>%
  group_by(FUA) %>% 
  mutate(income_per_capita_percentile = ntile(Income_per_capita, 100))

# Clean rows of missing data 
income2012 <- income2012 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
income2012 <- income2012 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 500m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- income2012
  
  # Create the dataframe out of the income database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, income_per_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, income_per_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = income_per_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 500, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("income_per_capita_segregation_500m_2012_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(income2012)


### Calculate Spatial Information Theory Index for income segregation in 2011 ###

# Load the data
income2011 <- read.csv("raw_data/database_2011.csv") # only if not already loaded

# Delete missing values of income
income2011 <- income2011[income2011$INHBESTINKH != 9999999999, ]

# Calculate income per capita in each household
income2011$Income_per_capita <- (income2011$INHBESTINKH/income2011$INHAHL)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their income within every FUA
income2011 <- income2011 %>%
  arrange(FUA, Income_per_capita)

# Calculate the income percentile rank every household belongs to in their FUA
income2011 <- income2011 %>%
  group_by(FUA) %>% 
  mutate(income_per_capita_percentile = ntile(Income_per_capita, 100))

# Clean rows of missing data 
income2011 <- income2011 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
income2011 <- income2011 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 500m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- income2011
  
  # Create the dataframe out of the income database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, income_per_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, income_per_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = income_per_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 500, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("income_per_capita_segregation_500m_2011_", City, ".csv"), row.names = FALSE)
}


# Delete data for enabling further operations
rm(income2011)



### Calculate Spatial Information Theory Index for income segregation in 2010 ###

# Load the data
income2010 <- read.csv("raw_data/database_2010.csv") # only if not already loaded

# Delete missing values of income
income2010 <- income2010[income2010$BVRBESTINKH != 999999999, ]

# Calculate income per capita in each household
income2010$Income_per_capita <- (income2010$BVRBESTINKH/income2010$BVRAHL)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their income within every FUA
income2010 <- income2010 %>%
  arrange(FUA, Income_per_capita)

# Calculate the income percentile rank every household belongs to in their FUA
income2010 <- income2010 %>%
  group_by(FUA) %>% 
  mutate(income_per_capita_percentile = ntile(Income_per_capita, 100))

# Clean rows of missing data 
income2010 <- income2010 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
income2010 <- income2010 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 500m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- income2010
  
  # Create the dataframe out of the income database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, income_per_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, income_per_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = income_per_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 500, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("income_per_capita_segregation_500m_2010_", City, ".csv"), row.names = FALSE)
}


# Delete data for enabling further operations
rm(income2010)




### Calculate Spatial Information Theory Index for income segregation in 2009 ###

# Load the data
income2009 <- read.csv("raw_data/database_2009.csv") # only if not already loaded

# Delete missing values of income
income2009 <- income2009[income2009$BVRBESTINKH != 999999999, ]

# Calculate income per capita in each household
income2009$Income_per_capita <- (income2009$BVRBESTINKH/income2009$BVRAHL)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their income within every FUA
income2009 <- income2009 %>%
  arrange(FUA, Income_per_capita)

# Calculate the income percentile rank every household belongs to in their FUA
income2009 <- income2009 %>%
  group_by(FUA) %>% 
  mutate(income_per_capita_percentile = ntile(Income_per_capita, 100))

# Clean rows of missing data 
income2009 <- income2009 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
income2009 <- income2009 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 500m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- income2009
  
  # Create the dataframe out of the income database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, income_per_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, income_per_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = income_per_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 500, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("income_per_capita_segregation_500m_2009_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(income2009)




### Calculate Spatial Information Theory Index for income segregation in 2008 ###

# Load the data
income2008 <- read.csv("raw_data/database_2008.csv") # only if not already loaded

# Delete missing values of income
income2008 <- income2008[income2008$BVRBESTINKH != 999999999, ]

# Calculate income per capita in each household
income2008$Income_per_capita <- (income2008$BVRBESTINKH/income2008$BVRAHL)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their income within every FUA
income2008 <- income2008 %>%
  arrange(FUA, Income_per_capita)

# Calculate the income percentile rank every household belongs to in their FUA
income2008 <- income2008 %>%
  group_by(FUA) %>% 
  mutate(income_per_capita_percentile = ntile(Income_per_capita, 100))

# Clean rows of missing data 
income2008 <- income2008 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
income2008 <- income2008 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 500m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- income2008
  
  # Create the dataframe out of the income database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, income_per_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, income_per_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = income_per_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 500, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("income_per_capita_segregation_500m_2008_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(income2008)




### Calculate Spatial Information Theory Index for income segregation in 2007 ###

# Load the data
income2007 <- read.csv("raw_data/database_2007.csv") # only if not already loaded

# Delete missing values of income
income2007 <- income2007[income2007$BVRBESTINKH != 999999999, ]

# Calculate income per capita in each household
income2007$Income_per_capita <- (income2007$BVRBESTINKH/income2007$BVRAHL)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their income within every FUA
income2007 <- income2007 %>%
  arrange(FUA, Income_per_capita)

# Calculate the income percentile rank every household belongs to in their FUA
income2007 <- income2007 %>%
  group_by(FUA) %>% 
  mutate(income_per_capita_percentile = ntile(Income_per_capita, 100))

# Clean rows of missing data 
income2007 <- income2007 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
income2007 <- income2007 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 500m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- income2007
  
  # Create the dataframe out of the income database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, income_per_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, income_per_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = income_per_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 500, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("income_per_capita_segregation_500m_2007_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(income2007)



### Calculate Spatial Information Theory Index for income segregation in 2006 ###

# Load the data
income2006 <- read.csv("raw_data/database_2006.csv") # only if not already loaded

# Delete missing values of income
income2006 <- income2006[income2006$BVRBESTINKH != 999999999, ]

# Calculate income per capita in each household
income2006$Income_per_capita <- (income2006$BVRBESTINKH/income2006$BVRAHL)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their income within every FUA
income2006 <- income2006 %>%
  arrange(FUA, Income_per_capita)

# Calculate the income percentile rank every household belongs to in their FUA
income2006 <- income2006 %>%
  group_by(FUA) %>% 
  mutate(income_per_capita_percentile = ntile(Income_per_capita, 100))

# Clean rows of missing data 
income2006 <- income2006 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
income2006 <- income2006 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 500m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- income2006
  
  # Create the dataframe out of the income database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, income_per_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, income_per_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = income_per_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 500, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("income_per_capita_segregation_500m_2006_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(income2006)




#### Local environment set at 4000m ####

### Calculate Spatial Information Theory Index for income segregation in 2022 ###

# Load the data
income2022 <- read.csv("raw_data/database_2022.csv") # only if not already loaded

# Delete missing values of income
income2022 <- income2022[income2022$INHBESTINKH != 9999999999, ]

# Calculate income per capita in each household
income2022$Income_per_capita <- (income2022$INHBESTINKH/income2022$INHAHL)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their income within every FUA
income2022 <- income2022 %>%
  arrange(FUA, Income_per_capita)

# Calculate the income percentile rank every household belongs to in their FUA
income2022 <- income2022 %>%
  group_by(FUA) %>% 
  mutate(income_per_capita_percentile = ntile(Income_per_capita, 100))

# Clean rows of missing data 
income2022 <- income2022 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
income2022 <- income2022 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 4000m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- income2022
  
  # Create the dataframe out of the income database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, income_per_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, income_per_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = income_per_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 4000, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("income_per_capita_segregation_4000m_2022_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(income2022)




### Calculate Spatial Information Theory Index for income segregation in 2021 ###

# Load the data
income2021 <- read.csv("raw_data/database_2021.csv") # only if not already loaded

# Delete missing values of income
income2021 <- income2021[income2021$INHBESTINKH != 9999999999, ]

# Calculate income per capita in each household
income2021$Income_per_capita <- (income2021$INHBESTINKH/income2021$INHAHL)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their income within every FUA
income2021 <- income2021 %>%
  arrange(FUA, Income_per_capita)

# Calculate the income percentile rank every household belongs to in their FUA
income2021 <- income2021 %>%
  group_by(FUA) %>% 
  mutate(income_per_capita_percentile = ntile(Income_per_capita, 100))

# Clean rows of missing data 
income2021 <- income2021 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
income2021 <- income2021 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 4000m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- income2021
  
  # Create the dataframe out of the income database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, income_per_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, income_per_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = income_per_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 4000, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("income_per_capita_segregation_4000m_2021_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(income2021)




### Calculate Spatial Information Theory Index for income segregation in 2020 ###

# Load the data
income2020 <- read.csv("raw_data/database_2020.csv") # only if not already loaded

# Delete missing values of income
income2020 <- income2020[income2020$INHBESTINKH != 9999999999, ]

# Calculate income per capita in each household
income2020$Income_per_capita <- (income2020$INHBESTINKH/income2020$INHAHL)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their income within every FUA
income2020 <- income2020 %>%
  arrange(FUA, Income_per_capita)

# Calculate the income percentile rank every household belongs to in their FUA
income2020 <- income2020 %>%
  group_by(FUA) %>% 
  mutate(income_per_capita_percentile = ntile(Income_per_capita, 100))

# Clean rows of missing data 
income2020 <- income2020 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
income2020 <- income2020 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 4000m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- income2020
  
  # Create the dataframe out of the income database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, income_per_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, income_per_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = income_per_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 4000, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("income_per_capita_segregation_4000m_2020_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(income2020)


### Calculate Spatial Information Theory Index for income segregation in 2019 ###

# Load the data
income2019 <- read.csv("raw_data/database_2019.csv") # only if not already loaded

# Delete missing values of income
income2019 <- income2019[income2019$INHBESTINKH != 9999999999, ]

# Calculate income per capita in each household
income2019$Income_per_capita <- (income2019$INHBESTINKH/income2019$INHAHL)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their income within every FUA
income2019 <- income2019 %>%
  arrange(FUA, Income_per_capita)

# Calculate the income percentile rank every household belongs to in their FUA
income2019 <- income2019 %>%
  group_by(FUA) %>% 
  mutate(income_per_capita_percentile = ntile(Income_per_capita, 100))

# Clean rows of missing data 
income2019 <- income2019 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
income2019 <- income2019 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 4000m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- income2019
  
  # Create the dataframe out of the income database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, income_per_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, income_per_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = income_per_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 4000, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("income_per_capita_segregation_4000m_2019_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(income2019)





### Calculate Spatial Information Theory Index for income segregation in 2018 ###

# Load the data
income2018 <- read.csv("raw_data/database_2018.csv") # only if not already loaded

# Delete missing values of income
income2018 <- income2018[income2018$INHBESTINKH != 9999999999, ]

# Calculate income per capita in each household
income2018$Income_per_capita <- (income2018$INHBESTINKH/income2018$INHAHL)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their income within every FUA
income2018 <- income2018 %>%
  arrange(FUA, Income_per_capita)

# Calculate the income percentile rank every household belongs to in their FUA
income2018 <- income2018 %>%
  group_by(FUA) %>% 
  mutate(income_per_capita_percentile = ntile(Income_per_capita, 100))

# Clean rows of missing data 
income2018 <- income2018 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
income2018 <- income2018 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 4000m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- income2018
  
  # Create the dataframe out of the income database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, income_per_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, income_per_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = income_per_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 4000, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("income_per_capita_segregation_4000m_2018_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(income2018)




### Calculate Spatial Information Theory Index for income segregation in 2017 ###

# Load the data
income2017 <- read.csv("raw_data/database_2017.csv") # only if not already loaded

# Delete missing values of income
income2017 <- income2017[income2017$INHBESTINKH != 9999999999, ]

# Calculate income per capita in each household
income2017$Income_per_capita <- (income2017$INHBESTINKH/income2017$INHAHL)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their income within every FUA
income2017 <- income2017 %>%
  arrange(FUA, Income_per_capita)

# Calculate the income percentile rank every household belongs to in their FUA
income2017 <- income2017 %>%
  group_by(FUA) %>% 
  mutate(income_per_capita_percentile = ntile(Income_per_capita, 100))

# Clean rows of missing data 
income2017 <- income2017 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
income2017 <- income2017 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 4000m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- income2017
  
  # Create the dataframe out of the income database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, income_per_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, income_per_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = income_per_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 4000, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("income_per_capita_segregation_4000m_2017_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(income2017)




### Calculate Spatial Information Theory Index for income segregation in 2016 ###

# Load the data
income2016 <- read.csv("raw_data/database_2016.csv") # only if not already loaded

# Delete missing values of income
income2016 <- income2016[income2016$INHBESTINKH != 9999999999, ]


# Calculate income per capita in each household
income2016$Income_per_capita <- (income2016$INHBESTINKH/income2016$INHAHL)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their income within every FUA
income2016 <- income2016 %>%
  arrange(FUA, Income_per_capita)

# Calculate the income percentile rank every household belongs to in their FUA
income2016 <- income2016 %>%
  group_by(FUA) %>% 
  mutate(income_per_capita_percentile = ntile(Income_per_capita, 100))

# Clean rows of missing data 
income2016 <- income2016 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
income2016 <- income2016 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 4000m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- income2016
  
  # Create the dataframe out of the income database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, income_per_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, income_per_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = income_per_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 4000, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("income_per_capita_segregation_4000m_2016_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(income2016)


### Calculate Spatial Information Theory Index for income segregation in 2015 ###

# Load the data
income2015 <- read.csv("raw_data/database_2015.csv") # only if not already loaded

# Delete missing values of income
income2015 <- income2015[income2015$INHBESTINKH != 9999999999, ]

# Calculate income per capita in each household
income2015$Income_per_capita <- (income2015$INHBESTINKH/income2015$INHAHL)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their income within every FUA
income2015 <- income2015 %>%
  arrange(FUA, Income_per_capita)

# Calculate the income percentile rank every household belongs to in their FUA
income2015 <- income2015 %>%
  group_by(FUA) %>% 
  mutate(income_per_capita_percentile = ntile(Income_per_capita, 100))

# Clean rows of missing data 
income2015 <- income2015 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
income2015 <- income2015 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 4000m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- income2015
  
  # Create the dataframe out of the income database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, income_per_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, income_per_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = income_per_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 4000, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("income_per_capita_segregation_4000m_2015_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(income2015)



### Calculate Spatial Information Theory Index for income segregation in 2014 ###

# Load the data
income2014 <- read.csv("raw_data/database_2014.csv") # only if not already loaded

# Delete missing values of income
income2014 <- income2014[income2014$INHBESTINKH != 9999999999, ]

# Calculate income per capita in each household
income2014$Income_per_capita <- (income2014$INHBESTINKH/income2014$INHAHL)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their income within every FUA
income2014 <- income2014 %>%
  arrange(FUA, Income_per_capita)

# Calculate the income percentile rank every household belongs to in their FUA
income2014 <- income2014 %>%
  group_by(FUA) %>% 
  mutate(income_per_capita_percentile = ntile(Income_per_capita, 100))

# Clean rows of missing data 
income2014 <- income2014 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
income2014 <- income2014 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 4000m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- income2014
  
  # Create the dataframe out of the income database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, income_per_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, income_per_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = income_per_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 4000, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("income_per_capita_segregation_4000m_2014_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(income2014)




### Calculate Spatial Information Theory Index for income segregation in 2013 ###

# Load the data
income2013 <- read.csv("raw_data/database_2013.csv") # only if not already loaded

# Delete missing values of income
income2013 <- income2013[income2013$INHBESTINKH != 9999999999, ]

# Calculate income per capita in each household
income2013$Income_per_capita <- (income2013$INHBESTINKH/income2013$INHAHL)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their income within every FUA
income2013 <- income2013 %>%
  arrange(FUA, Income_per_capita)

# Calculate the income percentile rank every household belongs to in their FUA
income2013 <- income2013 %>%
  group_by(FUA) %>% 
  mutate(income_per_capita_percentile = ntile(Income_per_capita, 100))

# Clean rows of missing data 
income2013 <- income2013 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
income2013 <- income2013 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 4000m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- income2013
  
  # Create the dataframe out of the income database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, income_per_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, income_per_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = income_per_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 4000, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("income_per_capita_segregation_4000m_2013_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(income2013)








### Calculate Spatial Information Theory Index for income segregation in 2012 ###

# Load the data
income2012 <- read.csv("raw_data/database_2012.csv") # only if not already loaded

# Delete missing values of income
income2012 <- income2012[income2012$INHBESTINKH != 9999999999, ]

# Calculate income per capita in each household
income2012$Income_per_capita <- (income2012$INHBESTINKH/income2012$INHAHL)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their income within every FUA
income2012 <- income2012 %>%
  arrange(FUA, Income_per_capita)

# Calculate the income percentile rank every household belongs to in their FUA
income2012 <- income2012 %>%
  group_by(FUA) %>% 
  mutate(income_per_capita_percentile = ntile(Income_per_capita, 100))

# Clean rows of missing data 
income2012 <- income2012 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
income2012 <- income2012 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 4000m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- income2012
  
  # Create the dataframe out of the income database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, income_per_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, income_per_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = income_per_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 4000, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("income_per_capita_segregation_4000m_2012_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(income2012)


### Calculate Spatial Information Theory Index for income segregation in 2011 ###

# Load the data
income2011 <- read.csv("raw_data/database_2011.csv") # only if not already loaded

# Delete missing values of income
income2011 <- income2011[income2011$INHBESTINKH != 9999999999, ]

# Calculate income per capita in each household
income2011$Income_per_capita <- (income2011$INHBESTINKH/income2011$INHAHL)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their income within every FUA
income2011 <- income2011 %>%
  arrange(FUA, Income_per_capita)

# Calculate the income percentile rank every household belongs to in their FUA
income2011 <- income2011 %>%
  group_by(FUA) %>% 
  mutate(income_per_capita_percentile = ntile(Income_per_capita, 100))

# Clean rows of missing data 
income2011 <- income2011 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
income2011 <- income2011 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 4000m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- income2011
  
  # Create the dataframe out of the income database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, income_per_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, income_per_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = income_per_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 4000, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("income_per_capita_segregation_4000m_2011_", City, ".csv"), row.names = FALSE)
}


# Delete data for enabling further operations
rm(income2011)



### Calculate Spatial Information Theory Index for income segregation in 2010 ###

# Load the data
income2010 <- read.csv("raw_data/database_2010.csv") # only if not already loaded

# Delete missing values of income
income2010 <- income2010[income2010$BVRBESTINKH != 999999999, ]

# Calculate income per capita in each household
income2010$Income_per_capita <- (income2010$BVRBESTINKH/income2010$BVRAHL)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their income within every FUA
income2010 <- income2010 %>%
  arrange(FUA, Income_per_capita)

# Calculate the income percentile rank every household belongs to in their FUA
income2010 <- income2010 %>%
  group_by(FUA) %>% 
  mutate(income_per_capita_percentile = ntile(Income_per_capita, 100))

# Clean rows of missing data 
income2010 <- income2010 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
income2010 <- income2010 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 4000m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- income2010
  
  # Create the dataframe out of the income database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, income_per_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, income_per_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = income_per_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 4000, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("income_per_capita_segregation_4000m_2010_", City, ".csv"), row.names = FALSE)
}


# Delete data for enabling further operations
rm(income2010)




### Calculate Spatial Information Theory Index for income segregation in 2009 ###

# Load the data
income2009 <- read.csv("raw_data/database_2009.csv") # only if not already loaded

# Delete missing values of income
income2009 <- income2009[income2009$BVRBESTINKH != 999999999, ]

# Calculate income per capita in each household
income2009$Income_per_capita <- (income2009$BVRBESTINKH/income2009$BVRAHL)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their income within every FUA
income2009 <- income2009 %>%
  arrange(FUA, Income_per_capita)

# Calculate the income percentile rank every household belongs to in their FUA
income2009 <- income2009 %>%
  group_by(FUA) %>% 
  mutate(income_per_capita_percentile = ntile(Income_per_capita, 100))

# Clean rows of missing data 
income2009 <- income2009 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
income2009 <- income2009 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 4000m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- income2009
  
  # Create the dataframe out of the income database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, income_per_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, income_per_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = income_per_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 4000, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("income_per_capita_segregation_4000m_2009_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(income2009)




### Calculate Spatial Information Theory Index for income segregation in 2008 ###

# Load the data
income2008 <- read.csv("raw_data/database_2008.csv") # only if not already loaded

# Delete missing values of income
income2008 <- income2008[income2008$BVRBESTINKH != 999999999, ]

# Calculate income per capita in each household
income2008$Income_per_capita <- (income2008$BVRBESTINKH/income2008$BVRAHL)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their income within every FUA
income2008 <- income2008 %>%
  arrange(FUA, Income_per_capita)

# Calculate the income percentile rank every household belongs to in their FUA
income2008 <- income2008 %>%
  group_by(FUA) %>% 
  mutate(income_per_capita_percentile = ntile(Income_per_capita, 100))

# Clean rows of missing data 
income2008 <- income2008 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
income2008 <- income2008 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 4000m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- income2008
  
  # Create the dataframe out of the income database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, income_per_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, income_per_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = income_per_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 4000, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("income_per_capita_segregation_4000m_2008_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(income2008)




### Calculate Spatial Information Theory Index for income segregation in 2007 ###

# Load the data
income2007 <- read.csv("raw_data/database_2007.csv") # only if not already loaded

# Delete missing values of income
income2007 <- income2007[income2007$BVRBESTINKH != 999999999, ]

# Calculate income per capita in each household
income2007$Income_per_capita <- (income2007$BVRBESTINKH/income2007$BVRAHL)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their income within every FUA
income2007 <- income2007 %>%
  arrange(FUA, Income_per_capita)

# Calculate the income percentile rank every household belongs to in their FUA
income2007 <- income2007 %>%
  group_by(FUA) %>% 
  mutate(income_per_capita_percentile = ntile(Income_per_capita, 100))

# Clean rows of missing data 
income2007 <- income2007 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
income2007 <- income2007 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 4000m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- income2007
  
  # Create the dataframe out of the income database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, income_per_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, income_per_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = income_per_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 4000, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("income_per_capita_segregation_4000m_2007_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(income2007)



### Calculate Spatial Information Theory Index for income segregation in 2006 ###

# Load the data
income2006 <- read.csv("raw_data/database_2006.csv") # only if not already loaded

# Delete missing values of income
income2006 <- income2006[income2006$BVRBESTINKH != 999999999, ]

# Calculate income per capita in each household
income2006$Income_per_capita <- (income2006$BVRBESTINKH/income2006$BVRAHL)

# Define all possible FUAs
cities <- c("'s-Gravenhage", "'s-Hertogenbosch", "Alkmaar","Almelo", "Alphen aan den Rijn", "Amersfoort", "Amsterdam", "Apeldoorn", "Arnhem", "Assen", "Bergen op Zoom", "Breda", "Deventer", "Ede", "Eindhoven", "Enschede", "Gouda", "Groningen", "Heerlen", "Leeuwarden", "Leiden", "Lelystad", "Maastricht", "Middelburg", "Nijmegen", "Oss", "Roosendaal", "Rotterdam", "Sittard-Geleen", "Soest", "Tilburg", "Utrecht", "Veenendaal", "Venlo", "Zwolle")

# Order values according to their income within every FUA
income2006 <- income2006 %>%
  arrange(FUA, Income_per_capita)

# Calculate the income percentile rank every household belongs to in their FUA
income2006 <- income2006 %>%
  group_by(FUA) %>% 
  mutate(income_per_capita_percentile = ntile(Income_per_capita, 100))

# Clean rows of missing data 
income2006 <- income2006 %>%
  filter(VRLVIERKANT100M != "----------")

# Correct coordinates to make sure they represent meters and not kilometers
income2006 <- income2006 %>%
  mutate(VRLVIERKANT100M = gsub("(E)(\\d+)(N)(\\d+)","\\1\\200\\3\\400", VRLVIERKANT100M))

# Loop over each city (sigma = 4000m)
for (city in cities) {
  # Set the variable city for the given city
  City <- city
  
  # Re-start the original database
  df <- income2006
  
  # Create the dataframe out of the income database  for the FUA we are interested on
  df <- df %>%
    filter(FUA == City)
  
  # Keep only relevant columns
  df <- df %>%
    select(VRLVIERKANT100M, income_per_capita_percentile)
  df$FUA <- NULL
  
  # create a pivot table
  pivot <- df %>%
    group_by(VRLVIERKANT100M, income_per_capita_percentile) %>%
    summarise(Population = n()) %>%
    tidyr::pivot_wider(names_from = income_per_capita_percentile, values_from = Population, values_fill = 0)
  
  # Create an element for only the coordinates of the grid cells
  grid_cells <- pivot %>%
    select(VRLVIERKANT100M)
  
  # Parse the coordinates
  grid_cells <- grid_cells %>%
    mutate(
      x = as.numeric(sub("E", "", str_extract(VRLVIERKANT100M, "E\\d+"))),
      y = as.numeric(sub("N", "", str_extract(VRLVIERKANT100M, "N\\d+"))),
    )
  
  # Delete superflous columns for coordinates
  grid_cells$VRLVIERKANT100M <- NULL
  
  # Create a spatial object for grid cells
  Spatial_grid_cells <- st_as_sf(grid_cells, coords = c("x", "y"), crs = 28992)
  coords <- st_coordinates(Spatial_grid_cells)
  
  # Create SpatialPoints object
  Spatial_grid_cells_sp <- SpatialPoints(coords, proj4string = CRS("+init=epsg:28992"))
  
  # Delete superflous columns for the income percentiles data
  pivot$VRLVIERKANT100M <- NULL
  
  # Order columns in the right order
  col_names <- as.numeric(colnames(pivot))
  
  # Re-order the dataframe
  pivot <- pivot[, order(col_names)]
  
  # Initialize a list to store each database for the income percentile pairwise calculation
  list_of_databases <- list()
  
  # Loop through each combination
  for(i in 1:(ncol(pivot)-1)) {
    # Calculate the cumulative sum of people up to the current percentile
    sumUpToCurrentPerc <- rowSums(pivot[, 1:i], na.rm = TRUE)
    
    # Calculate the sum of people above the current percentile
    sumAboveCurrentPerc <- rowSums(pivot[, (i+1):ncol(pivot)], na.rm = TRUE)
    
    # Create a new dataframe for the current percentile
    newData <- data.frame(sumUpToCurrentPerc = sumUpToCurrentPerc, sumAboveCurrentPerc = sumAboveCurrentPerc)
    
    # Store it in the list
    list_of_databases[[i]] <- newData
  }
  
  # Extract specific databases from it
  indices <- seq(1,99)
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    assign(database_name,list_of_databases[[i]])
  }
  
  # Initialize a list for storing SITI values
  SITI_results <- list()
  
  # Loop through the indices and calculate SITI results
  for (i in indices) {
    database_name <- paste0("database_IncPer_", i)
    SITI_results_name <- paste0("SITI_result", i)
    
    # Calculate SITI
    assign(SITI_results_name, spseg(x = Spatial_grid_cells_sp, data = get(database_name), method = "information", smoothing = "kernel", sigma = 4000, useC = FALSE))
    
    # Store the result in the list
    SITI_results[[SITI_results_name]] <- get(SITI_results_name)
    
  }
  
  # Transform the results into something readable
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    
    #Transform the SITI result
    assign(SITI_result_name, get(SITI_result_name)@h)
  }
  
  
  # Prepare them for being saved and plotted
  combined_data_list <- list()
  for (i in indices) {
    SITI_result_name <- paste0("SITI_result", i)
    df <- data.frame(Value = get(SITI_result_name), Group = as.character(i))
    combined_data_list[[SITI_result_name]] <- df
  }
  
  # Make them be in a single database
  combined_data <- do.call(rbind, combined_data_list)
  
  # Convert the Group column to a factor with custom levels
  combined_data$Group <- factor(combined_data$Group, levels = as.character(indices))
  
  # Save the data
  write.csv(combined_data, file = paste0("income_per_capita_segregation_4000m_2006_", City, ".csv"), row.names = FALSE)
}

# Delete data for enabling further operations
rm(income2006)


