#######################################
# Gateway Factor: Regression analysis #
#######################################

rm(list=ls())
library(tidyverse)
library(stargazer)
library(knitr)
library(sf)
library(ggrepel)

# read model data and spatial data per EU country
modeldata <- read_csv("gateway_factor/processed/modeldata.csv") |>  filter(ctp_percap > 0, Exports > 0, expenditure2022 > 0 ) 
nuts <- st_read("gateway_factor/data/NUTS_RG_20M_2021_3035/NUTS_RG_20M_2021_3035.shp")
country_id <- read_csv2("gateway_factor/data/country_id.csv")

# Check  correlations between independent variables
cortable <- modeldata |> select(-country) |> cor()
kable(cortable, caption = "Correlation table")
### Balance/Imports and Exports/cap highly correlate, so only Exports/cap is kept
### Added value and income per capita correlates highly (> 0.7) with container throughput (TEU) per capita and trade (import+export) 
### trade correlates higher with logistics footprint per capita, so Added value and TEU variables dropped

# define a linear model with selected variables
## select data
modeldata1 <- modeldata |> 
  select(country,Pop2022,log_share,ind_share,Ecom_2022,ctp_percap,val_cap,teu_cap, bal_cap, imp_cap, exp_cap, expenditure2022) |>
  mutate(trade = imp_cap + exp_cap, Pop2022 = Pop2022/1000000)
## a standardized version of the data was also attempted, with no advantage in the regression, so dropped
#### modeldata2 <- modeldata1 |> mutate_at(scale, .vars = vars(-country)) # standardize all variables except country name

## define 2 models to explain amount of logistics footprint per capita, the first including the second excluding trade
model1 <- lm(ctp_percap ~ trade + Ecom_2022 + ind_share + log_share + expenditure2022, data = modeldata1)
model2 <- lm(ctp_percap ~ Ecom_2022 + ind_share + expenditure2022, data = modeldata1)

## regression results
stargazer(model1, model2,
          type = 'text', 
          header = F, 
          align = T,
          title = "linear regression model",
          df = F, no.space = T, font.size = 'small', 
          single.row = T,
          dep.var.labels = ("logistics footprint per capita"),
          covariate.labels = c("Trade volume per capita",
                               "E-commerce penetration", "Manufacturing share of economy",
                               "Logistics, wholesale and retail share of economy",
                               "Real expenditure per capita"),
          out = "gateway_factor/plots/regression_table.html"
)


# plots

## Figure 2.19 plot residuals
predscore1 <- predict(model1)
predscore2 <- predict(model2)
residualdata <- cbind(modeldata1,predscore1, predscore2) |> 
  mutate(residual1 = predscore1 - ctp_percap, residual2 = predscore2 - ctp_percap) |>
  select(country, residual1, residual2) |>
  pivot_longer(cols = (-country))
ggplot(residualdata, aes(x = value, y = country, color = name)) +
  geom_vline(xintercept = 0, color = "gray70") +
  geom_point() +
  theme_bw(base_size = 12) +
  scale_color_manual(name = "Residuals", labels = c("model 1","model 2"), values = c("red","blue")) +
  labs(x = "residual (unexplained) value per model", y = "")
ggsave("gateway_factor/export/2.19.pdf", height = 2000, width = 3000, units = "px", dpi = 300)
ggsave("gateway_factor/export/2.19.png", height = 2000, width = 3000, units = "px", dpi = 300)  

## Figure 2.20 DC area per capita with predicted values model1
### simple fit model predicting for all countries
plotdata <- read_csv("gateway_factor/processed/modeldata.csv") |>
  select(country, ctp_percap, imp_cap, exp_cap, ind_share, log_share, Ecom_2022, expenditure2022) |>
  mutate(trade = imp_cap + exp_cap,
         predicted = case_when(is.na(ctp_percap) ~ 1, TRUE ~ 0),
         percap = case_when(predicted == 1 ~ 16.622 * trade + 0.0001 * expenditure2022 - 2.225,
                            TRUE ~ ctp_percap)
  )
plotdata$predicted <- as.factor(plotdata$predicted)
### dot plot
plotdata |>
  ggplot(aes(x = trade, y = percap , label = country, color = predicted)) +
  geom_point() +
  scale_color_manual(name = "", labels = c("data","predicted"),values = c("cyan4","cyan2")) +
  geom_smooth(method = "lm") +
  geom_text_repel() + #for not overlapping labels
  labs(x = "trade per capita (imports + exports in billion Euro)", y = "logistics footprint per capita (sqm)") +
  theme_bw(base_size = 12)
ggsave("gateway_factor/export/2.20.pdf", height = 2000, width = 3000, units = "px", dpi = 300)
ggsave("gateway_factor/export/2.20.png", height = 2000, width = 3000, units = "px", dpi = 300)  

## Figure 2.21 EU map plot
### join data
plotdata <- plotdata |> left_join(country_id)
Fig2.21 <- nuts |>  filter(LEVL_CODE == 0) |> left_join(plotdata)
### map plot
ggplot() +
  geom_sf(data = Fig2.21, aes(fill = percap), color = "white") +
  scale_fill_gradient(name = "DC class A per capita (sqm)",low = "azure2", high = "cyan3") +
  theme_classic(base_size = 12) +
  coord_sf(xlim = c(2500000, 6000000), ylim = c(1500000, 5500000), expand = FALSE) + ## checked lims in qgis coords
  theme(axis.line = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        legend.position = "right"
  )
ggsave(filename = "gateway_factor/export/2.21.png",
       dpi = 300, width = 3000, height = 3000, units = "px")
ggsave(filename = "gateway_factor/export/2.21.pdf",
       dpi = 300, width = 3000, height = 3000, units = "px")
