# Raquel Viula, 2022
# "Discomfort Glare in Classrooms: an Investigation in Space"
# PhD Thesis, TU Delft

# explore bins for regression models

#### load required libraries ####
library("dplyr")

# identify working directory (of R project)
setwd("~/glare-spatial-model")
wd <- getwd()

#
#### linear regression of dgp, ugp and dgp_log_ev for disturbing and any glare using bins and testing cook's distance ####
#
bin_regression <- function(case, df, fl, bin, calc_cooks = FALSE){
  for (ind in new_inds){
    # create bins
    ranked_values = rank(df[,ind],ties.method='first')
    df$bin = as.integer(cut(ranked_values,quantile(ranked_values, c(0:bin)/bin),include.lowest=T))
    
    # aggregate values for modelling
    df_agg = aggregate(df[,ind], by=list(df$bin), FUN=mean)
    colnames(df_agg)[1:2] = c('bin','ind_mean')
    df_agg$ind_sd = aggregate(df[,ind], by=list(df$bin), FUN=sd)$x
    df_agg$size = aggregate(df, by=list(df$bin), FUN=length)$bin
    val_sd_mean = mean(df_agg$ind_sd)
    val_sd_max = max(df_agg$ind_sd)
    for(ind_glare in glare_inds){
      df_agg$pc_glare = aggregate(df[,ind_glare], by=list(df$bin), FUN=sum)$x/df_agg$size
      
      # linear regression
      lm_df_agg = lm(pc_glare ~ ind_mean, data=df_agg)
      val_r2 = round(summary(lm_df_agg)[[8]],4)
      val_adjr2 = round(summary(lm_df_agg)[[9]],4)
      val_p = round(coef(summary(lm_df_agg))[[8]],5)
      #val_ci = predict.lm(lm_df_agg, interval="confidence", level=0.95)
      val_intercept = coef(summary(lm_df_agg))[[1]]
      val_slope = coef(summary(lm_df_agg))[[2]]
      val_interc_std_err = coef(summary(lm_df_agg))[[3]]
      val_slope_std_err = coef(summary(lm_df_agg))[[4]]

      # draw a plot with interval bars
      plot(df_agg$ind_mean, df_agg$pc_glare, col="red", pch = 10, cex = 1, ylim=c(0, 1), xlab = names(which(new_inds==ind)), 
           ylab = names(which(glare_inds==ind_glare)),
           main = bquote(.(fl) ~ .(bin) ~ " bins: " ~ R^2 == .(val_r2) ~ ", p-value: " ~ .(val_p)))
      axis(side=2, at=seq(0, 1, by=0.2))
      segments(df_agg$ind_mean-(df_agg$ind_sd/2), df_agg$pc_glare, df_agg$ind_mean+(df_agg$ind_sd/2), df_agg$pc_glare)
      lines(df_agg$ind_mean,predict(lm_df_agg), lty=2 )

      # cooks distance to validate linear regression
      if (calc_cooks==TRUE){
        val_cooksd = cooks.distance(lm_df_agg)
        # draw a cooks distance plot
        sample_size <- nrow(df_agg)
        plot(val_cooksd, pch="*", cex=2, main=paste0(names(which(new_inds==ind))," Influential Obs by Cooks distance"))  # plot cook's distance
        abline(h = 4/sample_size, col="red")  # add cutoff line
        abline(h = 1, col="blue")  # add cutoff line
        text(x=1:length(val_cooksd)+1, y=val_cooksd, labels=ifelse(val_cooksd>4/sample_size & val_cooksd<=1, names(val_cooksd),""), col="red")
        text(x=1:length(val_cooksd)+1, y=val_cooksd, labels=ifelse(val_cooksd>1, names(val_cooksd),""), col="blue")     
      }

      # write the model results
      res <- data.frame(case,fl,names(which(new_inds==ind)),ind_glare,bin,val_sd_mean,val_sd_max,val_r2,val_adjr2,val_p,val_interc_std_err,val_slope_std_err,val_intercept,val_slope)
      write.table(res,file=paste0(wd,"/outputs/ch8/metrics_glare_regression.csv"),sep = ",",append=T,col.names=F,row.names=F)
      # write data results
      df_data = data.frame(case_study= case, position = fl, metric = names(which(new_inds==ind)), glare = ind_glare, df_agg)
      write.table(df_data,file=paste0(wd,"/outputs/ch8/metrics_glare_bins.csv"),sep = ",",append=T,col.names=F,row.names=F)
    }
  }
}
# results dataframe
metrics_regression_results = data.frame(
  case_study=character(),
  position=character(),
  metric=character(),
  glare=character(),
  bins=integer(),
  val_sd_mean=double(),
  val_sd_max=double(),
  val_r2=double(),
  val_adjr2=double(),
  val_p=double(),
  se_intercept=double(),
  se_slope=double(),
  val_intercept=double(),
  val_slope=double(),
  stringsAsFactors=FALSE
)
# write the headers
write.csv(metrics_regression_results,file=paste0(wd,"/outputs/ch8/metrics_glare_regression.csv"),col.names=T,row.names=F)
# vote bins dataframe
metrics_bins_data = data.frame(
  case_study=character(),
  position=character(),
  metric=character(),
  glare=character(),
  bin=integer(),
  ind_mean=double(),
  ind_sd=double(),
  size=double(),
  pc_glare=double(),
  stringsAsFactors=FALSE
)
# write the headers
write.csv(metrics_bins_data,file=paste0(wd,"/outputs/ch8/metrics_glare_bins.csv"),col.names=T,row.names=F)


#
case = "Case study I"
for (nbin in c(7,10,13)){
  # full data
  fl = "full"
  df = vote_data
  bin_regression(case,df,fl,nbin)
  # p1 data
  fl = "p1"
  df = p1_vote_data
  bin_regression(case,df,fl,nbin)
  # p2 data
  fl = "p2"
  df = p2_vote_data
  bin_regression(case,df,fl,nbin)
  # p3 data
  fl = "p3"
  df = p3_vote_data
  bin_regression(case,df,fl,nbin)
  # p4 data
  fl = "p4"
  df = p4_vote_data
  bin_regression(case,df,fl,nbin)  
  # front data
  fl = "front"
  df = front_vote_data
  bin_regression(case,df,fl,nbin)
  # back data
  fl = "back"
  df = back_vote_data
  bin_regression(case,df,fl,nbin)
  # wall data
  fl = "wall"
  df = wall_vote_data
  bin_regression(case,df,fl,nbin)
  # window data
  fl = "window"
  df = window_vote_data
  bin_regression(case,df,fl,nbin)
}
dev.off()


# clean up
rm(case,fl,nbin,df,metrics_regression_results,metrics_bins_data)
