# On-line system for pipe burst using Bayesian dynamic linear models

# Authors : Renato Henriques-Silva, Sophie Duchesne, Nicolas Fortin St-Gelais,
# Naysan Saran, Alexandra M. Schmidt

###########################################################################################
# The Kalman filter equations below are based on the model developed by Helio S. Migon 
# and Larissa Alves, authors of the paper entitled "Multivariate dynamic regression: modeling 
# and forecasting for intraday electricity load" (2013). We are grateful to them for
# sharing their script.
###########################################################################################

# Script to run Bayesian dynamic linear model 
# The dataset is available for download under the DOI: https://doi.org/10.4121/17169383.v1

# How to cite : 

# Henriques-Silva, R., Schmidt, A. M., Saran, N., Duchesne, S., & St-Gelais, N. F. (2022). 
# Data underlying the research on On-line warning system for pipe burst using Bayesian dynamic linear models.
# 4TU.ResearchData Dataset. https://doi.org/10.4121/17169383.v1 



# absolute path to were the data was downloaded
sv_dir 

# auxiliary function
`%notin%`= negate(`%in%`) # inverse of %in% function

###############################################################################################
# DATA
###############################################################################################
# read flow matrix (rows are days, columns are the 24 hours)
y <- as.matrix(read.table(file = paste0(sv_dir, 'dma1_flow.txt'), header = T, 
                          stringsAsFactors = F, row.names = NULL))
# read pressure matrix (rows are days, columns are the 24 hours)
x2 <- as.matrix(read.table(file = paste0(sv_dir, 'dma1_pressure.txt'), header = T, 
                           stringsAsFactors = F, row.names = NULL))
delta = 0.95 # discount factor (value obtained from sensitivity analysis)

# read Date vector (rows are days)
dates <- as.matrix(read.table(file = paste0(sv_dir, 'dates.txt'), header = T, 
                              stringsAsFactors = F, row.names = NULL))
# read temperature matrix (rows are days, columns are the 24 hours)
x <- as.matrix(read.table(file = paste0(sv_dir, 'temperature.txt'), header = T, 
                          stringsAsFactors = F, row.names = NULL))
# read weekend/workday dummies (rows are days, columns are the 2 dummies)
wk_dummy <- as.matrix(read.table(file = paste0(sv_dir, 'wk_dummy.txt'), header = T, 
                                 stringsAsFactors = F, row.names = NULL))

# data transformation
y <- log(y/10)
x <- scale(x)
x2 <- scale(x2)

###############################################################################################
# MODEL SPECIFICATION 
###############################################################################################
# Evolution matrix (GG)

# 1st component - trend
GG1 = diag(2)
GG1[1,2] = 1

# 2nd component - preditctors and auto-regressive order 1
GG2 = diag(3)

# 3rd component -  weekend / workday dummies
GG3 = diag(2)

p  =  7                               # Number of parameters for Gt
GG = matrix(0 ,nrow = p, ncol = p)    # Evolution Matrix (Gt)
GG[1:2,1:2] = GG1
GG[3:5,3:5] = GG2
GG[6:7,6:7] = GG3

# Matrice of regressors (Ft)
n_hours <- ncol(y)
n_days <- nrow(y) - 1 # first day is not filtered

X  = array(NA,dim=c(n_hours,p,n_days))  # Design Matrix (Ft)

# 1st component - trend
X[,1,] = 1
X[,2,] = 0

for(h in 1:n_hours){
  
  x_temp <- scale(x[,h])
  x_press <- scale(x2[,h])
  
  for(t in 1:n_days){
    
    # auto-regressive order 1 
    X[h,3,t] = y[t,h]
    if(is.na(X[h,3,t])){ # if measurement is missing, assign previous one
      X[h,3,t] = y[t-1,h]
    }
    # temperature
    X[h,4,t] = x_temp[(t+1)]
    # pressure
    X[h,5,t] = x_press[(t+1)]
    # workday dummy
    X[h,6,t] = wk_dummy[(t+1),1]
    # weekend dummy
    X[h,7,t] = wk_dummy[(t+1),2]
  } 
}

###############################################################################################
# INITIAL VALUES 
###############################################################################################
n0_init = 5 # degrees of freedom
d0_init = 1 # d0
S0_init = d0_init/n0_init # prior point estimate of variance
Y0 <- colMeans(y, na.rm = T) # mean of flow for each hour
m0_init = matrix(0,p,ncol(y),byrow=T); m0_init[1,] = Y0 # prior mean
C0_init = diag(p)*100 # prior covariance matrix
lvl_shift = 3 # level shift (h) 

###############################################################################################
# OUTPUT OBJECT INITIALIZATION
###############################################################################################

# Kalman filter and one-step-ahead forecast
MT_out = array(0, dim=c(n_days, p, n_hours))                             # posterior mean (m_t)
colnames(MT_out) <- colnames(X)
FT_out = matrix(0, n_days, n_hours, T)                                   # one-step ahead predictive mean (f_t)
QT_out = matrix(0,n_days, n_hours, T)                                    # predictive variance (Q_t)
ET_out = matrix(0,n_days, n_hours, T)                                    # error term (e_t)
NT_out = matrix(0,n_days, n_hours, T)                                    # degree of freedom (n_t)
ST_out = matrix(0,n_days, n_hours, T)                                    # Point estimate of variance (S_t)
CC_out = array(NA, dim=c(p, p, n_hours, n_days))                         # posterior covariance matrix (C_t)
RT_out = array(NA, dim=c(p, p, n_hours, n_days))                         # priori covariance matrix (R_t)
AA_out = array(NA, dim=c(n_days, p, n_hours))                            # prior mean (a_t)
Low_out = matrix(0, n_days, n_hours,T)                                   # 95% Lower Posterior Credible Interval
Up_out = matrix(0, n_days, n_hours,T)                                 # 95% Upper Posterior Credible Interval
HT <- matrix(NA, n_days, n_hours, T)                                     # Bayes factor


###############################################################################################
# KALMAN FILTER
###############################################################################################

# loop through hours (24 univariate DLMs)
for(h in 1:n_hours)
{
  # 1st timestep of each DLM
  
  aa = GG%*%m0_init[,h]                        # a-priori mean (a_1)
  
  RR = GG%*%C0_init%*%t(GG)/delta              # a-priori covatiance matrix (R_1)
  
  for(o in 1:(p-1))                            # Force symmetry  R_1
  {
    for(p in (o+1):p)
    {
      RR[o,p] = RR[p,o]
    }
  }
  
  FF = (X[h,,1])                                # define design matrix (F_1)
  ft = t(FF)%*%aa                               # predictive mean (f_1)
  et = y[2,h]-ft                                # predictive error (e_1)
  
  QQ = t(FF)%*%RR%*%FF+ S0_init                 # predictive variance (Q_1)
  At = RR%*%FF%*%(1/QQ)                         # adaptive vector (A_1)
  
  # if y is observed, parameters are updated
  if(!is.na(y[2,h])){
    
   
    n1 = n0_init+1                                           # n_1
    S1 = S0_init + (S0_init/n1)*(((et^2)/QQ) - 1)            # point estimate of observation variance (S_1)
   
    mt = aa + At%*%et                                        # posterior mean (m_1) 
    CC = as.numeric(S1/S0_init)*(RR - QQ[1]*(At%*%t(At)))    # posterior covariance matrix (C_1) 
    
    for(o in 1:(p-1))                                        # Force symmetry C_1
    {
      for(p in (o+1):p)
      {
        CC[o,p] = CC[p,o]
      }
    }
    # if y is missing, parameters are not updated; posterior = prior  
  }else{
    n1 = n0_init
    S1 = S0_init
    mt = aa
    CC = RR
  }
  
  # Log-Bayes factor
  H = log(exp(0.5*(lvl_shift^2 - 2*lvl_shift*(et/sqrt(QQ))))) 
  
  # Store results first time step
  MT_out[1,,h] = t(mt)                                                    
  FT_out[1,h] = t(ft)                                
  QT_out[1,h] = QQ                                    
  ET_out[1,h] = t(et)                   
  NT_out[1,h] = n1                                
  ST_out[1,h] = S1                                  
  CC_out[,,h,1] = CC                     
  RT_out[,,h,1] = RR                       
  AA_out[1,,h] = t(aa)              
  Low_out[1,h] = ft - 1.96*sqrt(QQ)                                  
  Up_out[1,h] = ft + 1.96*sqrt(QQ)                                 
  HT[1,h] = H                              
  
  
  # update parameter 'initial values' for next iteration
  S0  = S1
  n0  = n1
  m0 = mt
  C0 = CC
  
  # loop from timestep 2 up to n_days
  for (t in 2:n_days)
  {
    
    
    aa = GG%*%m0                         # a priori mean (a_t)
    RR = GG%*%C0%*%t(GG)/delta           # a-priori covatiance matrix (R_t)
    
    
    for(o in 1:(p-1))                    # Force simmetry R_t
    {
      for(p in (o+1):p)
      {
        RR[o,p] = RR[p,o]
      }
    }
    
    FF = (X[h,,t])                     # define F_t (design matrix)
    ft = t(FF)%*%aa                    # predictive mean (f_t)
    et = y[(t+1),h]-ft                 # predictive errors (e_t)
    
    QQ = t(FF)%*%RR%*%FF+S0            # predictive variance (Q_t)
    
    At = RR%*%FF%*%(1/QQ)              # adaptive vector (A_1)
    
    # if y is observed, parameters are updated
    if(!is.na(y[(t+1),h])){
      
      n1 = n0 + 1                                                       # n_t
      S1 = S0 + (S0/n1)*(((et^2)/QQ) - 1)                               # point estimate of observation variance (S_t)
      
      mt = aa + At%*%et                                                 # posterior mean (m_t)
      CC = as.numeric(S1/S0)*(RR - as.numeric(QQ)*(At%*%t(At)))         # posterior covariance matrix (C_t)
      
      for(o in 1:(p-1))                             # force symmetry C_t
      {
        for(p in (o+1):p)
        {
          CC[o,p] = CC[p,o]
        }
      }
      # if y is missing, parameters are not updated; posterior = prior  
    }else{
      mt = aa
      CC = RR
      n1 = n0
      S1 = S0
    }
    
    # Log-Bayes factor
    H = log(exp(0.5*(lvl_shift^2 - 2*lvl_shift*(et/sqrt(QQ))))) 
    
    # Store results time step 't'
    MT_out[t,,h] =  t(mt)                                                    
    FT_out[t,h] = t(ft)                                
    QT_out[t,h] = QQ                                    
    ET_out[t,h] = t(et)                   
    NT_out[t,h] = n1                                
    ST_out[t,h] = S1                                  
    CC_out[,,h,t] = CC                     
    RT_out[,,h,t] = RR                       
    AA_out[t,,h] = t(aa)              
    Low_out[t,h] = ft - 1.96*sqrt(QQ)                                  
    Up_out[t,h] = ft + 1.96*sqrt(QQ)                                 
    HT[t,h] = H
    
    # update parameter 'initial values' for next iteration
    C0 = CC
    m0 = mt
    S0  = S1
    n0  = n1
  }
}

###############################################################################################
# ORGANIZE OUTPUT
###############################################################################################
require(tidyverse)
require(lubridate)
require(scales)

# Build output data.frame
# remove first day
dates <- dates[-1]
y <- y[-1,]

# 'hours of the day' vector
hours <- 0:23

# Create a data.frame with date & hour features
df <- data.frame(Date = rep(dates, times = 24), hour = rep(hours, each = n_days))

# Observed values
df <- df %>% mutate(obs = as.vector(y))  
# Extract one-step ahead forecast values (f_t) from model output
df <- df %>% mutate(pred = as.vector(FT_out)) 
# Extract one-step ahead forecast 95% upper probability interval (LimSupDR2l) 
df <- df %>% mutate(upper = as.vector(Up_out)) 
# Extract one-step ahead forecast 95% lower probability interval (LimInfDR2l) 
df <- df %>% mutate(lower = as.vector(Low_out))
# Extract one-step ahead forecast error
df <- df %>% mutate(error = as.vector(ET_out))
# Extract one-step ahead forecast variance
df <- df %>% mutate(qt = as.vector(QT_out))
# Compute model innovation (i.e., standardized residuals)
df <- df %>% mutate(error_std =  error/sqrt(qt))
# Extract Bayes factor (model 1)
df <- df %>% mutate(HT = as.vector(HT))
# Sort data.frame by date, and then hour
df <- df %>% arrange(Date, hour)
# Convert hour feature from numeric to character
df <- df %>% mutate(hour = as.character(hour))
# Convert hour feature from 1 to 01:00, 2 to 02:00, ... and 23 to 23:00
df <- df %>% mutate(hour = ifelse(nchar(hour) == 1, paste0("0",hour,":00"),
                                  paste0(hour,":00")))
# Merge Date and hour feature into POSIXct object (date-time feature)
df <- df %>% mutate(Date = as.POSIXct(paste(Date,hour),
                                      format = "%Y-%m-%d %H:%M", tz = 'EST')) %>% select(-hour)
# transform back results to the original scale
df <- df %>% mutate(obs_original = exp(obs)*10)
df <- df %>% mutate(pred_original = exp(pred + 1/2*qt)*10)
df <- df %>% mutate(qt_original = (exp(2*pred + qt)*(exp(qt) - 1))*10)
df <- df %>% mutate(upper_original = exp(pred + 1.96*sqrt(qt))*10)
df <- df %>% mutate(lower_original = exp(pred - 1.96*sqrt(qt))*10)





###############################################################################################
# PLOT RESULTS
###############################################################################################
# Example : break date -> 16 September 2016

# plot results from six days before to six days after the break
date_start = as.Date("2016-09-16", tz = 'EST') - 6
date_end   = as.Date("2016-09-16", tz = 'EST') + 6

# highlight break date
RECT = data.frame(
  xmin=as.POSIXct(paste("2016-09-16","00:00:00"), format = '%Y-%m-%d %H:%M', tz ='EST'),
  xmax=as.POSIXct(paste("2016-09-16","23:30:00"), format = '%Y-%m-%d %H:%M', tz ='EST'),
  ymin=-Inf,
  ymax=Inf,
  fill=rep("gray20",1)
)

# filter selected dates
df.plot <- df %>% filter(between(as.Date(Date,tz ='EST'),
                                 as.Date(date_start, tz = 'EST'),as.Date(date_end, tz = 'EST')))  

# plot labels
labs_plot = c('One-step ahead forecast','Flow (m³/h)','estimated','observed')

# one-step ahead forecast
p1 <-  df.plot %>% mutate(label = labs_plot[1]) %>%
  ggplot() + geom_ribbon(aes(x = Date, ymin = lower_original, ymax = upper_original), alpha = 0.5, fill = 'steelblue', color = 'blue',
                         linetype = 'dashed')  +
  geom_line(aes(x = Date, y = pred_original, alpha = labs_plot[3]), color = 'red', size = 1.5) +
  geom_point(aes(x = Date, y = obs_original, alpha = labs_plot[4]), size = 2, shape = 21, color = 'black', fill = 'white', stroke = 1.5)  +
  ylab(labs_plot[2]) + facet_wrap(~label) + scale_x_datetime(breaks = scales::breaks_pretty(10)) +
  scale_alpha_manual(name = ylab(labs_plot[2]),
                     values = c(1, 1),
                     breaks = c(labs_plot[3], labs_plot[4]),
                     guide = guide_legend(override.aes = list(linetype = c(1,0),  shape = c(NA,21))))


p1 <- p1 + geom_rect(data=RECT,inherit.aes=FALSE,aes(xmin=xmin,xmax=xmax,ymin=ymin,ymax=ymax),
                     fill=rep(RECT$fill,1),alpha=0.4) 

# Bayes factor plot
df.HT <- df.plot %>% select(Date, HT)  %>% pivot_longer(cols = -Date, names_to = c('param'), values_to = 'value') %>%
  mutate(param = "log-Bayes factor")
df.hline <- data.frame(param = c("log-Bayes factor"), value = -2.5)

p2 <- df.HT  %>% 
  ggplot(aes(x = Date, y = value)) + geom_line(col = 'black', size = 1.5) + geom_point( col = 'red', shape = 21, size = 2, stroke = 1.5)+
  geom_hline(data = df.hline, aes(yintercept = value), color = 'red',linetype = 'dashed', size = 1.5) + ylab(substitute(paste("Log[",H[t],"]"))) +
  facet_wrap(~param, scales = 'free_y', ncol =1) 

# Adjust plot themes [cosmetic]
p2 <- p2 + theme_bw()  + 
  scale_x_datetime(breaks = breaks_pretty(10)) + theme(axis.title = element_text(size = 20), axis.text = element_text(size = 16),
                                                       strip.text = element_text(size = 26, color = '#FFFFFF'), strip.background =element_rect(fill="#228b22"))
p1 <- p1 +  theme_bw()  +
  theme(axis.title.y = element_text(size = 20), axis.text = element_text(size = 16),
        axis.title.x = element_blank(), axis.text.x = element_blank(),
        strip.text = element_text(size = 26, color = '#FFFFFF'), strip.background =element_rect(fill="#228b22"),
        legend.position = 'top', legend.title =  element_text(size = 20),
        legend.text = element_text(size = 16)) 


# join two graphs into one plot
g1 <- ggplotGrob(p1)
g2 <- ggplotGrob(p2)
grid::grid.newpage()
p <- grid::grid.draw(rbind(g1, g2))

# display
p


####################################################################################################################################
# COMPUTE MODEL METRICS (FALSE POSITIVE RATE, TRUE POSITIVE RATE)
####################################################################################################################################

# dates with burst events
burst_events <- c("2015-10-08","2015-10-27","2015-11-22","2016-01-26","2016-03-04","2016-07-12","2016-09-16","2016-10-27")
# convert to date format
burst_events <- as.Date(burst_events, tz = 'EST')
# break monitoring starts in march
df.monit <- df %>% filter(Date >= as.POSIXct('2015-03-01', tz = 'EST'))



####################################################################################################################################
# BAYES FACTOR APPROACH (uses HT - which is the log-Bayes Factor)
####################################################################################################################################
burst_detected = 0
bayes_threshold = -2
for(i in 1:length(burst_events)){
  
  # add previous 72h of each burst event 
  burst_period <- c(burst_events[i],(burst_events[i]- 1), (burst_events[i]-2),(burst_events[i]-3))
  
  # subset result data frame to the burst period  
  df.temp <- df.monit %>% filter(as.Date(Date, tz = 'EST') %in% burst_period)  
  
  # check if there is an outlier (HT < -2)
  if(length(which(df.temp$HT <  bayes_threshold))>0){
    # burst is detected
    burst_detected = burst_detected + 1
  }
}
# TRUE POSITIVE RATE
TPR = burst_detected/length(burst_events)

# FALSE POSITIVE RATE

# first remove all burst periods (any outlier flagged during these dates are considered true positives)
df.temp <- df.monit %>% filter(as.Date(Date,tz = 'EST') %notin% c(burst_events,(burst_events- 1), (burst_events-2),(burst_events-3)))

# now group by day, and compute the number of outliers in each date (i.e.,  the number of times HT < -2)
df.temp_FP <- df.temp %>% group_by(Date = as.Date(Date, tz ='EST')) %>%
  summarise(outliers = length(which(HT < bayes_threshold)))

# number of days where at least one outlier was flagged using Bayes-factor
false_positives = length(which(df.temp_FP$outliers > 0))

FPR = false_positives/nrow(df.temp_FP)

####################################################################################################################################
# MEAN + SD Threshold (from Ye and Fenner 2014)
####################################################################################################################################

# Ye, G., & Fenner, R. A. (2014). Study of Burst Alarming and 
# Data Sampling Frequency in Water Distribution Networks. 
# Journal of Water Resources Planning and Management, 140(6), 06014001. 


# Threshold T1 is based on average and standard-deviation of residuals for all estimated values from the previous month.
# Hence we compute T1 using data from February 2015 and it will be used as the threshold value for all 
# estimated values in March 2015. Than, we compute the T1 using data from March 2015 and it will be used as 
# the threshold value for all estimated values in April 2015, and so on. 
# T1 was compute for each hour of the day so given that the Kalman filter was applied to each hour individually

# slice data to start on 1st february
df.hist <- df %>% filter(is.finite(error)) %>% filter(Date >= as.POSIXct('2015-02-01', tz = 'EST'))
# from the historical dataset, also remove the dates of the breaks as well as the 2 previous days 
# These are considered as TRUE POSITIVES in this exercice
df.hist <- df.hist %>% filter(as.Date(Date, tz = 'EST') %notin% c(burst_events,(burst_events- 1), (burst_events-2),(burst_events-3)))
# now, for each year-month-hour combination, compute the average residual and the standard deviation of the residual  
df.hist <- df.hist %>% group_by(year = year(Date), month = month(Date), hour = hour(Date)) %>% 
  summarise(avg_resid = round(mean(error, na.rm = T),3), sd_resid = sd(error, na.rm = T))
# compute the threshold value T1 as Mean + 2*standard deviation
df.hist <- df.hist %>% mutate(T1 = avg_resid + 2*sd_resid)


# Now, shift each Threshold (T1) in df.hist to the next month. Meaning, all T1 calculates for 2015 February, 
# will be assigned to 2015 March, and so on

# first rename year and month
df.hist <- df.hist %>% rename(old_year = year, old_month = month)
# now shift year and month by 1 month
df.hist <- df.hist %>% mutate(month = ifelse(old_month < 12, old_month + 1, 1),
                              year = ifelse(old_month < 12, old_year, old_year + 1))

# add year, month and hour features so that we can left join the data frame with T using these features
df.monit <- df.monit %>% mutate(year = year(Date), month = month(Date), hour = hour(Date))

# left join df.hist into df.test
df.monit <- df.monit %>% left_join(select(df.hist,year, month, hour, T1), by = c('year', 'month','hour'))

# an alarm is triggered if the observed residual is larger than T
df.monit <- df.monit%>% mutate(alarm_triggered = ifelse(error > T1, 1, 0))

# TP is considered at the day level, and it will be calculated as the number of periods with burst 
# day of the burst in the historical record + 72h previous to the burst

burst_detected = 0

# minimum number of outliers considered to trigger alarm
min_num_outliers = 3

for(i in 1:length(burst_events)){

  # add previous 72h of each burst event 
  burst_period <- c(burst_events[i],(burst_events[i]- 1), (burst_events[i]-2),(burst_events[i]-3))
  
  # subset result data frame to the burst period  
  df.temp <- df.monit %>% filter(as.Date(Date, tz = 'EST') %in% burst_period)    
  
  if(sum(df.temp$alarm_triggered,na.rm=T)>=min_num_outliers){
    burst_detected = burst_detected + 1
  }
}  

# TRUE POSITIVE RATE
TPR = burst_detected/length(burst_events)  
  

# FALSE POSITIVE RATE

# set FP to 1 when alarms where triggered
df.monit <- df.monit %>% mutate(FP = ifelse(alarm_triggered == 1,1, 0))
# set FP to 0 on all periods of burst (periods of burst are TP)
df.monit <- df.monit %>% mutate(FP = ifelse(as.Date(Date, tz = 'EST') %in% 
                                             c(burst_events,(burst_events- 1), (burst_events-2),(burst_events-3)),
                                           0, FP))

# now group by day, and compute the number of outliers in each date (i.e.,  the number of times HT < -2)
df.temp_FP <- df.monit %>% group_by(Date = as.Date(Date, tz ='EST')) %>%
  summarise(n_outliers = sum(FP))

# number of days where the minimum number of outliers (min_num_outliers) were flagged using T1
false_positives = length(which(df.temp_FP$n_outliers >= min_num_outliers))

FPR = false_positives/nrow(df.temp_FP)
