library(mltools)
library(data.table)
library(car)
library(ggpubr)
library(Amelia)
library(lme4)
library(readr)
library(DataExplorer)
library(gridExtra)
library(mlmRev) 
library(broom)
library(glue)
library(readxl)
library(caret)
library(GGally)
library(plotly)
library(dplyr)
library(ggplot2)
library(corrplot)
library(MLeval)
library(DMwR2)
library(ROSE)
library(klaR)
library(loo)
library(ggthemes)
library(MLmetrics)

# Set theme for ggplot
mytheme = theme_bw()
theme_set(mytheme)

data <- read_csv("DataClassification.csv", show_col_types = FALSE)
data <- data[-c(1)]
head(data)

data <- data %>% mutate(Binary = as.factor(Binary), Multiclass = as.factor(Multiclass))

str(data)

#_______________________________________________________________________________
# FEATURE SELECTION USING CORRELATION MATRIX
#_______________________________________________________________________________

# calculate correlation matrix
correlationMatrix <- cor(data[,9:11])

# summarize the correlation matrix
print(correlationMatrix)

# find attributes that are highly corrected (ideally >0.75)
highlyCorrelated <- findCorrelation(correlationMatrix, cutoff=0.5)

# print indexes of highly correlated attributes
print(highlyCorrelated)

#_______________________________________________________________________________
# DATA SPLITTING
#_______________________________________________________________________________

i <- createDataPartition(y = data$Multiclass, p = 0.8, list = FALSE)
training <- data[i,]
test <- data[-i,]

SEED <- 78465

#_______________________________________________________________________________
# LEAVE-ONE-GROUP-OUT CROSS-VALIDATION
#_______________________________________________________________________________

group_cv <- function(x, k = length(unique(x))) {
  dat <- data.frame(index = seq(along = x), group = x)
  groups <- data.frame(group = unique(dat$group))
  group_folds <- createFolds(groups$group, returnTrain = TRUE, k = k)
  group_folds <- lapply(group_folds, function(x, y) y[x,,drop = FALSE], y = groups)
  dat_folds <- lapply(group_folds, function(x, y) merge(x, y), y = dat)
  lapply(dat_folds, function(x) sort(x$index))
}

set.seed(SEED)
groups <- training$ID
folds <- group_cv(groups)

# VISUALIZATION

in_model <- lapply(folds, function(indexing, grouping) grouping[indexing], grouping = groups)
in_model_df <- data.frame(Subject = unlist(in_model), data = "Used for Modeling")
in_model_df$Fold <- rep(names(in_model), times = unlist(lapply(in_model, length)))
holdout <- lapply(folds, function(indexing, grouping) grouping[-unique(indexing)], grouping = groups)
holdout_df <- data.frame(Subject = unlist(holdout), data = "Holdout")
holdout_df$Fold <- rep(names(holdout), times = unlist(lapply(holdout, length)))
both <- rbind(in_model_df, holdout_df)
ggplot(both, aes(x = Fold, y = Subject, fill = data)) + 
  geom_tile() + 
  scale_fill_tableau() + 
  theme(legend.position = "top")

#_______________________________________________________________________________

#_______________________________________________________________________________
# HYPERPARAMETER TUNING
#_______________________________________________________________________________

set.seed(SEED)

trControl <- trainControl(method = "cv", number = 19, index = folds, verboseIter = TRUE, summaryFunction = multiClassSummary, 
                          savePredictions ="final", classProbs = T)

# UPSAMPLING
trControl_up <- trainControl(method = "cv", number = 19, index = folds, verboseIter = TRUE, summaryFunction = multiClassSummary,
                             savePredictions ="final", classProbs = T, sampling = "up")

#_______________________________________________________________________________
#_______________________________________________________________________________
# BINARY CLASSIFICATION
#_______________________________________________________________________________
#_______________________________________________________________________________

#_______________________________________________________________________________
# K-NEAREST NEIGHBOURS
#_______________________________________________________________________________

set.seed(SEED)
model_knn <- train(Binary~Pelvis+Trunk+Separation, data = training, method = "knn", trControl = trControl, preProcess = c("center","scale")) 
model_knn

# CONFUSION MATRIX

knn_predictions <- predict(model_knn, newdata = test)
knn_confmatrix <- confusionMatrix(knn_predictions, test$Binary, mode = "everything")
knn_confmatrix

#_______________________________________________________________________________
# NAIVE BAYES
#_______________________________________________________________________________

set.seed(SEED)
model_nb <- train(Binary~Pelvis+Trunk+Separation, data = training, method = "nb", trControl = trControl, preProcess = c("center","scale")) 
model_nb

# CONFUSION MATRIX

nb_predictions <- predict(model_nb, newdata = test)
nb_confmatrix <- confusionMatrix(nb_predictions, test$Binary, mode = "everything")
nb_confmatrix

#_______________________________________________________________________________
# RANDOM FOREST
#_______________________________________________________________________________

set.seed(SEED)
model_rf <- train(Binary~Pelvis+Trunk+Separation, data = training, method = "rf", trControl = trControl, preProcess = c("center","scale")) 
model_rf

# CONFUSION MATRIX

rf_predictions <- predict(model_rf, newdata = test)
rf_confmatrix <- confusionMatrix(rf_predictions, test$Binary, mode = "everything")
rf_confmatrix

#_______________________________________________________________________________
# SUPPORT VECTOR MACHINE
#_______________________________________________________________________________

set.seed(SEED)
model_svm <- train(Binary~Pelvis+Trunk+Separation, data = training, method = "svmRadial", trControl = trControl, preProcess = c("center","scale")) 
model_svm

# CONFUSION MATRIX

svm_predictions <- predict(model_svm, newdata = test)
svm_confmatrix <- confusionMatrix(svm_predictions, test$Binary, mode = "everything")
svm_confmatrix

#_______________________________________________________________________________
# Logistic regression
#_______________________________________________________________________________

set.seed(SEED)
model_logreg <- train(Binary~Pelvis+Trunk+Separation, data = training, method = "glm", trControl = trControl, family = "binomial", preProcess = c("center","scale")) 
model_logreg

# CONFUSION MATRIX

logreg_predictions <- predict(model_logreg, newdata = test)
logreg_confmatrix <- confusionMatrix(logreg_predictions, test$Binary, mode = "everything")
logreg_confmatrix

#_______________________________________________________________________________

# CONFUSION MATRIX VISUALIZATION
# cm = model confusion matrix

draw_confusion_matrix_binary <- function(cm) {
  
  total <- sum(cm$table)
  res <- as.numeric(cm$table)
  
  # Generate color gradients. Palettes come from RColorBrewer.
  greenPalette <- c("#F7FCF5","#E5F5E0","#C7E9C0","#A1D99B","#74C476","#41AB5D","#238B45","#006D2C","#00441B")
  redPalette <- c("#FFF5F0","#FEE0D2","#FCBBA1","#FC9272","#FB6A4A","#EF3B2C","#CB181D","#A50F15","#67000D")
  getColor <- function (greenOrRed = "green", amount = 0) {
    if (amount == 0)
      return("#FFFFFF")
    palette <- greenPalette
    if (greenOrRed == "red")
      palette <- redPalette
    colorRampPalette(palette)(100)[10 + ceiling(90 * amount / total)]
  }
  
  # set the basic layout
  layout(matrix(c(1,1,2)))
  par(mar=c(2,2,4,4))
  plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
  title('Binary Classification\n', cex.main=1.5)
  
  # create the matrix 
  classes = colnames(cm$table)
  rect(150, 430, 240, 370, col=getColor("green", res[1]))
  text(195, 435, classes[1], cex=1.3)
  rect(250, 430, 340, 370, col=getColor("red", res[3]))
  text(295, 435, classes[2], cex=1.3)
  text(125, 370, 'Prediction', cex=1.3, srt=90, font=2)
  text(245, 445, 'Truth', cex=1.3, font=2)
  rect(150, 305, 240, 365, col=getColor("red", res[2]))
  rect(250, 305, 340, 365, col=getColor("green", res[4]))
  text(140, 400, classes[1], cex=1.3, srt=90)
  text(140, 335, classes[2], cex=1.3, srt=90)
  
  # add in the cm results
  text(195, 400, res[1], cex=1.3, font=2, col='black')
  text(195, 335, res[2], cex=1.3, font=2, col='black')
  text(295, 400, res[3], cex=1.3, font=2, col='black')
  text(295, 335, res[4], cex=1.3, font=2, col='black')
  
  # add in the specifics 
  plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "Performance metrics", xaxt='n', yaxt='n')
  text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
  text(10, 60, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
  text(35, 85, names(cm$byClass[2]), cex=1.2, font=2)
  text(35, 60, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
  text(57, 85, names(cm$byClass[5]), cex=1.2, font=2)
  text(57, 60, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
  text(75, 85, names(cm$byClass[6]), cex=1.2, font=2)
  text(75, 60, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
  text(95, 85, names(cm$byClass[7]), cex=1.2, font=2)
  text(95, 60, round(as.numeric(cm$byClass[7]), 3), cex=1.2)
  
  # add in the accuracy information 
  text(30, 35, names(cm$overall[1]), cex=1.2, font=2)
  text(30, 10, round(as.numeric(cm$overall[1]), 3), cex=1.2)
  text(70, 35, names(cm$overall[2]), cex=1.2, font=2)
  text(70, 10, round(as.numeric(cm$overall[2]), 3), cex=1.2)
}

#_______________________________________________________________________________
#_______________________________________________________________________________
# MULTICLASS CLASSIFICATION
#_______________________________________________________________________________
#_______________________________________________________________________________

#_______________________________________________________________________________
# K-NEAREST NEIGHBOURS
#_______________________________________________________________________________

set.seed(SEED)
mcmodel_knn_up <- train(Multiclass~Pelvis+Trunk+Separation, data = training, method = "knn", trControl = trControl_up, preProcess = c("center","scale")) 
mcmodel_knn_up

# CONFUSION MATRIX

mc_knn_predictions_up <- predict(mcmodel_knn_up, newdata = test)
mc_knn_confmatrix_up <- confusionMatrix(mc_knn_predictions_up, test$Multiclass, mode = "everything")
mc_knn_confmatrix_up

#_______________________________________________________________________________
# NAIVE BAYES
#_______________________________________________________________________________

set.seed(SEED)
mcmodel_nb_up <- train(Multiclass~Pelvis+Trunk+Separation, data = training, method = "nb", trControl = trControl_up, preProcess = c("center","scale")) 
mcmodel_nb_up

# CONFUSION MATRIX

mc_nb_predictions_up <- predict(mcmodel_nb_up, newdata = test)
mc_nb_confmatrix_up <- confusionMatrix(mc_nb_predictions_up, test$Multiclass, mode = "everything")
mc_nb_confmatrix_up

#_______________________________________________________________________________
# RANDOM FOREST
#_______________________________________________________________________________

set.seed(SEED)
mcmodel_rf_up <- train(Multiclass~Pelvis, data = training, method = "rf", trControl = trControl_up, preProcess = c("center","scale")) 
mcmodel_rf_up

# CONFUSION MATRIX

mc_rf_predictions_up <- predict(mcmodel_rf_up, newdata = test)
mc_rf_confmatrix_up <- confusionMatrix(mc_rf_predictions_up, test$Multiclass, mode = "everything")
mc_rf_confmatrix_up

#_______________________________________________________________________________
# SVM Radial
#_______________________________________________________________________________

set.seed(SEED)
mcmodel_svm_up <- train(Multiclass~Pelvis+Trunk+Separation, data = training, method = "svmRadial", trControl = trControl_up, preProcess = c("center","scale")) 
mcmodel_svm_up

# CONFUSION MATRIX

mc_svm_predictions_up <- predict(mcmodel_svm_up, newdata = test)
mc_svm_confmatrix_up <- confusionMatrix(mc_svm_predictions_up, test$Multiclass, mode = "everything")
mc_svm_confmatrix_up

#_______________________________________________________________________________
# Logistic regression
#_______________________________________________________________________________

# UPSAMPLING
set.seed(SEED)
mcmodel_multinom_up <- train(Multiclass~Pelvis+Trunk+Separation, data = training, method = "multinom", trControl = trControl_up, family = "binomial", preProcess = c("center","scale")) 
mcmodel_multinom_up

# CONFUSION MATRIX

mc_multinom_predictions_up <- predict(mcmodel_multinom_up, newdata = test)
mc_multinom_confmatrix_up <- confusionMatrix(mc_multinom_predictions_up, test$Multiclass, mode = "everything")
mc_multinom_confmatrix_up

#_______________________________________________________________________________

# CONFUSION MATRIX VISUALIZATION

draw_confusion_matrix_multiclass <- function(cm) {
  
  total <- sum(cm$table)
  res <- as.numeric(cm$table)
  
  # Generate color gradients. Palettes come from RColorBrewer.
  greenPalette <- c("#F7FCF5","#E5F5E0","#C7E9C0","#A1D99B","#74C476","#41AB5D","#238B45","#006D2C","#00441B")
  redPalette <- c("#FFF5F0","#FEE0D2","#FCBBA1","#FC9272","#FB6A4A","#EF3B2C","#CB181D","#A50F15","#67000D")
  getColor <- function (greenOrRed = "green", amount = 0) {
    if (amount == 0)
      return("#FFFFFF")
    palette <- greenPalette
    if (greenOrRed == "red")
      palette <- redPalette
    colorRampPalette(palette)(100)[10 + ceiling(90 * amount / total)]
  }
  
  # set the basic layout
  layout(matrix(c(1,1,2,2,nrow=2, byrow=TRUE)))
  plot(c(110, 300), c(280, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
  title('Multiclass Classification\n', cex.main=1.5)
  
  # create the matrix 
  classes = colnames(cm$table)
  
  # First row of rectangles
  rect(140, 430, 180, 390, col=getColor("green", res[1]))
  text(160, 435, classes[1], cex=1.3)
  rect(185, 430, 225, 390, col=getColor("red", res[4]))
  text(205, 435, classes[2], cex=1.3)
  rect(230, 430, 270, 390, col=getColor("red", res[7]))
  text(250, 435, classes[3], cex=1.3)
  text(135, 410, classes[1], cex=1.3, srt=90)
  
  text(120, 365, 'Prediction', cex=1.3, srt=90, font=2)
  text(205, 445, 'Truth', cex=1.3, font=2)
  
  # Second row of rectangles
  rect(140, 385, 180, 345, col=getColor("red", res[2]))
  rect(185, 385, 225, 345, col=getColor("green", res[5]))
  rect(230, 385, 270, 345, col=getColor("red", res[8]))
  text(135, 365, classes[2], cex=1.3, srt=90)
  
  
  # Third row of rectangles
  rect(140, 340, 180, 300, col=getColor("red", res[2]))
  rect(185, 340, 225, 300, col=getColor("red", res[5]))
  rect(230, 340, 270, 300, col=getColor("green", res[8]))
  text(135, 320, classes[3], cex=1.3, srt=90)
  
  # add in the cm results
  # First column
  text(160, 410, res[1], cex=1.3, font=2, col='black')
  text(160, 365, res[2], cex=1.3, font=2, col='black')
  text(160, 320, res[3], cex=1.3, font=2, col='black')
  # Second column
  text(205, 410, res[4], cex=1.3, font=2, col='black')
  text(205, 365, res[5], cex=1.3, font=2, col='black')
  text(205, 320, res[6], cex=1.3, font=2, col='black')
  # Third column
  text(250, 410, res[7], cex=1.3, font=2, col='black')
  text(250, 365, res[8], cex=1.3, font=2, col='black')
  text(250, 320, res[9], cex=1.3, font=2, col='black')
}

# FEATURE IMPORTANCE FOR RANDOM FOREST

importance <- varImp(mcmodel_rf_up, scale = FALSE)
plot(importance)

