# Set Working Directory
setwd("~/OneDrive/Uni/2020-2021 Den Haag/Thesis/02-Thesis Work/public-value-conflict-identification")

# Imports
library(stm)        # Package for sturctural topic modeling
library(igraph)     # Package for network analysis and visualisation
library(stmCorrViz) # Package for hierarchical correlation view of STMs
library(geometry)
library(Rtsne)
library(rsvd)
library(dplyr)
library(wordcloud)
library(stmCorrViz)
library(stminsights)



# Specify name of Experiment
name <- "Contributions Advanced Stopword Removal"
date <- "2021-06-13"

# Shortcut: Load previous data
load(paste("data/02_experiments/RData/",name,"_",date,".RData", sep = ""))

# Read data
data <- read.csv("data/02_experiments/input/e-17.csv")

# Subset with specific project
data <- dplyr::filter(data,Type=="Contribution")
data <- dplyr::filter(data,Type=="Contribution" | Type=="Comment")
#data <- dplyr::filter(data,Name=="Lindenallee")
data$processnode <- data$Process.ID * 10000 + data$Node.ID
data[is.na(data)] = 0

processed <- textProcessor(data$Preprocessed_trigrams, metadata = data, language = "german", removestopwords = FALSE, removenumbers = FALSE, removepunctuation = FALSE, stem = FALSE )
plotRemoved(processed$documents, lower.thresh=seq(20,50, by=1))
lower_threshold <- round(nrow(data) * 0.0075)
upper_threshold <- round(nrow(data) * 0.90)
out <- prepDocuments(processed$documents, processed$vocab, processed$meta, lower.thresh = lower_threshold, upper.thresh = upper_threshold)

# Look into the number of topics per model
storage <- searchK(out$documents, out$vocab, K = seq(from = 5, to = 60, by = 5), prevalence =~ Category + compound + Rubric + Rating + Number.replies, data = out$meta, cores = 6)
plot.searchK(storage)


# Run the topic model with an automatic detection of clusters
topicModel <- stm(documents = out$documents, vocab = out$vocab, K = 0, prevalence =~ Category + compound + Rubric + Rating + Number.replies, max.em.its = 300, data = out$meta, init.type = "Spectral", verbose=TRUE)


# Save results
save.image(file = paste("data/02_experiments/RData/",name,"_", Sys.Date(), ".RData", sep = ""))

# Print list of words for each topic
topics <- labelTopics(topicModel, c(1:100), n=15)
num_topics <- length(topics$prob[,1])
topic_df <- data.frame(topic_id = c(1:num_topics), prob_words = topics$prob, frex_words = topics$frex, lift_words = topics$lift)
topic_df$PublicValue <- "None"
write.csv(topic_df, paste("data/02_experiments/topicwords/",name,"_",Sys.Date(),"_rtopicwords.csv", sep = ""))

# Deatiled word list
word_prob = data.frame(topicModel[["beta"]][["logbeta"]])
names(word_prob) = topicModel$vocab
write.csv(word_prob, paste("data/02_experiments/topicwords/",name,"_",date,"_rtopicwords_detail.csv", sep = ""))



# Create a dataframe
dt <- make.dt(topicModel, out$meta)
write.csv(dt, paste("data/02_experiments/topics/",name,"_",Sys.Date(),"_rtopics.csv", sep = ""))






# Plot convergence
plot(topicModel$convergence$bound, type = "l", ylab = "Approximate Objective", main = "Convergence")

# Show summary
plot(topicModel, type = "summary", xlim = c(0, 0.3))


# Print Word cloud per topic
cloud(topicModel, topic = 2, scale = c(2, 1, 0.5))

# Look into topic correlations
mod.out.corr <- topicCorr(topicModel)
plot(mod.out.corr)

# Print some examples
# Remove the documents that were dropped in the preparation phase
contributions <- data[-out$docs.removed,]
topic <- 7
thoughts3 <- findThoughts(topicModel, text = contributions$Text_en, n = 2, topics = topic)$docs[[1]] 
plotQuote(thoughts3, width = 30, main = paste("Topic ", topic))

# Use D3 based stmCorrViz Package
stmCorrViz(topicModel, "stmviz.html", )

# Estimate effects of covariates
effects <- estimateEffect(~ Name, topicModel, out$meta)

# Save as a json
toLDAvisJson(mod=topicModel, docs=out$documents)

# STM Insights
library(stminsights)
run_stminsights()

# Data Analysis
summary(data)
data[is.na(data$compound)]
data[is.na(data)] = 0

sageLabels(topicModel, n = 10)



