---
title: "Data input and exploration"
author: "Stijn Schreven"
date: "31 mei 2018"
output:
  html_document: default
  pdf_document: default
---

# Data input and exploration  
## Loading Data  

```{r}
ps <- read_phyloseq(otu.file = "//wurnet.nl/homes/schre030/My Documents/Courses and workshops/Microbioma data analysis/SpringSchoolOwnData/input_data/Library_A_till_E_Firstrun_Silva132.biom1", 
                    taxonomy.file = NULL, 
                    metadata.file = "//wurnet.nl/homes/schre030/My Documents/Courses and workshops/Microbioma data analysis/SpringSchoolOwnData/input_data/mapping_file_stijn_libABCDE_recoded2.csv", 
                    type = "biom")

# tree file
treefile_p1 <- read.tree("//wurnet.nl/homes/schre030/My Documents/Courses and workshops/Microbioma data analysis/SpringSchoolOwnData/input_data/all_otus_library_A_till_E_firstrun.tre")

# merge tree into phyloseq
ps <-merge_phyloseq(ps,treefile_p1)

print(ps)
```

## Clean data  

```{r}
datatable(tax_table(ps))
#knitr::kable(tax_table(ps))

tax_table(ps)[,colnames(tax_table(ps))] <- gsub(tax_table(ps)[,colnames(tax_table(ps))],pattern="[a-z]__",replacement="")

tax_table(ps)[tax_table(ps)[,"Phylum"]== "","Phylum"] <- "Unidentified"

# remove mitochondria and chloroplasts
ps1 <- subset_taxa(ps, Family != "Mitochondria")
ps1 <- subset_taxa(ps1, Order != "Chloroplast")
print(ps1)

ntaxa(ps)-ntaxa(ps1)

# store as RDS file
saveRDS(ps1, "//wurnet.nl/homes/schre030/My Documents/Courses and workshops/Microbioma data analysis/SpringSchoolOwnData/phyobjects/ps1.rds")
ps1 <- readRDS("//wurnet.nl/homes/schre030/My Documents/Courses and workshops/Microbioma data analysis/SpringSchoolOwnData/phyobjects/ps1.rds")
```

## View data  

```{r}
# Abundance histogram total
ps1_df_taxa <- data.table(tax_table(ps1), 
                        ASVabundance = taxa_sums(ps1), 
                        ASV= taxa_names(ps1))

ps1_tax_plot <- ggplot(ps1_df_taxa, aes(ASVabundance)) + 
  geom_histogram() + ggtitle("Histogram of ASVs (unique sequence) counts") + 
  theme_bw() + scale_x_log10() + ylab("Frequency of ASVs") + xlab("Abundance (raw counts)")

print(ps1_tax_plot)

# Abundance-prevalence per phylum
p <- plot_taxa_prevalence(ps1, "Phylum")
p
```

# Create datasets for different quality checks  

```{r}
# common dataset with only the experimental data (i.e. excluding all the extra samples)
# (to be used for all other analyses)
ps1.exp <- subset_samples(ps1, Spare == "no")
ps1.exp <- prune_taxa(taxa_sums(otu_table(ps1.exp))>0, ps1.exp)
print(ps1.exp)
saveRDS(ps1.exp, "//wurnet.nl/homes/schre030/My Documents/Courses and workshops/Microbioma data analysis/SpringSchoolOwnData/phyobjects/ps1.exp.rds")
ps1.exp <- readRDS("//wurnet.nl/homes/schre030/My Documents/Courses and workshops/Microbioma data analysis/SpringSchoolOwnData/phyobjects/ps1.exp.rds")

# compare technical duplicates: are they consistent?
ps1.tech <- subset_samples(ps1, Description %in% c("7.C.","6.A.","13.G.", "13.G","6.A"))
ps1.tech <- prune_taxa(taxa_sums(otu_table(ps1.tech))>0, ps1.tech)
print(ps1.tech)

#	compare biological duplicates: are they consistent?
ps1.biol <- subset_samples(ps1, Type == "substrate")
ps1.biol <- subset_samples(ps1.biol, ContainerID %in% c(1,13,17,29,33,45))
ps1.biol <- subset_samples(ps1.biol, Timepoint %in% c(1,3))
ps1.biol <- prune_taxa(taxa_sums(otu_table(ps1.biol))>0, ps1.biol)
print(ps1.biol)

#	check the negative controls and mocks (have "na" for Type)
ps1.contr <- subset_samples(ps1, Type == "na")
ps1.contr <- prune_taxa(taxa_sums(otu_table(ps1.contr))>0, ps1.contr)
print(ps1.contr)

#	check reads of mitochondrial DNA in larvae samples over time + plot against total of other bacterial DNA: size relation / DNA dilution effect?
pslarv <- subset_samples(ps, Type == "larvae")
pslarv <- subset_samples(pslarv, Spare == "no")
pslarv.mit <- subset_taxa(pslarv, Family == "Mitochondria")
pslarv.mit <- aggregate_taxa(pslarv.mit, "Family")
pslarv.nmit <- subset_taxa(pslarv, Family != "Mitochondria")
pslarv.nmit <- aggregate_taxa(pslarv.nmit, "Phylum")
print(pslarv.nmit)

# check reads of chloroplast and mitochondrial DNA in substrate samples over time + plot against total of other bacterial DNA: initial DNA dilution effect?
pssub <- subset_samples(ps, Type == "substrate")
pssub <- subset_samples(pssub, Spare == "no")
pssub.mit <- subset_taxa(pssub, Family == "Mitochondria")
pssub.mit <- aggregate_taxa(pssub.mit, "Family")
pssub.chl <- subset_taxa(pssub, Order == "Chloroplast")
pssub.chl <- aggregate_taxa(pssub.chl, "Order")
pssub.rest <- subset_taxa(pssub, Family != "Mitochondria" & Order != "Chloroplast")
pssub.rest <- aggregate_taxa(pssub.rest, "Phylum")
print(pssub.rest)

#	compare biofilm samples to corresponding substrate samples: are biofilm taxa also dominant in substrate sample?
ps1.film <- subset_samples(ps1, Description %in%
                             c("5.H.","17.H.","21.H.","24.H.","25.H.","41.H.", # biofilm samples
                               "5.B.","17.B.","21.B.","24.B.","25.B.","41.B.")) # substrate samples
ps1.film <- prune_taxa(taxa_sums(otu_table(ps1.film))>0, ps1.film)
print(ps1.film)

# compare corresponding small and big larvae, and prepupae and larvae: are they similar?
ps1.size <- subset_samples(ps1, Description %in% c("14.H.","32.H.","44.H.","14.G.","32.G.","44.G."))
ps1.size <- prune_taxa(taxa_sums(otu_table(ps1.size))>0, ps1.size)
print(ps1.size)

# compare top vs bottom substrate to total substrate samples: are they similar?
ps1.sdepth <- subset_samples(ps1, Description %in% c("28.H.","29.H.","37.H.","38.H.","28.F.","37.F."))
ps1.sdepth <- prune_taxa(taxa_sums(otu_table(ps1.sdepth))>0, ps1.sdepth)
print(ps1.sdepth)

# compare eggs and adults samples: which taxa overlap / dominate, and what is difference between surface-sterilized and non-sterilized female adults?
ps1.eggad <- subset_samples(ps1, Type == "eggs" | Type == "adults")
ps1.eggad <- prune_taxa(taxa_sums(otu_table(ps1.eggad))>0, ps1.eggad)
print(ps1.eggad)

# save all subsets
saveRDS(ps1.tech, "//wurnet.nl/homes/schre030/My Documents/Courses and workshops/Microbioma data analysis/SpringSchoolOwnData/phyobjects/ps1.tech.rds")
saveRDS(ps1.biol, "//wurnet.nl/homes/schre030/My Documents/Courses and workshops/Microbioma data analysis/SpringSchoolOwnData/phyobjects/ps1.biol.rds")
saveRDS(ps1.contr, "//wurnet.nl/homes/schre030/My Documents/Courses and workshops/Microbioma data analysis/SpringSchoolOwnData/phyobjects/ps1.contr.rds")

saveRDS(pslarv.mit, "//wurnet.nl/homes/schre030/My Documents/Courses and workshops/Microbioma data analysis/SpringSchoolOwnData/phyobjects/pslarv.mit.rds")
saveRDS(pslarv.nmit, "//wurnet.nl/homes/schre030/My Documents/Courses and workshops/Microbioma data analysis/SpringSchoolOwnData/phyobjects/pslarv.nmit.rds")

saveRDS(pssub.mit, "//wurnet.nl/homes/schre030/My Documents/Courses and workshops/Microbioma data analysis/SpringSchoolOwnData/phyobjects/pssub.mit.rds")
saveRDS(pssub.chl, "//wurnet.nl/homes/schre030/My Documents/Courses and workshops/Microbioma data analysis/SpringSchoolOwnData/phyobjects/pssub.chl.rds")
saveRDS(pssub.rest, "//wurnet.nl/homes/schre030/My Documents/Courses and workshops/Microbioma data analysis/SpringSchoolOwnData/phyobjects/pssub.rest.rds")

saveRDS(ps1.film, "//wurnet.nl/homes/schre030/My Documents/Courses and workshops/Microbioma data analysis/SpringSchoolOwnData/phyobjects/ps1.film.rds")
saveRDS(ps1.size, "//wurnet.nl/homes/schre030/My Documents/Courses and workshops/Microbioma data analysis/SpringSchoolOwnData/phyobjects/ps1.size.rds")
saveRDS(ps1.sdepth, "//wurnet.nl/homes/schre030/My Documents/Courses and workshops/Microbioma data analysis/SpringSchoolOwnData/phyobjects/ps1.sdepth.rds")

saveRDS(ps1.eggad, "//wurnet.nl/homes/schre030/My Documents/Courses and workshops/Microbioma data analysis/SpringSchoolOwnData/phyobjects/ps1.eggad.rds")

```



## Coefficient of variation (CV)  

```{r}
p1 <- plot_taxa_cv(ps1.exp, plot.type = "scatter")
p1 + scale_x_log10()
```

## Sequencing depth  

```{r}
p_seqdepth.time <- plot_read_distribution(ps1.exp, "Timepoint", "density")
p_seqdepth.time
```

# Alpha diversity  

## Variation in reads  

```{r}
summary(sample_sums(ps1.exp))

# rarefaction curves
otu_tab <- t(abundances(ps1.exp))
prar <- vegan::rarecurve(otu_tab, 
                      step = 50, label = FALSE, 
                      sample = min(rowSums(otu_tab), 
                                   col = "blue", cex = 0.6))
```

## Normalize to lower sequence depth  
This will remove some samples (Nreads = 0 in some substrate samples of timepoint 0) and OTUs (for diverse samples with many reads). Try out different sample sizes (2000, 5000, 10000, 100000).

```{r}
set.seed(9242)  # This will help in reproducing the filtering and nomalisation. 

ps0.rar <- rarefy_even_depth(ps1.exp, sample.size = 2000)

saveRDS(ps0.rar, "//wurnet.nl/homes/schre030/My Documents/Courses and workshops/Microbioma data analysis/SpringSchoolOwnData/phyobjects/ps0.rar.rds")
```
2000: removes 16 samples and 40 OTUs
5000: removes 19 samples and 42 OTUs
10000: removes 26 samples and 44 OTUs
100000: removes 70 samples and 230 OTUs
Chosen 2000 for now.

```{r}
barplot(sample_sums(ps0.rar), las =2)

p.rar <- plot_taxa_prevalence(ps0.rar, "Phylum")
p.rar
```

## Diversity indices  

```{r}
# data table with all diversity indices (on data normalized ot equal sequence depth 2000 reads)
bsf.div <- diversities(ps0.rar, index = "all")

datatable(bsf.div)

# get the metadata out as separate object
bsf.meta <- meta(ps0.rar)

# Add the rownames as a new colum for easy integration later.
bsf.meta$sam_name <- rownames(bsf.meta)

# Add the rownames to diversity table
bsf.div$sam_name <- rownames(bsf.div)

# merge these two data frames into one
div.df <- merge(bsf.div,bsf.meta, by = "sam_name")

# check the tables
colnames(div.df)
```

```{r}
# Now use this data frame to plot 
p <- ggboxplot(div.df, x = "Diet", y = "shannon",
              fill = "Diet", palette = "jco", facet.by = "Density")
p + rotate_x_text()
ggsave("//wurnet.nl/homes/schre030/My Documents/Courses and workshops/Microbioma data analysis/SpringSchoolOwnData/figures/Shannon diversity.pdf", height = 4, width = 10)
```

```{r}
# Alternative way
# convert phyloseq object into a long data format.  
div.df2 <- div.df[,c("Diet", "Density", "inverse_simpson", "gini_simpson", "shannon", "fisher", "coverage")]

# the names are not pretty. we can replace them
colnames(div.df2) <- c("Diet", "Density", "Inverse Simpson", "Gini-Simpson", "Shannon", "Fisher", "Coverage")
colnames(div.df2)

div_df_melt <- reshape2::melt(div.df2)
## Using Location as id variables
head(div_df_melt)

# Now use this data frame to plot 
p <- ggboxplot(div_df_melt, x = "Diet", y = "value",
              fill = "Diet", 
              palette = "jco", 
              legend= "right",
              facet.by = "variable", 
              scales = "free")

p <- p + rotate_x_text() 

# we will remove the x axis lables
p <- p + rremove("x.text")
p
ggsave("//wurnet.nl/homes/schre030/My Documents/Courses and workshops/Microbioma data analysis/SpringSchoolOwnData/figures/Diversities.pdf", height = 4, width = 10)
```

```{r}
lev <- levels(div_df_melt$Diet) # get the variables

# make a pairwise list that we want to compare.
L.pairs <- combn(seq_along(lev), 2, simplify = FALSE, FUN = function(i)lev[i])

p2 <- p + stat_compare_means(comparisons = L.pairs, 
                             label = "p.signif", 
                             symnum.args = list(cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1), 
                                                symbols = c("****", "***", "**", "*", "n.s")))

print(p2)
```

## Phylogenetic diversity  

```{r}
library(picante)

ps0.rar.asvtab <- as.data.frame(ps0.rar@otu_table)

ps0.rar.tree <- ps0.rar@phy_tree

# hmp.meta from previous code chunks

# We first need to check if the tree is rooted or not 

ps0.rar@phy_tree
```

```{r}
# it is a rooted tree
df.pd <- pd(t(ps0.rar.asvtab), ps0.rar.tree,include.root=T)
# t(otu_table) transposes the table for use in picante and the tre file comes from the first code chunk we used to read tree file (see making a phyloseq object section).

datatable(df.pd)
```

```{r}
# add new column with PD values
bsf.meta$Phylogenetic_Diversity <- df.pd$PD

# plot PD
pd.plot <- ggboxplot(bsf.meta, x = "Diet", 
                     y = "Phylogenetic_Diversity",
                     fill = "Diet", 
                     palette = "jco",
                     ylab = "Phylogenetic Diversity", 
                     xlab = "Diet",
                     legend = "right")
pd.plot <- pd.plot + rotate_x_text()

pd.plot + stat_compare_means(comparisons = L.pairs, 
                             label = "p.signif", 
                             symnum.args = list(cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1), 
                                                symbols = c("****", "***", "**", "*", "n.s")))
```
NB: Note that these diversity figures only group data for Diet, and underlying effects of Timepoint, Type, Density are hidden.

## Correlation between library size and richness  
Check for correlation between increasing library size (sequence depth, number of reads in a sample) and richness.

```{r}
lib.div <- diversities(ps1.exp, index = "all")
lib.div2 <- richness(ps1.exp)

# let us add number of total reads/samples
lib.div$ReadsPerSample <- sample_sums(ps1.exp)
lib.div$Richness <- lib.div2$`0`
colnames(lib.div)
```

```{r}
p1 <- ggscatter(lib.div, "shannon", "ReadsPerSample") + 
  stat_cor(method = "pearson")

p2 <- ggscatter(lib.div, "inverse_simpson", "ReadsPerSample",
          add = "loess") + 
  stat_cor(method = "pearson")

p3 <- ggscatter(lib.div, "Richness", "ReadsPerSample",
          add = "loess") + 
  stat_cor(method = "pearson", 
           label.x = 100, 
           label.y = 50000)

ggarrange(p1,p2,p3, ncol=2, nrow = 2)
```
There seem to be some correlations between sequence depth and diversity. So it is better to normalize to a lower sequence depth...?

```{r}
sessionInfo()
```

