Hello everyone, my level / knowledge in R is super mega beginner, and it has cost me a lot to perform my statistical analyzes. It is not for vanity, but I would like to improve my graphics. Do you know what I could do to obtain a graph similar to the one attached below? Thanks in advance.
library(readxl)
Family <- read_excel("~/RSTUDIOPICRUST/NMDS.xlsx")
#NORMALIZATION OF RAW READ TABLE LOG2
#OTUS en filas, columnas son las muestras, es al reves que para el resto de pasos
data<- Family
replicates <- as.data.frame(colnames(data)[-1])
colnames(replicates) <- "replicates"
attach(Family)
rwnames <- OTU
data <- as.matrix(data[,-1])
rownames(data) <- rwnames
data[is.na(data)] <- 0
library(DESeq2)
#> Loading required package: S4Vectors
#> Loading required package: stats4
#> Loading required package: BiocGenerics
#> Loading required package: parallel
#>
#> Attaching package: 'BiocGenerics'
#> The following objects are masked from 'package:parallel':
#>
#> clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
#> clusterExport, clusterMap, parApply, parCapply, parLapply,
#> parLapplyLB, parRapply, parSapply, parSapplyLB
#> The following objects are masked from 'package:stats':
#>
#> IQR, mad, sd, var, xtabs
#> The following objects are masked from 'package:base':
#>
#> anyDuplicated, append, as.data.frame, basename, cbind, colMeans,
#> colnames, colSums, dirname, do.call, duplicated, eval, evalq,
#> Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply,
#> lengths, Map, mapply, match, mget, order, paste, pmax, pmax.int,
#> pmin, pmin.int, Position, rank, rbind, Reduce, rowMeans, rownames,
#> rowSums, sapply, setdiff, sort, table, tapply, union, unique,
#> unsplit, which, which.max, which.min
#>
#> Attaching package: 'S4Vectors'
#> The following object is masked from 'package:base':
#>
#> expand.grid
#> Loading required package: IRanges
#>
#> Attaching package: 'IRanges'
#> The following object is masked from 'package:grDevices':
#>
#> windows
#> Loading required package: GenomicRanges
#> Loading required package: GenomeInfoDb
#> Loading required package: SummarizedExperiment
#> Loading required package: Biobase
#> Welcome to Bioconductor
#>
#> Vignettes contain introductory material; view with
#> 'browseVignettes()'. To cite Bioconductor, see
#> 'citation("Biobase")', and for packages 'citation("pkgname")'.
#> Loading required package: DelayedArray
#> Loading required package: matrixStats
#>
#> Attaching package: 'matrixStats'
#> The following objects are masked from 'package:Biobase':
#>
#> anyMissing, rowMedians
#> Loading required package: BiocParallel
#>
#> Attaching package: 'DelayedArray'
#> The following objects are masked from 'package:matrixStats':
#>
#> colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
#> The following objects are masked from 'package:base':
#>
#> aperm, apply
dds <- DESeqDataSetFromMatrix(data, replicates, ~ replicates)
#> converting counts to integer mode
cts <- counts(dds)
geoMeans <- apply(cts, 1, function(row) if (all(row == 0)) 0 else exp(mean(log(row[row != 0]))))
dds <- estimateSizeFactors(dds, geoMeans=geoMeans)
norm <- counts(dds, normalized=TRUE)
logcounts <- log2( counts(dds, normalized=TRUE) + 1 )# in case you want a log transformed normalized count
write.csv(logcounts, file = "~/RSTUDIOPICRUST/NDMS_NORMALIZADOS.csv")
#> Warning in file(file, ifelse(append, "a", "w")): no fue posible abrir el archivo
#> 'C:/Users/Osiris Díaz Torres/Documents/RSTUDIOPICRUST/NDMS_NORMALIZADOS.csv':
#> Permission denied
#> Error in file(file, ifelse(append, "a", "w")): no se puede abrir la conexión
#log counts are like variance stabilized counts (https://support.bioconductor.org/p/76712/)
#DATA ANALYSIS
#Introduce two tables, one has to have the samples in the first column and the
#species in the first line with the reads per sample.
#The second table is with environmental parameters, it has to have the samples
#in the first column and the environmental parameters in the first line.
#You can have the environmental parameters as presence/absence with 1 and 0 or
#with numbers.
library(readxl)
Family <- read_excel("~/RSTUDIOPICRUST/NMDS_TRANSPU.xlsx")
#Family <- t(Family)
Metadata<- read.csv("~/RSTUDIOPICRUST/Metadata-final.csv", row.names=1)
attach(Family)
Family <- Family[,-1]
rownames(Family) <- SampleID
#> Warning: Setting row names on a tibble is deprecated.
Metadata$Month <- factor(Metadata$Month,
levels = c("July", "August", "September"))
#remove the rare microbes. It keeps only the microbes that are present in at least 10% of the samples
dim(Family)
#> [1] 96 10415
Family <- Family[,colMeans(Family) >=.1]
dim(Family)
#> [1] 96 8291
library(vegan)
#> Loading required package: permute
#> Loading required package: lattice
#> This is vegan 2.5-6
#NMDS
#autotransform=F evita la transformaci?n de datos
#sratmax=0.999999 para que el an?lisis no se detenga antes de tiempo cuando se tienen muchos datos
Family.mds <- metaMDS(Family, k=2, distance="bray", trymax=100, zerodist="add", autotransform=F)
#> Run 0 stress 0.1274936
#> Run 1 stress 0.1274939
#> ... Procrustes: rmse 9.269493e-05 max resid 0.0005469849
#> ... Similar to previous best
#> Run 2 stress 0.1274941
#> ... Procrustes: rmse 0.0001485437 max resid 0.0007761965
#> ... Similar to previous best
#> Run 3 stress 0.1274912
#> ... New best solution
#> ... Procrustes: rmse 0.000960561 max resid 0.006058439
#> ... Similar to previous best
#> Run 4 stress 0.1631234
#> Run 5 stress 0.1631233
#> Run 6 stress 0.1274918
#> ... Procrustes: rmse 0.0002420235 max resid 0.001806596
#> ... Similar to previous best
#> Run 7 stress 0.1606099
#> Run 8 stress 0.1703349
#> Run 9 stress 0.1594046
#> Run 10 stress 0.1434982
#> Run 11 stress 0.1454402
#> Run 12 stress 0.1437091
#> Run 13 stress 0.1454402
#> Run 14 stress 0.1525878
#> Run 15 stress 0.1274927
#> ... Procrustes: rmse 0.00048292 max resid 0.004084371
#> ... Similar to previous best
#> Run 16 stress 0.1274944
#> ... Procrustes: rmse 0.001087916 max resid 0.006300639
#> ... Similar to previous best
#> Run 17 stress 0.156991
#> Run 18 stress 0.1800571
#> Run 19 stress 0.1274936
#> ... Procrustes: rmse 0.0009613467 max resid 0.006058546
#> ... Similar to previous best
#> Run 20 stress 0.1274937
#> ... Procrustes: rmse 0.0009853107 max resid 0.006129919
#> ... Similar to previous best
#> *** Solution reached
stressplot(Family.mds)
#add the metadata information
##THIS HAS TO BE IN THE SAME ORDER IN THE METADATA FILE AND IN THE COUNTS FILE
colvec<- c("yellowgreen","turquoise4", "tomato2")
plot(Family.mds, type="n", xlim=c(-.5,.5), ylim=c(-0.6,0.6))
pl <-ordiellipse(Family.mds, Metadata$Month, kind="se", conf=0.95, lwd=2, col="gray30", label=T)
#plot with ggplot
data.scores <- as.data.frame(scores(Family.mds))
data.scores$site <- rownames(data.scores)
data.scores$grp <- Metadata$Month
head(data.scores)
#> NMDS1 NMDS2 site grp
#> 1 -0.14279213 0.001943053 1 July
#> 2 -0.12419134 -0.030650515 2 July
#> 3 -0.10328531 -0.119223304 3 July
#> 4 -0.09456066 0.107006047 4 August
#> 5 0.04013923 0.030698429 5 August
#> 6 0.08413096 -0.003885208 6 August
library(ggplot2)
NMDS <- data.frame(MDS1 = Family.mds$points[,1], MDS2 = Family.mds$points[,2],group=Metadata$Month)
#NMDS.mean <- aggregate(NMDS[,1:2],list(group=group),mean)
veganCovEllipse<-function (cov, center = c(0, 0), scale = 1, npoints = 100)
{
theta <- (0:npoints) * 2 * pi/npoints
Circle <- cbind(cos(theta), sin(theta))
t(center + scale * t(Circle %*% chol(cov)))
}
#attach(NMDS.mean)
df_ell <- data.frame()
for(g in levels(NMDS$group)){
df_ell <- rbind(df_ell, cbind(as.data.frame(with(NMDS[NMDS$group==g,],
veganCovEllipse(pl[[g]]$cov,pl[[g]]$center,pl[[g]]$scale)))
,group=g))
}
ggplot() +
geom_path(data=df_ell, aes(x=NMDS1, y=NMDS2,colour=group), size=1, linetype=2)+
geom_point(data=data.scores,aes(x=NMDS1,y=NMDS2,colour=grp),size=1.5) + # add the point markers
theme(legend.title = element_blank()) +
ylab("NMDS2")+
xlab("NMDS1")+
#geom_text(data=data.scores,aes(x=NMDS1,y=NMDS2,label=site),size=6,vjust=0) +
scale_colour_manual(values=c("July" = "yellowgreen", "August" = "turquoise4", "September" = "pink"))
Created on 2021-06-01 by the reprex package (v0.3.0)