Remove values Based on a criteria and re-run average, sd and sem

dplyr
rstudio

#1

Hello-

I apologize if my question in confusing. I believe this coding is above my intermediate R knowledge level.

I know there is a previous thread that begins to touch on this but I do not know how to adapt the answer to my question.

I summarized a bunch of data using the following code in R. Now, I want to recalculate mean, sd, cv, and sem by removing any specific values that will allow the following logic to be true (sd <1.5).

I'm not sure if there is a way to do this, but I have added code below and shown an example that I created using excel. Please help me find if there is there a way to do this. I have 10 datasets of 9217 vales each and I'm going a bit nutty trying to do this in excel.

#load libraries

library(devtools)
library(plyr)
library(dplyr)
library(data.table)
library(xlsx)
library(readxl)
library(reshape2)
library(tibble)

#dummy data

IFC <- data.frame(stringsAsFactors=FALSE,
              ï..Position = c("A01", "A05", "A09", "D03", "D07", "D11", "A03", "A07",
                              "A11", "B01", "B05", "B09"),
          Mature.miRNA.ID = c("hsa-let-7a-5p", "hsa-let-7a-5p", "hsa-let-7a-5p",
                              "hsa-let-7c-5p", "hsa-let-7c-5p",
                              "hsa-let-7c-5p", "hsa-let-7d-3p", "hsa-let-7d-3p",
                              "hsa-let-7d-3p", "hsa-let-7d-5p", "hsa-let-7d-5p",
                              "hsa-let-7d-5p"),
              V1.RNA.0001 = c(5.06, 4.8, 5.39, 10.28, 5.05, 4.91, 5.77, 10.23, 5.48,
                              4.88, 8.39, 9.76),
              V1.RNA.0002 = c(4.8, 4.71, 9.26, 13.35, 4.18, 4.33, 4.5, 10.92, 4.44,
                              4.26, 9.62, 11.91),
              V1.RNA.0003 = c(5.36, 4.84, 9.16, 10.31, 4.92, 4.5, 9.15, 10.95, 4.81,
                              4.6, 9.29, 11.18),
              V1.RNA.0004 = c(4.85, 4.74, 11.12, 12.59, 4.83, 4.63, 11.93, 12.84,
                              4.73, 4.79, 10.87, 12.82),
              V1.RNA.0005 = c(4.93, 4.21, 4.85, 9.73, 4.48, 4.47, 4.96, 9.89, 4.62,
                              3.67, 4.92, 9.45),
              V1.RNA.0006 = c(5.6, 4.62, 11.77, 13.4, 4.86, 4.79, 12.57, 13.08, 4.8,
                              4.64, 11.62, 14.55),
              V1.RNA.0007 = c(6.87, 6.03, 12.54, 12.86, 4.84, 4.78, 11.71, 12.76,
                              4.83, 4.72, 11.65, 12.89),
              V1.RNA.0008 = c(4.26, 4.36, 10.99, 12.38, 4.95, 4.64, 11.15, 12.89,
                              3.95, 4.49, 10.88, 11.99),
              V1.RNA.0009 = c(7.69, 5.86, 12.58, 13.99, 4.88, 4.63, 13.36, 13.53,
                              8.32, 4.85, 12.27, 13.27),
              V1.RNA.0010 = c(5.26, 4.6, 11.73, 13.01, 4.89, 4.72, 10.94, 12.84, 4.83,
                              4.41, 11.53, 12.88)
       )


#Script to write all IFC Data

IFC_out <- function(x) {
  IFC_melt <- melt(IFC, id.vars=c("ï..Position","Mature.miRNA.ID"))
  IFC_melt <- IFC_melt[complete.cases(IFC_melt),]
  IFC_sum <- ddply(IFC_melt, c("Mature.miRNA.ID", "variable"), summarise, length=length(value),
                   mean = mean(value), sd = sd(value), coefficient_of_variation=sd(value)/mean(value),
                   sem = sd(value)/sqrt(length(value)))
  df <- x
  IFC_melt <- add_column(IFC_melt, plate = df, .before = "Mature.miRNA.ID")
  IFC_sum <- add_column(IFC_sum, plate = df, .before = "Mature.miRNA.ID")
  st=format(Sys.time(), "%Y-%m-%d")
  head(IFC)
  head(IFC_melt)
  head(IFC_sum)
  write.xlsx(IFC, file = paste(df,"_", st,".xlsx", sep=""), sheetName="IFC_original", 
             col.names=TRUE, row.names=FALSE, append=FALSE)
  write.xlsx(IFC_melt, file = paste(df,"_", st,".xlsx", sep=""), sheetName="IFC_melt", 
             col.names=TRUE, row.names=FALSE, append=TRUE)
  write.xlsx(IFC_sum, file = paste(df,"_", st,".xlsx", sep=""), sheetName="IFC_sum1", 
             col.names=TRUE, row.names=FALSE, append=TRUE)
}

#example output before cleaning

tibble::tribble(
~plate, ~Mature.miRNA.ID,     ~variable, ~length,       ~mean,         ~sd, ~coefficient_of_variation,        ~sem,
"IFC1",  "hsa-let-7a-5p", "V1.RNA.0001",      3L, 5.083333333,  0.29569128,               0.058168776,  0.17071744,
"IFC1",  "hsa-let-7a-5p", "V1.RNA.0002",      3L, 6.256666667, 2.601352212,               0.415772863,   1.5018914,
"IFC1",  "hsa-let-7a-5p", "V1.RNA.0003",      3L, 6.453333333, 2.358417549,               0.365457265, 1.361633007,
"IFC1",  "hsa-let-7a-5p", "V1.RNA.0004",      3L, 6.903333333, 3.652154615,               0.529042194,  2.10857245,
"IFC1",  "hsa-let-7a-5p", "V1.RNA.0005",      3L, 4.663333333, 0.394630629,               0.084624152,   0.2278401,
"IFC1",  "hsa-let-7a-5p", "V1.RNA.0006",      3L,        7.33, 3.876248186,               0.528819671, 2.237952934,
"IFC1",  "hsa-let-7a-5p", "V1.RNA.0007",      3L,        8.48, 3.541059164,               0.417577732, 2.044431461,
"IFC1",  "hsa-let-7a-5p", "V1.RNA.0008",      3L, 6.536666667, 3.857023896,               0.590059749, 2.226853785,
"IFC1",  "hsa-let-7a-5p", "V1.RNA.0009",      3L,        8.71, 3.474176161,                0.39887212, 2.005816542,
"IFC1",  "hsa-let-7a-5p", "V1.RNA.0010",      3L, 7.196666667, 3.939826561,               0.547451583, 2.274659926
)

#example from excel on how to clean

ï..Position plate Mature.miRNA.ID variable value sd mean
A01 IFC1 hsa-let-7a-5p V1.RNA.0001 5.06
A05 IFC1 hsa-let-7a-5p V1.RNA.0001 4.8
A09 IFC1 hsa-let-7a-5p V1.RNA.0001 5.39 0.29569128 5.083333333
A01 IFC1 hsa-let-7a-5p V1.RNA.0002 4.8
A05 IFC1 hsa-let-7a-5p V1.RNA.0002 4.71
A09 IFC1 hsa-let-7a-5p V1.RNA.0002 0.06363961 4.755
A01 IFC1 hsa-let-7a-5p V1.RNA.0003 5.36
A05 IFC1 hsa-let-7a-5p V1.RNA.0003 4.84
A09 IFC1 hsa-let-7a-5p V1.RNA.0003 0.367695526 5.1

#2

Hi @steineri, I'm a newbie so might not be able to help you with your specific problem, but you might get more help from others if you format your code correctly for this forum and provide a reprex. Have a look at the following links to get you started :slight_smile::


#3

Thank you for your suggestion @mrblobby I reformatted the code using R Markdown. Unfortunately my institution will not allow me to link to open source data, but I have copy and pasted from the .csv document a sample set that I can share.


#4

Hey @steineri, you don't need to provide your real data, you can provide some dummy data. It's not quite clear to me exactly how you want to group the data from what you've posted, but here's an example of how I'd approach a very basic dataset using tidyverse functions (disclaimer: I am a newbie!):

library(tidyverse)

dummy_data <- tibble::tribble(
                ~Position, ~miRNA_ID, ~value1, ~value2, ~value3,
                    "A01",        1L,      1L,      1L,      1L,
                    "A01",        1L,      2L,      2L,      2L,
                    "A02",        2L,      1L,      5L,     20L,
                    "A03",        3L,      1L,      2L,      3L
                )

# Group by both Position and miRNA_ID, then summarise within each grouping to get mean, sd, etc.
summarised_data <- dummy_data %>%
  gather(key, value, -Position, -miRNA_ID) %>%
  group_by(Position, miRNA_ID) %>%
  summarise(mean = mean(value, na.rm = T), 
            sd = sd(value, na.rm = T),
            count = n())

summarised_data
#> # A tibble: 3 x 5
#> # Groups:   Position [?]
#>   Position miRNA_ID  mean     sd count
#>   <chr>       <int> <dbl>  <dbl> <int>
#> 1 A01             1  1.5   0.548     6
#> 2 A02             2  8.67 10.0       3
#> 3 A03             3  2     1         3

# Filter data where SD is less than 1.5.
summarised_data %>%
  filter(
    sd < 1.5
  )
#> # A tibble: 2 x 5
#> # Groups:   Position [2]
#>   Position miRNA_ID  mean    sd count
#>   <chr>       <int> <dbl> <dbl> <int>
#> 1 A01             1   1.5 0.548     6
#> 2 A03             3   2   1         3

Edit: PS, when creating dummy data, I find this RStudio plugin a godsend: https://github.com/MilesMcBain/datapasta. I create my data in excel, copy it to clipboard, then using datapasta I can 'paste' it into RStudio as code to create a tibble/dataframe


#5

Thank you for all your help @mrblobby. I believe I have it formatted correctly now.


#6

This still isn't 100% clear to me since your data after cleaning has extra columns and rows of data (e.g. where does the column value come from in your clean data), but are you wanting to keep the rows, but replace the values with blanks if SD is greater than 1.5 or remove those rows completely?

Edit: @steineri I've written the below based on the above. There are two summary functions, one that completely removes rows where sd < 1.5 and one which replaces the values for mean and sd with NA. Please note that I have used what I think is a tidyverse equivalent for complete.cases in this use. I assume you're wanting to remove rows where values are missing, so I've used filter(!is.na(value)) to achieve the same outcome. If that's not what you want, I can amend it. Also I haven't tried writing to xls spreadsheets, but I copied that from your script and assume it works :slight_smile

Edit 2: the umlaut was throwing an error for me so I replaced it with an i, you'll want to change that back! :

IFC_out <- function(x) {
  
  df <- x
  
  IFC_melt <- IFC %>%
    gather(key, value, -iPosition, -Mature.miRNA.ID) %>%
    add_column(plate = df, .before = "Mature.miRNA.ID")
  
  IFC_sum <- IFC_melt %>%
    filter(!is.na(value)) %>%
    group_by(iPosition, plate, Mature.miRNA.ID) %>%
    summarise(
      length = n(),
      mean = mean(value, na.rm = T), 
      sd = sd(value, na.rm = T),
      coefficient_of_variation=sd(value)/mean(value),
      sem = sd/sqrt(length)
      ) %>%
    filter(sd < 1.5)
  
  
  IFC_sum_with_blanks <- IFC_melt %>%
    filter(!is.na(value)) %>%
    group_by(iPosition, plate, Mature.miRNA.ID) %>%
    summarise(
      length = n(),
      mean = mean(value, na.rm = T), 
      sd = sd(value, na.rm = T),
      coefficient_of_variation=sd(value)/mean(value),
      sem = sd/sqrt(length)
      ) %>%
    mutate(
      mean = ifelse(sd < 1.5, mean, NA),
      sd = ifelse(sd < 1.5, sd, NA)
    )
    
  
  head(IFC)
  head(IFC_melt)
  head(IFC_sum)
  
  st=format(Sys.time(), "%Y-%m-%d")
  
  write.xlsx(IFC, file = paste(df,"_", st,".xlsx", sep=""), sheetName="IFC_original", 
             col.names=TRUE, row.names=FALSE, append=FALSE)
  write.xlsx(IFC_melt, file = paste(df,"_", st,".xlsx", sep=""), sheetName="IFC_melt", 
             col.names=TRUE, row.names=FALSE, append=TRUE)
  write.xlsx(IFC_sum, file = paste(df,"_", st,".xlsx", sep=""), sheetName="IFC_sum1", 
             col.names=TRUE, row.names=FALSE, append=TRUE)
}

#7

@mrblobby, thank you so much for your help, both in teaching me how to format my code in this forum and providing a solution. I truly appreciate your help!


#8

You're most welcome @steineri. Have you managed to try it out to make sure it works as you expect?

Also, if you want to learn more about the different tidyverse functions, I suggest reading this book (available freely online): https://r4ds.had.co.nz/. I found it extremely useful and it makes using R a lot less painful (and, dare I say, fun :wink: ).