Filtering rows in dataframe by multiple strings

So I have 3342 rows in my dataset and want to filter for rows containing certain phrases (as partial matches). I've tried using the filter command in different ways and keep getting errors.

> dput(head(full_data))
structure(list(`Identified Proteins (3404)` = c("Casein kinase II subunit beta OS=Tetradesmus obliquus OX=3088 GN=BQ4739_LOCUS9 PE=3 SV=1", 
"Uncharacterized protein OS=Tetradesmus obliquus OX=3088 GN=BQ4739_LOCUS7 PE=4 SV=1", 
"DNA helicase OS=Tetradesmus obliquus OX=3088 GN=BQ4739_LOCUS38 PE=3 SV=1", 
"Uncharacterized protein OS=Tetradesmus obliquus OX=3088 GN=BQ4739_LOCUS55 PE=3 SV=1", 
"Uncharacterized protein OS=Tetradesmus obliquus OX=3088 GN=BQ4739_LOCUS89 PE=4 SV=1", 
"Uncharacterized protein OS=Tetradesmus obliquus OX=3088 GN=BQ4739_LOCUS120 PE=4 SV=1"
), `Accession Number` = c("A0A383V1G7", "A0A383V1H7", "A0A383V1J2", 
"A0A383V1M7", "A0A383V1R6", "A0A383V1S1"), `Blast protein ID` = c("Casein kinase II subunit beta", 
"Uncharacterized", "DNA helicase", "Methanethiol oxidase", "Uncharacterized", 
"Thylakoid lumenal protein, chlorplastic"), `Blast Accession` = c("A0A383V1G7", 
"A0A383V1H7", "A0A383V1J2", "Q8VIF7", "A0A383V1R6", "P81760"), 
    `Blast taxonomy` = c("Tetradesmus obliquus", NA, "Tetradesmus obliquus", 
    "Rattus norvegicus", "Tetradesmus obliquus", "Arabidopsis thaliana"
    ), `Reference A` = c(0.0807, 0.0152, -0.1225, 0.0443, -0.0241, 
    -0.0056), `Reference B` = c(-0.1262, -0.0034, 0.113, -0.052, 
    0.0498, 0.0151), `Anaerobic_24h A` = c(0.4803, -0.247, 0.2999, 
    0.2437, -0.9771, -1.088), `Anaerobic_24h B` = c(0.2009, -0.4038, 
    0.3061, 0.2491, -1.0182, -1.0642), `Anaerobic_28d A` = c(-0.3963, 
    -1.0501, 0.9427, -0.1703, 0.4629, -1.3814), `Anaerobic_28d B` = c(0.0993, 
    -0.5398, 0.8354, -0.1983, 0.3515, -1.4052), `Citric_24h A` = c(0.1738, 
    0.2246, 0.1355, -0.1582, -0.9691, 0.2904), `Citric_24h B` = c(0.3209, 
    0.2025, -0.1242, -0.0307, -0.7478, 0.0986), `Citric_28d A` = c(0.6117, 
    -0.3526, 0.964, 0.0942, -0.4023, -0.9981), `Citric_28d B` = c(0.4925, 
    -0.3011, 0.5552, 0.0344, -0.3841, -0.7916), Taxonomy = c("Tetradesmus obliquus", 
    "unknown", "Tetradesmus obliquus", "Tetradesmus obliquus", 
    "unknown", "unknown"), reference_avg = c(-0.02275, 0.0059, 
    -0.00475, -0.00385, 0.01285, 0.00475), anaerobic_24h_avg = c(0.3406, 
    -0.3254, 0.303, 0.2464, -0.99765, -1.0761), anaerobic_28d_avg = c(-0.1485, 
    -0.79495, 0.88905, -0.1843, 0.4072, -1.3933), citric_24h_avg = c(0.24735, 
    0.21355, 0.00565, -0.09445, -0.85845, 0.1945), Citric_28d_avg = c(0.5521, 
    -0.32685, 0.7596, 0.0643, -0.3932, -0.89485)), class = c("rowwise_df", 
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -6L), groups = structure(list(
    .rows = structure(list(1L, 2L, 3L, 4L, 5L, 6L), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), row.names = c(NA, -6L), class = c("tbl_df", 
"tbl", "data.frame")))


# Filtering based on Blast Protein ID's

tca_data<-full_data %>% 
  filter(full_data,`Blast protein ID`%in% c("Malate dehydrogenase",
                                            "Isocitrate dehydrogenase",
                                            "Phosphoenolpyruvate carboxylase",
                                            "Succinate dehydrogenase",
                                            "Dihydrolipoyllysine",
                                            "Succinate--CoA ligase",
                                            "Fumarate hydratase",
                                            "Oxoglutarate dehydrogeanase",
                                            "Isocitrate lyase","Malate synthase"))

Error in `filter()`:
ℹ In argument: `full_data`.
ℹ In row 1.
Caused by error:
! `..1` must be of size 1, not size 3342.
Run `rlang::last_trace()` to see where the error occurred.

tca_data<-full_data[rownames(full_data)%like%c("Malate dehydrogenase",
                                     "Isocitrate dehydrogenase",
                                     "Phosphoenolpyruvate carboxylase",
                                     "Succinate dehydrogenase",
                                     "Dihydrolipoyllysine",
                                     "Succinate--CoA ligase",
                                     "Fumarate hydratase",
                                     "Oxoglutarate dehydrogeanase",
                                     "Isocitrate lyase","Malate synthase"),]

# This last one provides all the column headings but no rows

Any advice would be really helpful

You are passing full_data to the filter function twice, once with the pipe and again in the filter function. Try

tca_data<-full_data %>% 
  filter(`Blast protein ID`%in% c("Malate dehydrogenase",
                                            "Isocitrate dehydrogenase",
                                            "Phosphoenolpyruvate carboxylase",
                                            "Succinate dehydrogenase",
                                            "Dihydrolipoyllysine",
                                            "Succinate--CoA ligase",
                                            "Fumarate hydratase",
                                            "Oxoglutarate dehydrogeanase",
                                            "Isocitrate lyase","Malate synthase"))
d <- data.frame(
  id_prot = c(
    "Casein kinase II subunit beta OS=Tetradesmus obliquus OX=3088 GN=BQ4739_LOCUS9 PE=3 SV=1",
    "Uncharacterized protein OS=Tetradesmus obliquus OX=3088 GN=BQ4739_LOCUS7 PE=4 SV=1",
    "DNA helicase OS=Tetradesmus obliquus OX=3088 GN=BQ4739_LOCUS38 PE=3 SV=1",
    "Uncharacterized protein OS=Tetradesmus obliquus OX=3088 GN=BQ4739_LOCUS55 PE=3 SV=1",
    "Uncharacterized protein OS=Tetradesmus obliquus OX=3088 GN=BQ4739_LOCUS89 PE=4 SV=1",
    "Uncharacterized protein OS=Tetradesmus obliquus OX=3088 GN=BQ4739_LOCUS120 PE=4 SV=1"
  ),
  acc_no = c(
    "A0A383V1G7", "A0A383V1H7", "A0A383V1J2",
    "A0A383V1M7", "A0A383V1R6", "A0A383V1S1"
  ),
  blast_id_prot = c(
    "Casein kinase II subunit beta", "Uncharacterized", 
    "DNA helicase", "Methanethiol oxidase", 
    "Uncharacterized", "Thylakoid lumenal protein, chlorplastic"
  ),
  blast_acc_no = c("A0A383V1G7", "A0A383V1H7", "A0A383V1J2", 
                   "Q8VIF7", "A0A383V1R6", "P81760"),
  blast_taxon = c("Tetradesmus obliquus", NA, 
                  "Tetradesmus obliquus","Rattus norvegicus", 
                  "Tetradesmus obliquus", "Arabidopsis thaliana"),
  ref_a = c(0.0807, 0.0152, -0.1225, 0.0443, -0.0241, -0.0056),
  ref_b = c(-0.1262, -0.0034, 0.113, -0.052, 0.0498, 0.0151),
  anaerob_24hA = c(0.4803, -0.247, 0.2999, 0.2437, -0.9771, -1.088),
  anaerob_24hB = c(0.2009, -0.4038, 0.3061, 0.2491, -1.0182, -1.0642),
  anaerob_28dA = c(-0.396,-1.0501, 0.9427, -0.1703, 0.4629, -1.3814),
  anaerob_28dB = c(0.0993,-0.5398, 0.8354, -0.1983, 0.3515, -1.4052),
  citric_24hA = c(0.1738, 0.2246, 0.1355, -0.1582, -0.9691, 0.2904),
  citric_24hB = c(0.3209, 0.2025, -0.1242, -0.0307, -0.7478, 0.0986),
  citric_28dA = c(0.6117, -0.3526, 0.964, 0.0942, -0.4023, -0.9981),
  citric_28dB = c(0.4925,-0.3011, 0.5552, 0.0344, -0.3841, -0.7916),
  taxonomy = c("Tetradesmus obliquus", "unknown", "Tetradesmus obliquus", 
               "Tetradesmus obliquus", "unknown", "unknown"),
  ref_avg = c(-0.02275, 0.0059, -0.00475, -0.00385, 0.01285, 0.00475),
  anaerob_24h_avg = c(0.3406, -0.3254, 0.303, 0.2464, -0.99765, -1.0761),
  anaerob_28d_avg = c(-0.1485, -0.79495, 0.88905, -0.1843, 0.4072, -1.3933),
  citric_24h_avg = c(0.24735, 0.21355, 0.00565, -0.09445, -0.85845, 0.1945),
  citric_28d_avg = c(0.5521, -0.32685, 0.7596, 0.0643, -0.3932, -0.89485)
)

q <- c("Malate dehydrogenase",
       "Isocitrate dehydrogenase",
       "Phosphoenolpyruvate carboxylase",
       "Succinate dehydrogenase",
       "Dihydrolipoyllysine",
       "Succinate--CoA ligase",
       "Fumarate hydratase",
       "Oxoglutarate dehydrogeanase",
       "Isocitrate lyase", 
       "Malate synthase")


tca <- d[which(d$blast_id_prot %in% q),]
tca
#>  [1] id_prot         acc_no          blast_id_prot   blast_acc_no   
#>  [5] blast_taxon     ref_a           ref_b           anaerob_24hA   
#>  [9] anaerob_24hB    anaerob_28dA    anaerob_28dB    citric_24hA    
#> [13] citric_24hB     citric_28dA     citric_28dB     taxonomy       
#> [17] ref_avg         anaerob_24h_avg anaerob_28d_avg citric_24h_avg 
#> [21] citric_28d_avg 
#> <0 rows> (or 0-length row.names)

# because there is no commonality between the data frame and 
# the query vector

d$blast_id_prot
#> [1] "Casein kinase II subunit beta"          
#> [2] "Uncharacterized"                        
#> [3] "DNA helicase"                           
#> [4] "Methanethiol oxidase"                   
#> [5] "Uncharacterized"                        
#> [6] "Thylakoid lumenal protein, chlorplastic"

q
#>  [1] "Malate dehydrogenase"            "Isocitrate dehydrogenase"       
#>  [3] "Phosphoenolpyruvate carboxylase" "Succinate dehydrogenase"        
#>  [5] "Dihydrolipoyllysine"             "Succinate--CoA ligase"          
#>  [7] "Fumarate hydratase"              "Oxoglutarate dehydrogeanase"    
#>  [9] "Isocitrate lyase"                "Malate synthase"

# introduce an item we know to be in the data frame
q <- append(q,d[4,3])

# now we get a single row instead of zero
tca <- d[which(d$blast_id_prot %in% q),]
tca
#>                                                                               id_prot
#> 4 Uncharacterized protein OS=Tetradesmus obliquus OX=3088 GN=BQ4739_LOCUS55 PE=3 SV=1
#>       acc_no        blast_id_prot blast_acc_no       blast_taxon  ref_a  ref_b
#> 4 A0A383V1M7 Methanethiol oxidase       Q8VIF7 Rattus norvegicus 0.0443 -0.052
#>   anaerob_24hA anaerob_24hB anaerob_28dA anaerob_28dB citric_24hA citric_24hB
#> 4       0.2437       0.2491      -0.1703      -0.1983     -0.1582     -0.0307
#>   citric_28dA citric_28dB             taxonomy  ref_avg anaerob_24h_avg
#> 4      0.0942      0.0344 Tetradesmus obliquus -0.00385          0.2464
#>   anaerob_28d_avg citric_24h_avg citric_28d_avg
#> 4         -0.1843       -0.09445         0.0643

Created on 2023-03-17 with reprex v2.0.2

Hello,

That almost worked, excpet it didn't select the rows that were partial matches. How would you reword it to select the rows that are partial matches to the ones you have listed for q in your example?

Hello,

That almost worked, excpet it didn't select the rows that were partial matches. How would you reword it to select the rows that are partial matches to the ones you have listed for q in your example?

Fuzzy match with stringdist::ain

library(stringdist)

d <- data.frame(
  id_prot = c(
    "Casein kinase II subunit beta OS=Tetradesmus obliquus OX=3088 GN=BQ4739_LOCUS9 PE=3 SV=1",
    "Uncharacterized protein OS=Tetradesmus obliquus OX=3088 GN=BQ4739_LOCUS7 PE=4 SV=1",
    "DNA helicase OS=Tetradesmus obliquus OX=3088 GN=BQ4739_LOCUS38 PE=3 SV=1",
    "Uncharacterized protein OS=Tetradesmus obliquus OX=3088 GN=BQ4739_LOCUS55 PE=3 SV=1",
    "Uncharacterized protein OS=Tetradesmus obliquus OX=3088 GN=BQ4739_LOCUS89 PE=4 SV=1",
    "Uncharacterized protein OS=Tetradesmus obliquus OX=3088 GN=BQ4739_LOCUS120 PE=4 SV=1"
  ),
  acc_no = c(
    "A0A383V1G7", "A0A383V1H7", "A0A383V1J2",
    "A0A383V1M7", "A0A383V1R6", "A0A383V1S1"
  ),
  blast_id_prot = c(
    "Casein kinase II subunit beta", "Uncharacterized", 
    "DNA helicase", "Methanethiol oxidase", 
    "Uncharacterized", "Thylakoid lumenal protein, chlorplastic"
  ),
  blast_acc_no = c("A0A383V1G7", "A0A383V1H7", "A0A383V1J2", 
                   "Q8VIF7", "A0A383V1R6", "P81760"),
  blast_taxon = c("Tetradesmus obliquus", NA, 
                  "Tetradesmus obliquus","Rattus norvegicus", 
                  "Tetradesmus obliquus", "Arabidopsis thaliana"),
  ref_a = c(0.0807, 0.0152, -0.1225, 0.0443, -0.0241, -0.0056),
  ref_b = c(-0.1262, -0.0034, 0.113, -0.052, 0.0498, 0.0151),
  anaerob_24hA = c(0.4803, -0.247, 0.2999, 0.2437, -0.9771, -1.088),
  anaerob_24hB = c(0.2009, -0.4038, 0.3061, 0.2491, -1.0182, -1.0642),
  anaerob_28dA = c(-0.396,-1.0501, 0.9427, -0.1703, 0.4629, -1.3814),
  anaerob_28dB = c(0.0993,-0.5398, 0.8354, -0.1983, 0.3515, -1.4052),
  citric_24hA = c(0.1738, 0.2246, 0.1355, -0.1582, -0.9691, 0.2904),
  citric_24hB = c(0.3209, 0.2025, -0.1242, -0.0307, -0.7478, 0.0986),
  citric_28dA = c(0.6117, -0.3526, 0.964, 0.0942, -0.4023, -0.9981),
  citric_28dB = c(0.4925,-0.3011, 0.5552, 0.0344, -0.3841, -0.7916),
  taxonomy = c("Tetradesmus obliquus", "unknown", "Tetradesmus obliquus", 
               "Tetradesmus obliquus", "unknown", "unknown"),
  ref_avg = c(-0.02275, 0.0059, -0.00475, -0.00385, 0.01285, 0.00475),
  anaerob_24h_avg = c(0.3406, -0.3254, 0.303, 0.2464, -0.99765, -1.0761),
  anaerob_28d_avg = c(-0.1485, -0.79495, 0.88905, -0.1843, 0.4072, -1.3933),
  citric_24h_avg = c(0.24735, 0.21355, 0.00565, -0.09445, -0.85845, 0.1945),
  citric_28d_avg = c(0.5521, -0.32685, 0.7596, 0.0643, -0.3932, -0.89485)
)

q <- c("Malate dehydrogenase",
       "Isocitrate dehydrogenase",
       "Phosphoenolpyruvate carboxylase",
       "Succinate dehydrogenase",
       "Dihydrolipoyllysine",
       "Succinate--CoA ligase",
       "Fumarate hydratase",
       "Oxoglutarate dehydrogeanase",
       "Isocitrate lyase", 
       "Malate synthase")


# introduce an item we know to be in the data frame
q <- append(q,d[4,3])

# create a row with an intentional partial match
d[7,] <- d[6,]
d[7,3] <- "Malate"

# now we get a two rows instead of one--
# one with a full match and one with a
# partial match
(tca <- d[which(ain(d$blast_id_prot,q)),])
#>                                                                               id_prot
#> 4 Uncharacterized protein OS=Tetradesmus obliquus OX=3088 GN=BQ4739_LOCUS55 PE=3 SV=1
#>       acc_no        blast_id_prot blast_acc_no       blast_taxon  ref_a  ref_b
#> 4 A0A383V1M7 Methanethiol oxidase       Q8VIF7 Rattus norvegicus 0.0443 -0.052
#>   anaerob_24hA anaerob_24hB anaerob_28dA anaerob_28dB citric_24hA citric_24hB
#> 4       0.2437       0.2491      -0.1703      -0.1983     -0.1582     -0.0307
#>   citric_28dA citric_28dB             taxonomy  ref_avg anaerob_24h_avg
#> 4      0.0942      0.0344 Tetradesmus obliquus -0.00385          0.2464
#>   anaerob_28d_avg citric_24h_avg citric_28d_avg
#> 4         -0.1843       -0.09445         0.0643

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.