So I have 3342 rows in my dataset and want to filter for rows containing certain phrases (as partial matches). I've tried using the filter command in different ways and keep getting errors.
> dput(head(full_data))
structure(list(`Identified Proteins (3404)` = c("Casein kinase II subunit beta OS=Tetradesmus obliquus OX=3088 GN=BQ4739_LOCUS9 PE=3 SV=1",
"Uncharacterized protein OS=Tetradesmus obliquus OX=3088 GN=BQ4739_LOCUS7 PE=4 SV=1",
"DNA helicase OS=Tetradesmus obliquus OX=3088 GN=BQ4739_LOCUS38 PE=3 SV=1",
"Uncharacterized protein OS=Tetradesmus obliquus OX=3088 GN=BQ4739_LOCUS55 PE=3 SV=1",
"Uncharacterized protein OS=Tetradesmus obliquus OX=3088 GN=BQ4739_LOCUS89 PE=4 SV=1",
"Uncharacterized protein OS=Tetradesmus obliquus OX=3088 GN=BQ4739_LOCUS120 PE=4 SV=1"
), `Accession Number` = c("A0A383V1G7", "A0A383V1H7", "A0A383V1J2",
"A0A383V1M7", "A0A383V1R6", "A0A383V1S1"), `Blast protein ID` = c("Casein kinase II subunit beta",
"Uncharacterized", "DNA helicase", "Methanethiol oxidase", "Uncharacterized",
"Thylakoid lumenal protein, chlorplastic"), `Blast Accession` = c("A0A383V1G7",
"A0A383V1H7", "A0A383V1J2", "Q8VIF7", "A0A383V1R6", "P81760"),
`Blast taxonomy` = c("Tetradesmus obliquus", NA, "Tetradesmus obliquus",
"Rattus norvegicus", "Tetradesmus obliquus", "Arabidopsis thaliana"
), `Reference A` = c(0.0807, 0.0152, -0.1225, 0.0443, -0.0241,
-0.0056), `Reference B` = c(-0.1262, -0.0034, 0.113, -0.052,
0.0498, 0.0151), `Anaerobic_24h A` = c(0.4803, -0.247, 0.2999,
0.2437, -0.9771, -1.088), `Anaerobic_24h B` = c(0.2009, -0.4038,
0.3061, 0.2491, -1.0182, -1.0642), `Anaerobic_28d A` = c(-0.3963,
-1.0501, 0.9427, -0.1703, 0.4629, -1.3814), `Anaerobic_28d B` = c(0.0993,
-0.5398, 0.8354, -0.1983, 0.3515, -1.4052), `Citric_24h A` = c(0.1738,
0.2246, 0.1355, -0.1582, -0.9691, 0.2904), `Citric_24h B` = c(0.3209,
0.2025, -0.1242, -0.0307, -0.7478, 0.0986), `Citric_28d A` = c(0.6117,
-0.3526, 0.964, 0.0942, -0.4023, -0.9981), `Citric_28d B` = c(0.4925,
-0.3011, 0.5552, 0.0344, -0.3841, -0.7916), Taxonomy = c("Tetradesmus obliquus",
"unknown", "Tetradesmus obliquus", "Tetradesmus obliquus",
"unknown", "unknown"), reference_avg = c(-0.02275, 0.0059,
-0.00475, -0.00385, 0.01285, 0.00475), anaerobic_24h_avg = c(0.3406,
-0.3254, 0.303, 0.2464, -0.99765, -1.0761), anaerobic_28d_avg = c(-0.1485,
-0.79495, 0.88905, -0.1843, 0.4072, -1.3933), citric_24h_avg = c(0.24735,
0.21355, 0.00565, -0.09445, -0.85845, 0.1945), Citric_28d_avg = c(0.5521,
-0.32685, 0.7596, 0.0643, -0.3932, -0.89485)), class = c("rowwise_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -6L), groups = structure(list(
.rows = structure(list(1L, 2L, 3L, 4L, 5L, 6L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame")))
# Filtering based on Blast Protein ID's
tca_data<-full_data %>%
filter(full_data,`Blast protein ID`%in% c("Malate dehydrogenase",
"Isocitrate dehydrogenase",
"Phosphoenolpyruvate carboxylase",
"Succinate dehydrogenase",
"Dihydrolipoyllysine",
"Succinate--CoA ligase",
"Fumarate hydratase",
"Oxoglutarate dehydrogeanase",
"Isocitrate lyase","Malate synthase"))
Error in `filter()`:
ℹ In argument: `full_data`.
ℹ In row 1.
Caused by error:
! `..1` must be of size 1, not size 3342.
Run `rlang::last_trace()` to see where the error occurred.
tca_data<-full_data[rownames(full_data)%like%c("Malate dehydrogenase",
"Isocitrate dehydrogenase",
"Phosphoenolpyruvate carboxylase",
"Succinate dehydrogenase",
"Dihydrolipoyllysine",
"Succinate--CoA ligase",
"Fumarate hydratase",
"Oxoglutarate dehydrogeanase",
"Isocitrate lyase","Malate synthase"),]
# This last one provides all the column headings but no rows
Any advice would be really helpful