New variable based on a increasing range of other variables

Hi,
I have this simple data file:

data.frame(stringsAsFactors=FALSE,
                                                                               Unique.respondent.number = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12),
                                                                                                comment = c("I have seen many various charges in my life,
                                                                                                            but I don’t like your saving rates",
                                                                                                            "I like R Studio", "No comment",
                                                                                                            "Main benefit is having low charges",
                                                                                                            NA,
                                                                                                            "Charge could be an issue",
                                                                                                            "Issues with saving rates", "Good saving rates",
                                                                                                            "Many benefits like reasonable charges", "N/A", "-",
                                                                                                            "Nothing")
                                                                            )

Now, I have to create new variables based on key words found in the "comment" str variable.
My R guru ( andresrcs) helped me to create this code:


source$Blank <- ifelse(is.na(source$comment), 1, ifelse(str_length(source$comment)<3, 1, 0))

library(dplyr)
library(stringr)

negative_sentiments <- regex("don.?t\\slike|issue|bad", ignore_case = TRUE)

result <- source %>% 
  mutate(Charges_Fees = if_else(str_detect(comment, regex("charges?", ignore_case = TRUE)) &
                                  !str_detect(comment, regex("benefits?", ignore_case = TRUE)), 1, 0),
         Poor_Rates = if_else(str_detect(comment, negative_sentiments) &
                                str_detect(comment, regex("saving\\srates", ignore_case = TRUE)), 1, 0)) %>% 
  mutate_if(is.numeric, ~if_else(is.na(.), 0, .))

result

Once, the coding is done I need to create a new variable called "Other" if none of the new variables above is equal one.
I know there are many ways of doing that but I am looking for a clever solution as the source data file is growing and I will have to create more and more variables in the code above.

Can I set up something like:
If Blank+Charges_Fees+Poor_Rates+"variable4"+"variable4"+...+"variable_n"=0 then "Other"=1?

I have a solution:

# Creating "Other" category

result$Other <-result$Charges_Fees+result$Good_Rates+result$Poor_Rates+result$Blank
result

result$Other <- ifelse(result$Other==0, 1,0)
result

but that is a way around it and I was looking for something clever with a function taking into account variables 3 onwards (or ideally all variables created in the mutate function).

Still hoping to get a solution :thinking:

I haven't tested, but what happens if you try this?

result$Other <- ifelse((rowSums(result[, -(1:2)]) == 0), 1, 0)

It returns 0s :frowning_face:

It shouldn't. I just tried with the limited data set you've provided, and it works. Please check.

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(stringr)

source <- data.frame(stringsAsFactors = FALSE,
                     Unique.respondent.number = 1:12,
                     comment = c("I have seen many various charges in my life,
                                                                                                            but I don’t like your saving rates",
                                 "I like R Studio", "No comment",
                                 "Main benefit is having low charges",
                                 NA,
                                 "Charge could be an issue",
                                 "Issues with saving rates", "Good saving rates",
                                 "Many benefits like reasonable charges", "N/A", "-",
                                 "Nothing"))

negative_sentiments <- regex(pattern = "don.?t\\slike|issue|bad",
                             ignore_case = TRUE)

source %>%
  mutate(Blank = ifelse(test = (is.na(x = source$comment)),
                        yes = 1,
                        no = ifelse(test = (str_length(string = source$comment) < 3),
                                    yes = 1,
                                    no = 0)),
         Charges_Fees = if_else(condition = ((str_detect(string = comment,
                                                         pattern = regex(pattern = "charges?",
                                                                         ignore_case = TRUE))) & (!str_detect(string = comment,
                                                                                                              pattern = regex(pattern = "benefits?",
                                                                                                                              ignore_case = TRUE)))),
                                true = 1,
                                false = 0),
         Poor_Rates = if_else(condition = ((str_detect(string = comment,
                                                       pattern = negative_sentiments)) & (str_detect(string = comment,
                                                                                                     pattern = regex(pattern = "saving\\srates",
                                                                                                                     ignore_case = TRUE)))),
                              true = 1,
                              false = 0)) %>%
  mutate_if(.predicate = is.numeric,
            .funs = ~ ifelse(test = is.na(x = .),
                             yes = 0,
                             no = .)) %>%
  mutate(Other = ifelse(test = (rowSums(x = .[-(1:2)]) == 0),
                        yes = 1,
                        no = 0))
#>    Unique.respondent.number
#> 1                         1
#> 2                         2
#> 3                         3
#> 4                         4
#> 5                         5
#> 6                         6
#> 7                         7
#> 8                         8
#> 9                         9
#> 10                       10
#> 11                       11
#> 12                       12
#>                                                                                                                                                                                         comment
#> 1  I have seen many various charges in my life,\n                                                                                                            but I don’t like your saving rates
#> 2                                                                                                                                                                               I like R Studio
#> 3                                                                                                                                                                                    No comment
#> 4                                                                                                                                                            Main benefit is having low charges
#> 5                                                                                                                                                                                          <NA>
#> 6                                                                                                                                                                      Charge could be an issue
#> 7                                                                                                                                                                      Issues with saving rates
#> 8                                                                                                                                                                             Good saving rates
#> 9                                                                                                                                                         Many benefits like reasonable charges
#> 10                                                                                                                                                                                          N/A
#> 11                                                                                                                                                                                            -
#> 12                                                                                                                                                                                      Nothing
#>    Blank Charges_Fees Poor_Rates Other
#> 1      0            1          1     0
#> 2      0            0          0     1
#> 3      0            0          0     1
#> 4      0            0          0     1
#> 5      1            0          0     0
#> 6      0            1          0     0
#> 7      0            0          1     0
#> 8      0            0          0     1
#> 9      0            0          0     1
#> 10     0            0          0     1
#> 11     1            0          0     0
#> 12     0            0          0     1

Created on 2019-07-10 by the reprex package (v0.3.0)

O, wow. Thank you!!!

Final 2 questions please

  1. what is "test" in the code?
  2. what rule is behind -(1:2)? Is this code taking into account all variables created in the mutate?

I'm just trying to learn your magic tricks...

There's no magic.

test is an argument to ifelse. It's analogous to condition of if_else.

In .[-(1:2)], . is the updated data set up to the last step, and [-(1:2)] instructs R to keep all but the 1st and the 2nd columns. I'm doing this because you're summing all columns starting from the 3rd one.

In this particular case, it does. But if you've reordered all the columns, then it won't. I'm not aware of any technique that'll enable R to recognise the columns created by mutate especially than others. You can create a special pattern in those columns though, and then select satisfying that pattern to select "only the mutate columns". I'm not quite fond of this idea, in case those columns are not related. But, it's a personal preference.

Thank you very much for this explanation and for being so patient :slight_smile:

Final question:
Your code is referring to my initial idea of importing sentiment libraries, which is incorrect:

negative_sentiments <- regex(pattern = "don.?t\\slike|issue|bad",
                             ignore_case = TRUE)

When I replace it by correct statement I have an error:

library(SentimentAnalysis)
negative_sentiments <- if_else(((tmresult$NegativityGI>0 & tmresult$NegativityHE>0 & tmresult$NegativityLM>0)
                                | str_detect(source$comment, regex("don.?t\\slike|issue|bad", ignore_case = TRUE))) & !str_detect(source$comment, regex("reasonable", ignore_case = TRUE)),1,0)


Error in UseMethod("type") : 
  no applicable method for 'type' applied to an object of class "c('double', 'numeric')"

Are you patient enough to advice how I could fix it?

Solution:

library(stringr)
blank_statements <- regex("no\\scomment?|nothing|^\\s*n.?a.?\\s*$", ignore_case = TRUE)


#  Sentiment set up

library(SentimentAnalysis)
tmresult<-analyzeSentiment(source$comment)


library(dplyr)
negative_sentiments <- if_else(((tmresult$NegativityGI>0 & tmresult$NegativityHE>0 & tmresult$NegativityLM>0)
                                | str_detect(source$comment, regex("don.?t\\slike|issue|bad", ignore_case = TRUE))) & !str_detect(source$comment, regex("reasonable", ignore_case = TRUE)),1,0)
positive_sentiments <- if_else(((tmresult$PositivityGI>0 & tmresult$PositivityHE>0 & tmresult$PositivityLM>0)
                                | str_detect(source$comment, regex("benefit|like", ignore_case = TRUE))) & !str_detect(source$comment, regex("bad|more", ignore_case = TRUE)),1,0)

library(dplyr)
library(stringr)


result <- source %>% 
  mutate(Blank = ifelse(test = (is.na(x = source$comment)),
                        yes = 1,
                        no = ifelse((test = (str_length(string = source$comment) < 3) | (str_detect(string = comment,
                                                                                                   pattern = blank_statements))|(str_detect(comment, "(.)\\1{3,}")))
                                    & tmresult$WordCount<4,
                                    yes = 1,
                                    no = 0)),
         Charges_Fees = if_else(str_detect(comment, regex("charges?", ignore_case = TRUE)) &
                                  !str_detect(comment, regex("benefits?", ignore_case = TRUE)), 1, 0),
         Good_Rates = if_else(str_detect(comment, regex("saving\\srates", ignore_case = TRUE)) &
                                (positive_sentiments==1), 1, 0),
         Poor_Rates = if_else(str_detect(comment, regex("saving\\srates", ignore_case = TRUE)) &
                                (negative_sentiments==1), 1, 0),
         More_Products = if_else(str_detect(comment, regex("more\\sproduct|product\\soffering", ignore_case = TRUE)), 1, 0)) %>% 
  mutate_if(is.numeric, ~if_else(is.na(.), 0, .)) %>%
  mutate(Other = ifelse(test = (rowSums(x = .[-(1:2)]) == 0),
                        yes = 1,
                        no = 0))

result

Thank you very much for your help!

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.