OR operation on similar row items

dplyr

#1

Hi R experts,

Could you please help me to perform OR operation on the following table

input = data.frame(Jan=c(1,0,1,0,1,1),Feb= c(1,1,0,0,0,1), Mar = c(1,0,0,0,1,0), Apr = c(0,0,0,1,1,1))_
rownames(input) = c("green apple ","orange","banana","green banana","plastic apple","rotten orange")_
expected_output = data.frame(Jan=c(1,1,1),Feb=c(1,1,0),Mar=c(1,0,0), Apr=c(1,1,1))_
rownames(expected_output) = c("apple","orange","banana")_

Basically, to perform OR operation on similar items and retain the row

Tried with str_replace() and str_detect but these will just replace the strings causing ambiguity to choose the row. Is there a better way to obtain such results ?

Thanks in advance
Abi


#2

I think you'll need to clarify your question a bit to attract answers :slightly_smiling_face:


#3

To go from your example, translated into a reproducible example, called a reprex, which is a terrific way to get quicker and better answers, use the following modifications.

# Original code
input = data.frame(Jan=c(1,0,1,0,1,1),Feb= c(1,1,0,0,0,1), Mar = c(1,0,0,0,1,0), Apr = c(0,0,0,1,1,1))
rownames(input) = c("green apple ","orange","banana","green banana","plastic apple","rotten orange")
expected_output = data.frame(Jan=c(1,1,1),Feb=c(1,1,0),Mar=c(1,0,0), Apr=c(1,1,1))
rownames(expected_output) = c("apple","orange","banana")

# Revised
# Install, if necessary, and load the following libraries
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(magrittr)
library(stringr)
library(tibble)

# Style varies on which assignment operator to use, <- or =
# It doesn't matter so long as you're consistent
# I like to use <- when defining an object like "input"
# and = when defining an attribute; be consistent with spaces/no spaces around =

# Avoid rownames, make the data you want to filter part of a data frame or tibble column
input <-  data.frame(fruit = c("green apple ","orange","banana","green banana",
                               "plastic apple","rotten orange"), Jan=c(1,0,1,0,1,1),
                               Feb= c(1,1,0,0,0,1), Mar = c(1,0,0,0,1,0),
                               Apr = c(0,0,0,1,1,1), stringsAsFactors = FALSE
                    )

# show revised input

input
#>           fruit Jan Feb Mar Apr
#> 1  green apple    1   1   1   0
#> 2        orange   0   1   0   0
#> 3        banana   1   0   0   0
#> 4  green banana   0   0   0   1
#> 5 plastic apple   1   0   1   1
#> 6 rotten orange   1   1   0   1

# define the problem simply: Eliminate all entries with more than one word, in your example

# show revised output

output <- input %>% filter(fruit == word(fruit[], -1))

# show revised output

output
#>    fruit Jan Feb Mar Apr
#> 1 orange   0   1   0   0
#> 2 banana   1   0   0   0

# notice that "apple is missing" What's different about it?

input$fruit
#> [1] "green apple "  "orange"        "banana"        "green banana" 
#> [5] "plastic apple" "rotten orange"

# do you see the trailing space?

Created on 2019-01-09 by the reprex package (v0.2.1)


#4

Thank you for great explanation.
Actually, I didnt mean to eliminate all entries with more than 1 word
Instead
I want to choose the unique fruits and perform OR operation about their quantity in respective month.
Eg: Green apple and plastic apple have 1 quantity for Jan, expected_output also displays the same. But if any the similar fruit has no entry i.e Banana and green banana has no entry for Feb, expected_output is also 0

Please refer the expected_output from the question


#5

I'm afraid I still don't understand your logic, and without a reproducible example, called a reprex (with a corrected "green apple " to "green apple") the only thing I can suggest is

# Original code
input = data.frame(Jan=c(1,0,1,0,1,1),Feb= c(1,1,0,0,0,1), Mar = c(1,0,0,0,1,0), Apr = c(0,0,0,1,1,1))
rownames(input) = c("green apple","orange","banana","green banana","plastic apple","rotten orange")
expected_output = data.frame(Jan=c(1,1,1),Feb=c(1,1,0),Mar=c(1,0,0), Apr=c(1,1,1))
rownames(expected_output) = c("apple","orange","banana")

# Show expected output

expected_output
#>        Jan Feb Mar Apr
#> apple    1   1   1   1
#> orange   1   1   0   1
#> banana   1   0   0   1

# Revised

# Install, if necessary, and load the following libraries

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(magrittr)
library(stringr)
library(tibble)

# Style varies on which assignment operator to use, <- or =
# It doesn't matter so long as you're consistent
# I like to use <- when defining an object like "input"
# and = when defining an attribute; be consistent with spaces/no spaces around =

# Avoid rownames, make the data you want to filter part of a data frame or tibble column
input <-  data.frame(fruit = c("green apple","orange","banana","green banana",
                               "plastic apple","rotten orange"), Jan=c(1,0,1,0,1,1),
                               Feb= c(1,1,0,0,0,1), Mar = c(1,0,0,0,1,0),
                               Apr = c(0,0,0,1,1,1), stringsAsFactors = FALSE
                    )

# show revised input

input
#>           fruit Jan Feb Mar Apr
#> 1   green apple   1   1   1   0
#> 2        orange   0   1   0   0
#> 3        banana   1   0   0   0
#> 4  green banana   0   0   0   1
#> 5 plastic apple   1   0   1   1
#> 6 rotten orange   1   1   0   1

# categorize fruit

input <- input %>% mutate(type_fruit = word(fruit[], -1)) %>% select(type_fruit, everything())

# show revised input

input
#>   type_fruit         fruit Jan Feb Mar Apr
#> 1      apple   green apple   1   1   1   0
#> 2     orange        orange   0   1   0   0
#> 3     banana        banana   1   0   0   0
#> 4     banana  green banana   0   0   0   1
#> 5      apple plastic apple   1   0   1   1
#> 6     orange rotten orange   1   1   0   1

# summarize by month by fruit type

output <- input %>% group_by(type_fruit) %>% summarize(Jan = sum(Jan), Feb = sum(Feb), Mar = sum(Mar), Apr = sum(Apr))

# show revised output

output
#> # A tibble: 3 x 5
#>   type_fruit   Jan   Feb   Mar   Apr
#>   <chr>      <dbl> <dbl> <dbl> <dbl>
#> 1 apple          2     1     2     1
#> 2 banana         1     0     0     1
#> 3 orange         1     2     0     1

# Doesn't match the expected output because you don't specify whether you want <lgl> 
# or <int>, and I've assumed <int>

Created on 2019-01-09 by the reprex package (v0.2.1)


#6

You are right, I should have mentioned the expected_output to be logical values (result of OR operation).

Basically, Perform OR operation on the values of the similar (row) items.
Eg:
apple has 1110 in first row
but 1011 is 5th row
resulting in 1111
Therefore, the code should look for different fruits and perform OR operation corresponding to that fruit.

I now rephrased the question
input <- data.frame(fruit = c("apple","orange","banana","banana",
"apple","orange"), Jan=c(1,0,1,0,1,1),
Feb= c(1,1,0,0,0,1), Mar = c(1,0,0,0,1,0),
Apr = c(0,0,0,1,1,1), stringsAsFactors = FALSE
)

expected_output = data.frame(Jan=c(1,1,1),Feb=c(1,1,0),Mar=c(1,0,0), Apr=c(1,1,1))
rownames(expected_output) = c("apple","orange","banana")
Values of expected_output can be logical but better to be 1/0.

Thanks in advance


#7
library(magrittr)
library(stringr)
library(tibble)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

input <-  data.frame(fruit = c("green apple","orange","banana","green banana",
                               "plastic apple","rotten orange"), 
                               Jan = as.logical(c(1,0,1,0,1,1)),
                               Feb = as.logical(c(1,1,0,0,0,1)),
                               Mar = as.logical(c(1,0,0,0,1,0)),
                               Apr = as.logical(c(0,0,0,1,1,1)), 
                     stringsAsFactors = FALSE
                     )

input
#>           fruit   Jan   Feb   Mar   Apr
#> 1   green apple  TRUE  TRUE  TRUE FALSE
#> 2        orange FALSE  TRUE FALSE FALSE
#> 3        banana  TRUE FALSE FALSE FALSE
#> 4  green banana FALSE FALSE FALSE  TRUE
#> 5 plastic apple  TRUE FALSE  TRUE  TRUE
#> 6 rotten orange  TRUE  TRUE FALSE  TRUE

# categorize fruit

input <- input %>% mutate(type_fruit = word(fruit[], -1)) %>% select(type_fruit, everything())

# show revised input

input
#>   type_fruit         fruit   Jan   Feb   Mar   Apr
#> 1      apple   green apple  TRUE  TRUE  TRUE FALSE
#> 2     orange        orange FALSE  TRUE FALSE FALSE
#> 3     banana        banana  TRUE FALSE FALSE FALSE
#> 4     banana  green banana FALSE FALSE FALSE  TRUE
#> 5      apple plastic apple  TRUE FALSE  TRUE  TRUE
#> 6     orange rotten orange  TRUE  TRUE FALSE  TRUE

# summarize by month by fruit type

output <- input %>% group_by(type_fruit) %>% summarize(Jan = as.integer(sum(Jan) > 0),
                                                       Feb = as.integer(sum(Feb) > 0),
                                                       Mar = as.integer(sum(Mar) > 0),
                                                       Apr = as.integer(sum(Apr) > 0)
                                                      )
# show revised output

output
#> # A tibble: 3 x 5
#>   type_fruit   Jan   Feb   Mar   Apr
#>   <chr>      <int> <int> <int> <int>
#> 1 apple          1     1     1     1
#> 2 banana         1     0     0     1
#> 3 orange         1     1     0     1

Created on 2019-01-10 by the reprex package (v0.2.1)


#8

Thanks Technocrat, this solves the purpose :slight_smile:


closed #9

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.