how to match one row value with other row value and convert if it matched?

Hi all
Good Evening
I have data like this

DATA I HAVE
IND SNP1 SNP2 SNP3
P1 A/A G/G T/T
P2 T/T C/C G/G
IND1 T/A G/C MISSING
IND2 A/T G/G G/T
IND3 T/T C/G T/T
IND4 NA MISSING T/G

if any IND contains either NA or MISSING then "-"

If IND1 match to P1 then A,

if IND1 matched to P2 then B otherwise H

i want to repeat this to all INDs in IND column

Then I want to remove both IND (rows) and SNPs(columns) having percentage of "-" more than 15

and i wish to have like to have like this
|I WISH TO HAVE|||||

I WISH TO HAVE
SNP1 SNP2 SNP3 COUNT OF - % -
H H - 1 33.33333333
H A H 0 0
B H A 0 0
- - H 2 66.66666667

i have posted my example data file here

i am doing this currently in excel with formulas but it is taking lot of time because i have more than 500 columns. is it possible to do it in R with dplyr or tidyverse package?
thanking you very much in advance for your help
Regards

I feel that's easier to work with if:

  1. you separate the reference (P1/P2) from the individuals
  2. you work with long data
library(tidyverse)

# read data
dat <- read.table(text = "IND   SNP1    SNP2    SNP3
P1  A/A     G/G     T/T
P2  T/T     C/C     G/G
IND1    T/A     G/C     MISSING
IND2    A/T     G/G     G/T
IND3    T/T     C/G     T/T
IND4    NA  MISSING     T/G",
header = TRUE)

# separate the reference P1/P2...
ref <- dat |>
  filter(IND %in% c("P1", "P2"))
ref
#>   IND SNP1 SNP2 SNP3
#> 1  P1  A/A  G/G  T/T
#> 2  P2  T/T  C/C  G/G

# ...from the test subjects
inds <- dat |>
  filter(startsWith(IND, "IND"))
inds
#>    IND SNP1    SNP2    SNP3
#> 1 IND1  T/A     G/C MISSING
#> 2 IND2  A/T     G/G     G/T
#> 3 IND3  T/T     C/G     T/T
#> 4 IND4 <NA> MISSING     T/G


# make ref to long format and code what we want
ref_long <- pivot_longer(ref,
                         cols = -IND,
                         names_to = "SNP_number",
                         values_to = "SNP") |>
  mutate(if_match = if_else(IND == "P1", "A", "B")) |>
  rename(IND_ref = IND)
ref_long
#> # A tibble: 6 × 4
#>   IND_ref SNP_number SNP   if_match
#>   <chr>   <chr>      <chr> <chr>   
#> 1 P1      SNP1       A/A   A       
#> 2 P1      SNP2       G/G   A       
#> 3 P1      SNP3       T/T   A       
#> 4 P2      SNP1       T/T   B       
#> 5 P2      SNP2       C/C   B       
#> 6 P2      SNP3       G/G   B



# long format
pivot_longer(inds,
             cols = -IND,
             names_to = "SNP_number",
             values_to = "SNP")
#> # A tibble: 12 × 3
#>    IND   SNP_number SNP    
#>    <chr> <chr>      <chr>  
#>  1 IND1  SNP1       T/A    
#>  2 IND1  SNP2       G/C    
#>  3 IND1  SNP3       MISSING
#>  4 IND2  SNP1       A/T    
#>  5 IND2  SNP2       G/G    
#>  6 IND2  SNP3       G/T    
#>  7 IND3  SNP1       T/T    
#>  8 IND3  SNP2       C/G    
#>  9 IND3  SNP3       T/T    
#> 10 IND4  SNP1       <NA>   
#> 11 IND4  SNP2       MISSING
#> 12 IND4  SNP3       T/G

# Properly identify all NA
pivot_longer(inds,
             cols = -IND,
             names_to = "SNP_number",
             values_to = "SNP") |>
  mutate(SNP = if_else(SNP == "MISSING", NA_character_, SNP))
#> # A tibble: 12 × 3
#>    IND   SNP_number SNP  
#>    <chr> <chr>      <chr>
#>  1 IND1  SNP1       T/A  
#>  2 IND1  SNP2       G/C  
#>  3 IND1  SNP3       <NA> 
#>  4 IND2  SNP1       A/T  
#>  5 IND2  SNP2       G/G  
#>  6 IND2  SNP3       G/T  
#>  7 IND3  SNP1       T/T  
#>  8 IND3  SNP2       C/G  
#>  9 IND3  SNP3       T/T  
#> 10 IND4  SNP1       <NA> 
#> 11 IND4  SNP2       <NA> 
#> 12 IND4  SNP3       T/G


# Assemble this data with the ref
pivot_longer(inds,
             cols = -IND,
             names_to = "SNP_number",
             values_to = "SNP") |>
  mutate(SNP = if_else(SNP == "MISSING", NA_character_, SNP)) |>
  left_join(ref_long,
            by = c("SNP_number", "SNP"))
#> # A tibble: 12 × 5
#>    IND   SNP_number SNP   IND_ref if_match
#>    <chr> <chr>      <chr> <chr>   <chr>   
#>  1 IND1  SNP1       T/A   <NA>    <NA>    
#>  2 IND1  SNP2       G/C   <NA>    <NA>    
#>  3 IND1  SNP3       <NA>  <NA>    <NA>    
#>  4 IND2  SNP1       A/T   <NA>    <NA>    
#>  5 IND2  SNP2       G/G   P1      A       
#>  6 IND2  SNP3       G/T   <NA>    <NA>    
#>  7 IND3  SNP1       T/T   P2      B       
#>  8 IND3  SNP2       C/G   <NA>    <NA>    
#>  9 IND3  SNP3       T/T   P1      A       
#> 10 IND4  SNP1       <NA>  <NA>    <NA>    
#> 11 IND4  SNP2       <NA>  <NA>    <NA>    
#> 12 IND4  SNP3       T/G   <NA>    <NA>

# Note that IND1-SNP2, IND3-SNP1 and IND3-SNP3 have one match,
# and the others have no matches, so they have a NA that we'll replace with H
# In this data, doesn't seem we have any SNP that matches both P1 and P2!
# if that were the case, you might need to do something about it.


# So, now we can put "H" instead of NA
pivot_longer(inds,
             cols = -IND,
             names_to = "SNP_number",
             values_to = "SNP") |>
  mutate(SNP = if_else(SNP == "MISSING", NA_character_, SNP)) |>
  left_join(ref_long,
            by = c("SNP_number", "SNP")) |>
  mutate(if_match = if_else(is.na(if_match), "H", if_match))
#> # A tibble: 12 × 5
#>    IND   SNP_number SNP   IND_ref if_match
#>    <chr> <chr>      <chr> <chr>   <chr>   
#>  1 IND1  SNP1       T/A   <NA>    H       
#>  2 IND1  SNP2       G/C   <NA>    H       
#>  3 IND1  SNP3       <NA>  <NA>    H       
#>  4 IND2  SNP1       A/T   <NA>    H       
#>  5 IND2  SNP2       G/G   P1      A       
#>  6 IND2  SNP3       G/T   <NA>    H       
#>  7 IND3  SNP1       T/T   P2      B       
#>  8 IND3  SNP2       C/G   <NA>    H       
#>  9 IND3  SNP3       T/T   P1      A       
#> 10 IND4  SNP1       <NA>  <NA>    H       
#> 11 IND4  SNP2       <NA>  <NA>    H       
#> 12 IND4  SNP3       T/G   <NA>    H

# Oh no! We are forgetting about the fact that the SNP can be NA itself!
pivot_longer(inds,
             cols = -IND,
             names_to = "SNP_number",
             values_to = "SNP") |>
  mutate(SNP = if_else(SNP == "MISSING", NA_character_, SNP)) |>
  left_join(ref_long,
            by = c("SNP_number", "SNP")) |>
  mutate(if_match = if_else(is.na(if_match), "H", if_match),
         if_match = if_else(is.na(SNP), "-", if_match))
#> # A tibble: 12 × 5
#>    IND   SNP_number SNP   IND_ref if_match
#>    <chr> <chr>      <chr> <chr>   <chr>   
#>  1 IND1  SNP1       T/A   <NA>    H       
#>  2 IND1  SNP2       G/C   <NA>    H       
#>  3 IND1  SNP3       <NA>  <NA>    -       
#>  4 IND2  SNP1       A/T   <NA>    H       
#>  5 IND2  SNP2       G/G   P1      A       
#>  6 IND2  SNP3       G/T   <NA>    H       
#>  7 IND3  SNP1       T/T   P2      B       
#>  8 IND3  SNP2       C/G   <NA>    H       
#>  9 IND3  SNP3       T/T   P1      A       
#> 10 IND4  SNP1       <NA>  <NA>    -       
#> 11 IND4  SNP2       <NA>  <NA>    -       
#> 12 IND4  SNP3       T/G   <NA>    H


# And put it back in wide format if that's what you need later
pivot_longer(inds,
             cols = -IND,
             names_to = "SNP_number",
             values_to = "SNP") |>
  mutate(SNP = if_else(SNP == "MISSING", NA_character_, SNP)) |>
  left_join(ref_long,
            by = c("SNP_number", "SNP")) |>
  mutate(if_match = if_else(is.na(if_match), "H", if_match),
         if_match = if_else(is.na(SNP), "-", if_match)) |>
  pivot_wider(id_cols = IND,
              names_from = "SNP_number",
              values_from = "if_match")
#> # A tibble: 4 × 4
#>   IND   SNP1  SNP2  SNP3 
#>   <chr> <chr> <chr> <chr>
#> 1 IND1  H     H     -    
#> 2 IND2  H     A     H    
#> 3 IND3  B     H     A    
#> 4 IND4  -     -     H


# Finally the percentage of "-" is easier on a long format,
# or can be done rowwise on the wide format (with rowwise and c_across)
prop_dash <- pivot_longer(inds,
             cols = -IND,
             names_to = "SNP_number",
             values_to = "SNP") |>
  mutate(SNP = if_else(SNP == "MISSING", NA_character_, SNP)) |>
  left_join(ref_long,
            by = c("SNP_number", "SNP")) |>
  mutate(if_match = if_else(is.na(if_match), "H", if_match),
         if_match = if_else(is.na(SNP), "-", if_match)) |>
  group_by(IND) |>
  summarize(prop_dash = 100 * mean(if_match == "-"))
prop_dash
#> # A tibble: 4 × 2
#>   IND   prop_dash
#>   <chr>     <dbl>
#> 1 IND1       33.3
#> 2 IND2        0  
#> 3 IND3        0  
#> 4 IND4       66.7

# And we can reassemble them
pivot_longer(inds,
             cols = -IND,
             names_to = "SNP_number",
             values_to = "SNP") |>
  mutate(SNP = if_else(SNP == "MISSING", NA_character_, SNP)) |>
  left_join(ref_long,
            by = c("SNP_number", "SNP")) |>
  mutate(if_match = if_else(is.na(if_match), "H", if_match),
         if_match = if_else(is.na(SNP), "-", if_match)) |>
  pivot_wider(id_cols = IND,
              names_from = "SNP_number",
              values_from = "if_match") |>
  left_join(prop_dash,
            by = "IND")
#> # A tibble: 4 × 5
#>   IND   SNP1  SNP2  SNP3  prop_dash
#>   <chr> <chr> <chr> <chr>     <dbl>
#> 1 IND1  H     H     -          33.3
#> 2 IND2  H     A     H           0  
#> 3 IND3  B     H     A           0  
#> 4 IND4  -     -     H          66.7

Created on 2023-05-18 with reprex v2.0.2

The filtering of INDs with too many dashes is trivial with mutate(), if you also want to filter out SNPs you might want to do it separately.

2 Likes

Dear AlexisW
Good Afternoon
Thanks a lot for your help its working as i expected :slight_smile:
Regards

1 Like

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.