unifying values based on a condition

I have a numeric variable that contains '--undefined--' values. I want to replace '--undefined--' by a numeric value based on a condition. Here is an illustrative example.

 percentvoiced phoneme word   word_start word_end interval Duration..msec.
 <fct>         <chr>   <chr>       <dbl>    <dbl> <chr>              <dbl>
1 100           b       baarid       1.29     1.66 B                   5.06
2 --undefined-- b       baarid       1.29     1.66 ASP                 6.27
3 --undefined-- b       baarid       3.1      3.46 B                   5.50
4 100           b       baarid       3.1      3.46 ASP                 6.60
5 0             t       taaba3       4.50     4.86 B                   5.06
6 50            t       taaba3       4.50     4.86 ASP                24.3 
7 --undefined-- t       taaba3       5.84     6.18 B                   2.13
8 0             t       taaba3       5.84     6.18 ASP                29.4 
9 100           d       daafa3       7.34     7.71 B                   4.66
10 --undefined-- d       daafa3       7.34     7.71 ASP                 8.35

As is seen, percentvoiced has some --undefined-- values. Note that row 1 and 2 share the same value in word_start, and so on for 3-4, 5-6, 7-8, etc. Now I want to use word_start as a condition where i can unify the values in percentvoiced.

The output I'm looking for is similar to this one:

 percentvoiced phoneme word   word_start word_end interval Duration..msec.
 <fct>         <chr>   <chr>       <dbl>    <dbl> <chr>              <dbl>
1 100           b       baarid       1.29     1.66 B                   5.06
2 100           b       baarid       1.29     1.66 ASP                 6.27
3 100           b       baarid       3.1      3.46 B                   5.50
4 100           b       baarid       3.1      3.46 ASP                 6.60
5 0             t       taaba3       4.50     4.86 B                   5.06
6 50            t       taaba3       4.50     4.86 ASP                24.3 
7 0             t       taaba3       5.84     6.18 B                   2.13
8 0             t       taaba3       5.84     6.18 ASP                29.4 
9 100           d       daafa3       7.34     7.71 B                   4.66
10 100          d       daafa3       7.34     7.71 ASP                 8.35

Thank you in advance!

This looks to be a factor not a numeric variable. Could you suplly us with some sample data in dput() format? See ?dput. If you have a very large data set then something like head(dput(myfile), 100) will likely supply enough data for us to work with.

Sure, here it is.

structure(
  list(
    percentvoiced = structure(
      c(
        4L,
        1L,
        1L,
        4L,
        2L,
        14L,
        1L,
        2L,
        4L,
        1L,
        1L,
        4L,
        1L,
        2L,
        1L,
        2L,
        1L,
        4L,
        1L,
        4L,
        4L,
        4L,
        4L,
        4L,
        2L,
        2L,
        2L,
        2L,
        1L,
        4L,
        4L,
        1L,
        2L,
        14L,
        2L,
        2L,
        1L,
        4L,
        4L,
        4L,
        4L,
        4L,
        1L,
        1L,
        2L,
        4L,
        4L,
        1L,
        2L,
        1L,
        2L,
        4L,
        4L,
        1L,
        4L,
        1L,
        2L,
        1L,
        2L,
        4L,
        4L,
        2L,
        2L,
        1L,
        2L,
        1L,
        2L,
        1L,
        2L,
        2L,
        1L,
        2L,
        4L,
        1L,
        4L,
        1L,
        2L,
        2L,
        2L,
        2L,
        2L,
        1L,
        2L,
        1L,
        2L,
        1L,
        4L,
        4L,
        4L,
        1L,
        1L,
        2L,
        1L,
        2L,
        1L,
        4L,
        1L,
        4L,
        2L,
        9L
      ),
      .Label = c(
        "--undefined--",
        "0",
        "10",
        "100",
        "12.5",
        "14.286",
        "16.667",
        "18.75",
        "20",
        "25",
        "28.571",
        "33.333",
        "40",
        "50",
        "60",
        "62.5",
        "66.667",
        "71.429",
        "80",
        "83.333"
      ),
      class = "factor"
    ),
    phoneme = c(
      "b",
      "b",
      "b",
      "b",
      "t",
      "t",
      "t",
      "t",
      "d",
      "d",
      "d",
      "d",
      "k",
      "k",
      "k",
      "k",
      "g",
      "g",
      "g",
      "g",
      "b",
      "b",
      "b",
      "b",
      "t",
      "t",
      "t",
      "t",
      "d",
      "d",
      "d",
      "d",
      "k",
      "k",
      "k",
      "k",
      "g",
      "g",
      "g",
      "g",
      "b",
      "b",
      "b",
      "t",
      "t",
      "t",
      "t",
      "d",
      "d",
      "d",
      "d",
      "d",
      "d",
      "d",
      "d",
      "k",
      "k",
      "k",
      "k",
      "g",
      "g",
      "g",
      "g",
      "b",
      "b",
      "b",
      "b",
      "t",
      "t",
      "t",
      "t",
      "d",
      "d",
      "d",
      "d",
      "k",
      "k",
      "k",
      "k",
      "k",
      "k",
      "k",
      "k",
      "g",
      "g",
      "g",
      "b",
      "b",
      "b",
      "b",
      "t",
      "t",
      "t",
      "t",
      "d",
      "d",
      "d",
      "d",
      "k",
      "k"
    ),
    word = c(
      "baarid",
      "baarid",
      "baarid",
      "baarid",
      "taaba3",
      "taaba3",
      "taaba3",
      "taaba3",
      "daafa3",
      "daafa3",
      "daafa3",
      "daafa3",
      "kaatib",
      "kaatib",
      "kaatib",
      "kaatib",
      "gaarib",
      "gaarib",
      "gaarib",
      "gaarib",
      "biis/a",
      "biis/a",
      "biis/a",
      "biis/a",
      "tiin",
      "tiin",
      "tiin",
      "tiin",
      "diirat",
      "diirat",
      "diirat",
      "diirat",
      "kiis",
      "kiis",
      "kiis",
      "kiis",
      "giimat",
      "giimat",
      "giimat",
      "giimat",
      "buuma",
      "buuma",
      "buuma",
      "tuuna",
      "tuuna",
      "tuuna",
      "tuuna",
      "duuda",
      "duuda",
      "duuda",
      "duuda",
      "duuda",
      "duuda",
      "duuda",
      "duuda",
      "kuura",
      "kuura",
      "kuura",
      "kuura",
      "guuli",
      "guuli",
      "guuli",
      "guuli",
      "Guraab",
      "Guraab",
      "Guraab",
      "Guraab",
      "nabaat",
      "nabaat",
      "nabaat",
      "nabaat",
      "baraad",
      "baraad",
      "baraad",
      "baraad",
      "asmaak",
      "asmaak",
      "asmaak",
      "asmaak",
      "asmaak",
      "asmaak",
      "asmaak",
      "asmaak",
      "nifaag",
      "nifaag",
      "nifaag",
      "najiib",
      "najiib",
      "najiib",
      "najiib",
      "nasiit",
      "nasiit",
      "nasiit",
      "nasiit",
      "fariid",
      "fariid",
      "fariid",
      "fariid",
      "s/ariik",
      "s/ariik"
    ),
    word_start = c(
      1.29,
      1.29,
      3.1,
      3.1,
      4.497547849,
      4.497547849,
      5.84,
      5.84,
      7.33799164,
      7.33799164,
      8.69,
      8.69,
      10.56,
      10.56,
      12.33931889,
      12.33931889,
      14.51429437,
      14.51429437,
      15.98722058,
      15.98722058,
      17.74128888,
      17.74128888,
      19.07923981,
      19.07923981,
      20.8,
      20.8,
      22.24101925,
      22.24101925,
      24.93996795,
      24.93996795,
      26.57171821,
      26.57171821,
      28.95,
      28.95,
      30.80824885,
      30.80824885,
      33.34892913,
      33.34892913,
      35.63400123,
      35.63400123,
      37.63,
      37.63,
      39.83,
      43.15775737,
      43.15775737,
      45.70355722,
      45.70355722,
      47.82177575,
      47.82177575,
      47.82177575,
      47.82177575,
      49.81596858,
      49.81596858,
      49.81596858,
      49.81596858,
      51.6610037,
      51.6610037,
      52.90933923,
      52.90933923,
      54.50020389,
      54.50020389,
      57.4624365,
      57.4624365,
      60.2,
      60.2,
      62.22,
      62.22,
      64.4,
      64.4,
      66.25,
      66.25,
      67.86,
      67.86,
      69.26,
      69.26,
      70.52,
      70.52,
      72.02,
      72.02,
      73.07,
      73.07,
      74.52,
      74.52,
      76.09,
      76.09,
      78.144989,
      81.1,
      81.1,
      82.55,
      82.55,
      84.5,
      84.5,
      85.94,
      85.94,
      87.53,
      87.53,
      88.82,
      88.82,
      91.42,
      91.42
    ),
    word_end = c(
      1.66,
      1.66,
      3.46,
      3.46,
      4.86,
      4.86,
      6.18,
      6.18,
      7.71,
      7.71,
      9.04,
      9.04,
      10.94,
      10.94,
      12.69647947,
      12.69647947,
      14.86,
      14.86,
      16.33,
      16.33,
      18.15724845,
      18.15724845,
      19.49,
      19.49,
      21.08,
      21.08,
      22.51,
      22.51,
      25.29,
      25.29,
      26.9,
      26.9,
      29.26,
      29.26,
      31.1,
      31.1,
      33.72,
      33.72,
      36,
      36,
      38.01,
      38.01,
      40.204989,
      43.51,
      43.51,
      46,
      46,
      48.1,
      48.1,
      48.1,
      48.1,
      50.11,
      50.11,
      50.11,
      50.11,
      52.04,
      52.04,
      53.21,
      53.21,
      54.79,
      54.79,
      57.7,
      57.7,
      60.59343395,
      60.59343395,
      62.63,
      62.63,
      64.79766196,
      64.79766196,
      66.60918452,
      66.60918452,
      68.23530817,
      68.23530817,
      69.58599747,
      69.58599747,
      70.90290686,
      70.90290686,
      72.44065838,
      72.44065838,
      73.47,
      73.47,
      74.90445164,
      74.90445164,
      76.44453771,
      76.44453771,
      78.60096772,
      81.54306709,
      81.54306709,
      83.05,
      83.05,
      84.89680986,
      84.89680986,
      86.36102683,
      86.36102683,
      87.89443425,
      87.89443425,
      89.22698569,
      89.22698569,
      91.93319481,
      91.93319481
    ),
    interval = c(
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP"
    ),
    Duration..msec. = c(
      5.062940718,
      6.27231462,
      5.503196433,
      6.60383572,
      5.063547359,
      24.28125574,
      2.128053784,
      29.40809068,
      4.663215096,
      8.349349004,
      3.469092202,
      4.38621359,
      1.189181135,
      21.14528332,
      6.498880688,
      42.638088,
      2.456465096,
      6.199650003,
      7.786682742,
      14.56863223,
      5.87347944,
      12.07326329,
      4.540773542,
      6.191963922,
      25.86031547,
      25.85931547,
      1.40662656,
      41.39501019,
      2.736534659,
      7.273421066,
      4.207320514,
      7.007995115,
      20.06852322,
      12.8773024,
      12.52008252,
      21.56742859,
      4.74736464,
      11.52931413,
      8.700243039,
      6.631239497,
      8.176177558,
      21.50825999,
      7.074256002,
      2.010285959,
      34.71966771,
      5.088304725,
      19.7771844,
      3.941462697,
      11.74761059,
      2.087993879,
      10.8204471,
      6.482937831,
      8.880250902,
      3.014199771,
      8.590469348,
      4.165135943,
      49.52590816,
      4.427105914,
      28.38371451,
      10.16833878,
      15.78759144,
      11.98209296,
      8.784196628,
      5.01870151,
      5.386336749,
      5.6505437,
      7.960745519,
      1.529281071,
      8.889661094,
      3.459212702,
      5.967106564,
      5.073104667,
      8.641504292,
      3.767749714,
      6.028399543,
      5.174277408,
      22.14290465,
      5.076448556,
      23.55179185,
      5.989967746,
      21.85494911,
      2.924394756,
      15.89967435,
      1.7582832,
      7.0331328,
      8.303479476,
      7.385538906,
      17.33340764,
      5.435074193,
      6.65655934,
      5.59507289,
      56.44321591,
      2.468831335,
      31.96235314,
      3.611888255,
      13.2778415,
      4.688066535,
      5.671275084,
      5.156275597,
      47.23342261
    )
  ),
  class = c("grouped_df",
            "tbl_df", "tbl", "data.frame"),
  row.names = c(NA,-100L),
  groups = structure(
    list(
      word = c(
        "asmaak",
        "baarid",
        "baraad",
        "biis/a",
        "buuma",
        "daafa3",
        "diirat",
        "duuda",
        "fariid",
        "gaarib",
        "giimat",
        "Guraab",
        "guuli",
        "kaatib",
        "kiis",
        "kuura",
        "nabaat",
        "najiib",
        "nasiit",
        "nifaag",
        "s/ariik",
        "taaba3",
        "tiin",
        "tuuna"
      ),
      .rows = structure(
        list(
          76:83,
          1:4,
          72:75,
          21:24,
          41:43,
          9:12,
          29:32,
          48:55,
          95:98,
          17:20,
          37:40,
          64:67,
          60:63,
          13:16,
          33:36,
          56:59,
          68:71,
          87:90,
          91:94,
          84:86,
          99:100,
          5:8,
          25:28,
          44:47
        ),
        ptype = integer(0),
        class = c("vctrs_list_of",
                  "vctrs_vctr", "list")
      )
    ),
    class = c("tbl_df", "tbl", "data.frame"),
    row.names = c(NA,-24L),
    .drop = TRUE
  )
)

Can you use dplyr::case_when?

Not sure how to approach this given that each cell in percentvoiced has a different value but each two row such as 1-2, 3-4, 5-6, etc. has a sharing value in word_start. Any advice?

Am I reading this pattern correctly?

Each percentvoiced that is --undefined-- has a corresponding *percentvoiced * with a numeric value where "word_start" & "word_end" are the same. The row with --undefined-- can appear immediately before or after the the row wint the numeric value of *percentvoiced *

Can we have more than one percentvoiced that is --undefined-- that has a corresponding *percentvoiced * with a numeric value where "word_start" & "word_end"?

Regarding the pattern, yes this is the correct pattern you described.
Regarding the second question, yes, but I fixed that in the below dataset as that was an avoidable repetition. In the following dataset, each word_start & word_end has only two corresponding cells in percentvoiced, and some of these corresponding cells in percentvoiced contains --undefined--.

c <- structure(
  list(
    percentvoiced = structure(
      c(
        4L,
        1L,
        1L,
        4L,
        2L,
        14L,
        1L,
        2L,
        4L,
        1L,
        1L,
        4L,
        1L,
        2L,
        1L,
        2L,
        1L,
        4L,
        1L,
        4L,
        4L,
        4L,
        4L,
        4L,
        2L,
        2L,
        2L,
        2L,
        1L,
        4L,
        4L,
        1L,
        2L,
        14L,
        2L,
        2L,
        1L,
        4L,
        4L,
        4L
      ),
      .Label = c(
        "--undefined--",
        "0",
        "10",
        "100",
        "12.5",
        "14.286",
        "16.667",
        "18.75",
        "20",
        "25",
        "28.571",
        "33.333",
        "40",
        "50",
        "60",
        "62.5",
        "66.667",
        "71.429",
        "80",
        "83.333"
      ),
      class = "factor"
    ),
    phoneme = c(
      "b",
      "b",
      "b",
      "b",
      "t",
      "t",
      "t",
      "t",
      "d",
      "d",
      "d",
      "d",
      "k",
      "k",
      "k",
      "k",
      "g",
      "g",
      "g",
      "g",
      "b",
      "b",
      "b",
      "b",
      "t",
      "t",
      "t",
      "t",
      "d",
      "d",
      "d",
      "d",
      "k",
      "k",
      "k",
      "k",
      "g",
      "g",
      "g",
      "g"
    ),
    word = c(
      "baarid",
      "baarid",
      "baarid",
      "baarid",
      "taaba3",
      "taaba3",
      "taaba3",
      "taaba3",
      "daafa3",
      "daafa3",
      "daafa3",
      "daafa3",
      "kaatib",
      "kaatib",
      "kaatib",
      "kaatib",
      "gaarib",
      "gaarib",
      "gaarib",
      "gaarib",
      "biis/a",
      "biis/a",
      "biis/a",
      "biis/a",
      "tiin",
      "tiin",
      "tiin",
      "tiin",
      "diirat",
      "diirat",
      "diirat",
      "diirat",
      "kiis",
      "kiis",
      "kiis",
      "kiis",
      "giimat",
      "giimat",
      "giimat",
      "giimat"
    ),
    word_start = c(
      1.29,
      1.29,
      3.1,
      3.1,
      4.497547849,
      4.497547849,
      5.84,
      5.84,
      7.33799164,
      7.33799164,
      8.69,
      8.69,
      10.56,
      10.56,
      12.33931889,
      12.33931889,
      14.51429437,
      14.51429437,
      15.98722058,
      15.98722058,
      17.74128888,
      17.74128888,
      19.07923981,
      19.07923981,
      20.8,
      20.8,
      22.24101925,
      22.24101925,
      24.93996795,
      24.93996795,
      26.57171821,
      26.57171821,
      28.95,
      28.95,
      30.80824885,
      30.80824885,
      33.34892913,
      33.34892913,
      35.63400123,
      35.63400123
    ),
    word_end = c(
      1.66,
      1.66,
      3.46,
      3.46,
      4.86,
      4.86,
      6.18,
      6.18,
      7.71,
      7.71,
      9.04,
      9.04,
      10.94,
      10.94,
      12.69647947,
      12.69647947,
      14.86,
      14.86,
      16.33,
      16.33,
      18.15724845,
      18.15724845,
      19.49,
      19.49,
      21.08,
      21.08,
      22.51,
      22.51,
      25.29,
      25.29,
      26.9,
      26.9,
      29.26,
      29.26,
      31.1,
      31.1,
      33.72,
      33.72,
      36,
      36
    ),
    interval = c(
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP",
      "B",
      "ASP"
    ),
    Duration..msec. = c(
      5.062940718,
      6.27231462,
      5.503196433,
      6.60383572,
      5.063547359,
      24.28125574,
      2.128053784,
      29.40809068,
      4.663215096,
      8.349349004,
      3.469092202,
      4.38621359,
      1.189181135,
      21.14528332,
      6.498880688,
      42.638088,
      2.456465096,
      6.199650003,
      7.786682742,
      14.56863223,
      5.87347944,
      12.07326329,
      4.540773542,
      6.191963922,
      25.86031547,
      25.85931547,
      1.40662656,
      41.39501019,
      2.736534659,
      7.273421066,
      4.207320514,
      7.007995115,
      20.06852322,
      12.8773024,
      12.52008252,
      21.56742859,
      4.74736464,
      11.52931413,
      8.700243039,
      6.631239497
    )
  ),
  class = c("grouped_df",
            "tbl_df", "tbl", "data.frame"),
  row.names = c(NA,-40L),
  groups = structure(
    list(
      word = c(
        "baarid",
        "biis/a",
        "daafa3",
        "diirat",
        "gaarib",
        "giimat",
        "kaatib",
        "kiis",
        "taaba3",
        "tiin"
      ),
      .rows = structure(
        list(1:4, 21:24, 9:12, 29:32, 17:20, 37:40, 13:16, 33:36,
             5:8, 25:28),
        ptype = integer(0),
        class = c("vctrs_list_of",
                  "vctrs_vctr", "list")
      )
    ),
    class = c("tbl_df", "tbl", "data.frame"),
    row.names = c(NA,-10L),
    .drop = TRUE
  )
)
)

If this is correct, then could you do something like this?

library(dplyr)
library(tidyr)

df %>% 
    mutate(
        percentvoiced = case_when(
            percentvoiced == '--undefined--' ~ NA,
            T ~ percentvoiced
        )
    ) %>% 
    group_by(word_start, word_end) %>% 
    fill(
        percentvoiced,
        .direction = 'downup'
    )

That would make sure that all unique pairs of word_start and word_end share a value. Am I misunderstanding the issue?

I got this error:

Error: Problem with mutate() column percentvoiced.
i percentvoiced = case_when(percentvoiced == "--undefined--" ~ NA, T ~ percentvoiced).
x must be a logical vector, not a factor object.
i The error occurred in group 1: word = "baarid".

Whoops I didn't notice that column was a factor. If it's not necessary to your workflow that it's a factor I would change it to be a character object. This should do the trick I think:

library(dplyr)
library(tidyr)

df %>% 
    mutate(
        percentvoiced = case_when(
            as.character(percentvoiced) == '--undefined--' ~ NA,
            T ~ percentvoiced
        )
    ) %>% 
    group_by(word_start, word_end) %>% 
    fill(
        percentvoiced,
        .direction = 'downup'
    )

Still the same

Error: Problem with mutate() column percentvoiced.
i percentvoiced = case_when(...).
x must be a logical vector, not a character vector.
i The error occurred in group 1: word = "baarid".

Ok last fix - sorry - use NA_character_ instead of NA. The problem is that NA is a logical.

Still the same.

Error: Problem with mutate() column percentvoiced.
i percentvoiced = case_when(...).
x must be a character vector, not a factor object.
i The error occurred in group 1: interval = "a".

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.