Find and Filter data

I need to check if info from a data.frame (call in the example "trajectory") is in another data.frame (call in the example "segment") and if not remove it from the 1st data.frame.

To expose my issue I present a short replex:

trajectory<- data.frame(stringsAsFactors=FALSE,
             Lon_i = c(9.23027777777778, 6.39722222222222, 7.35638888888889,
                       -0.0458333333333333, 0.516666666666667,
                       -0.922777777777778, 11.1397222222222),
             Lat_i = c(53.5088888888889, 49.7688888888889, 49.4375,
                       50.1633333333333, 51.35, 51.2816666666667,
                       46.0202777777778),
             Lon_f = c(8.87472222222222, 6.51944444444444, 7.83944444444444, 0,
                       0.298055555555556, -0.625, 11.3766666666667),
             Lat_f = c(53.3472222222222, 49.7286111111111, 49.2652777777778,
                       50.1, 51.495, 51.0538888888889, 45.6411111111111),
    Rumbo_circular = c(53, 297, 299, 335, 137, 321, 336),
   newID_segmentos = c("%%EDH_WSN", "%DIPA_PITES", "%DIPI_LADAT",
                       "%DRSI_SITET", "*%200_BAKER", "**EG1_MID",
                       "**TNT_IBUGO")
)
segment<-data.frame(stringsAsFactors=FALSE,
   segmentos_Found = c("%%EDH_WSN", "AAAB_AAAA", "**EG1_MID", "XDCF_!HSHJ",
                       "%DIPI_LADAT"),
)

I need to write a code that looks if "trajectory$newID_segmentos" data is in "segment" data. If it is not included in "segment" data then the complete row should be deleted. Final data frame would be for this example:

trajectory<- data.frame(stringsAsFactors=FALSE,
             Lon_i = c(9.23027777777778, -0.922777777777778, 7.35638888888889),
             Lat_i = c(53.5088888888889, 51.2816666666667, 49.4375),
             Lon_f = c(8.87472222222222, -0.625, 7.83944444444444),
             Lat_f = c(53.3472222222222, 51.0538888888889, 49.2652777777778),
    Rumbo_circular = c(53, 321, 299),
   newID_segmentos = c("%%EDH_WSN", "**EG1_MID", "%DIPI_LADAT")
)

As it can be seen: "%DIPA_PITES", "%DRSI_SITET", "*%200_BAKER" and "**TNT_IBUGO" "trajectory$newID_segmentos" data, have been removed as they do not appear in the "segment" data. Therefore, only "%%EDH_WSN", "**EG1_MID", "%DIPI_LADAT" appear now in the "trajectory$newID_segmentos" column. Important to note that neither columns nor rows number are the same in the two data.frames. "trajectory" has about 2174502 rows and 23 columns and "segments" about 56174 rows and 7 columns.

You can use a semi_join() from the dplyr package.

library(dplyr)
#> Warning: package 'dplyr' was built under R version 3.5.3
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
trajectory<- data.frame(stringsAsFactors=FALSE,
                        Lon_i = c(9.23027777777778, 6.39722222222222, 7.35638888888889,
                                  -0.0458333333333333, 0.516666666666667,
                                  -0.922777777777778, 11.1397222222222),
                        Lat_i = c(53.5088888888889, 49.7688888888889, 49.4375,
                                  50.1633333333333, 51.35, 51.2816666666667,
                                  46.0202777777778),
                        Lon_f = c(8.87472222222222, 6.51944444444444, 7.83944444444444, 0,
                                  0.298055555555556, -0.625, 11.3766666666667),
                        Lat_f = c(53.3472222222222, 49.7286111111111, 49.2652777777778,
                                  50.1, 51.495, 51.0538888888889, 45.6411111111111),
                        Rumbo_circular = c(53, 297, 299, 335, 137, 321, 336),
                        newID_segmentos = c("%%EDH_WSN", "%DIPA_PITES", "%DIPI_LADAT",
                                            "%DRSI_SITET", "*%200_BAKER", "**EG1_MID",
                                            "**TNT_IBUGO")
)
segment<-data.frame(stringsAsFactors=FALSE,
                    segmentos_Found = c("%%EDH_WSN", "AAAB_AAAA", "**EG1_MID", "XDCF_!HSHJ",
                                        "%DIPI_LADAT")
)
trajectory <- semi_join(trajectory, segment, by = c("newID_segmentos" = "segmentos_Found"))
trajectory
#>        Lon_i    Lat_i     Lon_f    Lat_f Rumbo_circular newID_segmentos
#> 1  9.2302778 53.50889  8.874722 53.34722             53       %%EDH_WSN
#> 2  7.3563889 49.43750  7.839444 49.26528            299     %DIPI_LADAT
#> 3 -0.9227778 51.28167 -0.625000 51.05389            321       **EG1_MID

Created on 2019-10-28 by the reprex package (v0.3.0.9000)

1 Like

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.