I need more help on this topic.
I am trying to merge two data frames by using full_join() by a variable name called "CUSTOMER_NUMBER".
I got help from Martin.R before to work in this:
- First, I converted CUSTOMER_NUMBER into numeric
as.numeric(as.character(orginal_data1$CUSTOMER_NUMBER))
- The reason why he suggested me doing that was that there existed an issue when I merged two data frames together, there were new rows coming out of nowhere.
Now, the issue is, the original data set has 84,961 unique customers and after I did convert that column, there is one row less (i.e 84, 960).
If I do this ...%>% na.omit(), then there will be even one more row less (i.e 84, 959).
I did View(data)
and there are a few customers ID 0
but my original data set has only 1 customer ID 000000
.
What is the issue?
library(tidyverse)
library(dplyr)
library(reprex)
data = structure(list(CUSTOMER_NUMBER = c("240464833", "240464833",
"304200341", "328200716", "328804035", "341165440", "344200356",
"344200356", "344257899", "345114321", "368200262", "371200189",
"373128923", "373156382", "373182845", "373200097", "373200163",
"373305193", "373305193", "373654228", "373655997", "373655997",
"373746845", "374200116", "374200116", "374200245", "374200567",
"374547687", "374547687", "328200227", "328200258", "328200324",
"328200449", "328200665", "328200854", "328200930", "328432553",
"328484858", "328503426", "360545696", "373102040", "373461662",
"328200940", "342170407", "422201038", "422201038", "422295043",
"456200526", "462626501", "632198401", "656558734", "728468964",
"743200015", "760553604", "764326192", "374547687", "374770073",
"375315324", "402200379", "408488309", "408748740", "410200334",
"410200334", "410527343", "422112098", "422761648", "438200798",
"438461197", "373552475", "373552475", "373552475", "373746845",
"374615602", "375200313", "375360703", "410574873", "422886800",
"444701998", "448200314", "448267635", "456442639", "768152705",
"768200259", "209200110", "209200110", "209200320", "209426442",
"209733492", "212149526", "220739357", "230257275", "240464878",
"240506379", "246232660", "444474214", "448361237", "448455525",
"450504833", "454200908", "454803092", "458503336", "458879597",
"460447273", "462387101", "610152257", "610152257", "632569663",
"458346229", "458575623", "632200352", "656493726", "704200304",
"704201434", "710198201", "710743579", "728119488", "728201030",
"744538875", "760378672", "760378672", "763385790", "246524598",
"304106135", "328546392", "328689306", "332200105", "336608543",
"336608543", "336608543", "342782537", "344200356", "344257899",
"360200004", "360217257", "632735606", "642221462", "704200334",
"704557802", "704576852", "704615849", "704746835", "708505217",
"710155532", "710339414", "710421769", "728279411", "728343667",
"764157626", "766319650", "768200171", "768200516")), class = c("data.table",
"data.frame"), row.names = c(NA, -151L))
data$CUSTOMER_NUMBER %>% unique() %>% length()
#> [1] 132
data$CUSTOMER_NUMBER = as.numeric(as.character(data$CUSTOMER_NUMBER))
data$CUSTOMER_NUMBER %>% unique() %>% length()
#> [1] 132
Created on 2018-07-18 by the reprex
package (v0.2.0).