Automatic recode of the most frequent names

Hi,
I have this sample of my database with Mazda models where I am trying to find the most frequent ones.

source <- data.frame(
  stringsAsFactors = FALSE,
                id = c(1,2,3,4,5,6,7,8,9,10,
                       11,12,13,14,15,16,17,18,19,20,21,22,23,24,
                       25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,
                       40,41,42,43,44,45,46,47,48,49,50,51,52,53,
                       54,55,56,57,58,59,60,61,62,63,64,65,66,67,
                       68,69,70,71,72,73,74,75,76,77,78,79,80,81,
                       82,83,84,85,86,87,88,89,90,91,92,93,94,95,96),
             Model = c("194ps AWD GT Sport Auto","2",
                       "2","2","2","2","2 - 1.5 cc petrol",
                       "2  1.3 5dr  BLACK LIMITED EDIT","2 (90) 1.","2 1.","3","3","3",
                       "3","3","3","3","3","3 - 1.6 Diesel Manual",
                       "3 (Blk Lthr) 5DR 120PS SPORT NAV + S/P","3 1.",
                       "5Dr Tourer 2.0 165ps Sport","6","6","6","6",
                       "6  2.2d 175 Sport Nav Auto Estate - FUNDED #20","6 (136hp) Ts","6 (150) 2.",
                       "cx-3","Cx-3","CX-3","CX-3","CX-3",
                       "Cx-3  2.0 120ps Sport NAV Cx-3 2.0 120ps Sport NAV",
                       "Cx-3  2.0 121PS Se-l Nav+  AUTO","Cx-3  Sport Nav +","CX-3 (120) 2.0",
                       "Cx-30 2.0 e-Skyactiv G MHEV Sport Lux 5dr Auto Hatchback",
                       "cx-5","Cx-5","CX-5","CX-5","CX-5","cx-5 - black",
                       "CX-5 - SPORT NAV PLUS","CX-5  -  RED - SPORT NAV",
                       "CX-5     2..2  diesel","Cx-5  2.0 Sport Nav 2WD",
                       "Cx-5  2.2D [175] Sport Nav AWD Auto",
                       "Cx-6",
                       "CX-5  Sport Nav+ Auto 2WD 2.0","CX-5 (150) 2.2","MAZDA - 2",
                       "MAZDA - 2 GT","MAZDA - 2 GT SPORT 1.5l NAV MHEV PETROL",
                       "MAZDA - 2 GT SPORT NAV","MAZDA - 3 TS D SILVER",
                       "MAZDA - 3 TS GREY, 5 DOOR HATCHBACK - AUTOMATIC",
                       "MAZDA - 8",
                       "MAZDA - 6 D SE","MAZDA - 6 D SE-L","Mazda  MX5",
                       "Mazda \t2 TS2","mazda 2","Mazda 2","MAZDA 2",
                       "Mazda 2 - GT - Blue","mazda 2      1.4 petrol","mazda 3","Mazda 3",
                       "MAZDA 3","Mazda 3 - 2.0 - Petrol","MAZDA 3 - BLUE",
                       "Mazda MX-3035.5kWh 145ps GT Sport Tech Auto","Mazda mx-5",
                       "Mazda Mx-5","Mazda MX-5","MAZDA MX-5",
                       "MAZDA MX-5 - BLACK","Mazda2 5dr 1.5 75ps SE","Mazda2 5Dr 1.5 75ps SE",
                       "Mazda3 5dr 2.2D 150ps SE-L Nav",
                       "Mazda7","Mazda6 Diesel Saloon 2.2",
                       "MAZDA6 Diesel Saloon 2.2d [175] Sport Nav 4dr Auto 2(2013)",
                       "MX-5 1.5 132ps Se +","Mx-5 1.8 Opt Pk LT AC        *","MX-5","MX-5",
                       "MX-5","MX-5","MX-5"),
           Mileage = c(26,71,615,185,528,365,518,
                       80,664,395,514,262,635,554,300,14,238,271,
                       584,256,196,593,48,84,678,182,364,433,595,550,
                       612,347,669,628,110,524,224,311,243,94,224,291,
                       247,767,744,228,171,648,222,301,548,652,545,
                       33,517,564,63,568,421,441,320,561,459,580,22,
                       306,543,577,538,659,558,294,739,647,84,604,584,
                       679,645,508,83,576,67,759,276,487,485,393,488,
                       424,28,530,710,412,282,409),
           MOTProp = c(1,1,1,0,0,1,0,1,0,1,0,
                       1,0,1,0,1,0,0,1,0,1,0,1,0,1,0,1,1,0,0,
                       1,0,1,0,1,0,1,0,1,0,0,1,0,1,0,1,0,1,0,
                       1,1,0,0,1,0,1,0,1,0,1,0,1,1,0,1,0,1,0,
                       1,1,0,0,1,0,1,0,1,0,1,1,0,1,0,1,0,1,1,
                       0,0,1,0,1,0,1,0,1)
)

# Finding main models

library(dplyr)
library(forcats)

source.Models <- source %>%
  add_count(Model, name = "Mod.Freq") %>%
  mutate(Mod.Freq = Mod.Freq/n()) 

Models.with_freq <- left_join(source, source.Models)

data_2022.models <- Models.with_freq %>%
  mutate(Main.models = case_when(
    is.na(Mod.Freq) ~ "Other",
    Mod.Freq < 0.03 ~ "Other",
    Mod.Freq >= 0.03 ~ Model))

Main.models <- table(data_2022.models$Model, data_2022.models$Main.models)
Main.models

I can see the most frequent models are 2, 3, 6, CX-3 and MX-5.
The problem I have is that within the list of models, all names starting with 2, Mazda2, Mazda 2, MAZDA - 2 should be the same as model 2. The same relates to model 3 (3, Mazda3, mazda 3, MAZDA -3) and 6, CX-3 and MX-5.
Is there a clever way of recoding all similar models before mutating their frequency? Perhaps any form of regular expressions should be done prior to mutating Main.models?

Here is one way to clean and reclassify the models. A new variable model2 is added to the data frame, which is a transformed version of Model (renamed solely so you could compare to Model; if you want, you can change model2 to Model). Once transformed, a case_when() is used to classify to the desired model name. For cases where model 2/3/6 or CX-3/MX-5 are not present, the current classification returns "other to classify". This means you would need to add new logic to have these cases properly identified. If you want to keep them as-is, change the final line of the case_when() to TRUE ~ model2. Hope this provides a useful framework to build upon.

source = source %>%
  # transform the strings
  mutate(model2 = tolower(Model),
         model2 = str_squish(model2),
         model2 = str_replace_all(model2, '-| - ', ''),
         model2 = str_replace_all(model2, 'mazda ', 'mazda')
         ) %>%
  # classify - add further classifications here
  mutate(model2 = case_when(
    substr(model2,1,1) == '2' | str_detect(model2, 'mazda2') ~ 'model 2',
    substr(model2,1,1) == '3' | str_detect(model2, 'mazda3') ~ 'model 3',
    substr(model2,1,1) == '6' | str_detect(model2, 'mazda6') ~ 'model 6',
    str_detect(model2, 'cx3') ~ 'CX-3',
    str_detect(model2, 'mx5') ~ 'MX-5',
    TRUE ~ 'other to classify'
  )) %>%
  mutate(Model = model2)

Main.models
#>                    
#>                     CX-3 model 2 model 3 model 6 MX-5 other to classify
#>   CX-3                10       0       0       0    0                 0
#>   model 2              0      24       0       0    0                 0
#>   model 3              0       0      19       0    0                 0
#>   model 6              0       0       0      11    0                 0
#>   MX-5                 0       0       0       0   13                 0
#>   other to classify    0       0       0       0    0                19

Created on 2023-01-05 with reprex v2.0.2.9000

Perhaps:

  1. changing all names in "Model" to upper case
  2. removing "MAZDA", "MAZDA - " and "MAZDA " in "Model"
  3. creating a new variable (like Model.Recoded) with names starting with a set of string up to the first space (so 194ps, 2, 3, 5Dr, 6, CX-3 etc.)
  4. running source.Models with the new variable (above) instead of "Model"
    would solve the problem?

If it would, I need someone to help me with a code to follow the steps above

Thank you. I modified the first part:

source = source %>%
  # transform the strings
  mutate(model2 = toupper(Model),
         model2 = str_squish(model2),
         model2 = str_replace_all(model2, 'MAZDA - ', ''),
         model2 = str_replace_all(model2, 'MAZDA ', ''),
         model2 = str_replace_all(model2, 'MAZDA', ''),
         model3 = substr(model2, 1, 5),
         model3 = str_trim(model3)
  )

source

1 Like

This topic was automatically closed 42 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.