Hi,
I have this sample of my database with Mazda models where I am trying to find the most frequent ones.
source <- data.frame(
stringsAsFactors = FALSE,
id = c(1,2,3,4,5,6,7,8,9,10,
11,12,13,14,15,16,17,18,19,20,21,22,23,24,
25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,
40,41,42,43,44,45,46,47,48,49,50,51,52,53,
54,55,56,57,58,59,60,61,62,63,64,65,66,67,
68,69,70,71,72,73,74,75,76,77,78,79,80,81,
82,83,84,85,86,87,88,89,90,91,92,93,94,95,96),
Model = c("194ps AWD GT Sport Auto","2",
"2","2","2","2","2 - 1.5 cc petrol",
"2 1.3 5dr BLACK LIMITED EDIT","2 (90) 1.","2 1.","3","3","3",
"3","3","3","3","3","3 - 1.6 Diesel Manual",
"3 (Blk Lthr) 5DR 120PS SPORT NAV + S/P","3 1.",
"5Dr Tourer 2.0 165ps Sport","6","6","6","6",
"6 2.2d 175 Sport Nav Auto Estate - FUNDED #20","6 (136hp) Ts","6 (150) 2.",
"cx-3","Cx-3","CX-3","CX-3","CX-3",
"Cx-3 2.0 120ps Sport NAV Cx-3 2.0 120ps Sport NAV",
"Cx-3 2.0 121PS Se-l Nav+ AUTO","Cx-3 Sport Nav +","CX-3 (120) 2.0",
"Cx-30 2.0 e-Skyactiv G MHEV Sport Lux 5dr Auto Hatchback",
"cx-5","Cx-5","CX-5","CX-5","CX-5","cx-5 - black",
"CX-5 - SPORT NAV PLUS","CX-5 - RED - SPORT NAV",
"CX-5 2..2 diesel","Cx-5 2.0 Sport Nav 2WD",
"Cx-5 2.2D [175] Sport Nav AWD Auto",
"Cx-6",
"CX-5 Sport Nav+ Auto 2WD 2.0","CX-5 (150) 2.2","MAZDA - 2",
"MAZDA - 2 GT","MAZDA - 2 GT SPORT 1.5l NAV MHEV PETROL",
"MAZDA - 2 GT SPORT NAV","MAZDA - 3 TS D SILVER",
"MAZDA - 3 TS GREY, 5 DOOR HATCHBACK - AUTOMATIC",
"MAZDA - 8",
"MAZDA - 6 D SE","MAZDA - 6 D SE-L","Mazda MX5",
"Mazda \t2 TS2","mazda 2","Mazda 2","MAZDA 2",
"Mazda 2 - GT - Blue","mazda 2 1.4 petrol","mazda 3","Mazda 3",
"MAZDA 3","Mazda 3 - 2.0 - Petrol","MAZDA 3 - BLUE",
"Mazda MX-3035.5kWh 145ps GT Sport Tech Auto","Mazda mx-5",
"Mazda Mx-5","Mazda MX-5","MAZDA MX-5",
"MAZDA MX-5 - BLACK","Mazda2 5dr 1.5 75ps SE","Mazda2 5Dr 1.5 75ps SE",
"Mazda3 5dr 2.2D 150ps SE-L Nav",
"Mazda7","Mazda6 Diesel Saloon 2.2",
"MAZDA6 Diesel Saloon 2.2d [175] Sport Nav 4dr Auto 2(2013)",
"MX-5 1.5 132ps Se +","Mx-5 1.8 Opt Pk LT AC *","MX-5","MX-5",
"MX-5","MX-5","MX-5"),
Mileage = c(26,71,615,185,528,365,518,
80,664,395,514,262,635,554,300,14,238,271,
584,256,196,593,48,84,678,182,364,433,595,550,
612,347,669,628,110,524,224,311,243,94,224,291,
247,767,744,228,171,648,222,301,548,652,545,
33,517,564,63,568,421,441,320,561,459,580,22,
306,543,577,538,659,558,294,739,647,84,604,584,
679,645,508,83,576,67,759,276,487,485,393,488,
424,28,530,710,412,282,409),
MOTProp = c(1,1,1,0,0,1,0,1,0,1,0,
1,0,1,0,1,0,0,1,0,1,0,1,0,1,0,1,1,0,0,
1,0,1,0,1,0,1,0,1,0,0,1,0,1,0,1,0,1,0,
1,1,0,0,1,0,1,0,1,0,1,0,1,1,0,1,0,1,0,
1,1,0,0,1,0,1,0,1,0,1,1,0,1,0,1,0,1,1,
0,0,1,0,1,0,1,0,1)
)
# Finding main models
library(dplyr)
library(forcats)
source.Models <- source %>%
add_count(Model, name = "Mod.Freq") %>%
mutate(Mod.Freq = Mod.Freq/n())
Models.with_freq <- left_join(source, source.Models)
data_2022.models <- Models.with_freq %>%
mutate(Main.models = case_when(
is.na(Mod.Freq) ~ "Other",
Mod.Freq < 0.03 ~ "Other",
Mod.Freq >= 0.03 ~ Model))
Main.models <- table(data_2022.models$Model, data_2022.models$Main.models)
Main.models
I can see the most frequent models are 2, 3, 6, CX-3 and MX-5.
The problem I have is that within the list of models, all names starting with 2, Mazda2, Mazda 2, MAZDA - 2 should be the same as model 2. The same relates to model 3 (3, Mazda3, mazda 3, MAZDA -3) and 6, CX-3 and MX-5.
Is there a clever way of recoding all similar models before mutating their frequency? Perhaps any form of regular expressions should be done prior to mutating Main.models?